xpath及response.text返回数据乱码问题处理
1.使用response.encoding = 'utf-8'
2.使用.encode('iso-8859-1').decode('gbk')
案例
from lxml import etree
import requests
import os
dirName = "img_dir"
if not os.path.exists(dirName):
os.mkdir(dirName)
#url = "https://pic.netbian.com/4kfengjing/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}
for page in range(1,3):
if page == 1:
url = "https://pic.netbian.com/4kfengjing/"
else:
url = f"https://pic.netbian.com/4kfengjing/index_{page}.html"
#获取页面源代码
print(f"----------正在进行第{page}页的数据处理------------")
respose = requests.get(url, headers=headers)
respose.encoding = "gbk"
page_text = respose.text
# 解析名称和图片的src属性值
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
# print(li_list)
for li in li_list:
li_name = li.xpath('./a/b/text()')[0] + ".jpg"
li_small_src = 'https://pic.netbian.com'+ li.xpath('./a/@href')[0]
# print(li_name,li_small_src)
page_true_text = requests.get(li_small_src, headers=headers).text
tree_b = etree.HTML(page_true_text)
li_true_src = 'https://pic.netbian.com' + tree_b.xpath('//div[@id="main"]/div[2]/div[1]/div[2]/a/img/@src')[0]
img_data = requests.get(li_true_src, headers=headers)
img_path = dirName + '/' + li_name
# print(li_name, li_true_src)
with open(img_path,"wb") as fp:
fp.write(img_data.content)
print(li_name, "已完成数据获取!")