一: requests模块的基本使用:
- response.text存在编码问题,原因是requests底层会自己推到编码,然后进行解码,如果推导的和编码不一致就会出现编码问题,需要提前指定:response.encoding = “utf-8”
- response.content获取的是二进制类型,需要自行转换:response.content.decode(‘gbk’)
import requests
url = "https://www.baidu.com"
response = requests.get(url)
# 方案一:使用response.text获取响应内容
# response.encoding = "utf-8"
# print(response.text)
# 方案二: 使用response.content获取响应内容
# decode()默认是utf-8
print(response.content.decode())
with open("baidu.html", "w") as f:
f.write(response.text)
1:response相应的其他常用属性和方法:
import requests
url = "http://www.baidu.com"
response = requests.get(url)
# 1: 获取响应的url
print(response.url)
# 2: 获取请求的url
print(response.request.url)
# 3: 获取响应的cookies
print(response.cookies)
# 4: 获取请求的cookies
print(response.request._cookies)
# 5: 获取响应头
print(response.headers)
# 6: 获取请求头
print(response.request.headers)
# 7: 获取响应内容中的json转换成字典
print(response.json())
http://www.baidu.com/
http://www.baidu.com/
<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
<RequestsCookieJar[]>
{'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html', 'Date': 'Wed, 18 Nov 2020 15:10:45 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:57 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}
{'User-Agent': 'python-requests/2.25.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
2:练习—抓取一张网络上的图片:
import requests
url = "https://img.alicdn.com/tfs/TB1MaLKRXXXXXaWXFXXXXXXXXXX-480-260.png"
response = requests.get(url)
with open("图片1.png", "wb") as f:
# 注意此处不进行解码
f.write(response.content)
3:携带请求头发送请求:
import requests
url = "https://www.baidu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'
}
response = requests.get(url, headers=headers)
with open("百度.html", "wb") as f:
f.write(response.content)
4:发送带参数请求:
"""
两种方案:
第一种:直接在url地址中携带
第二种:构建请求参数字典
"""
url = "https://www.baidu.com/s?"
params_dict = {
"wd": "python"
}
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
}
response = requests.get(url, headers=headers, params= params_dict)
print(response.content.decode())
with open("python.html", 'wb') as f:
f.write(response.content)
二: request模块的深入处理:
- 发送post请求,requests.post(self.url, headers=self.headers, data=self.parames),注意里面是data。
- 反向代理: 浏览器不知道目标服务器的ip,但是知道nginx的ip, 由nginx转发目标服务器。
- 正向代理:为了伪装客户端的ip地址,先将请求发送给代理服务器,再由代理服务器转发给目标服务器,目标服务器很难识别客户端的地址信息。
1: 金山翻译爬虫:
import json
import requests
class JinshanSpider(object):
def __init__(self, word=None):
self.word = word,
self.url = "http://fy.iciba.com/ajax.php?a=fy"
self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36'}
self.parames = {
'f': 'auto',
't': 'auto',
'w': self.word
}
def send_request(self):
# 1: 发送url请求,获取响应:
response = requests.post(self.url, headers=self.headers, data=self.parames)
return response.content.decode()
def get_message(self, response):
# 将json格式转换成字典格式
response_dict = json.loads(response)
ret = response_dict['content']['out']
return ret
def run(self):
# 调用发送请求,返回响应
response = self.send_request()
# 调用解析响应
ret = self.get_message(response)
return ret
if __name__ == '__main__':
jinshan = JinshanSpider('牛')
ret = jinshan.run()
print(ret)
2: 代理:
- 透明代理
- 匿名代理
- 高匿代理
import requests
url = "https://www.baidu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
}
proxies = {
"http": "http://123.57.84.116:8118",
}
response = requests.get(url, headers=headers, proxies=proxies)
print(response.content.decode())
3:cookie
案例:模拟码云登录:
import requests
url = "https://gitee.com/ren_shan_wen"
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
'Cookie': '里面是cookie'
}
response = requests.get(url, headers=headers)
with open("github.html", "wb") as f:
f.write(response.content)
import requests
url = "https://gitee.com/ren_shan_wen"
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36',
}
cookie_str = '里面是cookie'
# cookie_list = cookie_str.split("; "), 得到一个cookie的列表。
# for循环遍历列表,得到是分割后的一个一个字符串。
# 再把字符串分割两部分,得到最终的字典。
cookie_dict = {cookie.split("=")[0]: cookie.split("=")[1]for cookie in cookie_str.split("; ")}
response = requests.get(url, headers=headers, cookies = cookie_dict)
with open("github.html", "wb") as f:
f.write(response.content)
上面代码有问题:cookie如果过期,那么需要手动更换。
思路:
1:首先,向登录界面,发送请求,获取登录页面中的tocken值。
2:携带上次请求的tocken值,向目标地址发送登录请求,此时gitHub后台向向前端发送的cookie信息会被我们代码中的Session模块保存起来。
3:向个人中心页面发送请求,保存页面中心页面。(Session对象中的Cookie信息起作用了)
import requests
import re
"""
# 1.向https://github.com/login发送get请求,获取响应
# 2.从响应对象中使用正则表达式提取authenticity_token的值
# 3.构建请求体字典
# 4.向https://github.com/session发送post模拟登录
# 4.1 一旦登录成功,github后端将会给前端设置cookie做状态保持,cookie将会存储到session对象中
# 5.向个人中心发送get请求:https://github.com/TmacChenQian
# 5.1 保存页面信息到本地html中
"""
# 1.向https://github.com/login发送get请求,获取响应
start_url = "https://github.com/login"
# 1.1 构建session对象
session = requests.session()
# 1.2 构建请求头信息
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
}
response = session.get(start_url, headers=headers)
# print(response.content.decode())
# 1.3 提取响应内容字符串
response_str = response.content.decode()
# name="authenticity_token" value="(.*?)" />
# () 分组提取
# 分组提取的结果:[配置成功的整体结果, 组1]
# group(1) 提取列表中第二个元素
# ? 非贪婪
# 2.从响应对象中使用正则表达式提取authenticity_token的值
# w3A8sWv5cA52rIQn92dFjPvFTaFxg9iQKBG1wNEc0vsOWaUBgLzYeJ6F6Wa1Hdb7sJpJZ/7z/RehuhnnFbJhWQ==
token = re.search(r'name="authenticity_token" value="(.*?)" />', response_str).group(1)
print(token)
# 3.构建请求体字典
post_body = {
"commit": "Sign in",
"authenticity_token": token,
"ga_id": "",
"login": "279752917@qq.com",
"password": "XIAOxiaozicq520",
"webauthn-support": "supported",
"webauthn-iuvpaa-support": "supported",
}
# 4.向https://github.com/session发送post模拟登录
login_url = "https://github.com/session"
session.post(login_url, headers=headers, data=post_body)
# 4.1 一旦登录成功,github后端将会给前端设置cookie做状态保持,cookie将会存储到session对象中
# 5.向个人中心发送get请求:https://github.com/TmacChenQian
profile_url = "https://github.com/TmacChenQian"
profile_response = session.get(profile_url, headers=headers)
# 5.1 保存页面信息到本地html中
with open("github2.html", "w") as f:
f.write(profile_response.content.decode())
# <title>TmacChenQian (Ai1en)</title> 访问个人中心页面成功的标识