爬虫的本质就是模拟client频繁请求server,获取响应数据,对响应数据进行解析处理。常规的串行方式同步阻塞执行,必须等待一个任务处理完后才能之后才能继续下一个,这样效率就非常低。最常用的聚焦爬虫对数据处理的IO操作(阻塞)相对密集,因此需要考虑使用异步方案解决。
1.同步串行:提交任务之后,只有等待这个任务执行完毕返回结果才会继续执行下一个,这样效率比较低下!
1 '''
2 同步串爬取
3 '''
4 import os
5 import time
6 import requests
7 headers={
8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
9 }
10 urls = [
11 ('百度知道','https://zhidao.baidu.com/daily?fr=daohang'),
12 ('新浪', 'https://www.sina.com.cn/'),
13 ('腾讯网', 'https://www.qq.com/'),
14 ('简书','https://www.jianshu.com/'),
15 ('今日头条','https://www.toutiao.com/'),
16 ('新浪财经','https://finance.sina.com.cn/'),
17 ('东方财富','http://www.eastmoney.com/'),
18 ('襄阳家教网','http://www.jiajiao100.com/'),
19 ]
20
21 #同步串行爬取
22 start=time.time()
23 for url_info in urls:
24 url=url_info[1]
25 path=os.path.join('response',url_info[0]+'.html')
26 response=requests.get(url,headers=headers)
27 response.encoding='utf-8'
28 with open(path,'w',encoding='utf-8')as f:
29 f.write(response.text)
30 t=time.time()-start
31 print(t)#4.652341365814209
同步串行
1 '''
2 异步多进程
3 '''
4 import os
5 import time
6 import requests
7 from multiprocessing import Process
8 headers={
9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
10 }
11 urls = [
12 ('百度知道','https://zhidao.baidu.com/daily?fr=daohang'),
13 ('新浪', 'https://www.sina.com.cn/'),
14 ('腾讯网', 'https://www.qq.com/'),
15 ('简书','https://www.jianshu.com/'),
16 ('今日头条','https://www.toutiao.com/'),
17 ('新浪财经','https://finance.sina.com.cn/'),
18 ('东方财富','http://www.eastmoney.com/'),
19 ('襄阳家教网','http://www.jiajiao100.com/'),
20 ]
21
22 def get_html(url_info):
23 print(os.getppid(),os.getpid())
24 url = url_info[1]
25 path = os.path.join('response', url_info[0] + '.html')
26 response = requests.get(url, headers=headers)
27 response.encoding = 'utf-8'
28 with open(path, 'w', encoding='utf-8')as f:
29 f.write(response.text)
30
31
32 if __name__ == '__main__':
33 start = time.time()
34 p_list=[]
35 for url_info in urls:
36 p=Process(target=get_html,args=(url_info,))
37 p_list.append(p)
38 p.start()
39 for p in p_list:
40 p.join()
41 t=time.time()-start
42 print(t)#3.1241235733032227
异步多进程
1 '''
2 异步进程池
3 '''
4 import os
5 import time
6 import requests
7 from concurrent.futures import ProcessPoolExecutor
8 headers={
9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
10 }
11 urls = [
12 ('百度知道', 'https://zhidao.baidu.com/daily?fr=daohang'),
13 ('新浪', 'https://www.sina.com.cn/'),
14 ('腾讯网', 'https://www.qq.com/'),
15 ('简书', 'https://www.jianshu.com/'),
16 ('今日头条', 'https://www.toutiao.com/'),
17 ('新浪财经', 'https://finance.sina.com.cn/'),
18 ('东方财富', 'http://www.eastmoney.com/'),
19 ('襄阳家教网', 'http://www.jiajiao100.com/'),
20 ]
21
22
23 def get_html(url_info):
24 print(os.getppid(), os.getpid())
25 url = url_info[1]
26 path = os.path.join('response', url_info[0] + '.html')
27 response = requests.get(url, headers=headers)
28 return response,path
29
30
31 def done_callback(task):
32 path=task.result()[1]
33 response = task.result()[0]
34 response.encoding = 'utf-8'
35 with open(path, 'w', encoding='utf-8')as f:
36 f.write(response.text)
37
38 if __name__ == '__main__':
39 start = time.time()
40 ps = ProcessPoolExecutor(8)
41 for url_info in urls:
42 task = ps.submit(get_html, url_info)
43 task.add_done_callback(done_callback)
44 ps.shutdown()
45 t = time.time() - start
46 print(t) #3.589127779006958
异步进程池
1 '''
2 异步多线程
3 '''
4 import os
5 import time
6 import requests
7 from threading import Thread,activeCount
8 headers={
9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
10 }
11 urls = [
12 ('百度知道','https://zhidao.baidu.com/daily?fr=daohang'),
13 ('新浪', 'https://www.sina.com.cn/'),
14 ('腾讯网', 'https://www.qq.com/'),
15 ('简书','https://www.jianshu.com/'),
16 ('今日头条','https://www.toutiao.com/'),
17 ('新浪财经','https://finance.sina.com.cn/'),
18 ('东方财富','http://www.eastmoney.com/'),
19 ('襄阳家教网','http://www.jiajiao100.com/'),
20 ]
21
22 def get_html(url_info):
23 print(os.getppid(), os.getpid())
24 url = url_info[1]
25 path = os.path.join('response', url_info[0] + '.html')
26 response = requests.get(url, headers=headers)
27 response.encoding = 'utf-8'
28 with open(path, 'w', encoding='utf-8')as f:
29 f.write(response.text)
30
31 if __name__ == '__main__':
32 start = time.time()
33 t_list=[]
34 for url_info in urls:
35 # 为每一个任务开启线程
36 t=Thread(target=get_html,args=(url_info,))
37 t_list.append(t)
38 t.start()
39 print(activeCount())#存活的线程个数
40 #阻塞直到所有线程结束
41 for t in t_list:
42 t.join()
43 t=time.time()-start
44 print(t)#1.2163612842559814
异步多线程
1 '''
2 异步线程池
3 '''
4 import os
5 import time
6 import requests
7 from threading import activeCount
8 from concurrent.futures import ThreadPoolExecutor
9 headers={
10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
11 }
12 urls = [
13 ('百度知道', 'https://zhidao.baidu.com/daily?fr=daohang'),
14 ('新浪', 'https://www.sina.com.cn/'),
15 ('腾讯网', 'https://www.qq.com/'),
16 ('简书', 'https://www.jianshu.com/'),
17 ('今日头条', 'https://www.toutiao.com/'),
18 ('新浪财经', 'https://finance.sina.com.cn/'),
19 ('东方财富', 'http://www.eastmoney.com/'),
20 ('襄阳家教网', 'http://www.jiajiao100.com/'),
21 ]
22
23
24 def get_html(url_info):
25 print(os.getppid(), os.getpid())
26 url = url_info[1]
27 path = os.path.join('response', url_info[0] + '.html')
28 response = requests.get(url, headers=headers)
29 return response,path
30
31
32 def done_callback(task):
33 path=task.result()[1]
34 response = task.result()[0]
35 response.encoding = 'utf-8'
36 with open(path, 'w', encoding='utf-8')as f:
37 f.write(response.text)
38
39 if __name__ == '__main__':
40 start = time.time()
41 ts = ThreadPoolExecutor(8)
42 for url_info in urls:
43 task = ts.submit(get_html, url_info)
44 task.add_done_callback(done_callback)
45 print(activeCount())
46 ts.shutdown()
47 t = time.time() - start
48 print(t) #1.2402942180633545
异步线程池
另一种线程池:
1 '''
2 异步线程池from multiprocessing.dummy import Pool
3 '''
4 import os
5 import time
6 import requests
7 from multiprocessing.dummy import Pool
8 headers={
9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
10 }
11 urls = [
12 ('百度知道', 'https://zhidao.baidu.com/daily?fr=daohang'),
13 ('新浪', 'https://www.sina.com.cn/'),
14 ('腾讯网', 'https://www.qq.com/'),
15 ('简书', 'https://www.jianshu.com/'),
16 ('今日头条', 'https://www.toutiao.com/'),
17 ('新浪财经', 'https://finance.sina.com.cn/'),
18 ('东方财富', 'http://www.eastmoney.com/'),
19 ('襄阳家教网', 'http://www.jiajiao100.com/'),
20 ]
21
22
23 def get_html(url_info):
24 print(os.getppid(), os.getpid())
25 url = url_info[1]
26 path = os.path.join('response', url_info[0] + '.html')
27 response = requests.get(url, headers=headers)
28
29 response.encoding = 'utf-8'
30 with open(path, 'w', encoding='utf-8')as f:
31 f.write(response.text)
32
33
34 if __name__ == '__main__':
35 start = time.time()
36 pool = Pool(8)
37 pool.map(get_html,urls)
38 t = time.time() - start
39 print(t) #0.7495629787445068
分析对比:测试结果进程耗时最长,线程耗时时间最短。虽然在处理多任务时,开启多个进程或线程能提高效率,但是在实际运行时也会受到一些因素的约束,达不到期望的效果。
2.异步处理规避阻塞是处理问题的思路,接下来对比一下异步多进程/进程池与多线程/线程池:
(1)解决同步调用方案之多线程/多进程
好处:在服务器端使用多线程(或多进程)。多线程(或多进程)的目的是让每个连接都拥有独立的线程(或进程),这样任何一个连接的阻塞都不会影响其他的连接。
- 弊端:开启多进程或都线程的方式,我们是无法无限制地开启多进程或多线程的:在遇到要同时响应成百上千路的连接请求,则无论多线程还是多进程都会严重占据系统资源,降低系统对外界响应效率,而且线程与进程本身也更容易进入假死状态。
(2) 解决同步调用方案之线程/进程池
好处:很多程序员可能会考虑使用“线程池”或“连接池”。“线程池”旨在减少创建和销毁线程的频率,其维持一定合理数量的线程,并让空闲的线程重新承担新的执行任务。可以很好的降低系统开销。
- 弊端:“线程池”和“连接池”技术也只是在一定程度上缓解了频繁调用IO接口带来的资源占用。而且,所谓“池”始终有其上限,当请求大大超过上限时,“池”构成的系统对外界的响应并不比没有池的时候效果好多少。所以使用“池”必须考虑其面临的响应规模,并根据响应规模调整“池”的大小。
“线程池”或“连接池”或许可以缓解部分压力,但是不能解决所有问题。总之,多线程模型可以方便高效的解决小规模的服务请求,但面对大规模的服务请求,多线程模型也会遇到瓶颈,在遇到IO阻塞是并不能完全充分利用cpu,因此需要通过非阻塞接口来解决这个问题.