为什么用
- 快
- 反爬虫
多线程
- 复杂性
- 资源、数据的安全性:锁保护
- 原子性:数据操作是天然互斥的
- 同步等待:wait()、notify()、notifyall()
- 死锁:多个线程对资源互锁
- 容灾:任何线程出错,程序都会停止
- Python 多线程
- 支持多线程
- 直接映射到native线程(Java多线程由JVM映射到一个native thread上)
- GIL(Global Interpretor Lock):同一时间只有一个CPU运行,对于多核的利用能力有限(相当于单线程)
- 适用于 IO 阻塞为主的场景,而不是 CPU 阻塞为主的场景
- Python主要用于 offline 数据处理,而不是处理 online 并发的服务请求(C++,Java)
- 实现
- 创建线程池 threads = []
- 确认 url 队列线程安全 Queue Deque
- 从队列取出 url,分配一个线程开始爬取 pop()/get() threading.Thread
- 如果线程池满,循环等待,直到有线程结束
- 从线程池移除已完成下载的线程
- 如当前级别 url 遍历完毕,t.join() 等待所有线程结束,然后开始下一级别的爬取
- 优势
- 内存空间共享,数据交换高效
- CPU使用效率高,利用多个CPU操作
- 开发便捷
- 创建、销毁的开销小
- 减小下载出错、阻塞对抓取速度的影响,提高下载速度
- 对于没有反爬限制的网站,下载速度提升明显
- 不足
- 对于有反爬虫的网站,速度提升有限
- 提高了复杂性,对编码要求高
- 线程越多,每个线程获得时间越少,线程切换更频繁带来额外开销
- 线程之间资源竞争更激烈
multi_thread_mfw
1 # -*- coding: utf-8 -*-
2
3 import urllib3
4 import os
5 from collections import deque
6 import json
7 from lxml import etree
8 import hashlib
9 from bloom_filter import BloomFilter
10
11 import threading
12 import time
13
14 class CrawlBSF:
15 request_headers = {
16 'host': "www.mafengwo.cn",
17 'connection': "keep-alive",
18 'cache-control': "no-cache",
19 'upgrade-insecure-requests': "1",
20 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
21 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
23 }
24
25 cur_level = 0
26 max_level = 5
27 iter_width = 50
28 downloaded_urls = []
29
30 def __init__(self, url, dir_name):
31 self.dir_name = dir_name
32 self.du_md5_file_name = dir_name + '/download.txt'
33 self.du_url_file_name = dir_name + '/urls.txt'
34
35 self.bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01)
36 self.bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01)
37
38 self.cur_queue = deque()
39 self.child_queue = deque()
40
41 self.root_url = url
42 self.cur_queue.append(url)
43 self.du_file = open(self.du_url_file_name, 'a+')
44 try:
45 self.dumd5_file = open(self.du_md5_file_name, 'r')
46 self.downloaded_urls = self.dumd5_file.readlines()
47 self.dumd5_file.close()
48 for urlmd5 in self.downloaded_urls:
49 self.bloom_downloaded_urls.add(urlmd5[:-2])
50 except IOError:
51 print( "File not found")
52 finally:
53 self.dumd5_file = open(self.du_md5_file_name, 'a+')
54
55 def enqueueUrl(self, url):
56 if url not in self.bloom_url_queue and hashlib.md5(url.encode('utf8')).hexdigest() not in crawler.bloom_downloaded_urls:
57 self.child_queue.append(url)
58 self.bloom_url_queue.add(url)
59
60 def dequeuUrl(self):
61 try:
62 url = self.cur_queue.popleft()
63 return url
64 except IndexError:
65 return None
66
67 def close(self):
68 self.dumd5_file.close()
69 self.du_file.close()
70
71
72 # Global variables
73 num_downloaded_pages = 0
74
75 #download the page content
76 def get_page_content(cur_url):
77 global num_downloaded_pages
78 print( "downloading %s at level %d" % (cur_url, crawler.cur_level))
79 try:
80 http = urllib3.PoolManager()
81 r = http.request('GET', cur_url, headers = CrawlBSF.request_headers)
82 html_page = r.data
83 filename = cur_url[7:].replace('/', '_')
84 fo = open("%s/%s.html" % (crawler.dir_name, filename), 'wb+')
85 fo.write(html_page)
86 fo.close()
87 except IOError as err:
88 print(err)
89 return
90 except Exception as err:
91 print( err )
92 return
93 # print( 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list')
94
95 # save page and set bloomfilter
96 dumd5 = hashlib.md5(cur_url.encode('utf8')).hexdigest()
97 crawler.downloaded_urls.append(dumd5)
98 crawler.dumd5_file.write(dumd5 + '\r\n')
99 crawler.du_file.write(cur_url + '\r\n')
100 crawler.bloom_downloaded_urls.add(dumd5)
101 num_downloaded_pages += 1
102
103 html = etree.HTML(html_page.lower().decode('utf-8'))
104 hrefs = html.xpath(u"//a")
105
106 for href in hrefs:
107 try:
108 if 'href' in href.attrib:
109 val = href.attrib['href']
110 if val.find('javascript:') != -1:
111 continue
112 if val.startswith('http://') is False:
113 if val.startswith('/'):
114 val = 'http://www.mafengwo.cn' + val
115 else:
116 continue
117 if val[-1] == '/':
118 val = val[0:-1]
119 # if hashlib.md5(val).hexdigest() not in self.downloaded_urls:
120 crawler.enqueueUrl(val)
121 # else:
122 # print( 'Skip %s' % (val))
123 except ValueError:
124 continue
125
126 def start_crawl():
127 # if it's the first page (start url), if true, crawl it in main thread in sync(blocking) mode
128 # 如果是第一个抓取页面的话,在主线程用同步(阻塞)的模式下载,后续的页面会通过创建子线程的方式异步爬取
129 is_root_page = True
130 threads = []
131 max_threads = 10
132
133 CRAWL_DELAY = 0.6
134
135 while True:
136 url = crawler.dequeuUrl()
137 # Go on next level, before that, needs to wait all current level crawling done
138 if url is None:
139 crawler.cur_level += 1
140 for t in threads:
141 t.join()
142 if crawler.cur_level == crawler.max_level:
143 break
144 if len(crawler.child_queue) == 0:
145 break
146 crawler.cur_queue = crawler.child_queue
147 crawler.child_queue = deque()
148 continue
149
150 # looking for an empty thread from pool to crawl
151 if is_root_page is True:
152 get_page_content(url)
153 is_root_page = False
154 else:
155 while True:
156 # first remove all finished running threads
157 for t in threads:
158 if not t.is_alive():
159 threads.remove(t)
160 if len(threads) >= max_threads:
161 time.sleep(CRAWL_DELAY)
162 continue
163 try:
164 t = threading.Thread(target=get_page_content, name=None, args=(url,))
165 threads.append(t)
166 # set daemon so main thread can exit when receives ctrl-c
167 t.setDaemon(True)
168 t.start()
169 time.sleep(CRAWL_DELAY)
170 break
171 except Exception as err:
172 print( "Error: unable to start thread", err)
173 raise
174
175 if __name__ == '__main__':
176 start_time = time.time()
177 dir_name = 'htmls'
178 # 检查用于存储网页文件夹是否存在,不存在则创建
179 if not os.path.exists(dir_name):
180 os.makedirs(dir_name)
181
182 crawler = CrawlBSF("http://www.mafengwo.cn", dir_name)
183 start_crawl()
184 print( '%d pages downloaded, time cost %0.2f seconds' % (num_downloaded_pages, time.time()-start_time))
View Code
多进程
- 目的
- 控制线程数量
- 对线程进行隔离,减少资源竞争
- 某些环境下,单机上用多个IP伪装
- 局限性
- 不能突破网络瓶颈
- 单机单IP情况下没有意义(常用于分布式)
- 数据交换的代价更大
- 进程间通信(IPC)
- 管道(PIPE)
- 信号(Signal):复杂
- 消息队列:Posix及system V
- 共享内存:速度最快,需要结合信号量达到进程间同步及互斥
- 信号量:用于数据同步
- Socket:可标准化,用于多机
- 解决方式
- C/S模式
- 一个服务进程,入队及出队URL,入队需检查是否已经下载
- 监控目前爬取状态、进度
- 多个爬取进程,从服务进程获取URL,并将新的URL返回给服务进程
- 使用Socket做IPC
- 优势:运行速度块,扩展方便
- 数据库模式
- 多线程读取数据库中的 url 列表,把数据库当队列用
- 多个爬取进程,url 的获取与增加都通过数据库操作
- 优势:开发便捷,只需写一个爬虫程序
- 实现
- MySQLConnectionPool管理多线程下的mysql数据库连接
- __init__类实例的时候自动检查和创建数据库及表
- Cursor类
- SELECT...FOR UPDATE 加读锁,避免多个进程取出同一个url
- cursor.commit():支持事务,默认关闭了autocommit,需要提交
- 表字段
- status:下载状态
- md5:url 地址的 md5 值
- depth
- queue_time
- done_time
dbmanager
1 import mysql.connector
2 import hashlib
3 from mysql.connector import errorcode
4
5
6 class CrawlDatabaseManager:
7
8 DB_NAME = 'mfw_pro_crawl'
9
10 SERVER_IP = 'localhost'
11
12 TABLES = {}
13 # create new table, using sql
14 TABLES['urls'] = (
15 "CREATE TABLE `urls` ("
16 " `index` int(11) NOT NULL AUTO_INCREMENT," # index of queue
17 " `url` varchar(512) NOT NULL,"
18 " `md5` varchar(32) NOT NULL,"
19 " `status` varchar(11) NOT NULL DEFAULT 'new'," # could be new, downloading and finish
20 " `depth` int(11) NOT NULL,"
21 " `queue_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,"
22 " `done_time` timestamp NOT NULL DEFAULT 0 ON UPDATE CURRENT_TIMESTAMP,"
23 " PRIMARY KEY (`index`),"
24 " UNIQUE KEY `md5` (`md5`)"
25 ") ENGINE=InnoDB")
26
27
28 def __init__(self, max_num_thread):
29 # connect mysql server
30 try:
31 self.max_num_thread = max_num_thread
32 cnx = mysql.connector.connect(host=self.SERVER_IP, user='root', password='amei')
33 except mysql.connector.Error as err:
34 if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
35 print( "Something is wrong with your user name or password")
36 elif err.errno == errorcode.ER_BAD_DB_ERROR:
37 print( "Database does not exist")
38 else:
39 print( 'Create Error ' + err.msg)
40 exit(1)
41
42 cursor = cnx.cursor()
43
44 # use database, create it if not exist
45 try:
46 cnx.database = self.DB_NAME
47 except mysql.connector.Error as err:
48 if err.errno == errorcode.ER_BAD_DB_ERROR:
49 # create database and table
50 self.create_database(cursor)
51 cnx.database = self.DB_NAME
52 self.create_tables(cursor)
53 else:
54 print( err)
55 exit(1)
56 finally:
57 cursor.close()
58 cnx.close()
59
60 self.dbconfig = {
61 "database": self.DB_NAME,
62 "user": "root",
63 "host": self.SERVER_IP,
64 "password": "amei"
65 }
66
67 # self.cnxpool = mysql.connector.connect(pool_name="mypool",
68 # pool_size=max_num_thread,
69 # **dbconfig)
70
71
72 # create databse
73 def create_database(self, cursor):
74 try:
75 cursor.execute(
76 "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(self.DB_NAME))
77 except mysql.connector.Error as err:
78 print( "Failed creating database: {}".format(err))
79 exit(1)
80
81 def create_tables(self, cursor):
82 for name, ddl in self.TABLES.items():
83 try:
84 cursor.execute(ddl)
85 except mysql.connector.Error as err:
86 if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
87 print( 'create tables error ALREADY EXISTS')
88 else:
89 print( 'create tables error ' + err.msg)
90 else:
91 print( 'Tables created')
92
93
94 # put an url into queue
95 def enqueueUrl(self, url, depth):
96 con = mysql.connector.connect(pool_name="mypool",
97 pool_size=self.max_num_thread,
98 **self.dbconfig)
99 cursor = con.cursor()
100 try:
101 add_url = ("INSERT INTO urls (url, md5, depth) VALUES (%s, %s, %s)")
102 data_url = (url, hashlib.md5(url.encode('utf8')).hexdigest(), depth)
103 cursor.execute(add_url, data_url)
104 # commit this transaction, please refer to "mysql transaction" for more info
105 con.commit()
106 except mysql.connector.Error as err:
107 # print( 'enqueueUrl() ' + err.msg)
108 return
109 finally:
110 cursor.close()
111 con.close()
112
113
114 # get an url from queue
115 def dequeueUrl(self):
116 con = mysql.connector.connect(pool_name="mypool",
117 pool_size=self.max_num_thread,
118 **self.dbconfig)
119 cursor = con.cursor(dictionary=True)
120 try:
121 # use select * for update to lock the rows for read
122 query = ("SELECT `index`, `url`, `depth` FROM urls WHERE status='new' ORDER BY `index` ASC LIMIT 1 FOR UPDATE")
123 cursor.execute(query)
124 if cursor.rowcount is 0:
125 return None
126 row = cursor.fetchone()
127 update_query = ("UPDATE urls SET `status`='downloading' WHERE `index`=%d") % (row['index'])
128 cursor.execute(update_query)
129 con.commit()
130 return row
131 except mysql.connector.Error as err:
132 print( 'dequeueUrl() ' + err.msg)
133 return None
134 finally:
135 cursor.close()
136 con.close()
137
138 def finishUrl(self, index):
139 con = mysql.connector.connect(pool_name="mypool",
140 pool_size=self.max_num_thread,
141 **self.dbconfig)
142 cursor = con.cursor()
143 try:
144 # we don't need to update done_time using time.strftime('%Y-%m-%d %H:%M:%S') as it's auto updated
145 update_query = ("UPDATE urls SET `status`='done' WHERE `index`=%d") % (index)
146 cursor.execute(update_query)
147 con.commit()
148 except mysql.connector.Error as err:
149 print( 'finishUrl() ' + err.msg)
150 return
151 finally:
152 cursor.close()
153 con.close()
View Code
process_crawl
1 import urllib3
2 from collections import deque
3 import json
4 from lxml import etree
5 from bloom_filter import BloomFilter
6 import threading
7 import time
8 from dbmanager import CrawlDatabaseManager
9
10 from mysql.connector import errorcode
11 import mysql.connector
12
13 import os
14
15 request_headers = {
16 'host': "www.mafengwo.cn",
17 'connection': "keep-alive",
18 'cache-control': "no-cache",
19 'upgrade-insecure-requests': "1",
20 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
21 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
23 }
24
25 def get_page_content(cur_url, index, depth):
26 print( "downloading %s at level %d" % (cur_url, depth))
27 try:
28 http = urllib3.PoolManager()
29 r = http.request('GET', cur_url, headers=request_headers)
30 html_page = r.data
31 filename = cur_url[7:].replace('/', '_')
32 fo = open("%s%s.html" % (dir_name, filename), 'wb+')
33 fo.write(html_page)
34 fo.close()
35 dbmanager.finishUrl(index)
36 except urllib3.exceptions as err:
37 print('HttpError: ' + err)
38 return
39 except IOError as err:
40 print('IOError: ' + err)
41 return
42 except Exception as err:
43 print('Exception: ' + err)
44 return
45 # print( 'add ' + hashlib.md5(cur_url.encode('utf8')).hexdigest() + ' to list')
46
47 html = etree.HTML(html_page.lower().decode('utf-8'))
48 hrefs = html.xpath(u"//a")
49
50 for href in hrefs:
51 try:
52 if 'href' in href.attrib:
53 val = href.attrib['href']
54 if val.find('javascript:') != -1:
55 continue
56 if val.startswith('http://') is False:
57 if val.startswith('/'):
58 val = 'http://www.mafengwo.cn' + val
59 else:
60 continue
61 if val[-1] == '/':
62 val = val[0:-1]
63 dbmanager.enqueueUrl(val, depth + 1)
64
65 except ValueError:
66 continue
67
68
69 max_num_thread = 5
70
71 # create instance of Mysql database manager, which is used as a queue for crawling
72 dbmanager = CrawlDatabaseManager(max_num_thread)
73
74 # dir for saving HTML files
75 dir_name = 'dir_process/'
76
77 if os.path.exists(dir_name) is False:
78 os.mkdir(dir_name)
79
80 # put first page into queue
81 dbmanager.enqueueUrl("http://www.mafengwo.cn", 0)
82 start_time = time.time()
83 is_root_page = True
84 threads = []
85
86 # time delay before a new crawling thread is created
87 # use a delay to control the crawling rate, avoiding visiting target website too frequently
88 # 设置超时,控制下载的速率,避免太过频繁访问目标网站
89 CRAWL_DELAY = 0.6
90
91
92 while True:
93 curtask = dbmanager.dequeueUrl()
94 print ("dequeue")
95 # Go on next level, before that, needs to wait all current level crawling done
96 if curtask is None:
97 print ("no task")
98 for t in threads:
99 t.join()
100 break
101
102 # looking for an empty thread from pool to crawl
103
104 if is_root_page is True:
105 get_page_content(curtask['url'], curtask['index'], curtask['depth'])
106 is_root_page = False
107 else:
108 while True:
109 # first remove all finished running threads
110 for t in threads:
111 if not t.is_alive():
112 threads.remove(t)
113 if len(threads) >= max_num_thread:
114 time.sleep(CRAWL_DELAY)
115 continue
116 try:
117 t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['index'], curtask['depth']))
118 threads.append(t)
119 # set daemon so main thread can exit when receives ctrl-c
120 t.setDaemon(True)
121 t.start()
122 time.sleep(CRAWL_DELAY)
123 break
124 except Exception as err :
125 print( "Error: unable to start thread", err )
126 raise
View Code
分布式(多台机器)
- QPS < 2 ,单机即可,没必要做集群
- 演变
- A program->A process->A message->A packet->
- A protocol->A network->A component->A distributed system
- 优点
- 高容错/高可用/可恢复/持久/可伸缩/可预测
- 分布式爬虫作用
- 解决目标地址对IP访问频率的限制
- 利用更高的带宽,提高下载速度
- 大规模系统的分布式存储和备份
- 数据扩展能力
- 爬虫原始数据特征
- 文件小,kb 级别
- 文件数量大
- 增量方式一次性写入,极少需要修改
- 顺序读取
- 并发文件读写
- 可扩展
- Master-Slave 结构
- 一个主机,对所有的服务器进行管理(云服务)
- 爬虫服务器多的时候,通过一个中心节点对从节点进行管理
- 对整体的爬取进行控制
- 爬虫之间信息共享
- 负载控制
- RPC
- Socket
- 三次握手,建立TCP连接
- 建立好之后,keep-alive
- 服务器绑定端口(ftp 21/http 80/mySQL 3306)
- 客户端向服务器指定端口发送请求
- 服务器处理请求后返回给客户端
- 非阻塞监听:send和recv都会立即返回
- IPC转成数据流(二进制或字符串)传播
- 使用socket通信,客户端即可在一台电脑上(多进程),也可在多台电脑上
- 思路
- master+client:管理分布式爬虫,在一台电脑上启动多个client连master
- master 是服务器主程序,client 是客户端爬虫
- client 中的 heartbeat 负责和底层通信
- socket.server 和 socket.client 位于TCP层,负责底层通信
- mongo 部署在 server
- client 访问 mongo 获取任务
- client<-> socket.client<->socket.server<->master
- 结构
- socket.server:通信客户端
- socket.client:通信服务端
- master:注册 server,传递回调函数,处理不同消息类型,管理client
- client:注册 client,向 master 发送心跳,从数据库拿任务爬取
- protocol_contants:通信协议
- mongo_mgr:monge 存储任务队列
master
1 import hashlib
2
3 from socket_server import ServerSocket
4 import protocol_constants as pc
5 import json
6 import time
7 import _thread
8
9 from mongo_mgr import MongoManager
10
11 import signal
12 import sys
13
14 constants = {
15 'reorder_period': 1200, # 20 mins
16 'connection_lost_period': 30, # 30s
17 'status_check_intervel': 5, # 5 sec
18 }
19
20 class CrawlMaster:
21 clients = {}
22
23 server_status = pc.STATUS_RUNNING
24
25 last_rereoder_time = time.time()
26
27 mongo_mgr = MongoManager()
28
29 def __init__(self, mongo_client = None, mongo_host='localhost'):
30 self.server = ServerSocket(self.on_message)
31 self.server.start()
32
33 def on_message(self, msg):
34 print( 'Heart Beat request', msg)
35 request = json.loads(msg)
36 type = request[pc.MSG_TYPE]
37 client_state = {}
38 response = {}
39 response[pc.SERVER_STATUS] = self.server_status
40 if type == pc.REGISTER:
41 client_id = self.get_free_id()
42 client_state['status'] = pc.STATUS_RUNNING
43 client_state['time'] = time.time()
44 self.clients[client_id] = client_state
45 return client_id
46 elif type == pc.UNREGISTER:
47 client_id = request.get(pc.CLIENT_ID)
48 del self.clients[client_id]
49 return json.dumps(response)
50 elif type == pc.LOCATIONS:
51 items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE])
52 response[pc.MSG_TYPE] = pc.LOCATIONS
53 response[pc.CRAWL_DELAY] = 2
54 response[pc.DATA] = json.dumps(items)
55 return json.dumps(response)
56 elif type == pc.TRIPLES:
57 items = self.mongo_mgr.dequeueItems(request[pc.REQUEST_SIZE])
58 response[pc.MSG_TYPE] = pc.LOCATIONS
59 response[pc.DATA] = json.dumps(items)
60 return json.dumps(response)
61
62 client_id = request.get(pc.CLIENT_ID)
63 if client_id is None:
64 response[pc.ERROR] = pc.ERR_NOT_FOUND
65 return json.dumps(response)
66 if type == pc.HEARTBEAT:
67 if self.server_status is not self.clients[client_id]['status']:
68 if self.server_status == pc.STATUS_RUNNING:
69 response[pc.ACTION_REQUIRED] = pc.RESUME_REQUIRED
70 elif self.server_status == pc.STATUS_PAUSED:
71 response[pc.ACTION_REQUIRED] = pc.PAUSE_REQUIRED
72 elif self.server_status == pc.STATUS_SHUTDOWN:
73 response[pc.ACTION_REQUIRED] = pc.SHUTDOWN_REQUIRED
74 return json.dumps(response)
75 else:
76 client_state['status'] = type
77 client_state['time'] = time.time()
78 self.clients[client_id] = client_state
79
80 return json.dumps(response)
81
82 def get_free_id(self):
83 i = 0
84 for key in self.clients:
85 if i < int(key):
86 break
87 i += 1
88 return str(i)
89
90
91 def reorder_queue(self):
92 g = nx.DiGraph()
93 cursor = self.db.urlpr.find()
94 for site in cursor:
95 url = site['url']
96 links = site['links']
97 for link in links:
98 g.add_edge(url, link)
99 pageranks = nx.pagerank(g, 0.9)
100 for url, pr in pageranks.iteritems():
101 print( 'updating %s pr: %f' % (url, pr))
102 record = {'pr': pr}
103 self.db.mfw.update_one({'_id': hashlib.md5(url.encode('utf8')).hexdigest()}, {'$set': record}, upsert=False)
104
105
106 def periodical_check(self):
107 while True:
108 clients_status_ok = True
109
110 if self.is_reordering is False and time.time() - self.last_rereoder_time > constants['reorder_period']:
111 self.server_status = pc.STATUS_PAUSED
112 self.is_reordering = True
113
114 for cid, state in self.clients.iteritems():
115 # no heart beat for 2 mins, remove it
116 if time.time() - state['time'] > constants['connection_lost_period']:
117 # remove it from client list
118 # del client[cid]
119 # set client status to be CONNECTION_LIST
120 self.clients[cid]['status'] = pc.STATUS_CONNECTION_LOST
121 continue
122
123 if state['status'] != self.server_status:
124 clients_status_ok = False
125 break
126
127 if clients_status_ok and self.server_status == pc.STATUS_PAUSED and self.is_reordering:
128 self.reorder_queue()
129 self.last_rereoder_time = time.time()
130 is_reordering = False
131 self.server_status = pc.STATUS_RUNNING
132
133 time.sleep(constants['status_check_intervel'])
134
135 def exit_signal_handler(signal, frame):
136 crawl_master.server.close()
137 sys.exit(1)
138
139 crawl_master = CrawlMaster()
140
141 _thread.start_new_thread(crawl_master.periodical_check, ())
142
143 signal.signal(signal.SIGINT, exit_signal_handler)
144 signal.pause()
View Code
client_crawler
1 from lxml import etree
2 import threading
3 import time
4 from mongo_redis_mgr import MongoRedisUrlManager
5 import argparse
6 import socket
7
8 import urllib3
9
10 import os
11
12 # from hdfs import *
13 # from hdfs.util import HdfsError
14 from socket_client import SocketClient
15 import protocol_constants as pc
16 import json
17
18 import argparse
19
20 class arguments:
21 pass
22
23 def parse_app_arguments():
24 parser = argparse.ArgumentParser(prog='CrawlerClient', description='Start a crawler client')
25 parser.add_argument('-h', '--host', type=str, nargs=1, help='Crawler host server address, default is localhost')
26 parser.add_argument('-p', '--host-port', type=int, nargs=1, help='Crawler host server port number, default is 20100')
27 parser.add_argument('-m', '--mongo', type=str, nargs=1, help='Mongo Server address, default is localhost')
28 parser.add_argument('-n', '--mongo-port', type=int, nargs=1, help='Mongo port number, default is 27017')
29 parser.add_argument('-r', '--redis', type=str, nargs=1, help='Redis server address, default is localhost')
30 parser.add_argument('-x', '--redis-port', type=int, nargs=1, help='Redis port number, default is 6379')
31 parser.add_argument('-s', '--server', type=str, nargs=1, help='Server address for all services, including mongo, redis and spider')
32
33 args = arguments()
34
35 parser.parse_args(namespace=args)
36
37 if args.server is not None:
38 args.host = args.mongo = args.redis = args.server
39
40 if args.host is None:
41 args.host = 'localhost'
42
43 if args.mongo is None:
44 args.mongo = 'localhost'
45
46 if args.redis is None:
47 args.redis = 'localhost'
48
49 if args.host_port is None:
50 args.host_port = 9999
51
52 if args.mongo_port is None:
53 args.mongo_port = 27017
54
55 if args.redis_port is None:
56 args.redis_port = 6379
57
58 parse_app_arguments()
59
60
61 def get_page_content(cur_url, depth):
62 global dir_name, dbmanager
63
64 print( "downloading %s at level %d" % (cur_url, depth))
65 links = []
66 try:
67 http = urllib3.PoolManager()
68 r = http.request('GET', cur_url, headers = request_headers)
69 filename = cur_url[7:].replace('/', '_')
70
71 #Write page to local files system
72 fo = open("%s%s.html" % (dir_name, filename), 'wb+')
73 fo.write(r.data)
74 fo.close()
75 dbmanager.finishUrl(cur_url)
76 except IOError as err:
77 print( "get_page_content()", err )
78 raise
79 except Exception as err :
80 print( "get_page_content()", err )
81 raise
82
83 html = etree.HTML(r.data.lower().decode('utf-8'))
84 hrefs = html.xpath(u"//a")
85
86 for href in hrefs:
87 try:
88 if 'href' in href.attrib:
89 val = href.attrib['href']
90 if val.find('javascript:') != -1:
91 continue
92 if val.startswith('http://') is False:
93 if val.startswith('/'):
94 val = 'http://www.mafengwo.cn' + val
95 else:
96 continue
97 if val[-1] == '/':
98 val = val[0:-1]
99 links.append(val)
100 dbmanager.enqueueUrl(val, 'new', depth+1)
101 except ValueError:
102 continue
103
104 dbmanager.set_url_links(cur_url, links)
105
106 def heartbeat():
107 global server_status, run_heartbeat, client_id, hb_period
108 skip_wait = False
109 while run_heartbeat:
110 if skip_wait is False:
111 time.sleep(hb_period)
112 else:
113 skip_wait = False
114 try:
115 hb_request = {}
116 hb_request[pc.MSG_TYPE] = pc.HEARTBEAT
117 hb_request[pc.CLIENT_ID] = client_id
118 print("sending a heartbeat! ", str(hb_request))
119 hb_response_data = socket_client.send(json.dumps(hb_request))
120
121 # should be network error
122 if hb_response_data is None:
123 continue
124
125 # print( 'Heart Beat response', json.dumps(hb_response_data))
126 response = json.loads(hb_response_data)
127
128 err = response.get(pc.ERROR)
129 if err is not None:
130 if err == pc.ERR_NOT_FOUND:
131 register_request = {}
132 register_request[pc.MSG_TYPE] = pc.REGISTER
133 client_id = socket_client.send(json.dumps(register_request))
134
135 # skip heartbeat period and send next heartbeat immediately
136 skip_wait = True
137 heartbeat()
138 return
139 return
140
141 action = response.get(pc.ACTION_REQUIRED)
142 if action is not None:
143 action_request = {}
144 if action == pc.PAUSE_REQUIRED:
145 server_status = pc.PAUSED
146 action_request[pc.MSG_TYPE] = pc.PAUSED
147 elif action == pc.PAUSE_REQUIRED:
148 server_status = pc.RESUMED
149 action_request[pc.MSG_TYPE] = pc.RESUMED
150 elif action == pc.SHUTDOWN_REQUIRED:
151 server_status = pc.SHUTDOWN
152 # stop heartbeat thread
153 return
154 action_request[pc.CLIENT_ID] = client_id
155 socket_client.send(json.dumps(action_request))
156 else:
157 server_status = response[pc.SERVER_STATUS]
158
159 except socket.error as msg:
160 print ("heartbeat error: ", msg)
161 server_status = pc.STATUS_CONNECTION_LOST
162 raise
163
164 def start_heart_beat_thread():
165 try:
166 t = threading.Thread(target=heartbeat, name=None)
167 # set daemon so main thread can exit when receives ctrl-c
168 t.setDaemon(True)
169 t.start()
170 except Exception as err:
171 print( "Error: unable to start thread", err)
172 raise
173
174 def crawl():
175 # thread pool size
176 max_num_thread = 5
177 CRAWL_DELAY = 2
178 global dbmanager, is_root_page, threads, hb_period
179
180 while True:
181 if server_status == pc.STATUS_PAUSED:
182 time.sleep(hb_period)
183 continue
184 if server_status == pc.SHUTDOWN:
185 run_heartbeat = False
186 for t in threads:
187 t.join()
188 break
189 try:
190 curtask = dbmanager.dequeueUrl()
191 except Exception:
192 time.sleep(hb_period)
193 continue
194
195 # Go on next level, before that, needs to wait all current level crawling done
196 if curtask is None:
197 time.sleep(hb_period)
198 continue
199 else:
200 print( 'current task is: ', curtask['url'], "at depth: ", curtask['depth'])
201
202 # looking for an empty thread from pool to crawl
203
204 if is_root_page is True:
205 get_page_content(curtask['url'], curtask['depth'])
206 is_root_page = False
207 else:
208 while True:
209 # first remove all finished running threads
210 for t in threads:
211 if not t.is_alive():
212 threads.remove(t)
213 if len(threads) >= max_num_thread:
214 time.sleep(CRAWL_DELAY)
215 continue
216 try:
217 t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['depth']))
218 threads.append(t)
219 # set daemon so main thread can exit when receives ctrl-c
220 t.setDaemon(True)
221 t.start()
222 time.sleep(CRAWL_DELAY)
223 break
224 except Exception as err:
225 print( "Error: unable to start thread", err)
226 raise
227 def finish():
228 global client_id
229 shutdown_request = {}
230 shutdown_request[pc.MSG_TYPE] = pc.SHUTDOWN
231 shutdown_request[pc.CLIENT_ID] = client_id
232 socket_client.send(json.dumps(shutdown_request))
233
234
235 def init():
236 global client_id
237
238 if os.path.exists(dir_name) is False:
239 os.mkdir(dir_name)
240 dbmanager.clear()
241 dbmanager.enqueueUrl('http://www.mafengwo.cn', 'new', 0 )
242
243 register_request = {}
244 register_request[pc.MSG_TYPE] = pc.REGISTER
245 client_id = socket_client.send(json.dumps(register_request))
246
247
248 # initialize global variables
249 request_headers = {
250 'host': "www.mafengwo.cn",
251 'connection': "keep-alive",
252 'cache-control': "no-cache",
253 'upgrade-insecure-requests': "1",
254 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
255 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
256 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
257 }
258
259
260 # Initialize system variables
261 dir_name = 'mfw/'
262
263 # db manager
264 dbmanager = MongoRedisUrlManager()
265
266 is_root_page = True
267 threads = []
268
269 # use hdfs to save pages
270 # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user')
271
272 socket_client = SocketClient('localhost', 20010)
273 client_id = 0
274
275 hb_period = 5
276 run_heartbeat = True
277 server_status = pc.STATUS_RUNNING
278
279 init()
280 start_heart_beat_thread()
281 crawl()
282 finish()
View Code
socket.server
1 import socket
2 import sys
3 import _thread
4
5 import signal
6
7 class ServerSocket:
8
9 # @param callback callback function for handling received data
10 # @param host Symbolic name meaning all available interfaces
11 # @param port Arbitrary non-privileged port
12 def __init__(self, callback, host='localhost', port=20010):
13 self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
14 self.callback = callback
15 # print( 'Socket created')
16
17 #Bind socket to local host and port
18 try:
19 print("bind ", port)
20 self.s.bind((host, port ))
21 except socket.error as msg:
22 print(msg)
23 sys.exit()
24
25 # print( 'Socket bind complete')
26
27 #Start listening on socket
28 self.s.listen(10)
29 # print( 'Socket now listening')
30
31 def startlistening(self):
32 #now keep talking with the client
33 while True:
34 print( 'Waiting for new connection ... ')
35 # wait to accept a connection - blocking call
36 conn, addr = self.s.accept()
37
38 # print( 'Connected with ' + addr[0] + ':' + str(addr[1]))
39
40 #start new thread takes 1st argument as a function name to be run, second is the tuple of arguments to the function.
41 _thread.start_new_thread(self.clientthread ,(conn,))
42
43 #Function for handling connections. This will be used to create threads
44 def clientthread(self, conn):
45 #Sending message to connected client
46 # conn.send('Welcome to the server. Type something and hit enter\n') #send only takes string
47
48 #infinite loop so that function do not terminate and thread do not end.
49
50 #Receiving from client
51 data = conn.recv(1024)
52 reply = self.callback(data.decode('utf8'))
53
54 # print( 'server sends ' + reply)
55
56 conn.sendall(reply.encode('utf8'))
57
58 conn.close()
59
60
61 def start(self):
62 _thread.start_new_thread(self.startlistening, ())
63
64 def close(self):
65 # self.s.shutdown(socket.SHUT_WR)
66 self.s.close()
67
68 def msg_received(data):
69 return 'Ack'
70
71 def exit_signal_handler(signal, frame):
72 pass
73
74 if __name__ == '__main__':
75 server = ServerSocket(msg_received)
76 server.start()
77 signal.signal(signal.SIGINT, exit_signal_handler)
78 signal.pause()
79 server.close()
80 sys.exit(1)
View Code
socket.client
1 import socket
2 import sys
3
4 class SocketClient:
5 def __init__(self, server_ip, server_port):
6 self.server_ip = server_ip
7 self.server_port = server_port
8
9 self.families = self.get_constants('AF_')
10 self.types = self.get_constants('SOCK_')
11 self.protocols = self.get_constants('IPPROTO_')
12
13 # print( >>sys.stderr, 'Family :', families[sock.family])
14 # print( >>sys.stderr, 'Type :', types[sock.type])
15 # print( >>sys.stderr, 'Protocol:', protocols[sock.proto])
16 # print( >>sys.stderr)
17
18 def get_constants(self, prefix):
19 """Create a dictionary mapping socket module constants to their names."""
20 return dict( (getattr(socket, n), n)
21 for n in dir(socket)
22 if n.startswith(prefix)
23 )
24
25 def send(self, message):
26 try:
27 # Create a TCP/IP socket
28 print ("connecting to ", self.server_port)
29 self.sock = socket.create_connection((self.server_ip, self.server_port))
30 # Send data
31 print( 'connected! client sends ', message)
32 self.sock.sendall(message.encode('utf8'))
33
34 data = self.sock.recv(1024)
35
36 return data.decode('utf8')
37 except Exception as err:
38 print( 'Get Error Message: ', err ) #Error Code : ' + str(msg[0]) + ' Message ' + msg[1])
39 return None
40 finally:
41 if hasattr(self, 'sock'):
42 self.sock.close()
View Code
protocol_contants
1 # msg type, could be REGISTER, UNREGISTER and HEARTBEAT
2 MSG_TYPE = 'TYPE'
3
4 # send register
5 REGISTER = 'REGISTER'
6
7 # unregister client with id assigned by master
8 UNREGISTER = 'UNREGISTER'
9
10 # send heart beat to server with id
11 HEARTBEAT = 'HEARTBEAT'
12
13 # notify master paused with id
14 PAUSED = 'PAUSED'
15
16 # notify master resumed with id
17 RESUMED = 'RESUMED'
18
19 # notify master resumed with id
20 SHUTDOWN = 'SHUTDOWN'
21
22 # get a new location list to crawl
23 LOCATIONS = 'REQUIRE_LOCATION_LIST'
24
25 # get a new triple list to crawl
26 TRIPLES = 'TRIPLES'
27
28 DATA = 'DATA'
29
30 CRAWL_DELAY = 'CRAWL_DELAY'
31
32 # finished list of item
33 FININSHED_ITEMS = 'FINISHED_ITEMS'
34
35 # client id key word
36 CLIENT_ID = 'CLIENT_ID'
37
38 # server status key word
39 ACTION_REQUIRED = 'ACTION_REQUIRED'
40
41 # server require pause
42 PAUSE_REQUIRED = 'PAUSE_REQUIRED'
43
44 # server require pause
45 RESUME_REQUIRED = 'RESUME_REQUIRED'
46
47 # server require shutdown
48 SHUTDOWN_REQUIRED = 'SHUTDOWN_REQUIRED'
49
50 # server status key word
51 SERVER_STATUS = 'SERVER_STATUS'
52
53 # server status values
54 STATUS_RUNNING = 'STATUS_RUNNING'
55
56 STATUS_PAUSED = 'STATUS_PAUSED'
57
58 STATUS_SHUTDOWN = 'STATUS_SHUTDOWN'
59
60 STATUS_CONNECTION_LOST = 'STATUS_CONNECTION_LOST'
61
62 ERROR = 'ERROR'
63
64 # client id not found, then it needs to register itself
65 ERR_NOT_FOUND = 'ERR_NOT_FOUND'
66
67 REQUEST_SIZE = 50
View Code
mongo_redis_mgr
1 import mysql.connector
2 import hashlib
3 import time
4 from datetime import datetime
5 from datetime import timedelta
6
7 import redis
8 from pymongo import MongoClient
9 from pymongo import IndexModel, ASCENDING, DESCENDING
10
11
12 class MongoRedisUrlManager:
13
14 def __init__(self, server_ip='localhost', client=None, expires=timedelta(days=30)):
15 """
16 client: mongo database client
17 expires: timedelta of amount of time before a cache entry is considered expired
18 """
19 # if a client object is not passed
20 # then try connecting to mongodb at the default localhost port
21 self.client = MongoClient(server_ip, 27017) if client is None else client
22 self.redis_client = redis.StrictRedis(host=server_ip, port=6379, db=0)
23 #create collection to store cached webpages,
24 # which is the equivalent of a table in a relational database
25 self.db = self.client.spider
26
27 # create index if db is empty
28 if self.db.mfw.count() is 0:
29 self.db.mfw.create_index('status')
30
31 def dequeueUrl(self):
32 record = self.db.mfw.find_one_and_update(
33 { 'status': 'new'},
34 { '$set': { 'status' : 'downloading'} },
35 { 'upsert':False, 'returnNewDocument' : False}
36 )
37 if record:
38 return record
39 else:
40 return None
41
42 def enqueueUrl(self, url, status, depth):
43 num = self.redis_client.get(url)
44 if num is not None:
45 self.redis_client.set(url, int(num) + 1 )
46 return
47 self.redis_client.set(url, 1)
48 self.db.mfw.insert({
49 '_id': hashlib.md5(url.encode('utf8')).hexdigest(),
50 'url': url,
51 'status': status,
52 'queue_time': datetime.utcnow(),
53 'depth': depth
54 })
55
56 def finishUrl(self, url):
57 record = {'status': 'done', 'done_time': datetime.utcnow()}
58 self.db.mfw.update({'_id': hashlib.md5(url.encode('utf8')).hexdigest()}, {'$set': record}, upsert=False)
59
60 def clear(self):
61 self.redis_client.flushall()
62 self.db.mfw.drop()
63
64
65 def set_url_links(self, url, links):
66 try:
67 self.db.urlpr.insert({
68 '_id': hashlib.md5(url.encode('utf8')).hexdigest(),
69 'url': url,
70 'links': links
71 })
72 except Exception as err:
73 pass
View Code
mongo_mgr
1 import mysql.connector
2 import hashlib
3 import time
4 from datetime import datetime
5 from datetime import timedelta
6
7 import redis
8 from pymongo import MongoClient
9 from pymongo import IndexModel, ASCENDING, DESCENDING
10
11
12 class MongoManager:
13
14 def __init__(self, server_ip='localhost', client=None, expires=timedelta(days=30)):
15 """
16 client: mongo database client
17 expires: timedelta of amount of time before a cache entry is considered expired
18 """
19 # if a client object is not passed
20 # then try connecting to mongodb at the default localhost port
21 self.client = MongoClient(server_ip, 27017) if client is None else client
22 #create collection to store cached webpages,
23 # which is the equivalent of a table in a relational database
24 self.db = self.client.spider
25
26 # create index if db is empty
27 if self.db.locations.count() is 0:
28 self.db.mfw.create_index([("status", ASCENDING)])
29
30 def dequeueItems(self, size):
31 records = self.db.mfw.find({'status':'new'}).batch_size(50)
32
33 ids = []
34 for record in records:
35 ids.append(record['_id'])
36
37 self.db.mfw.update(
38 {
39 '_id': { '$in': ids }
40 },
41 {
42 '$set': {'status': 'downloading'}
43 }
44 )
45
46 if records:
47 return records
48 else:
49 return None
50
51 def finishItems(self, ids):
52 self.db.mfw.update(
53 {
54 '_id': { '$in': ids }
55 },
56 {
57 '$set': {'status': 'finish'}
58 }
59 )
60
61 def clear(self):
62 self.db.mfw.drop()
63
64 if __name__ == '__main__':
65 mongo_mgr = MongoManager()
66 records = mongo_mgr.dequeueItems(5)
View Code