一般爬虫可以分为以下几个步骤:

一、打开指定网页

二、解析网页

三、处理/存储数据,新增任务网页

另外异步的话,需要调度器。

简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。

项目路径:ur'D:\python_py\my_scrapy/scrapy_tools'

# scrapy_tools下添加__init__.py作为包使用

itemparse.py

按照数据的结构建立相应的xpath 结构


#-*- coding: utf-8 -*-
"""Created on Fri Jul 07 17:24:34 2017
@author: willowj"""
importsys
stdout, stdin, stderr=sys.stdout, sys.stdin, sys.stderr
reload(sys)
sys.stdout, sys.stdin, sys.stderr=stdout, stdin, stderr
sys.setdefaultencoding('utf8')importgeventimportpandas as pdimportnumpy as npfrom lxml importhtmlimporttimeimportcodecsimportjsondeflist_0e(list_):ifisinstance(list_, list):if notlist_:returnNoneelse:if len(list_)>1:print 'warning : list>1,list[1]:', list_[1] #,len(list_)
returnlist_[0]else:returnlist_classItemParse(object):"""docstring for zhihu_topi"""name= 'ItemParse'base_url= 'https://www.zhihu.com/topic/19551147/top-answers'pageN_x= '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()'new_urls_x=None#以下一条数据的节点,以及每一项
items_node_x = '//div[@class="feed-main"]'
#注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
item_xs =dict(
question_name= '''.//a[@class='question_link']/text()''',#question_href = '''.//a[@class='question_link']/@href''',
author = './/div[@data-action="/answer/content"]/@data-author-name',
author_href= '''.//a[@class='author-link']/@href''',
ups_x= './/div[@class="zm-item-vote-info"]/@data-votecount',
answers_text= ".//textarea/text()",
commentN= './/a[@name="addcomment"]/text()[last()]',
entry_url= './/div[@data-action="/answer/content"]/@data-entry-url',#re:
#z = re.compile('\.')
)#换页url样式
defgetnextpages(self):if self.pageN > 1:#自定义换也规则,只有一页则为 False
urls = [self.base_url + '?page=%s' %nfor n in range(self.pageN,1,-1)
]returnurlsdef __init__(self, html_):#self.item_atrr_xpath()
self.results =[]
self.new_urls=[]
self.pageN=self.update_page_n(html_)
self.nextpages=self.getnextpages()
self.parase(html_)defparase(self, html_):#优先使用xpath,,补充使用re; 找不到的item 返回none
etree =html.document_fromstring(html_)
items_nodes=etree.xpath(self.items_node_x)#results = []
for ee initems_nodes:
ee_str=None
ite={}for item,itemx inself.item_xs.items():#re, or xpath
if hasattr(itemx, 'findall'):if ee_str isNone:
ee_str=html.to_string(ee)
ite[item]=itemx.findall(ee_str)#xpath
elif isinstance(itemx, str) orisinstance(itemx, unicode):if itemx.startswith('./'):
ite[item]=ee.xpath(itemx)else:printitemraise 'xpath not startwith ./'
else:printitemraise 'not re.pattarn object or xpath str'
if len(ite[item]) ==0:
ite[item]=Noneelif len(ite[item]) == 1:
ite[item]=ite[item][0]else:
ite[item]= '\n'.join([str(__i) for __i inite[item]])
self.results.append(ite)#new_url
ifself.new_urls_x:
self.new_urls.extend(etree.xpath(self.new_urls_x))#获取有多少页
defupdate_page_n(self, html_):ifself.pageN_x:
etree=html.document_fromstring(html_)
pages=etree.xpath(self.pageN_x)
pages=list_0e(pages)ifisinstance(pages, str):
pages.strip()if pages andpages.isdigit():returnint(pages)else:return 1
#普通的获取项目下所有换页
def get_nextpages(self, opener, sleep_sec=None):for url inself.nextpages:ifsleep_sec:
time.sleep(sleep_sec)#if not hasattr(opener, 'get')
_re =opener.get(url)print_re.status_code, _re.url
self.parase(_re.text)printtime.time()#暂时把 异步控制和存储方法写到了这里
#gevent 协程方法
def __gevent_get_nextpages(self, opener):printid(opener)whileself.nextpages:#start_time = time.time()
url =self.nextpages.pop()printgevent.getcurrent()
zhihu_re=opener.get(url)#gevent.sleep(5)
printzhihu_re.status_code, url
self.parase(zhihu_re.text)printtime.time()#gevent 协程方法
def get_nextpages_by_gevent(self, opener_class, g_n=4):'''param: opener_class : 创建网页打开器的类
g_n: 协程数量,默认4个'''
from gevent importmonkey; monkey.patch_all()
start_time=time.time()
gs= [gevent.spawn(self.__gevent_get_nextpages, opener_class())for i inrange(g_n)
]
gevent.joinall(gs)print time.time() -start_time
self.save_to_excel()def save_to_excel(self, path=None):ifpath:
save_name=pathelse:
save_name= u''+self.name \+ time.strftime('%Y%m%d_%H_%M', time.localtime()) \+ '.xlsx'
printsave_name
result_pd=pd.DataFrame(self.results)print 'pd ok'result_pd.to_excel(u'' + save_name, encoding='gb18030')print 'saved to' +save_namedef save_to_json(self, path=None):ifpath:
save_name=pathelse:
save_name= u''+self.name \+ time.strftime('%Y%m%d_%H_%M', time.localtime()) \+ '.json'
printsave_name
with codecs.open(save_name,'w', encdoing='gb18030') as f:
f.write(josn.dumps(self.results))print 'saved to'+ save_name
View Code

使用时继承类重写类属性和getnextpages 换页方法

web_opener.py

使用requests.Session,保持会话的方式速度大概会快一倍

对应gevent异步,多少个协程就会生成同等的会话,各自打开网页互补干扰。 方法暂时写在itemparse.py



#-*- coding: utf-8 -*-

"""2017年8月17日星期四

下午 17:22

@author: willowj"""
importsys
sys.setdefaultencoding('utf8')importrequests#from requests.cookies import (#cookiejar_from_dict, extract_cookies_to_jar, RequestsCookieJar, merge_cookies)
classSessionFopener(object):"""requests 封装的网页打开器
param: headers 默认使用类属性,实例化的时候自己可以传入
cookie_dic 默认禁用
proxies 默认无"""headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6','Cache-Control':'max-age=0','Connection':'keep-alive',#'Cookie':'q'
#'Host':'www.zhihu.com',
'Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
}def __init__(self, headers=None, cookie_dic=None, proxies=None):
self.req_s=requests.Session()
self.req_s.adapters.DEFAULT_RETRIES= 3self.req_s.keep_alive=Trueifheaders:
self.req_s.headers=headerselse:
self.req_s.headers=self.headersif notcookie_dic:
cookie_dic={}
self.req_s.cookies=requests.cookies.cookiejar_from_dict(cookie_dic)ifproxies:
self.req_s.proxies=proxiesdefclose(self):
self.req_s.close()def get(self, *arg, **karg):return self.req_s.get(*arg, **karg)def post(self, *arg, **karg):return self.req_s.post(*arg, **karg)def set_cookiejar(self, cookie_dic={}):
self.req_s.cookies=requests.cookies.cookiejar_from_dict(cookie_dic)defadd_cookiejar(self, cookie_dic):
self.req_s.cookies=requests.cookies.merge_cookies(self.req_s.cookies, cookie_dic)def set_headers(self, headers={}):
self.req_s.headers=headersdefadd_headers(self, headers_dic):for k,v inheader_dic:
self.req_s.headers[k]=vdefset_proxies(self, proxies):
self.req_s.proxies=proxies
@classmethoddefcookiejar_from_dict(cls, cookie_dic):returnrequests.cookies.cookiejar_from_dict(cookie_dic)def __enter__(self):print 'enter'
returnselfdef __exit__(self, *used):
self.req_s.close()delself.req_sprint 'exit'
if __name__ == '__main__':
with SessionFopener() as req_o:
res_p= req_o.get('http://httpbin.org/get')print res_p.json()
View Code

大众点评店铺爬取示例:

只需要继承后重写解析的节点、换页的url形式就行

暂时未考虑外链接。



#-*- coding: utf-8 -*-

"""Created

2017年8月17日星期四

下午 19:33

@author: Administrator"""
importsys
stdout, stdin, stderr=sys.stdout, sys.stdin, sys.stderr
reload(sys)
sys.stdout, sys.stdin, sys.stderr=stdout, stdin, stderr
sys.setdefaultencoding('utf8')
sys.path.append(ur'D:\python_py\my_scrapy')from scrapy_tools.web_opener importSessionFopenerfrom scrapy_tools.itemparse importItemParseclassDzdpItemParse(ItemParse):"""广州酒家(文昌店)的点评
docstring for zhihu_topi"""name= u'DzdpItemParse广州酒家'base_url= 'https://www.dianping.com/shop/516983/review_more'pageN_x= ".//a[@class='PageLink'][last()]/text()"new_urls_x=None#以下一条数据的节点,以及每一项
items_node_x = './/div[@class="comment-list"]/ul/li'
#注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
item_xs =dict(
user_id= '''.//*[@class="J_card"]/@user-id''',#question_href = '''.//a[@class='question_link']/@href''' ,
comm_per= """.//span[@class='comm-per']/text()""",
total_mark= """.//*[@class="user-info"]/span[1]/@class""",
taste= """.//*[@class="comment-rst"]/span[1]/text()""",
environment= """.//*[@class="comment-rst"]/span[2]/text()""",
sevice= """.//*[@class="comment-rst"]/span[3]/text()""",
comments_agree= '''.//span[@class="heart-num"]/text()''',
comment_text= """.//*[@class="J_brief-cont"]/text()""",
comment_date= '''.//*[@class="time"]/text()''',
recommend_food=\
u'''.//*[@class="comment-recommend" \
and (contains(text(),推荐) \
or contains(text(),喜欢))]\
[1]/a/text()'''
#中文得使用unicode
#re:
#z = re.compile('\.')
)defgetnextpages(self):if self.pageN > 1:#自定义换也规则,只有一页则为 False
urls = [self.base_url + '?pageno=%s' %nfor n in range(self.pageN, 1, -1)
]returnurls
open_s= SessionFopener() #实例化一个打开器
respon_= open_s.get(DzdpItemParse.base_url) #打开初始页
gzjj_item = DzdpItemParse(respon_.text) #解析对象用初始页html实例化
#同步方式的话,使用普通方法
gzjj_item.get_nextpages(open_s, sleep_sec=None)#异步方法:#gzjj_item.get_nextpages_by_gevent(SessionFopener) #实例异步方法
View Code

结果:本来打开一个网页0.5279 s,开四个协程后77.71s爬完613个页面,平均0.13s一个,速度提升至4倍



200 https://www.dianping.com/shop/516983/review_more?pageno=600

1503074965.07

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=602

1503074965.1

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=601

1503074965.14

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=604

1503074965.54

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=607

1503074965.59

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=605

1503074965.64

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=606

1503074965.67

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=611

1503074966.1

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=609

1503074966.15

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=610

1503074966.18

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=608

1503074966.22

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=612

1503074966.7

200 https://www.dianping.com/shop/516983/review_more?pageno=614

1503074966.74

200 https://www.dianping.com/shop/516983/review_more?pageno=615

1503074967.05

200 https://www.dianping.com/shop/516983/review_more?pageno=613

1503074967.09

77.7100000381DzdpItemParse广州酒家20170819_00_49.xlsx

pd ok

saved to DzdpItemParse广州酒家20170819_00_49.xlsx

View Code

分布式多进程、入数据库的话,还得单独写调度器、与数据对接的模块