BeautifulSoup是python一种原生的解析文件的模块,区别于scrapy,scrapy是一种封装好的框架,只需要按结构进行填空,而BeautifulSoup就需要自己造轮子,相对scrapy麻烦一点但也更加灵活一些
以爬取百度贴吧内容示例说明。
# -*- coding:utf-8 -*- __author__='fengzhankui' import urllib2 from bs4 import BeautifulSoup class Item(object): title=None firstAuthor=None firstTime=None reNum=None content=None lastAuthor=None lastTime=None class GetTiebaInfo(object): def __init__(self,url): self.url=url self.pageSum=5 self.urls=self.getUrls(self.pageSum) self.items=self.spider(self.urls) self.pipelines(self.items) def getUrls(self,pageSum): urls=[] pns=[str(i*50) for i in range(pageSum)] ul=self.url.split('=') for pn in pns: ul[-1]=pn url='='.join(ul) urls.append(url) return urls def spider(self,urls): items=[] for url in urls: htmlContent=self.getResponseContent(url) soup=BeautifulSoup(htmlContent,'lxml') tagsli = soup.find_all('li',class_=['j_thread_list','clearfix'])[2:] for tag in tagsli: if tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '})==None: continue item=Item() item.title=tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip() item.firstAuthor=tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip() item.firstTime = tag.find('span', attrs={'title': u'创建时间'.encode('utf8')}).get_text().strip() item.reNum = tag.find('span', attrs={'title': u'回复'.encode('utf8')}).get_text().strip() item.content = tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip() item.lastAuthor = tag.find('span',attrs={'class': 'tb_icon_author_rely j_replyer'}).a.get_text().strip() item.lastTime = tag.find('span', attrs={'title': u'最后回复时间'.encode('utf8')}).get_text().strip() items.append(item) return items def pipelines(self,items): with open('tieba.txt','a') as fp: for item in items: fp.write('title:'+item.title.encode('utf8')+'\t') fp.write('firstAuthor:'+item.firstAuthor.encode('utf8') + '\t') fp.write('reNum:'+item.reNum.encode('utf8') + '\t') fp.write('content:' + item.content.encode('utf8') + '\t') fp.write('lastAuthor:' + item.lastAuthor.encode('utf8') + '\t') fp.write('lastTime:' + item.lastTime.encode('utf8') + '\t') fp.write('\n') def getResponseContent(self,url): try: response=urllib2.urlopen(url.encode('utf8')) except: print 'fail' else: return response.read() if __name__=='__main__': url=u'http://tieba.baidu.com/f?kw=战狼2&ie=utf-8&pn=50' GetTiebaInfo(url)
代码说明:
这个例子是按照scrapy那样的结构,定义一个item类,然后抽取url中的html,再然后交给第三个方法进行处理,由于贴吧都有置顶的条目,因为匹配class类名默认都是按in处理的,不能and处理,所以不能精确匹配类名,在tag循环过滤的时候才会有过滤置顶内容的条件筛选