最近看了《哪吒之魔童降世》,有搞笑,有温情,有剧情,有理念,强烈推荐,给国漫点赞。
然后又在学习python爬虫,就试了下爬取其豆瓣影评
涉及:
1. requests请求网页
2. xpath提取数据
3. 爬取遇到“下一页”操作时的处理
4. openpyxl将数据写入excel
5. matplotlib.pyplot画柱状图和圆形分布图
源码:
import requests
from lxml import etree
import os
import matplotlib.pyplot as plt
import openpyxl
import traceback
class DB_Film_review_Spider(object):
def __init__(self,url):
self.__url=url
self.page=0
self.total_author=[]
self.total_star_num=[]
self.total_comment_time=[]
self.total_recommendation_level=[]
self.total_text=[]
self.level=['力荐','推荐','还行','较差','很差']
self.num=[0,0,0,0,0]
# 从网页提取数据
def get_data(self):
responce=requests.get(self.__url)
xml=etree.HTML(responce.text)
self.author=xml.xpath('//div[@class="mod-bd"]//div//div//a/@title') # 作者
self.star_num = xml.xpath('//div[@class="mod-bd"]//h3//span[@class="votes"]/text()') # 有用数
self.comment_time=xml.xpath('//div[@class="mod-bd"]//h3//span[@class="comment-info"]//span[3]/@title') #评论时间
self.recommendation_level=xml.xpath('//div[@class="mod-bd"]//h3//span[@class="comment-info"]//span[2]/@title') # 推荐程度
self.text=xml.xpath('//div[@class="mod-bd"]//p//span/text()') # 影评
# 写入excel
def write_excel(self):
try:
file_path = "哪吒豆瓣影评.xlsx"
column_headers = ["编号", "作者", "推荐程度", "评论时间", "点赞数", "详细影评"]
if os.path.exists(file_path):
wb = openpyxl.load_workbook(file_path)
ws = wb["Sheet"]
else:
wb = openpyxl.Workbook()
ws = wb.active
ws.column_dimensions["A"].width = 5 # 列宽
ws.column_dimensions["B"].width = 30
ws.column_dimensions["C"].width = 10
ws.column_dimensions["D"].width = 23
ws.column_dimensions["F"].width = 700
for i in range(6): # 写列标题
ws.cell(row=1, column=i + 1, value=column_headers[i])
for i in range(len(self.total_author)):
ws.cell(row=i + 2, column=1, value=i + 1) # 写编号这一列数据
ws.cell(row=i + 2, column=2, value=self.total_author[i].encode("utf-8")) # 作者
ws.cell(row=i + 2, column=3, value=self.total_recommendation_level[i].encode("utf-8")) # 推荐程度
if i>=len(self.total_comment_time):
pass
else:
ws.cell(row=i + 2, column=4, value=self.total_comment_time[i].encode("utf-8"))
ws.cell(row=i + 2, column=5, value=self.total_star_num[i].encode("utf-8"))
ws.cell(row=i + 2, column=6, value=self.total_text[i].encode("utf-8"))
wb.save(file_path)
except Exception:
print(traceback.print_exc())
# 获取url,找出规律,提取下一页的url
def get_url(self):
try:
while (True):
self.get_data()
# set(list1).issubset(set(list2)) 判断list2是否包含list1,是则返回True
if set(self.author).issubset(set(self.total_author)):
break # 若获取到的数据没有增加(即无变化),则跳出循环
else:
self.total_author = self.total_author + self.author
self.total_star_num = self.total_star_num + self.star_num
self.total_comment_time = self.total_comment_time + self.comment_time
self.total_recommendation_level = self.total_recommendation_level + self.recommendation_level
self.total_text = self.total_text + self.text
self.page = self.page + 20 # 下一页的url 仅是start=" "的值每次加20
self.__url = "https://movie.douban.com/subject/26794435/comments?start=" + str(
self.page) + "&limit=20&sort=new_score&status=P"
except Exception:
print(traceback.print_exc())
# 处理数据并画出示意图
def process_data(self):
# print(len(self.total_recommendation_level))
# print(len(self.total_comment_time))
for level in self.total_recommendation_level: # 统计每种level的数目
for i in range(5):
if level==self.level[i]:
self.num[i]+=1
continue
# print(self.num)
# 画柱状图
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
plt.bar(range(len(self.level)),self.num,label='推荐等级',tick_label=self.level,fc='r')
plt.title("推荐情况")
plt.show()
# 画圆形分布图
plt.figure(figsize=(6,9)) #调节图形大小,宽,高
colors=['red','yellowgreen','lightskyblue','green','gray'] # 颜色
# 将某部分爆炸出来, 使用括号,将第一块分割出来,数值的大小是分割出来的与其他两块的间隙
explode = (0.05, 0, 0,0,0)
plt.pie(self.num,explode=explode,labels=self.level,colors=colors,
labeldistance=1.1, autopct='%3.1f%%', shadow=False,
startangle=90, pctdistance=0.6
)
# 参数1:每个标签所占大小(列表),会自动计算百分比
# 参数3:定义圆形图的标签(列表)
# labeldistance,文本的位置离远点有多远,1.1指1.1倍半径的位置
# autopct,圆里面的文本格式,%3.1f%%表示小数有三位,整数有一位的浮点数
# shadow,饼是否有阴影
# startangle,起始角度,0,表示从0开始逆时针转,为第一块。一般选择从90度开始比较好看
# pctdistance,百分比的text离圆心的距离
plt.axis('equal') # 设置x,y轴刻度一致,这样饼图才能是圆的
plt.legend()
plt.show()
if __name__ == '__main__':
D_Spider=DB_Film_review_Spider("https://movie.douban.com/subject/26794435/comments?start=0&limit=20&sort=new_score&status=P")
D_Spider.get_url()
D_Spider.process_data()
D_Spider.write_excel()
运行结果:
问题:
1. 代码需优化的部分还有很多
2. 仅爬取了10页(共220条)评论信息,后面的需要登录才能进行访问,后续进行处理