该篇文章为"行路难=_="原创
期末的Python考试要写一个爬取网站信息的程序,我就选取了b站番剧索引页面作为目标网页(因为感觉番剧主页的信息太杂了。)
原本打算魔改老师给的范例使用BeautifulSoup库来解析html获取数据的,
但是在运行的时候发现。好像获取不了数据?
原先使用的代码:
app.py (主程序)
import requests
from bs4 import BeautifulSoup
from writetext import TextStorage
from datetime import datetime
import os
class MySpider(object):
header = {
"Referer": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}
def __init__(self, url, directory):
self.s = requests.Session()
self.url = url
self.dir = directory
if not os.path.exists(directory):
os.mkdir(directory)
self.idx = 0
def crawling(self):
rsp = self.s.get(self.url, headers=MySpider.header)
soup = BeautifulSoup(rsp.text,"html.parser")
tag = soup.find("ul", class_="bangumi-list") # 该处设置断点
tags_li = tag.find_all("li") # 该处设置断点
with TextStorage() as xs:
for li in tags_li:
image_url = "https:%s" % li.a.div.img['src'] # 这个获取番剧封面
print(image_url)
filename = self.save_image(image_url)
content = li.find("a", class_="bangumi-title").string # 这个获取番剧标题
url = self.dir + '/' + filename
xs.write(content, url)
def save_image(self, image_url):
image = self.s.get(image_url, headers=MySpider.header)
now = datetime.now()
suffix = now.strftime('%Y%m%d_%H%M')
name = "img_%s_%d.jpg" % (suffix, self.idx)
self.idx += 1
with open(self.dir + '/' + name, 'wb') as file:
file.write(image.content)
return name
if __name__ == "__main__":
MySpider("https://www.bilibili.com/anime/index", # 爬取的目标网页
"./爬取的图片").crawling() # 爬取的图片存放位置
writetext.py (文本保存模块)
from datetime import datetime
import os
class TextStorage(object):
def __init__(self):
self.enter_flag = 0
def __enter__(self):
self.enter_flag = 1
now = datetime.now()
suffix = now.strftime('%Y%m%d_%H%M')
self.file_name = "./爬取的摘要文档_%s.txt" % (suffix)
self.textStorage = open(self.file_name, "a+" , encoding='utf-8')
return self
def __exit__(self, Type, value, traceback):
if self.enter_flag:
self.save()
self.textStorage.close()
else:
os.remove(self.file_name)
def write(self, content, image_url):
self.textStorage.write("%s%s\n" % (content, image_url))
def save(self):
self.textStorage.flush()
def read(self):
ret = []
old_pos = self.textStorage.tell()
self.textStorage.seek(0)
lines = self.textStorage.readlines()
self.textStorage.seek(old_pos)
idx = 0
content = ""
image_url = ""
for line in lines:
if idx == 0:
content = line
idx += 1
elif idx == 1:
image_url = line
idx += 1
ret.append((content, image_url))
else:
idx = 0
return ret
def seek(self, whence, offset):
self.textStorage.seek(whence, offset)
运行起这个代码,什么也没有显示就退出了,说明中间出错了。
在tags_li处设置断点进行调试,发现tags_li的长度为0,tag获取的数据显示为“加载中”
说明了b站番剧索引页面的数据无法通过静态获取。
所以现在通过F12查找页面的API接口。
步骤1:进入目标网页,F12打开开发人员工具
默认是在控制台(Console)或元素(Elements)页面
我们切换到网络(Network)标签页
步骤2:点击F5刷新页面,等待页面加载完成后,
点击这个网络标签页中没有任何一个按钮的空白处
同时按下“Ctrl”+“F”键,将在左边分出一个搜索框
在框里面输入api后点击回车,将显示许多个结果
步骤3:将开发工具页面拉大一点方便看,然后只点击搜索结果中红线划出的URL那行。
随便点一个,就能看到右边的区域显示了标头(Headers)标签页,
我们直接切换到预览(Preview)的标签页。
然后开始查看所有搜索结果里的URL那行的预览页面
(如果搜索结果有折叠起的也必须展开看里面有没有URL)
步骤4:预览页面中,如果显示是图片的,直接下一个。
如果是这样显示的,就点开这个data数组。
如果点开结果是类似这样的,就也不是
直到我们点开一个预览是这样的URl结果:
点开里面的list数组:
好,这下就找到了B站的番剧索引API了。
切换回标头(Headers)标签页,这个API的链接赫然在上。
复制这个API到Python,用我们的request来处理它。
于是,按照默认的追番人数排序从大到小的排序的番剧信息就可以爬下来了。
(由于接口信息就给了那么多,所以只能从这里看到追番人数、标题、话数等信息,无法看到番剧介绍,播放次数等内容。)
不过按照更新时间,评分,播放数量,开播时间排序的明显也可以通过这个接口实现,具体实现不进行描述了。
最终程序成品效果:
👇
👇
👇
最后,将完整代码放出:
app.py(主程序)
import requests
from bs4 import BeautifulSoup
from writetext import TextStorage
from datetime import datetime
import os
import json
class MySpider(object):
header = {
"Referer": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}
def __init__(self, url, directory):
self.s = requests.Session()
self.url = url
self.dir = directory
if not os.path.exists(directory):
os.mkdir(directory)
self.idx = 0
def crawling(self):
print("开始爬取……")
s = requests.get(self.url, headers=MySpider.header)
if s.status_code == 200:
animelist = s.json()['data']['list']
with TextStorage() as xs:
for li in animelist:
anime_title = li['title']
anime_order = li['order']
anime_badge = li['badge']
anime_index = li['index_show']
anime_link = li['link']
image_url = li['cover']
filename = self.save_image(image_url)
content = anime_title+"\t"+anime_order+"\n"+anime_index+"\t"+anime_badge + \
"\n番剧链接:"+anime_link+"\n封面链接:"+image_url + \
"\n封面存放位置:"+self.dir + '/'+filename+"\n"
print(content)
xs.write(content)
def save_image(self, image_url):
image = self.s.get(image_url, headers=MySpider.header)
now = datetime.now()
suffix = now.strftime('%Y%m%d_%H%M')
name = "img_%s_%d.jpg" % (suffix, self.idx)
self.idx += 1
with open(self.dir + '/' + name, 'wb') as file:
file.write(image.content)
return name
if __name__ == "__main__":
print("将爬取追番人数从高到低的番剧信息。\n不输入或输入非整数默认为20")
while True:
inp = None
try:
inp = int(input("请输入爬取数量:"))
except:
pass
if type(inp) == int:
if inp >= 3142:
print("超出范围了!")
continue
num = inp
break
if inp == None:
num = 20
break
url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=%d&type=1" %num
MySpider(url,"./爬取的图片").crawling()
writetext.py(文本保存模块)
from datetime import datetime
import os
class TextStorage(object):
def __init__(self):
self.enter_flag = 0
def __enter__(self):
self.enter_flag = 1
now = datetime.now()
suffix = now.strftime('%Y%m%d_%H%M')
self.file_name = "./爬取的摘要文档_%s.txt" % (suffix)
self.textStorage = open(self.file_name, "a+" , encoding='utf-8')
return self
def __exit__(self, Type, value, traceback):
if self.enter_flag:
self.save()
self.textStorage.close()
else:
os.remove(self.file_name)
def write(self, content): # 这里进行了修改!!
self.textStorage.write("%s\n" % (content)) # 这里进行了修改!!
def save(self):
self.textStorage.flush()
def read(self):
ret = []
old_pos = self.textStorage.tell()
self.textStorage.seek(0)
lines = self.textStorage.readlines()
self.textStorage.seek(old_pos)
idx = 0
content = ""
image_url = ""
for line in lines:
if idx == 0:
content = line
idx += 1
elif idx == 1:
image_url = line
idx += 1
ret.append((content, image_url))
else:
idx = 0
return ret
def seek(self, whence, offset):
self.textStorage.seek(whence, offset)