python实现爬取名人名言
技术路线:requests-bs4-re
- 第一步
首先打开名人名言的网站https://mingyan.supfree.net/search.asp
- 第二步
然后查看源代码,可以看到,名人名言都存储在table标签内,可以利用bs4库对其进行查找标签
即soup1 = soup.find('table')
找到table标签,然后再table标签里再寻找a标签,stockInfo = soup1.find_all('a')
,此是的stockinfo变量是class 'bs4.element.Tag类型的,所以需要变换成str类型才可以用正则表达式re库进行精确查找,str1 = str(stockInfo)
(这里涉及到将bs4.element.Tag转换成string,可以参考https://www.jianshu.com/p/d67a3858728c)
这里可以观察到下一页的url是,可以用requests库参数设置,对url进行修改就可以用for循环实现翻页功能,具体参数设置参考下图,这里我只爬取第一个页面,即用:
for i in range(1, 2):
kv = {'page': i}
r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
3. 第三步
最后用正则表达式re库对其进行精确查找
contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?)</a>', str1)
authors = re.findall(r'<a href="toyota\.asp\?id=[\u4e00-\u9fa5]+" target="_blank">(.*?)</a>', str1)
完整代码
import requests
import re
from bs4 import BeautifulSoup
# 利用bs4和re库获取html中我们想要的文本信息
for i in range(1, 2):
kv = {'page': i}
r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html, 'html.parser')
soup1 = soup.find('table')
stockInfo = soup1.find_all('a')
str1 = str(stockInfo)
contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?)</a>', str1)
authors = re.findall(r'<a href="toyota\.asp\?id=[\u4e00-\u9fa5]+" target="_blank">(.*?)</a>', str1)
print(contents)
print(authors)
运行效果如下
方法二:直接用re库查找文本内容
import requests
import re
# from bs4 import BeautifulSoup
for i in range(1, 2):
kv = {'page': i}
r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
contents = re.findall(r'<a target="_blank" href="honda\.asp\?id=\d+">(.*?)</a>', html)
authors = re.findall(r'<a target="_blank" href="toyota\.asp\?id=[\u4e00-\u9fa5]+">(.*?)</a>', html)
后续
之前的代码有点问题就是没有识别出中文的标点符号 。 ; , : “ ”( ) 、 ? 《 》 需要改进re正则表达式,匹配中文标点符号: [\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
匹配中文汉字字符的正则表达式: [\u4e00-\u9fa5]
即使用对中文汉字字符和标点符号进行匹配[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+
import requests
import re
from bs4 import BeautifulSoup
# 利用bs4和re库获取html中我们想要的文本信息
for i in range(1, 2):
kv = {'page': i}
r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html, 'html.parser')
soup1 = soup.find('table')
stockInfo = soup1.find_all('a')
str1 = str(stockInfo)
contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?|[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b].*?)</a>', str1)
authors = re.findall(r'<a href="toyota\.asp\?id=[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+" target="_blank">([\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+)</a>', str1)
lists = []
for i in range(len(contents)):
lists.append({'名言content':contents[i], '名人author':authors[i]})
print(lists)
print(len(lists))
效果如下
又发现了一点小bug,修改了代码的正则表达式部分,采用了函数式模块编程,增加保存数据的功能,保存数据为json文件格式。
import requests
import re
from bs4 import BeautifulSoup
import json
import time
lists = []
# 利用bs4和re库获取html中我们想要的文本信息
def gethtml(url, kv):
r = requests.get(url, params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def re_html(html):
soup = BeautifulSoup(html, 'html.parser')
soup1 = soup.find('table')
stockInfo = soup1.find_all('a')
str1 = str(stockInfo)
contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?|[+\
\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+.*?)</a>',str1)
authors = re.findall(r'<a href="toyota\.asp\?id=[+\
\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\+\
u300b\u4e00-\u9fa5_a-zA-Z0-9]+.*?" target="_blank">([\u3002\uff1b\uff0c\uff1a\+\
u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5_a-zA-Z0-9]+.*?)</a>',str1)
print(contents)
print(authors)
print(len(contents))
print(len(authors))
for i in range(len(contents)):
lists.append({'名言content':contents[i], '名人author':authors[i]})
return lists
def save_json(lists):
with open('a.json', 'w+', encoding='utf-8') as f:
json.dump(lists, f, ensure_ascii=False, indent=0)
def main():
a,b = eval(input('请输入想要爬取的页码范围(例如输入1,10表示1到10页):'))
start = time.perf_counter()
for i in range(a, b+1):
kv = {'page': i}
url = 'https://mingyan.supfree.net/search.asp'
html = gethtml(url, kv)
lists = re_html(html)
save_json(lists)
end = time.perf_counter()
print(end-start)
main()
又产生了一点小BUG,不能识别【】-,在正则表达式中添加相应的【】-的正则表达式代码,同时增加计时功能,计算爬取的时间
import requests
import re
from bs4 import BeautifulSoup
import json
import time
lists = []
# 利用bs4和re库获取html中我们想要的文本信息
def gethtml(url, kv):
r = requests.get(url, params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def re_html(html):
soup = BeautifulSoup(html, 'html.parser')
soup1 = soup.find('table')
stockInfo = soup1.find_all('a')
str1 = str(stockInfo)
contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?|[+\
\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+.*?)</a>',str1)
authors = re.findall(r'<a href="toyota\.asp\?id=[\u3010\u3011+\
\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\+\
u300b\u4e00-\u9fa5\-_a-zA-Z0-9]+.*?" target="_blank">([\u3010\u3011\u3002\uff1b\uff0c\uff1a\+\
u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5\-_a-zA-Z0-9]+.*?)</a>',str1)
print(contents)
print(authors)
print(len(contents))
print(len(authors))
for i in range(len(contents)):
lists.append({'名言content':contents[i], '名人author':authors[i]})
return lists
def save_json(lists):
with open('a.json', 'w+', encoding='utf-8') as f:
json.dump(lists, f, ensure_ascii=False, indent=0)
def main():
a,b = eval(input('请输入想要爬取的页码范围(例如输入1,10表示1到10页):'))
start = time.perf_counter()
for i in range(a, b+1):
kv = {'page': i}
print(i)
url = 'https://mingyan.supfree.net/search.asp'
html = gethtml(url, kv)
lists = re_html(html)
save_json(lists)
end = time.perf_counter()
print(end-start)
main()
总结
中文字符的爬取感觉总是又点小问题,但是不妨碍我们对原理上面的了解