python实现爬取名人名言

技术路线:requests-bs4-re

  1. 第一步

首先打开名人名言的网站https://mingyan.supfree.net/search.asp

python怎么把名单提取成列表 python提取人名_python怎么把名单提取成列表

  1. 第二步

然后查看源代码,可以看到,名人名言都存储在table标签内,可以利用bs4库对其进行查找标签

python怎么把名单提取成列表 python提取人名_python_02


soup1 = soup.find('table') 找到table标签,然后再table标签里再寻找a标签,stockInfo = soup1.find_all('a'),此是的stockinfo变量是class 'bs4.element.Tag类型的,所以需要变换成str类型才可以用正则表达式re库进行精确查找,str1 = str(stockInfo)(这里涉及到将bs4.element.Tag转换成string,可以参考https://www.jianshu.com/p/d67a3858728c

这里可以观察到下一页的url是,可以用requests库参数设置,对url进行修改就可以用for循环实现翻页功能,具体参数设置参考下图,这里我只爬取第一个页面,即用:

for i in range(1, 2):
    kv = {'page': i}
    r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)

python怎么把名单提取成列表 python提取人名_json_03


3. 第三步

最后用正则表达式re库对其进行精确查找

contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?)</a>', str1)
authors = re.findall(r'<a href="toyota\.asp\?id=[\u4e00-\u9fa5]+" target="_blank">(.*?)</a>', str1)

完整代码

import requests
import re
from bs4 import BeautifulSoup

# 利用bs4和re库获取html中我们想要的文本信息
for i in range(1, 2):
    kv = {'page': i}
    r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    soup1 = soup.find('table')
    stockInfo = soup1.find_all('a')
    str1 = str(stockInfo)
    contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?)</a>', str1)
    authors = re.findall(r'<a href="toyota\.asp\?id=[\u4e00-\u9fa5]+" target="_blank">(.*?)</a>', str1)
    print(contents)
    print(authors)

运行效果如下

python怎么把名单提取成列表 python提取人名_json_04

方法二:直接用re库查找文本内容

import requests
import re
# from bs4 import BeautifulSoup


for i in range(1, 2):
    kv = {'page': i}
    r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    html = r.text
    contents = re.findall(r'<a target="_blank" href="honda\.asp\?id=\d+">(.*?)</a>', html)
    authors = re.findall(r'<a target="_blank" href="toyota\.asp\?id=[\u4e00-\u9fa5]+">(.*?)</a>', html)

后续
之前的代码有点问题就是没有识别出中文的标点符号 。 ; , : “ ”( ) 、 ? 《 》 需要改进re正则表达式,匹配中文标点符号: [\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b] 匹配中文汉字字符的正则表达式: [\u4e00-\u9fa5]
即使用对中文汉字字符和标点符号进行匹配[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+

import requests
import re
from bs4 import BeautifulSoup

# 利用bs4和re库获取html中我们想要的文本信息
for i in range(1, 2):
    kv = {'page': i}
    r = requests.get('https://mingyan.supfree.net/search.asp', params=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    soup1 = soup.find('table')
    stockInfo = soup1.find_all('a')
    str1 = str(stockInfo)
    contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?|[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b].*?)</a>', str1)
    authors = re.findall(r'<a href="toyota\.asp\?id=[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+" target="_blank">([\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+)</a>', str1)
    lists = []
    
    for i in range(len(contents)):
        lists.append({'名言content':contents[i], '名人author':authors[i]})
    print(lists)
    print(len(lists))

效果如下

python怎么把名单提取成列表 python提取人名_python_05

又发现了一点小bug,修改了代码的正则表达式部分,采用了函数式模块编程,增加保存数据的功能,保存数据为json文件格式。

import requests
import re
from bs4 import BeautifulSoup
import json
import time

lists = []
# 利用bs4和re库获取html中我们想要的文本信息
def gethtml(url, kv):
    r = requests.get(url, params=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def re_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    soup1 = soup.find('table')
    stockInfo = soup1.find_all('a')
    str1 = str(stockInfo)
    contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?|[+\
    \u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+.*?)</a>',str1)
    authors = re.findall(r'<a href="toyota\.asp\?id=[+\
    \u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\+\
    u300b\u4e00-\u9fa5_a-zA-Z0-9]+.*?" target="_blank">([\u3002\uff1b\uff0c\uff1a\+\
    u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5_a-zA-Z0-9]+.*?)</a>',str1)
    print(contents)
    print(authors)
    print(len(contents))
    print(len(authors))
    for i in range(len(contents)):
        lists.append({'名言content':contents[i], '名人author':authors[i]})
    return lists

def save_json(lists):
    with open('a.json', 'w+', encoding='utf-8') as f:
        json.dump(lists, f, ensure_ascii=False, indent=0)

def main():
    a,b = eval(input('请输入想要爬取的页码范围(例如输入1,10表示1到10页):'))
    start = time.perf_counter()
    for i in range(a, b+1):
        kv = {'page': i}
        url = 'https://mingyan.supfree.net/search.asp'
        html = gethtml(url, kv)
        lists = re_html(html)
        save_json(lists)
    end = time.perf_counter()
    print(end-start)
main()

又产生了一点小BUG,不能识别【】-,在正则表达式中添加相应的【】-的正则表达式代码,同时增加计时功能,计算爬取的时间

import requests
import re
from bs4 import BeautifulSoup
import json
import time

lists = []
# 利用bs4和re库获取html中我们想要的文本信息
def gethtml(url, kv):
    r = requests.get(url, params=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def re_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    soup1 = soup.find('table')
    stockInfo = soup1.find_all('a')
    str1 = str(stockInfo)
    contents = re.findall(r'<a href="honda\.asp\?id=\d+" target="_blank">(.*?|[+\
    \u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+.*?)</a>',str1)
    authors = re.findall(r'<a href="toyota\.asp\?id=[\u3010\u3011+\
    \u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\+\
    u300b\u4e00-\u9fa5\-_a-zA-Z0-9]+.*?" target="_blank">([\u3010\u3011\u3002\uff1b\uff0c\uff1a\+\
    u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5\-_a-zA-Z0-9]+.*?)</a>',str1)
    print(contents)
    print(authors)
    print(len(contents))
    print(len(authors))
    for i in range(len(contents)):
        lists.append({'名言content':contents[i], '名人author':authors[i]})
    return lists

def save_json(lists):
    with open('a.json', 'w+', encoding='utf-8') as f:
        json.dump(lists, f, ensure_ascii=False, indent=0)

def main():
    a,b = eval(input('请输入想要爬取的页码范围(例如输入1,10表示1到10页):'))
    start = time.perf_counter()
    for i in range(a, b+1):
        kv = {'page': i}
        print(i)
        url = 'https://mingyan.supfree.net/search.asp'
        html = gethtml(url, kv)
        lists = re_html(html)
        save_json(lists)
    end = time.perf_counter()
    print(end-start)
main()

总结
中文字符的爬取感觉总是又点小问题,但是不妨碍我们对原理上面的了解