从html中提取相关数据
- 正则表达式基础
- re模块的使用
- findall
- finditer
- search
- match
- compile
- 获取[豆瓣TOP250](https://movie.douban.com/top250?start=0&filter=)指定内容
- 获取[电影天堂](https://dytt89.com/)指定内容
- Bs4
- 获取[电影天堂](https://dytt89.com/)指定内容
- 获取[优美图](https://www.youmeitu.com)指定内容
- xpath
- 获取[猪八戒网](https://dalian.zbj.com/)的相关信息
由于网站在不停更新,网站中的格式,html代码可能已经发生改变,故代码不一定可以正常运行。
正则表达式基础
re模块的使用
findall
import re
#匹配字符串中的内容,并以列表的形式返回,若不加符号“+”,则在列表中返回的是单个的字符,否则列表中返回的是字符串
li = re.findall(r"\d+","iidfigiegiei454545hgidghi865834658")
print(li)
finditer
import re
#在访问网页时一般不采用返回返回列表的形式,多采用返回迭代器的形式
#在此处加上 “+” 返回的是两串数字,否则返回的是单个的数字
li = re.finditer(r"\d+","iidfigiegiei454545hgidghi865834658")
for i in li:
print(i.group())
search
import re
#找到一个结果就返回,返回的结果是mach对象想要拿到结果需要.group()
s =re.search(r"\d+","iidfigiegiei454545hgidghi865834658")
print(s.group())
match
import re
#从头开始匹配,若第一个不为数字则报错
m = re.match(r"\d+","454545hgidghi865834658")
print(m.group())
compile
import re
#预加载
# obj = re.compile(r"\d+")
S = """"
<div class='看到过打开工行卡'><span id='1231'>中国联通</span></div>
<div class='gfghkg'><span id='12310'>日光灯管</span></div>
<div class='好多个IE如何特'><span id='1310'>恶女</span></div>
<div class='⻄游记'><span id='10410'>额从</span></div>
<div class='公开答复广阔的'><span id='2310'>从从</span></div>
"""
#(?P<name>正则)
obj = re.compile(r"<div class='(?P<class>.*?)'><span id='(?P<id>\d+)'>(?P<nei>.*?)</span></div>",re.S)
#re.S 是为了让其可以匹配换行符
resp = obj.finditer(S)
for i in resp:
print(i.group("class"))
print(i.group("id"))
print(i.group("nei"))
获取豆瓣TOP250指定内容
import requests
import re
import csv
for page in range(0,250,25):
#获取网页源代码
start = page
url = f"https://movie.douban.com/top250?start={start}&filter="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
resp = requests.get(url,headers = headers)
content = resp.text
#对网页源代码进行解析,获得所需要的内容
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<title>.*?)</span>.*?<span class="inq">(?P<jiyu>.*?)</span>',re.S)
res = obj.finditer(content)
#根据获取的内容生成相应的".csv"文件
f = open("dadta.csv",mode="a+",encoding="utf-8")
#因为此代码中url是改变的,所以每次循环所获得的网页代码都不同,故用"a+"(追加);若为"w"则为写,".csv"文件获取的内容将会是最后一次网页 中显示的内容
csvwrite = csv.writer(f)
for i in res:
# print(i.group("title"))
# print(i.group("jiyu"))
#若想将获取的内容存到".csv"文件中,则必须得到的迭代器中mach的内容转化为字典的形式
dict = i.groupdict()
dict['title'] = dict['title'].strip()
dict['jiyu'] = dict['jiyu'].strip()
csvwrite.writerow(dict.values())
resp.close()
f.close()
获取电影天堂指定内容
import requests
import re
import csv
#获取主页面的所有内容
url1 = "https://dytt89.com/"
resp1 = requests.get(url1)
resp1.encoding = 'gbk'
# content1 = resp1.text
resp1.close()
#re.S 一定不能少
obj1 = re.compile(r'经典大片.*?<ul>(?P<content2>.*?)</ul>',re.S)
obj2 = re.compile(r"<li><a href='(?P<href1>.*?)' title,*?",re.S)
obj3 = re.compile(r'<br />◎片 名 (?P<name>.*?)<br />.*?<li><a href="(?P<href2>.*?)">',re.S)
result1 = obj1.finditer(resp1.text)# 经典大片栏下的关键内容
complete_href_list = []
for it in result1:
content2 = it.group('content2')
result2 = obj2.finditer(content2)
for i in result2:
href1 = i.group('href1')
href1 = href1.strip("/")
complete_href = url1 + href1
complete_href_list.append(complete_href)
#获取各电影的下载链接并生成“.csv”文件,注意写encoding='gbk'不可写为encoding='gb2312',否则会出现乱码
f = open("jfrj.csv",mode='a+',encoding='gbk')
csvwrite = csv.writer(f)
for url2 in complete_href_list:
resp2 = requests.get(url2)
resp2.encoding = 'gbk'
resp2.close()
resp3 = obj3.finditer(resp2.text)
for ii in resp3:
# print(ii.group('name'))
# print(ii.group('href2'))
dict = ii.groupdict()
csvwrite.writerow(dict.values())
f.close()
Bs4
获取电影天堂指定内容
import requests
from bs4 import BeautifulSoup
url = "https://dytt89.com/"
resp = requests.get(url)
resp.encoding = "gbk"
content = resp.text
page = BeautifulSoup(content,"html.parser")#指定html解析器
resp.close()
result = page.find_all("div", class_ = "co_content222")#抓取指定的内容 class_为html代码div处的class
for li in result:
lis = li.find_all("li")#抓取指定内容中的所有<li>
print(lis[0].text)#....................
获取优美图指定内容
import requests
from bs4 import BeautifulSoup
import time
import random
n = input()
kindlist = ['weimeitupian','gaoxiaotupian','beijingtupian','jingwutupian','dongmantupian','yinghuatupian','wenzitupian','meinv']#各个专栏的名字
kind = kindlist[random.randint(0,7)]
m = len(kind)+2
print(kind)
url1 = f"https://www.youmeitu.com/{kind}/list_{n}.html"
url2 = "https://www.youmeitu.com"
resp1 = requests.get(url1)#获取网页源码
resp1.close()
page = BeautifulSoup(resp1.text,"html.parser")
#获取网页源码中第一个class为TypeList的div中的内容
result = page.find("div",class_="TypeList")
#获取上述div中a、img标签中的内容
ass = result.find_all('a')
imgs = result.find_all("img")
url3s = []
url4s = []
#获取访问图片的链接
for a in ass:
href = a.get('href')#获取标签内的内容直接用get函数即可获取
# print(len(href))
if len(href)>m:
url3 = url2 + href
url3s.append(url3)
#获取下载图片的链接
for b in imgs:
src = b.get('src')
url4 = url2+src
url4s.append(url4)
#下载图片的img文件夹
i='0'
for url5 in url4s:
resp2 = requests.get(url5)
with open("img/"+i, mode="wb") as f:
f.write(resp2.content)#将获取到的图片转为二进制并以文件的形式写出来
print(i + "over")
i+='1'
time.sleep(1)
resp2.close()
xpath
它是XML文档中搜索内容中的一门语言
html是xml的一个子集
import requests
from lxml import etree
url = "https://dalian.zbj.com/search/f/?kw=html"
resp = requests.get(url)
resp.close()
#进行解析
html = etree.HTML(resp.text)
#获取每个商家的div
divs = html.xpath("/html/body/div[6]/div/div/div[2]/div[5]/div[1]/div")
for div in divs:#拿到商家的指定信息
price = div.xpath("./div/div/a[2]/div[2]/div[1]/span[1]/text()")
num = div.xpath("./div/div/a[2]/div[2]/div[1]/span[2]/text()")
print(price)
print(num)