python爬虫
一.爬虫基础
安装模块
1.1爬取京客隆网店店铺信息
import requests
from lxml import etree
import pandas as pd
url = "http://www.jkl.com.cn/shop.aspx"#网页
UA = {'user-agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}#UA伪装
# 1拿取每个城区
data = requests.get(url=url,headers=UA).text# requests响应数据
data2 = etree.HTML(data)# 解析信息etree--实例化etree对象
regin = data2.xpath('//div[@class="infoLis"]//@href')#父盒子infoLis下的href中有的是所有城区名称
# 拿到每个城区的详细店铺名称
for i in regin:
url2 = 'http://www.jkl.com.cn/'+i
data3 = requests.get(url=url2,headers=UA).text#响应数据
data4 = etree.HTML(data3) # 解析信息
shop = data4.xpath('//span[@class="con01"]/text()')#获取店名
address=data4.xpath('//span[@class="con02"]/text()')#获取地址
number=data4.xpath('//span[@class="con03"]/text()')#获取电话
time = data4.xpath('//span[@class="con04"]/text()')#获取营业时间
# 放到excel文件中
newdata = pd.DataFrame({'店名':shop,'地址':address,'电话号码':number,'营业时间':time})
newdata.to_csv("e:/pc/01.csv",index=False,header=0,mode='a',encoding='ANSI')#mode选择a表示追加模式
1.2批量下载图片
import requests
from lxml import etree
import os
if not os.path.exists('e:/pc/image'):
os.makedirs('e:/pc/image/')
url = 'http://www.jkl.com.cn/phoLis.aspx'
UA = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
img = data2.xpath('//div[@class="proLis"]//@src')
for ii in img:
url2 = 'http://www.jkl.com.cn'+ii
data3 = requests.get(url=url2,headers=UA).content#二进制文件
name = url2.split('/')[-1]
u = 'e:/pc/image/'+name
with open(u,'wb') as t:#二进制形式
t.write(data3)
print(name,'下载成功!!!')
1.3批量下载文件
import requests
from lxml import etree
import os
if not os.path.exists('e:/pc/file'):
os.makedirs('e:/pc/file/')
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
UA = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
file = data2.xpath('//div[@class="newsLis"]//li//@href')
fname = data2.xpath('//div[@class="newsLis"]//li//a/text()')
keys=[]
values=[]
# 名称和内容对应
for i in fname:
i = i.strip()
keys.append(i)
for ii in file:
url2 = 'http://www.jkl.com.cn'+ii
values.append(url2)
dic = dict(zip(keys,values))
for k,v in dic.items():#遍历字典
su = v.split('.')[-1]#后缀
data3 = requests.get(url=v,headers=UA).content
u = 'e:/pc/file/'+k+'.'+su
with open(u,'wb') as t:#保存
t.write(data3)
print(k,'下载成功!!!')
1.4智能探测总页数
1.4.1通过尾页获得总页数
import requests
from lxml import etree
import re
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
UA = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url,UA).text
data2 = etree.HTML(data)
last = data2.xpath('//a[text()="尾页"]//@href')
# 正则获取页数,进行判断
if last !=[]:
rr = re.search("(\d+)",last[0])
page = rr.group()#正则获得页数
else:
page=1
print(page)
1.4.2增强前面代码(智能翻页)
import requests
from lxml import etree
import re
url = 'http://www.jkl.com.cn/shop.aspx'
UA = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
regin = data2.xpath('//div[@class="infoLis"]//@href')
name = data2.xpath('//div[@class="infoLis"]//a/text()')
for i in range(1,13):
url2 = 'http://www.jkl.com.cn/'+regin[i-1]
data3 = requests.get(url=url2,headers=UA).text
data4 = etree.HTML(data3)
last = data4.xpath('//a[text()="尾页"]/@href')
if last != []:
page = re.search("(\d+)",last[0]).group()
else:
page=1
print(f"{name[i-1]},{url2},总页数{page}")
1.4.3智能翻页和批量下载文件
import requests
from lxml import etree
import re
import os
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
UA = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url,UA).text;
data2 = etree.HTML(data)
fileName = data2.xpath('//div[@class="infoLis"]//a/text()')
fileUrl = data2.xpath('//div[@class="infoLis"]//@href')
fileUrl = ['http://www.jkl.com.cn/'+fileUrl for fileUrl in fileUrl]
dic = dict(zip(fileName,fileUrl))
for n,u in dic.items():
n = n.replace('/','.')
n = n.replace('...','报表')
# 添加路径
uu = 'e:/pc/baobiao/'+n
if not os.path.exists(uu):
os.makedirs(uu)
data3 = requests.get(u,UA).text
data4 = etree.HTML(data3)
last = data4.xpath('//a[text()="尾页"]/@href')
if last != []:
page = re.search("(\d+)",last[0]).group()
else:
page = 1
for page in range(1,int(page)+1):
dd = {
'_EVENTTARGET': 'AspNetPage1',
'_EVENTARGUMENT':page
}
data5 = requests.get(url=u,headers=UA,params=dd).text
data6 = etree.HTML(data5)
fu = data6.xpath('//div[@class="newsLis"]//li//@href')
fn = data6.xpath('//div[@class="newsLis"]//li/a/text()')
fn = [fn.strip() for fn in fn]#处理空格
if all(fu):#处理空文件
fu = ['http://www.jkl.com.cn'+fu for fu in fu]
dic2 = dict(zip(fn,fu))
for name2,url2 in dic2.items():
data7 = requests.get(url=url2,headers=UA).content
suf = url2.split('.')[-1]
url3 = uu+'/'+name2+"."+suf
with open(url3,'wb') as t:
t.write(data7)
print(name2,"下载成功!!!")
1.5批量下载ppt模板
import requests
from lxml import etree
import os
if not os.path.exists('e:/pc/china'):
os.makedirs('e:/pc/china')
url ='https://www.1ppt.com/moban/guoqingjie/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url,UA)
data.encoding='gbk'#解决中文乱码
data2 = etree.HTML(data.text)
fileU = data2.xpath('//ul[@class="tplist"]/li/a/@href')
fileU = ['https://www.1ppt.com'+fileU for fileU in fileU]
fileN = data2.xpath('//ul[@class="tplist"]//@alt')
dic = dict(zip(fileN,fileU))
for n,u in dic.items():
data3 = requests.get(u,UA).text
data4 = etree.HTML(data3)
url2 = data4.xpath('//ul[@class="downurllist"]//@href')
url2 = ['https://www.1ppt.com'+url2 for url2 in url2]
url2 = url2[0]#下载地址应该是一个字符串
data5 = requests.get(url2,UA).text
data6 = etree.HTML(data5)
url3 = data6.xpath('//ul[@class="downloadlist"]//@href')
data7 = requests.get(url3[0],UA).content
suf = url3[0].split('.')[-1]#获得后缀
u = 'e:/pc/china'+n+"."+suf#保存路径
with open(u,'wb') as t:
t.write(data7)
print(n+"下载成功!!!")
1.5.1修改-智能翻页
注:翻页时url发生变化
import requests
from lxml import etree
import os
import re
if not os.path.exists('e:/pc/china'):
os.makedirs('e:/pc/china')
url ='https://www.1ppt.com/moban/guoqingjie/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url,UA)
data.encoding='gbk'#解决中文乱码
data2 = etree.HTML(data.text)
#智能翻页
last = data2.xpath('//a[text()="末页"]/@href')
if last != []:
page = re.search("(\d+)",last[0]).group()
else:
page = 1
for page in range(1,int(page)+1):
if page == 1:
url = 'https://www.1ppt.com/moban/guoqingjie/'
else:
url = 'https://www.1ppt.com/moban/guoqingjie/'+'ppt_guoqingjie_'+str(page)+'.html'
d = requests.get(url,UA).text
dd = etree.HTML(d)
fileU = dd.xpath('//ul[@class="tplist"]/li/a/@href')
fileU = ['https://www.1ppt.com'+fileU for fileU in fileU]
fileN = data2.xpath('//ul[@class="tplist"]//@alt')
dic = dict(zip(fileN,fileU))
for n,u in dic.items():
data3 = requests.get(u,UA).text
data4 = etree.HTML(data3)
url2 = data4.xpath('//ul[@class="downurllist"]//@href')
url2 = ['https://www.1ppt.com'+url2 for url2 in url2]
url2 = url2[0]#下载地址应该是一个字符串
data5 = requests.get(url2,UA).text
data6 = etree.HTML(data5)
url3 = data6.xpath('//ul[@class="downloadlist"]//@href')
data7 = requests.get(url3[0],UA).content
suf = url3[0].split('.')[-1]#获得后缀
u = 'e:/pc/china/'+n+"."+suf#保存路径
with open(u,'wb') as t:
t.write(data7)
print(n+"下载成功!!!")
1.6爬取京东商城评价及型号(初级)
注:京东的数据不可以直接爬取到
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-USQJy33o-1664347823627)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220922171608800.png)]
import pandas as pd
import requests
from lxml import etree
import json
id=input('请输入商品编号:')
n = int(input('请输入要下载评论的页数:'))
url = f"https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={id}&score=0&sortType=5&page={n-1}&pageSize=10&isShadowSku=0&fold=1"
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data = data.replace('fetchJSON_comment98(','')
data = data.replace(');','')#转成标准的json字符串
dic = json.loads(data)#json数据转字典
# comments中包含需要的信息
data2 = dic['comments']
content = [cc['content'] for cc in data2]
color = [cc['productColor'] for cc in data2]
size = [cc['productSize'] for cc in data2]
data3 = pd.DataFrame({'评价':content,'颜色':color,'型号':size})
data3.index = data3.index+1
data3.to_excel("e:/pc/京东.xlsx")
1.6.2爬取商城评价智能翻页及延时
import pandas as pd
import requests
import json
id=input('请输入商品编号:')
url = f"https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={id}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1"
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data = data.replace('fetchJSON_comment98(','')
data = data.replace(');','')#转成标准的json字符串
dic = json.loads(data)#json数据转字典
maxPage = dic['maxPage']
for p in range(1,maxPage+1):
url2 = f'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={id}&score=0&sortType=5&page={p-1}&pageSize=10&isShadowSku=0&fold=1'
data2 = requests.get(url2,UA).text
print(data2)
data2 = data2.replace('fetchJSON_comment98(', '')
data2 = data2.replace(');', '') # 转成标准的json字符串
dic = json.loads(data2) # json数据转字典
# comments中包含需要的信息
data3 = dic['comments']
content = [cc['content'] for cc in data3]
color = [cc['productColor'] for cc in data3]
size = [cc['productSize'] for cc in data3]
data4 = pd.DataFrame({'评价':content,'颜色':color,'型号':size})
data4.index = data4.index+1
data4.to_excel("e:/pc/京东.xlsx")
延时问题:import time=>time.sleep(3)
1.7爬取豆瓣数据
动态翻页(下拉式):XHR
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cY806M36-1664347823628)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220923141632313.png)]
import pandas as pd
import requests
from lxml import etree
import json
n = input('您想查看排名多少前多少位的电影')
url = 'https://movie.douban.com/j/chart/top_list'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data={
'type': 11,#11代表类型为剧情片
'interval_id': '100:90',
'action':'',
' start': 0,
'limit': n,
}
data2 = requests.get(url=url,headers=UA,params=data).text#json字符串
dic = json.loads(data2)#json字符串转为字典
title = [dic['title'] for dic in dic]
score = [dic['score'] for dic in dic]
type = [dic['types'] for dic in dic]
region = [dic['regions'] for dic in dic]
date = [dic['release_date'] for dic in dic]
actor = [dic['actors'] for dic in dic]
url2 = [dic['url'] for dic in dic]
data3 = pd.DataFrame({'名称':title,'得分':score,'类型':type,'地区':region,'上映日期':date,'演员':actor,'路径':url2})
data3.index = data3.index+1
data3.to_excel('e:/pc/豆瓣.xlsx')
1.8有道词典翻译-输入式翻页
post请求:你给我一个请求我再发送给你数据,而不是一开始就发送数据
1.9批量下载音乐
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-01siYhEE-1664347823628)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220923172831544.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1L8RHJg7-1664347823628)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220923171410155.png)]
import requests
import json
import os
name = input('请输入要下载的歌手名称:')
n = int(input('请输入要下载的页数:'))
if not os.path.exists(f'e:/pc/{name}'):
os.makedirs(f'e:/pc/{name}')
UA = {'Cookie': 'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1663921985; _ga=GA1.2.1757646121.1663921995; _gid=GA1.2.1409077562.1663921995; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1663922141; kw_token=XTF8QQ6Y6WF; _gat=1',
'csrf': 'XTF8QQ6Y6WF',
'Referer': 'http://www.kuwo.cn/search/list?key=%E4%BB%A5%E5%86%AC',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
for i in range(1,n+1):
url = f'http://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key={name}&pn={i}&rn=30&httpsStatus=1&reqId=a0e843a0-3b1d-11ed-967d-f97f573ba1ab'
data = requests.get(url=url,headers=UA).text
data2 = json.loads(data)
lists = data2['data']['list']
for i in lists:
id = i['rid']
nn = i['name']
url2 = f'http://www.kuwo.cn/api/v1/www/music/playUrl?mid={id}&type=music&httpsStatus=1&reqId=78160f50-3b1f-11ed-805a-931103018992'
data3 = requests.get(url2,UA).text
dic = json.loads(data3)
url3 = dic['data']['url']
mp3 = requests.get(url3,UA).content
u = f'e:/pc/{name}/{nn}.mp3'
with open(u,'wb')as t:
t.write(mp3)
print(nn,'下载成功!!!')
1.10今日天气预报
拼音模块—安装模块
from pypinyin import lazy_pinyin
from pypinyin import lazy_pinyin
import requests
from lxml import etree
# 获得城市拼音
name = input('请输入城市名称:')
list = lazy_pinyin(name,style=0)
list = ''.join(list)
url = f'https://www.tianqi.com/{name}/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
name = data2.xpath('//dd[@class="name"]/h1/text()')[0]
date = data2.xpath('//dd[@class="week"]/text()')[0]
weather = data2.xpath('//dd[@class="weather"]//span/b/text()')[0]
c = data2.xpath('//dd[@class="weather"]//span/text()')[0]
nc = data2.xpath('//dd[@class="weather"]//p/b/text()')[0]
qual = data2.xpath('//dd[@class="kongqi"]/h5/text()')[0]
print(qual)
1.11未来3,7,10,15,30预报
import requests
from lxml import etree
name = input('请输入城市名称:')
num = input('请输入查询天数:')
if num in ["3","7","10","15","30"]:
url = f'https://www.tianqi.com/{name}/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
n= data2.xpath('//dd[@class="name"]/h1/text()')[0]
date = data2.xpath('//dd[@class="week"]/text()')[0]
weather = data2.xpath('//dd[@class="weather"]//span/b/text()')[0]
c = data2.xpath('//dd[@class="weather"]//span/text()')[0]
nc = data2.xpath('//dd[@class="weather"]//p/b/text()')[0]
qual = data2.xpath('//dd[@class="kongqi"]/h5/text()')[0]
btn = input(f'是否继续查询{name}近{num}日天气预报?是/否:')
if(btn=='是' or btn=='y' or btn=='Y'):
url2 = f'https://www.tianqi.com/{name}/{num}/'
data3 = requests.get(url=url2,headers=UA).text
data4 = etree.HTML(data3)
for i in range(int(num)):
d = data4.xpath('//span[@class="fl"]/text()')[i]
w = data4.xpath('//div[@class="weaul_z"]//text()')[0+5*i]
c = data4.xpath('//div[@class="weaul_z"]//text()')[1+5*i]+data4.xpath('//div[@class="weaul_z"]//text()')[2+5*i]+data4.xpath('//div[@class="weaul_z"]//text()')[3+5*i]+data4.xpath('//div[@class="weaul_z"]//text()')[4+5*i]
print(f'{d}:{w}:{c}')
else:
print('输入有误')
1.12验证码
import requests
from lxml import etree
url = 'http://www.chaojiying.com/user/reg/'
UA = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
img = data2.xpath('//div[@class="login_left width_658"]//@src')[0]
img = 'http://www.chaojiying.com'+img
data = requests.get(url=img,headers=UA).content
with open('e:/pc/验证码','wb') as t:
t.write(data)