python爬虫

一.爬虫基础

安装模块


1.1爬取京客隆网店店铺信息

import requests
from lxml import etree
import pandas as pd

url = "http://www.jkl.com.cn/shop.aspx"#网页
UA = {'user-agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}#UA伪装

# 1拿取每个城区
data = requests.get(url=url,headers=UA).text# requests响应数据
data2 = etree.HTML(data)# 解析信息etree--实例化etree对象
regin = data2.xpath('//div[@class="infoLis"]//@href')#父盒子infoLis下的href中有的是所有城区名称

# 拿到每个城区的详细店铺名称
for i in regin:
    url2 = 'http://www.jkl.com.cn/'+i
    data3 = requests.get(url=url2,headers=UA).text#响应数据
    data4 = etree.HTML(data3)  # 解析信息
    shop = data4.xpath('//span[@class="con01"]/text()')#获取店名
    address=data4.xpath('//span[@class="con02"]/text()')#获取地址
    number=data4.xpath('//span[@class="con03"]/text()')#获取电话
    time = data4.xpath('//span[@class="con04"]/text()')#获取营业时间

    # 放到excel文件中
    newdata = pd.DataFrame({'店名':shop,'地址':address,'电话号码':number,'营业时间':time})
    newdata.to_csv("e:/pc/01.csv",index=False,header=0,mode='a',encoding='ANSI')#mode选择a表示追加模式

1.2批量下载图片

import requests
from lxml import etree
import os
if not os.path.exists('e:/pc/image'):
    os.makedirs('e:/pc/image/')

url = 'http://www.jkl.com.cn/phoLis.aspx'
UA = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
img = data2.xpath('//div[@class="proLis"]//@src')

for ii in img:
    url2 = 'http://www.jkl.com.cn'+ii
    data3 = requests.get(url=url2,headers=UA).content#二进制文件
    name = url2.split('/')[-1]
    u = 'e:/pc/image/'+name
    with open(u,'wb') as t:#二进制形式
        t.write(data3)
        print(name,'下载成功!!!')

1.3批量下载文件

import requests
from lxml import etree
import os
if not os.path.exists('e:/pc/file'):
    os.makedirs('e:/pc/file/')

url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
UA = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
file = data2.xpath('//div[@class="newsLis"]//li//@href')
fname = data2.xpath('//div[@class="newsLis"]//li//a/text()')
keys=[]
values=[]
# 名称和内容对应
for i in fname:
    i = i.strip()
    keys.append(i)
for ii in file:
    url2 = 'http://www.jkl.com.cn'+ii
    values.append(url2)
dic = dict(zip(keys,values))
for k,v in dic.items():#遍历字典
    su = v.split('.')[-1]#后缀
    data3 = requests.get(url=v,headers=UA).content
    u = 'e:/pc/file/'+k+'.'+su
    with open(u,'wb') as t:#保存
        t.write(data3)
        print(k,'下载成功!!!')

1.4智能探测总页数

1.4.1通过尾页获得总页数

import requests
from lxml import etree
import re
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
UA = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url,UA).text
data2 = etree.HTML(data)
last = data2.xpath('//a[text()="尾页"]//@href')

# 正则获取页数,进行判断
if last !=[]:
    rr = re.search("(\d+)",last[0])
    page = rr.group()#正则获得页数
else:
    page=1
print(page)

1.4.2增强前面代码(智能翻页)

import requests
from lxml import etree
import re

url = 'http://www.jkl.com.cn/shop.aspx'
UA = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
regin = data2.xpath('//div[@class="infoLis"]//@href')
name = data2.xpath('//div[@class="infoLis"]//a/text()')
for i in range(1,13):
    url2 = 'http://www.jkl.com.cn/'+regin[i-1]
    data3 = requests.get(url=url2,headers=UA).text
    data4 = etree.HTML(data3)
    last = data4.xpath('//a[text()="尾页"]/@href')
    if last != []:
        page = re.search("(\d+)",last[0]).group()
    else:
        page=1
    print(f"{name[i-1]},{url2},总页数{page}")

1.4.3智能翻页和批量下载文件

import requests
from lxml import etree
import re
import os
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
UA = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url,UA).text;
data2 = etree.HTML(data)
fileName = data2.xpath('//div[@class="infoLis"]//a/text()')
fileUrl = data2.xpath('//div[@class="infoLis"]//@href')
fileUrl  = ['http://www.jkl.com.cn/'+fileUrl for fileUrl in fileUrl]
dic = dict(zip(fileName,fileUrl))
for n,u in dic.items():
    n = n.replace('/','.')
    n = n.replace('...','报表')

    # 添加路径
    uu = 'e:/pc/baobiao/'+n
    if not os.path.exists(uu):
        os.makedirs(uu)

    data3 = requests.get(u,UA).text
    data4 = etree.HTML(data3)
    last = data4.xpath('//a[text()="尾页"]/@href')
    if last != []:
        page = re.search("(\d+)",last[0]).group()
    else:
        page = 1
    for page in range(1,int(page)+1):
        dd = {
            '_EVENTTARGET': 'AspNetPage1',
            '_EVENTARGUMENT':page
        }
        data5 = requests.get(url=u,headers=UA,params=dd).text
        data6 = etree.HTML(data5)
        fu = data6.xpath('//div[@class="newsLis"]//li//@href')
        fn = data6.xpath('//div[@class="newsLis"]//li/a/text()')
        fn = [fn.strip() for fn in fn]#处理空格
        if all(fu):#处理空文件
            fu = ['http://www.jkl.com.cn'+fu for fu in fu]
            dic2 = dict(zip(fn,fu))
            for name2,url2 in dic2.items():
                data7 = requests.get(url=url2,headers=UA).content
                suf = url2.split('.')[-1]
                url3 = uu+'/'+name2+"."+suf
                with open(url3,'wb') as t:
                    t.write(data7)
                    print(name2,"下载成功!!!")

1.5批量下载ppt模板

import requests
from lxml import etree
import os

if not os.path.exists('e:/pc/china'):
    os.makedirs('e:/pc/china')

url ='https://www.1ppt.com/moban/guoqingjie/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url,UA)
data.encoding='gbk'#解决中文乱码
data2 = etree.HTML(data.text)

fileU = data2.xpath('//ul[@class="tplist"]/li/a/@href')
fileU = ['https://www.1ppt.com'+fileU for fileU in fileU]
fileN = data2.xpath('//ul[@class="tplist"]//@alt')
dic = dict(zip(fileN,fileU))
for n,u in dic.items():
    data3 = requests.get(u,UA).text
    data4 = etree.HTML(data3)
    url2 = data4.xpath('//ul[@class="downurllist"]//@href')
    url2 = ['https://www.1ppt.com'+url2 for url2 in url2]
    url2 = url2[0]#下载地址应该是一个字符串
    data5 = requests.get(url2,UA).text
    data6 = etree.HTML(data5)
    url3 = data6.xpath('//ul[@class="downloadlist"]//@href')
    data7 = requests.get(url3[0],UA).content
    suf = url3[0].split('.')[-1]#获得后缀
    u = 'e:/pc/china'+n+"."+suf#保存路径
    with open(u,'wb') as t:
        t.write(data7)
        print(n+"下载成功!!!")

1.5.1修改-智能翻页

注:翻页时url发生变化

import requests
from lxml import etree
import os
import re

if not os.path.exists('e:/pc/china'):
    os.makedirs('e:/pc/china')

url ='https://www.1ppt.com/moban/guoqingjie/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url,UA)
data.encoding='gbk'#解决中文乱码
data2 = etree.HTML(data.text)

#智能翻页
last = data2.xpath('//a[text()="末页"]/@href')
if last != []:
    page = re.search("(\d+)",last[0]).group()
else:
    page = 1
for page in range(1,int(page)+1):
    if page == 1:
        url = 'https://www.1ppt.com/moban/guoqingjie/'
    else:
        url = 'https://www.1ppt.com/moban/guoqingjie/'+'ppt_guoqingjie_'+str(page)+'.html'
    d = requests.get(url,UA).text
    dd = etree.HTML(d)
    fileU = dd.xpath('//ul[@class="tplist"]/li/a/@href')
    fileU = ['https://www.1ppt.com'+fileU for fileU in fileU]
    fileN = data2.xpath('//ul[@class="tplist"]//@alt')
    dic = dict(zip(fileN,fileU))
    for n,u in dic.items():
         data3 = requests.get(u,UA).text
         data4 = etree.HTML(data3)
         url2 = data4.xpath('//ul[@class="downurllist"]//@href')
         url2 = ['https://www.1ppt.com'+url2 for url2 in url2]
         url2 = url2[0]#下载地址应该是一个字符串
         data5 = requests.get(url2,UA).text
         data6 = etree.HTML(data5)
         url3 = data6.xpath('//ul[@class="downloadlist"]//@href')
         data7 = requests.get(url3[0],UA).content
         suf = url3[0].split('.')[-1]#获得后缀
         u = 'e:/pc/china/'+n+"."+suf#保存路径
         with open(u,'wb') as t:
             t.write(data7)
             print(n+"下载成功!!!")

1.6爬取京东商城评价及型号(初级)

注:京东的数据不可以直接爬取到

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-USQJy33o-1664347823627)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220922171608800.png)]

import pandas as pd
import requests
from lxml import etree
import json
id=input('请输入商品编号:')
n = int(input('请输入要下载评论的页数:'))
url = f"https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={id}&score=0&sortType=5&page={n-1}&pageSize=10&isShadowSku=0&fold=1"
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data = data.replace('fetchJSON_comment98(','')
data = data.replace(');','')#转成标准的json字符串
dic = json.loads(data)#json数据转字典

# comments中包含需要的信息
data2 = dic['comments']
content = [cc['content'] for cc in data2]
color = [cc['productColor'] for cc in data2]
size = [cc['productSize'] for cc in data2]
data3 = pd.DataFrame({'评价':content,'颜色':color,'型号':size})
data3.index = data3.index+1
data3.to_excel("e:/pc/京东.xlsx")

1.6.2爬取商城评价智能翻页及延时

import pandas as pd
import requests
import json

id=input('请输入商品编号:')
url = f"https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={id}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1"
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data = data.replace('fetchJSON_comment98(','')
data = data.replace(');','')#转成标准的json字符串
dic = json.loads(data)#json数据转字典
maxPage = dic['maxPage']
for p in range(1,maxPage+1):
    url2 = f'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={id}&score=0&sortType=5&page={p-1}&pageSize=10&isShadowSku=0&fold=1'
    data2 = requests.get(url2,UA).text
    print(data2)
    data2 = data2.replace('fetchJSON_comment98(', '')
    data2 = data2.replace(');', '')  # 转成标准的json字符串
    dic = json.loads(data2)  # json数据转字典


    # comments中包含需要的信息
    data3 = dic['comments']
    content = [cc['content'] for cc in data3]
    color = [cc['productColor'] for cc in data3]
    size = [cc['productSize'] for cc in data3]
    data4 = pd.DataFrame({'评价':content,'颜色':color,'型号':size})
    data4.index = data4.index+1
    data4.to_excel("e:/pc/京东.xlsx")

延时问题:import time=>time.sleep(3)

1.7爬取豆瓣数据

动态翻页(下拉式):XHR

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cY806M36-1664347823628)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220923141632313.png)]

import pandas as pd
import requests
from lxml import etree
import json
n = input('您想查看排名多少前多少位的电影')

url = 'https://movie.douban.com/j/chart/top_list'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
data={
    'type': 11,#11代表类型为剧情片
    'interval_id': '100:90',
    'action':'',
   ' start': 0,
    'limit': n,
}
data2 = requests.get(url=url,headers=UA,params=data).text#json字符串
dic = json.loads(data2)#json字符串转为字典

title = [dic['title'] for dic in dic]
score = [dic['score'] for dic in dic]
type = [dic['types'] for dic in dic]
region = [dic['regions'] for dic in dic]
date = [dic['release_date'] for dic in dic]
actor = [dic['actors'] for dic in dic]
url2 = [dic['url'] for dic in dic]

data3 = pd.DataFrame({'名称':title,'得分':score,'类型':type,'地区':region,'上映日期':date,'演员':actor,'路径':url2})
data3.index = data3.index+1
data3.to_excel('e:/pc/豆瓣.xlsx')

1.8有道词典翻译-输入式翻页

post请求:你给我一个请求我再发送给你数据,而不是一开始就发送数据

1.9批量下载音乐

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-01siYhEE-1664347823628)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220923172831544.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1L8RHJg7-1664347823628)(E:%5C%E7%AC%94%E8%AE%B02%5Cimages%5Cimage-20220923171410155.png)]

import requests
import json
import os
name = input('请输入要下载的歌手名称:')
n = int(input('请输入要下载的页数:'))
if not os.path.exists(f'e:/pc/{name}'):
    os.makedirs(f'e:/pc/{name}')

UA = {'Cookie': 'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1663921985; _ga=GA1.2.1757646121.1663921995; _gid=GA1.2.1409077562.1663921995; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1663922141; kw_token=XTF8QQ6Y6WF; _gat=1',
       'csrf': 'XTF8QQ6Y6WF',
        'Referer': 'http://www.kuwo.cn/search/list?key=%E4%BB%A5%E5%86%AC',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}

for i in range(1,n+1):
    url = f'http://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key={name}&pn={i}&rn=30&httpsStatus=1&reqId=a0e843a0-3b1d-11ed-967d-f97f573ba1ab'
    data = requests.get(url=url,headers=UA).text
    data2 = json.loads(data)
    lists = data2['data']['list']
    for i in lists:
        id = i['rid']
        nn = i['name']
        url2 = f'http://www.kuwo.cn/api/v1/www/music/playUrl?mid={id}&type=music&httpsStatus=1&reqId=78160f50-3b1f-11ed-805a-931103018992'
        data3 = requests.get(url2,UA).text
        dic = json.loads(data3)
        url3 = dic['data']['url']
        mp3 = requests.get(url3,UA).content
        u = f'e:/pc/{name}/{nn}.mp3'
        with open(u,'wb')as t:
            t.write(mp3)
            print(nn,'下载成功!!!')

1.10今日天气预报

拼音模块—安装模块

from pypinyin import lazy_pinyin

from pypinyin import lazy_pinyin
import requests
from lxml import etree
# 获得城市拼音
name = input('请输入城市名称:')
list = lazy_pinyin(name,style=0)
list = ''.join(list)

url = f'https://www.tianqi.com/{name}/'
UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
name = data2.xpath('//dd[@class="name"]/h1/text()')[0]
date = data2.xpath('//dd[@class="week"]/text()')[0]
weather = data2.xpath('//dd[@class="weather"]//span/b/text()')[0]
c = data2.xpath('//dd[@class="weather"]//span/text()')[0]
nc = data2.xpath('//dd[@class="weather"]//p/b/text()')[0]
qual = data2.xpath('//dd[@class="kongqi"]/h5/text()')[0]
print(qual)

1.11未来3,7,10,15,30预报

import requests
from lxml import etree
name = input('请输入城市名称:')
num = input('请输入查询天数:')
if num in ["3","7","10","15","30"]:
    url = f'https://www.tianqi.com/{name}/'
    UA ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

    data = requests.get(url=url,headers=UA).text
    data2 = etree.HTML(data)
    n= data2.xpath('//dd[@class="name"]/h1/text()')[0]
    date = data2.xpath('//dd[@class="week"]/text()')[0]
    weather = data2.xpath('//dd[@class="weather"]//span/b/text()')[0]
    c = data2.xpath('//dd[@class="weather"]//span/text()')[0]
    nc = data2.xpath('//dd[@class="weather"]//p/b/text()')[0]
    qual = data2.xpath('//dd[@class="kongqi"]/h5/text()')[0]
    btn = input(f'是否继续查询{name}近{num}日天气预报?是/否:')
    if(btn=='是' or btn=='y' or btn=='Y'):
        url2 = f'https://www.tianqi.com/{name}/{num}/'
        data3 = requests.get(url=url2,headers=UA).text
        data4 = etree.HTML(data3)
        for i in range(int(num)):
            d = data4.xpath('//span[@class="fl"]/text()')[i]
            w = data4.xpath('//div[@class="weaul_z"]//text()')[0+5*i]
            c = data4.xpath('//div[@class="weaul_z"]//text()')[1+5*i]+data4.xpath('//div[@class="weaul_z"]//text()')[2+5*i]+data4.xpath('//div[@class="weaul_z"]//text()')[3+5*i]+data4.xpath('//div[@class="weaul_z"]//text()')[4+5*i]
            print(f'{d}:{w}:{c}')
    else:
        print('输入有误')

1.12验证码

import requests
from lxml import etree

url = 'http://www.chaojiying.com/user/reg/'
UA = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

data = requests.get(url=url,headers=UA).text
data2 = etree.HTML(data)
img = data2.xpath('//div[@class="login_left width_658"]//@src')[0]
img = 'http://www.chaojiying.com'+img
data = requests.get(url=img,headers=UA).content
with open('e:/pc/验证码','wb') as t:
    t.write(data)