身为一个有觉悟的渣渣,永远不会停止爬虫的瞎写(内卷)之路,很久没有coding了,so就有了下面这篇分享,一个博客爬虫,图片爬虫,我们都非常熟悉的新浪博客的图片爬虫,为了体现本渣渣的渣渣(弱智)水平,带来了一个异步版本,供大家参考学习,如果异步玩的6,请带带本渣渣!
异步代码是本渣渣抄袭的,不懂不要问本渣渣,因为本渣渣也不会。。。
目标网址:
http://blog.sina.com.cn/s/articlelist_1462278767_0_1.html
几个关键点
1.图片Referer反爬
图片如果没有设置,会下载不到想要的图片内容!
headers={
"Referer":url,
"User-Agent":UserAgent().random,
}
2.图片中高清大图的地址获取
高清大图的链接是需要替换的,这里本渣渣直接用replace替换!
img=img.replace("mw690","orignal").replace("bmiddle","orignal").replace("middle","orignal")
附完整源码参考:
#http://blog.sina.com.cn/s/articlelist_1462278767_0_1.html
#新浪博客文章采集
#20210705 by 微信:huguo00289
# -*- coding: UTF-8 -*-
import requests,time
from fake_useragent import UserAgent
from lxml import etree
import os,re
def ua():
headers={"User-Agent":UserAgent().random}
return headers
def get_pagenum():
num=20
for i in range(1,num+1):
print(f">>正在爬取第{i}页数据..")
url=f"http://blog.sina.com.cn/s/articlelist_1462278767_0_{i}.html"
html=requests.get(url=url,headers=ua(),timeout=6).content.decode('utf-8')
time.sleep(8)
tree=etree.HTML(html)
hrefs=tree.xpath('//span[@class="atc_title"]/a/@href')
print(hrefs)
for href in hrefs:
try:
get_imgs(href)
time.sleep(2)
except Exception as e:
print(f"访问出错,错误代码{e}")
with open("fail_list.txt", 'a+', encoding='utf-8') as f:
f.write(f'{href}\n')
print(f'保存访问失败的图片数据列表链接成功!')
def get_imgs(url):
html = requests.get(url=url, headers=ua(), timeout=6).content.decode('utf-8')
time.sleep(6)
tree = etree.HTML(html)
h2=tree.xpath('//h2/text()')[0]
print(h2)
h2 = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", h2) # 剔除不合法字符
path=f'{h2}/'
os.makedirs(path, exist_ok=True)
imgs=tree.xpath('//div[@id="sina_keyword_ad_area2"]//img/@real_src')
print(imgs)
headers={
"Referer":url,
"User-Agent":UserAgent().random,
}
i=1
for img in imgs:
img=img.replace("mw690","orignal").replace("bmiddle","orignal").replace("middle","orignal")
r=requests.get(url=img,headers=headers,timeout=6)
with open(f'{path}{i}.jpg','wb')as f:
f.write(r.content)
print("下载图片成功!")
i=i+1
time.sleep(1)
if __name__=="__main__":
get_pagenum()
附异步版本源码参考:
#20210721 by 微信:huguo00289
# -*- coding: UTF-8 -*-
#https://www.52pojie.cn/forum.php?mod=viewthread&tid=1469537&extra=page%3D1%26filter%3Dtypeid%26typeid%3D29
#[Python] 爬取小姐姐写真照的全站异步爬虫,即使设置了反爬我也要爬给你看
import asyncio
import time
import aiohttp
import aiofiles
from lxml import etree
import os
import re
from fake_useragent import UserAgent
from functools import wraps
from asyncio.proactor_events import _ProactorBasePipeTransport
def silence_event_loop_closed(func):
@wraps(func) #带wraps装饰器
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except RuntimeError as e:
if str(e) != 'Event loop is closed':
raise
return wrapper
_ProactorBasePipeTransport.__del__ = silence_event_loop_closed(_ProactorBasePipeTransport.__del__)
ua = UserAgent()
headers = {'User-Agent': ua.random,'Referer': 'http://blog.sina.com.cn'}
class Slblog:
def __init__(self):
self.write_num = 0
#获取访问网页内容文本
async def get_url(self, url):
async with aiohttp.ClientSession() as client:
async with client.get(url, headers=headers) as resp:
if resp.status == 200:
return await resp.text()
#获取列表链接
async def html_parse(self, html):
semaphore = asyncio.Semaphore(5) # 有界信号量(等待其中五个协程结束)
html_parse = etree.HTML(html)
url_list = html_parse.xpath('//span[@class="atc_title"]/a/@href')
tasks = [asyncio.create_task(self.img_parse(url, semaphore)) for url in url_list]
await asyncio.wait(tasks)
#获取详情页标题及图片链接
async def img_parse(self, h_url, sem):
async with sem: #异步上下文管理器”async with”
semaphore = asyncio.Semaphore(5)
h_html = await self.get_url(h_url)
h_html_parse = etree.HTML(h_html)
title = h_html_parse.xpath('//h2/text()')[0]
title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符
img_demo_url = h_html_parse.xpath(
'//div[@id="sina_keyword_ad_area2"]//img/@real_src')
img_url_list = []
for d_url in img_demo_url:
img_url = d_url.replace("mw690","orignal").replace("bmiddle","orignal").replace("middle","orignal")
img_url_list.append(img_url)
index_list = list(range(1, len(img_url_list) + 1))
index_dict = dict(zip(img_url_list, index_list))
tasks = [asyncio.create_task(self.img_con(i_url, i_num, title, semaphore)) for i_url, i_num in
index_dict.items()]
await asyncio.wait(tasks)
#访问图片或者字节
async def img_con(self, url, num, title, semaphore):
async with semaphore:
async with aiohttp.ClientSession() as client:
async with client.get(url, headers=headers) as resp:
if resp.status == 200:
img_con = await resp.read()
await self.write_img(img_con, num, title)
else:
print('请求出错,请尝试调低并发数重新下载!!')
#下载图片
async def write_img(self, img_con, num, title):
if not os.path.exists(title):
os.makedirs(title) #创建目录
async with aiofiles.open(title + '/' + f'{num}.jpg', 'wb') as f: #异步下载图片
print(f'正在下载{title}/{num}.jpg')
await f.write(img_con)
self.write_num += 1
else:
async with aiofiles.open(title + '/' + f'{num}.jpg', 'wb') as f:
print(f'正在下载{title}/{num}.jpg')
await f.write(img_con)
self.write_num += 1
#主函数
async def main(self, ):
q_start_num = input('输入要从第几页开始下载(按Entry默认为1):') or '1'
start_num = int(q_start_num)
total_num = int(input('请输入要下载的页数:')) + start_num
print('*' * 74)
start_time = time.time()
for num in range(start_num, total_num + 1):
url = f'http://blog.sina.com.cn/s/articlelist_1462278767_0_{num}.html'
html = await self.get_url(url)
print('开始解析下载>>>')
await self.html_parse(html)
end_time = time.time()
print(f'本次共下载图片{self.write_num}张,共耗时{end_time - start_time}秒。')
if __name__=="__main__":
a = Slblog()
asyncio.run(a.main())
·················END·················
你好,我是二大爷,
革命老区外出进城务工人员,
互联网非早期非专业站长,
喜好python,写作,阅读,英语
不入流程序,自媒体,seo . . .