import os
import requests
from bs4 import BeautifulSoup
rootrurl = 'https://www.mzitu.com/'
save_dir = 'D:/estimages/'
no_more_pages = 'END'
max_pages = 10
# 这是一个集合,不能重复,也就是不能重复下载图片
image_cache = set()
index = len(image_cache)
headers = {
"Referer": "https://www.mzitu.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
} ###设置请求的头部,伪装成浏览器
def getNextPageUrl(html):
ahref = html.find('a', {'class': 'next page-numbers'}) # 找到导航条的位置,获得下一个连接网页的位置
if ahref is None:
print('no more page')
return no_more_pages
else:
return ahref.get('href')
def findTheNum(navi):
lis = navi.find_all('span')
num = 0
for span in lis:
if span.string.isdigit():
tmp = int(span.string)
if tmp > num:
num = tmp
return num
def deepSaveImgs(href, saveDir):
html = BeautifulSoup(requests.get(href, headers=headers).text, features="html.parser")
# to find the number of max pages
total = findTheNum(html.find('div', {'class': 'pagenavi'}))
print('total of this group is %d.' % total)
for i in range(1, (total+1)):
url = '{}/{}'.format(href, i) # 拼接照片所在html的页面
html = BeautifulSoup(requests.get(url, headers=headers).text, features="html.parser") # 解析每张图片的源码
img_url = html.find('img', attrs={'class': 'blur'}).get('src') # 查找实际每张图片的具体地址
# 因为网站有防盗链,重新设置了头部的Referer ;在浏览器里面F12里打开网络监听,在Request Headers 里面可以看得到
new_headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
'Referer': url
}
img = requests.get(img_url, headers=new_headers) # 请求图片的实际URL
print(img.url)
with open(
'{}/{}'.format(saveDir, img.url.split("/")[-1]), 'wb') as jpg: # 请求图片并写进去到本地文件
jpg.write(img.content)
if img.url not in image_cache:
image_cache.add(img.url)
def saveImgs(html, mainidx):
lis = html.find('ul', {'id': 'pins'}).find_all('li')
subidx = 1
for link in lis:
# step 1: save this cover image, and create the folder.
a = link.find('a')
href = a.get('href')
img = a.find('img').get('data-original')
print('封面图片: ' + img)
tag = '{}{}/{}/'.format(save_dir, mainidx, subidx)
if not os.path.exists(tag):
os.makedirs(tag)
with open(
'{}/{}'.format(tag, "coverImg_" + img.split("/")[-1]), 'wb') as jpg: # 请求图片并写进去到本地文件
jpg.write(requests.get(img).content)
if img not in image_cache:
image_cache.add(img)
# step 2: enter the mew page to save deeply.
deepSaveImgs(href, tag) #深度搜索该图片组
subidx = subidx + 1
if __name__ == '__main__':
url = rootrurl
idx = 1
while 1:
print("next page: " + url)
html = BeautifulSoup(requests.get(url, headers=headers).text, features="html.parser")
saveImgs(html, idx) # 处理当前浏览页面
if idx >= max_pages:
break
idx = idx + 1
url = getNextPageUrl(html) # 获得下一个浏览页
if url == no_more_pages:
break