python爬虫二：网易博客的图片

原创

qq61b6d41d3f9e6 2021-12-27 13:59:26 ©著作权

©著作权归作者所有：来自51CTO博客作者qq61b6d41d3f9e6的原创作品，请联系作者获取转载授权，否则将追究法律责任

下面讲解我的爬虫程序：

在D:\wa中新建一个记事本文件1.txt，把所有的网易博客的链接（注意，是一篇博客的链接，不是目录页面的链接）都放进去，每个占一行，然后运行程序把每篇博客的图片全部下载下来，每篇博客都在D:\wa目录下新建一个目录，目录名为该篇博客的网页title。

代码：

#coding=utf-8
import re
import urllib.request
import os

def get(url): #下载一个页面所有需要的图片
    if(len(url)<9):  #忽略txt中的空行
        return        
    all_title=[]
    try:
        html = urllib.request.urlopen(url).read().decode('gbk')
    except:
        print('error')
        print(url)
        return
    title = re.search("<title>.*</title>", html).group()    
    title = title[7:-25] #########根据博客标题数出来的
    while(title in all_title):  #处理相同标题的网页
        title = title+'a'
    all_title.append(title)    
    #pic_url = re.findall('http://img[0-9]*./[0-9a-zA-Z-_=]*==/[0-9]*\.jpg',str(html))
    pic_url = re.findall('http://.{1,100}jpg',str(html))
    pic_url = list(set(pic_url))#去重
    path = thepath + '\\' + title
    try:
        os.mkdir(path)
    except:
        return
    i = 1
    for each in pic_url:
        try:
            pic= urllib.request.urlopen(each,timeout=10).read()
        except:   
            continue
        file = path + '\\' + title + str(i) + '.jpg'

        fp = open(file,'w')
        fp.write(pic)
        fp.close()
        i=i+1
    if not os.listdir(path):
        os.removedirs(path)
        print('error')
        print(url)

thepath = 'D:\\wa'
fp = open(thepath + '\\' + '1.txt','r').readlines()
for line in fp:
    if line == '\n':
        print('none')
        break
    get(line.strip('\n'))