一个人无聊,写了个小爬虫爬取不可描述图片....

代码太短,就暂时先往这里贴一下做备份吧。

注:这是很严肃的技术研究,当然爬下来的图片我会带着批判性的眼光审查一遍的....   :)

 

#! /usr/bin/python

import chardet
import urllib3
import uuid
import os
import logging
import time
import sys
import re
import threading
from bs4 import BeautifulSoup


"""
http://www.qiubaichengren.com/1.html
"""


class PageNotFoundException(BaseException):
    """
        代表网页404的异常
    """
    pass


class ResponseStatusException(BaseException):
    pass


class QiuBaiChengRenSpider:

    http_pool_manager = urllib3.PoolManager()

    img_save_dir = 'D:/QiuBaiChengRen/'

    logger = logging.getLogger('QiuBaiChengRenSpider')

    def __init__(self):
        self.init_log()

    def init_log(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        self.logger.addHandler(stream_handler)
        self.logger.setLevel(logging.DEBUG)

    def get(self, url):
        try:
            http_response = self.http_pool_manager.request('GET', url)
            if http_response.status == 404:
                raise PageNotFoundException('404')
            if http_response.status != 200:
                raise ResponseStatusException(http_response.status)
            return http_response.data
        except Exception:
            self.logger.info(u'获取网页的时候发生了异常')
            return ''

    def extract_img(self, html_doc):
        bs = BeautifulSoup(html_doc, 'lxml')
        imgs = bs.select('div.mala-text img')
        return imgs

    def save_img(self, img_tag):
        img_link = img_tag['src'].strip()
        save_name = self.img_save_dir + img_tag['alt'] + '___' + uuid.uuid4().hex + os.path.splitext(img_link)[1]
        save_name = re.compile('[\\s+,\",\']').sub('', save_name)    # 覆盖掉生成的文件名中不合法的部分

        self.logger.info('Save img: %s %s' %(save_name, img_link))

        img_byte = self.get(img_link)
        if img_byte == '':
            return

        img_file = open(save_name, 'wb')
        img_file.write(img_byte)
        img_file.close()

    def list_visitor(self, seed):
        threads = []
        i = 1
        while True:
            try:
                url = seed % {'page': i}
                self.logger.info('Begin process:%s' %url)

                html_doc = self.get(url)
                if html_doc == '':
                    continue

                imgs = self.extract_img(html_doc)
                for img in imgs:
                    # self.logger.info('Saving img:%s %s' %(img['alt'], img['src']))
                    t1 = threading.Thread(target=self.save_img, args={img})
                    t1.start()
                    threads.append(t1)
                i += 1
            except PageNotFoundException:
                self.logger.info('404')
                break
            except BaseException:
                break
        for t1 in threads:
            t1.join()


if __name__ == '__main__':
    spider = QiuBaiChengRenSpider()
    spider.list_visitor('http://www.qiubaichengren.com/%(page)d.html')