一个人无聊,写了个小爬虫爬取不可描述图片....
代码太短,就暂时先往这里贴一下做备份吧。
注:这是很严肃的技术研究,当然爬下来的图片我会带着批判性的眼光审查一遍的.... :)
#! /usr/bin/python import chardet import urllib3 import uuid import os import logging import time import sys import re import threading from bs4 import BeautifulSoup """ http://www.qiubaichengren.com/1.html """ class PageNotFoundException(BaseException): """ 代表网页404的异常 """ pass class ResponseStatusException(BaseException): pass class QiuBaiChengRenSpider: http_pool_manager = urllib3.PoolManager() img_save_dir = 'D:/QiuBaiChengRen/' logger = logging.getLogger('QiuBaiChengRenSpider') def __init__(self): self.init_log() def init_log(self): stream_handler = logging.StreamHandler(sys.stdout) self.logger.addHandler(stream_handler) self.logger.setLevel(logging.DEBUG) def get(self, url): try: http_response = self.http_pool_manager.request('GET', url) if http_response.status == 404: raise PageNotFoundException('404') if http_response.status != 200: raise ResponseStatusException(http_response.status) return http_response.data except Exception: self.logger.info(u'获取网页的时候发生了异常') return '' def extract_img(self, html_doc): bs = BeautifulSoup(html_doc, 'lxml') imgs = bs.select('div.mala-text img') return imgs def save_img(self, img_tag): img_link = img_tag['src'].strip() save_name = self.img_save_dir + img_tag['alt'] + '___' + uuid.uuid4().hex + os.path.splitext(img_link)[1] save_name = re.compile('[\\s+,\",\']').sub('', save_name) # 覆盖掉生成的文件名中不合法的部分 self.logger.info('Save img: %s %s' %(save_name, img_link)) img_byte = self.get(img_link) if img_byte == '': return img_file = open(save_name, 'wb') img_file.write(img_byte) img_file.close() def list_visitor(self, seed): threads = [] i = 1 while True: try: url = seed % {'page': i} self.logger.info('Begin process:%s' %url) html_doc = self.get(url) if html_doc == '': continue imgs = self.extract_img(html_doc) for img in imgs: # self.logger.info('Saving img:%s %s' %(img['alt'], img['src'])) t1 = threading.Thread(target=self.save_img, args={img}) t1.start() threads.append(t1) i += 1 except PageNotFoundException: self.logger.info('404') break except BaseException: break for t1 in threads: t1.join() if __name__ == '__main__': spider = QiuBaiChengRenSpider() spider.list_visitor('http://www.qiubaichengren.com/%(page)d.html')