大家好,我是天空之城,今天给大家带来小福利,带你使用scrapy框架爬取电影数据存储到mysql数据库中。

话不多说,直接上代码。

下图是整个scrapy项目的目录结构图

MySQL 保存查询结果为表_html


首先在中控台建立scrapy工程项目,scrapy startproject douban

再创立爬虫 scrapy genspider db douban.com

下面是spiders爬虫db.py目录下代码

import scrapy

import scrapy, bs4

#在Scrapy中,每个爬虫的代码结构基本都如下所示:
from ..items import DoubanItem


class DbSpider(scrapy.Spider):
    name = 'db'
    allowed_domains = ['book.douban.com']
    # start_urls = ['https://book.douban.com/top250?start=0']

    start_urls = []

    for x in range(3):
        url = 'https://book.douban.com/top250?start=' + str(x * 25)
        start_urls.append(url)
        #把豆瓣Top250图书的前3页网址添加进start_urls。

    def parse(self, response):
    #parse是默认处理response的方法。
        bs = bs4.BeautifulSoup(response.text,'html.parser')
        #用BeautifulSoup解析response。
        datas = bs.find_all('tr',class_="item")
        #用find_all提取<tr class="item">元素,这个元素里含有书籍信息。
        # for data in  datas:
        # #遍历data。
        #     title = data.find_all('a')[1]['title']
        #     #提取出书名。
        #     publish = data.find('p',class_='pl').text
        #     #提取出出版信息。
        #     score = data.find('span',class_='rating_nums').text
        #     #提取出评分。
        #     print([title,publish,score])
        #     #打印上述信息。

        for data in datas:
            # 遍历data。
            item = DoubanItem()
            # 实例化DoubanItem这个类。
            item['title'] = data.find_all('a')[1]['title']
            # 提取出书名,并把这个数据放回DoubanItem类的title属性里。
            item['publish'] = data.find('p', class_='pl').text
            # 提取出出版信息,并把这个数据放回DoubanItem类的publish里。
            item['score'] = data.find('span', class_='rating_nums').text
            # 提取出评分,并把这个数据放回DoubanItem类的score属性里。
            print(item['title'])
            # 打印书名。
            yield item
            # yield item是把获得的item传递给引擎。

下面是pipelines里面代码,这里分别涉及到保存到数据库中和保存为excel文件

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


# class DoubanPipeline:
#     def process_item(self, item, spider):
#         return item

'''
import openpyxl


class DoubanPipeline(object):
    # 定义一个JobuiPipeline类,负责处理item
    def __init__(self):
        # 初始化函数 当类实例化时这个方法会自启动
        self.wb = openpyxl.Workbook()
        # 创建工作薄
        self.ws = self.wb.active
        # 定位活动表
        self.ws.append(['书名', '出版信息', '得分'])
        # 用append函数往表格添加表头

    def process_item(self, item, spider):
        # process_item是默认的处理item的方法,就像parse是默认处理response的方法
        line = [item['title'], item['publish'], item['score']]
        # 把公司名称、职位名称、工作地点和招聘要求都写成列表的形式,赋值给line
        self.ws.append(line)
        # 用append函数把公司名称、职位名称、工作地点和招聘要求的数据都添加进表格
        return item
        # 将item丢回给引擎,如果后面还有这个item需要经过的itempipeline,引擎会自己调度

    def close_spider(self, spider):
        # close_spider是当爬虫结束运行时,这个方法就会执行
        self.wb.save('./douban.xlsx')
        # 保存文件
        self.wb.close()
        # 关闭文件
'''

#1.在pipelings.py文件里面先导入item类以及pymysql模块
from .items import DoubanItem
# from ..jobspider.items import JobspiderItem
import pymysql
#2.然后就开始连接数据库以及写入数据库,我这里是直接先将mysql数据库以及数据表建立好了,并没有在代码里面创建
class DoubanPipeline(object):
    def __init__(self):
        # 1. 建立数据库的连接
        self.connect = pymysql.connect(
	    # localhost连接的是本地数据库
            host='localhost',
            # mysql数据库的端口号
            port=3306,
            # 数据库的用户名
            user='root',
            # 本地数据库密码
            passwd='qwer123456',
            # 表名
            db='doubanfilm',
            # 编码格式
            charset='utf8'
        )
        # 2. 创建一个游标cursor, 是用来操作表。
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        # 3. 将Item数据放入数据库,默认是同步写入。
        # line = [item['title'], item['publish'], item['score']]

        insert_sql = "INSERT INTO film2(title, publish, score) VALUES ('%s','%s','%s')" % (item['title'], item['publish'], item['score'])
        self.cursor.execute(insert_sql)

        # 4. 提交操作
        self.connect.commit()

    def close_spider(self, spider):
        self.cursor.close()
        self.connect.close()

下面是items里面代码

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


# class DoubanItem(scrapy.Item):
#     # define the fields for your item here like:
#     # name = scrapy.Field()
#     pass
import scrapy
#导入scrapy
class DoubanItem(scrapy.Item):
#定义一个类DoubanItem,它继承自scrapy.Item
    title = scrapy.Field()
    #定义书名的数据属性
    publish = scrapy.Field()
    #定义出版信息的数据属性
    score = scrapy.Field()
    #定义评分的数据属性

下面是start.py文件,用于启动爬虫

from scrapy import cmdline

# cmdline.execute("scrapy crawl db".split())
cmdline.execute(['scrapy','crawl','db'])

下面是setting是文件下面代码

# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'douban'

SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'douban (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'douban.middlewares.DoubanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'douban.middlewares.DoubanDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'douban.pipelines.DoubanPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


FEED_URI='./storage/data/%(name)s.csv'
FEED_FORMAT='CSV'
FEED_EXPORT_ENCODING='ansi'

下面是建表的代码

import pymysql
conn=pymysql.connect(host='127.0.0.1',port=3306,user='root',passwd='qwer123456',db='doubanfilm',charset='utf8')

cursor=conn.cursor()

#建立存放电影数据的表doubanfilm6,名字自取
sql = """create table film2(
        m_id int primary key auto_increment,
        title varchar (200) not null,
        publish varchar(200) not null,
        score varchar(10) not null
        )"""

cursor.execute(sql)

cursor.close()
conn.close()

看下成果,首图是mysql数据库中数据

MySQL 保存查询结果为表_python_02


保存到excel中的数据

MySQL 保存查询结果为表_MySQL 保存查询结果为表_03