大家好,我是天空之城,今天给大家带来小福利,带你使用scrapy框架爬取电影数据存储到mysql数据库中。
话不多说,直接上代码。
下图是整个scrapy项目的目录结构图
首先在中控台建立scrapy工程项目,scrapy startproject douban
再创立爬虫 scrapy genspider db douban.com
下面是spiders爬虫db.py目录下代码
import scrapy
import scrapy, bs4
#在Scrapy中,每个爬虫的代码结构基本都如下所示:
from ..items import DoubanItem
class DbSpider(scrapy.Spider):
name = 'db'
allowed_domains = ['book.douban.com']
# start_urls = ['https://book.douban.com/top250?start=0']
start_urls = []
for x in range(3):
url = 'https://book.douban.com/top250?start=' + str(x * 25)
start_urls.append(url)
#把豆瓣Top250图书的前3页网址添加进start_urls。
def parse(self, response):
#parse是默认处理response的方法。
bs = bs4.BeautifulSoup(response.text,'html.parser')
#用BeautifulSoup解析response。
datas = bs.find_all('tr',class_="item")
#用find_all提取<tr class="item">元素,这个元素里含有书籍信息。
# for data in datas:
# #遍历data。
# title = data.find_all('a')[1]['title']
# #提取出书名。
# publish = data.find('p',class_='pl').text
# #提取出出版信息。
# score = data.find('span',class_='rating_nums').text
# #提取出评分。
# print([title,publish,score])
# #打印上述信息。
for data in datas:
# 遍历data。
item = DoubanItem()
# 实例化DoubanItem这个类。
item['title'] = data.find_all('a')[1]['title']
# 提取出书名,并把这个数据放回DoubanItem类的title属性里。
item['publish'] = data.find('p', class_='pl').text
# 提取出出版信息,并把这个数据放回DoubanItem类的publish里。
item['score'] = data.find('span', class_='rating_nums').text
# 提取出评分,并把这个数据放回DoubanItem类的score属性里。
print(item['title'])
# 打印书名。
yield item
# yield item是把获得的item传递给引擎。
下面是pipelines里面代码,这里分别涉及到保存到数据库中和保存为excel文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
# class DoubanPipeline:
# def process_item(self, item, spider):
# return item
'''
import openpyxl
class DoubanPipeline(object):
# 定义一个JobuiPipeline类,负责处理item
def __init__(self):
# 初始化函数 当类实例化时这个方法会自启动
self.wb = openpyxl.Workbook()
# 创建工作薄
self.ws = self.wb.active
# 定位活动表
self.ws.append(['书名', '出版信息', '得分'])
# 用append函数往表格添加表头
def process_item(self, item, spider):
# process_item是默认的处理item的方法,就像parse是默认处理response的方法
line = [item['title'], item['publish'], item['score']]
# 把公司名称、职位名称、工作地点和招聘要求都写成列表的形式,赋值给line
self.ws.append(line)
# 用append函数把公司名称、职位名称、工作地点和招聘要求的数据都添加进表格
return item
# 将item丢回给引擎,如果后面还有这个item需要经过的itempipeline,引擎会自己调度
def close_spider(self, spider):
# close_spider是当爬虫结束运行时,这个方法就会执行
self.wb.save('./douban.xlsx')
# 保存文件
self.wb.close()
# 关闭文件
'''
#1.在pipelings.py文件里面先导入item类以及pymysql模块
from .items import DoubanItem
# from ..jobspider.items import JobspiderItem
import pymysql
#2.然后就开始连接数据库以及写入数据库,我这里是直接先将mysql数据库以及数据表建立好了,并没有在代码里面创建
class DoubanPipeline(object):
def __init__(self):
# 1. 建立数据库的连接
self.connect = pymysql.connect(
# localhost连接的是本地数据库
host='localhost',
# mysql数据库的端口号
port=3306,
# 数据库的用户名
user='root',
# 本地数据库密码
passwd='qwer123456',
# 表名
db='doubanfilm',
# 编码格式
charset='utf8'
)
# 2. 创建一个游标cursor, 是用来操作表。
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
# 3. 将Item数据放入数据库,默认是同步写入。
# line = [item['title'], item['publish'], item['score']]
insert_sql = "INSERT INTO film2(title, publish, score) VALUES ('%s','%s','%s')" % (item['title'], item['publish'], item['score'])
self.cursor.execute(insert_sql)
# 4. 提交操作
self.connect.commit()
def close_spider(self, spider):
self.cursor.close()
self.connect.close()
下面是items里面代码
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# class DoubanItem(scrapy.Item):
# # define the fields for your item here like:
# # name = scrapy.Field()
# pass
import scrapy
#导入scrapy
class DoubanItem(scrapy.Item):
#定义一个类DoubanItem,它继承自scrapy.Item
title = scrapy.Field()
#定义书名的数据属性
publish = scrapy.Field()
#定义出版信息的数据属性
score = scrapy.Field()
#定义评分的数据属性
下面是start.py文件,用于启动爬虫
from scrapy import cmdline
# cmdline.execute("scrapy crawl db".split())
cmdline.execute(['scrapy','crawl','db'])
下面是setting是文件下面代码
# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'douban'
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'douban (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'douban.middlewares.DoubanSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.DoubanDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_URI='./storage/data/%(name)s.csv'
FEED_FORMAT='CSV'
FEED_EXPORT_ENCODING='ansi'
下面是建表的代码
import pymysql
conn=pymysql.connect(host='127.0.0.1',port=3306,user='root',passwd='qwer123456',db='doubanfilm',charset='utf8')
cursor=conn.cursor()
#建立存放电影数据的表doubanfilm6,名字自取
sql = """create table film2(
m_id int primary key auto_increment,
title varchar (200) not null,
publish varchar(200) not null,
score varchar(10) not null
)"""
cursor.execute(sql)
cursor.close()
conn.close()
看下成果,首图是mysql数据库中数据
保存到excel中的数据