python初学者爬虫教程(四)数据存储
- 数据存储至TXT
- 将数据存储至CSV
- 读文件
- 写文件
- 数据存储至MangoDB
- python操作MongoDB数据库
- 可视化工具
- MongoDB爬虫实践——虎扑论坛
- 爬取所有帖子,测试输出
- 难点:爬取ul下的所有li
- 难点:获取a标签里的链接
- MongoDB 类
- 保存数据
数据存储至TXT
存储方法具体可以参考这篇文章
将数据存储至CSV
读文件
import csv
with open('test.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print(row)
print(row[0])
可见,csv.reader
把每一行数据转化为一个列表
写文件
import csv
output_list = ['1', '2', '3', '4']
with open('test2.csv', 'a+', newline='') as csvfile:
w = csv.writer(csvfile)
w.writerow(output_list)
数据存储至MangoDB
数据库的安装可以参考MongoDB安装
python操作MongoDB数据库
pip install pymongo
安装相应的包
测试连接是否成功:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.blog_database
collection = db.blog
首先连接数据库客户端,然后连接数据库blog_database,如果不存在则会创建一个,然后创建数据的集合blog,不存在则会创建一个。运行成功则代表成功。
将爬取的博客标题保存在数据库中:
import datetime
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
link = "http://www.santostang.com"
html = requests.get(link)
# 连接MongoDB数据库
client = MongoClient('localhost', 27017)
db = client.blog_database
collection = db.blog
soup = BeautifulSoup(html.text, 'html.parser')
title_list = soup.find_all('h1', class_='post-title')
for i in range(len(title_list)):
url = title_list[i].a['href']
title = title_list[i].a.text.strip()
post = {
"url": url,
"title": title,
"date": datetime.datetime.utcnow()
}
collection.insert_one(post)
运行成功,进入Mongo Shell:
输入show dbs
,可以查看新创建的数据库创建成功
输入us blog_database
切换数据库
输入db.blog.find().pretty()
查询数据集合的数据
可视化工具
MongoDB爬虫实践——虎扑论坛
爬取所有帖子,测试输出
需要爬取的网站https://bbs.hupu.com/bxj
需要爬取每一个帖子的相关信息
代码如下,包含测试输出:
import time
import requests
from bs4 import BeautifulSoup
def get_page(link):
r = requests.get(link)
html = r.content
html = html.decode('UTF-8')
soup = BeautifulSoup(html, 'lxml')
return soup
def get_data(post_list):
data_list = []
for post in post_list:
title = post.find('a', class_='truetit').text.strip() # 获取标题
link1 = post.find('a', class_='truetit')['href'] # 获取帖子链接,获取a标签里的链接
title_link = "https://bbs.hupu.com/" + link1 # 拼接起来
author = post.find('a', class_='aulink').text.strip()
author_link = post.find('a', class_='aulink')['href']
time = post.find('div', class_='author').contents[1].text.strip() # 注意使用contents获取子标签
reply = post.find('span', class_='ansour').text.strip().split('/')[0].strip()
watch = post.find('span', class_='ansour').text.strip().split('/')[1].strip()
last_reply_time = post.find('div', class_='endreply').a.text.strip()
last_reply_author = post.find('span', class_='endauthor').text.strip()
data_list.append(
[title, title_link, author, author_link, time, reply, watch, last_reply_time, last_reply_author])
return data_list
link = 'https://bbs.hupu.com/bxj-'
for i in range(10):
new_link = link + str(i + 1)
soup = get_page(new_link)
time.sleep(3)
# post_list = soup.find('ul', class_='for-list').find('li')
post_list = soup.select("div.show-list ul li") # 获取所有的li
# print(len(post_list))
data_list = get_data(post_list)
print("第 %s 页帖子数据:" % str(i + 1))
for each in data_list:
print(each)
难点:爬取ul下的所有li
对于每一个帖子,它的结构是这样的:
为了获取li的列表,从而进行遍历,可以通过soup.select("div.show-list ul li")
这样的语句,CSS选择器来实现
难点:获取a标签里的链接
可以通过中括号来表示,比如link1 = post.find('a', class_='truetit')['href']
来获取链接
MongoDB 类
注释有解释:
from pymongo import MongoClient
class MongoAPI(object):
def __init__(self, db_ip, db_port, db_name, table_name):
self.db_ip = db_ip
self.db_port = db_port
self.db_name = db_name
self.table_name = table_name
self.conn = MongoClient(host=self.db_ip, port=self.db_port)
self.db = self.conn[self.db_name]
self.table = self.db[table_name]
def get_one(self, query):
# 获取一个指定的值,并且将_id字段不返回
return self.table.find_one(query, projection={"_id": False})
# 获取数据库满足条件的所有数据
def get_all(self, query):
return self.table.find(query)
# 向集合中添加数据
def add(self, kv_dict):
return self.table.insert(kv_dict)
# 删除集合中的数据
def delete(self, query):
return self.table.delete_many(query)
# 查看集合中是否包含满足条件的数据
def check_exist(self, query):
ret = self.table.find_one(query)
return ret is not None
# 如果没有则新建
def update(self, query, kv_dict):
self.table.update_one(query, {
'$set': kv_dict
}, upsert=True)
保存数据
修改后的爬取文件为:
import time
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from MongoAPI.MongoDB import MongoAPI
def get_page(link):
r = requests.get(link)
html = r.content
html = html.decode('UTF-8')
soup = BeautifulSoup(html, 'lxml')
return soup
def get_data(post_list):
data_list = []
for post in post_list:
title = post.find('a', class_='truetit').text.strip() # 获取标题
link1 = post.find('a', class_='truetit')['href'] # 获取帖子链接,获取a标签里的链接
title_link = "https://bbs.hupu.com/" + link1 # 拼接起来
author = post.find('a', class_='aulink').text.strip()
author_link = post.find('a', class_='aulink')['href']
time = post.find('div', class_='author').contents[1].text.strip() # 注意使用contents获取子标签
reply = post.find('span', class_='ansour').text.strip().split('/')[0].strip()
watch = post.find('span', class_='ansour').text.strip().split('/')[1].strip()
last_reply_time = post.find('div', class_='endreply').a.text.strip()
last_reply_author = post.find('span', class_='endauthor').text.strip()
data_list.append(
[title, title_link, author, author_link, time, reply, watch, last_reply_time, last_reply_author])
return data_list
hupu_post = MongoAPI("localhost", 27017, "hupu", "post")
link = 'https://bbs.hupu.com/bxj-'
for i in range(10):
new_link = link + str(i + 1)
soup = get_page(new_link)
# post_list = soup.find('ul', class_='for-list').find('li')
post_list = soup.select("div.show-list ul li") # 获取所有的li
# print(len(post_list))
data_list = get_data(post_list)
print("第 %s 页帖子数据:" % str(i + 1))
for each in data_list:
print(each)
hupu_post.update({"post_link": each[1]}, {
"title": each[0],
"post_link": each[1],
"author": each[2],
"author_page": each[3],
"start_date": str(each[4]),
"reply": each[5],
"view": each[6],
"last_reply_time": str(each[7]),
"last_reply_author": each[8]
})
time.sleep(3)
print("第 %s 页获取完成休息3秒" % str(i+1))
结果成功保存: