前提需求在本地环境创建stock库和sina_news表
import datetime
import urllib.request
import pymysql
from bs4 import BeautifulSoup#如果没有安装好BeautifulSoup,这里是会报错的
#自定义一个函数拿到博客的链接
def getUrl (url):
#定义一个headers,存储刚才复制下来的报头,模拟成浏览器
headers = ('User-Agent',
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
# 将opener安装为全局
urllib.request.install_opener(opener)
html = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
# print(html)
bs = BeautifulSoup(html,'lxml')
# 用beautifulsoup的select,找到所有的<a>标签
links = bs.select('.list04 > li > p > a')
return links import sys
if __name__ == '__main__':
# 建立数据库连接,剔除已入库的部分
db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='stock', charset='utf8')
cursor = db.cursor() # 要爬取的网页链接 ,循环足够多的页数,所以填了1000000
for i in range(1,2):
url = 'https://finance.sina.com.cn/stock/'.format(i)
# 获取对应网页的链接地址
linklist = getUrl(url)
# 定义一个列表texts存储文章的标题
texts = []
# 定义一个列表links存储文章的链接
links = []
# 遍历linkllist,存储标题和链接
for link in linklist:
texts.append(link.text.strip())
links.append(link.get('href'))
# 通过zip,将信息输出到控制台
for text, link in zip(texts, links):
text = text.strip().replace("原 \n ", "")
text = text.strip().replace("转 \n ", "")
data = {'tittle': text, 'link': link}
#print(data)
try:
sql_insert = "INSERT INTO sina_news(title,link) VALUES ('%s', '%s')" %(text,link)
cursor.execute(sql_insert)
db.commit()
except Exception as err:
continue
cursor.close()
db.close()
print('All Finished!')