爬虫笔记1--爬取墨迹天气
最近由于需要写了一个简单的墨迹天气爬取脚本,主要功能为爬取墨迹天气,然后将其存到MySQL数据库中。
1、功能
本代码主要功能为:爬取墨迹天气数据,将数据保存到MySQL数据库。其中数据库的建立脚本和数据的插入脚本在第二部分代码块中,此处不赘述。此处简单说明一下如何使用 requests_html 库爬取数据。
requests_html是18年出的一个Python库,非常适用于新手爬取数据,以下以墨迹天气为例子加以说明。
此处爬取墨迹天气中 空气质量综合评价(良)和CO指数(5),其代码如下所示:
# -*- coding:utf-8 -*-
from requests_html import HTMLSession
def GetAqi(city):
session = HTMLSession()
url = 'https://tianqi.moji.com/aqi/china/guangdong/' + city
r = session.get(url)
seldesc = '#aqi_desc'
selco = '#aqi_info > div.aqi_info_item > ul > li:nth-child(6) > span'
retdesc = r.html.find(seldesc)
retco = r.html.find(selco)
dictaqi = {"空气质量综合评价":retdesc[0].text, "CO指数":retco[0].text}
return dictaqi
if __name__ == '__main__':
city = 'pingshan-district'
print(GetAqi(city))
爬取结果如下:
需要注意的是如何获取特定字段在html中的路径,此处直接在Chrome浏览器中通过如下方法获取:Inspect->Elements->选中‘“良”对应最下层标签->Copy->Copy selector,将拷贝的内容粘贴到上述代码中seldesc中,然后同样将CO的路径粘贴到上述代码中selco中即可, 具体操作如下图所示:
2、代码
MySQL数据库脚本:
CREATE SCHEMA `weather`;
#weather (id,cid,Fevn,Ftemp,Fcondition,Fhumidity,Fupdatetime,Fwind,ts)
CREATE TABLE `weather`.`weather` (
`id` INT NOT NULL,
`cid` INT NOT NULL,
`Fevn` VARCHAR(20) NOT NULL,
`Ftemp` TINYINT(1) NOT NULL,
`Fcond` VARCHAR(20) NOT NULL,
`Fhumi` TINYINT(1) NOT NULL,
`Futime` DATETIME NULL,
`Fwind` VARCHAR(20) NOT NULL,
`ts` TIMESTAMP NULL,
PRIMARY KEY (`id`))
COMMENT = '该表存放id,城市id,环境综合指数,温度,天气状况,湿度,天气更新时间,风速,写入数据时间戳';
#city (id,Sname,Lname)
CREATE TABLE `weather`.`city` (
`id` INT NOT NULL,
`Sname` VARCHAR(50) NOT NULL,
`Lname` VARCHAR(200) NOT NULL,
PRIMARY KEY (`id`))
COMMENT = '城市id,城市名称缩写,城市名称绝对地址';
#'1', 'pingshan-district', '中国广东省深圳市坪山区'
#aqi (id,cid,val,desc,pm10,pm2.5,no2,so2,o3,co,ts)
CREATE TABLE `weather`.`aqi` (
`id` INT NOT NULL,
`cid` INT NOT NULL,
`val` TINYINT(1) NOT NULL,
`desc` VARCHAR(10) NOT NULL,
`pm10` TINYINT(1) NOT NULL,
`pm25` TINYINT(1) NOT NULL,
`no2` TINYINT(1) NOT NULL,
`so2` TINYINT(1) NOT NULL,
`o3` TINYINT(1) NOT NULL,
`co` TINYINT(1) NOT NULL,
`ts` TIMESTAMP NOT NULL,
PRIMARY KEY (`id`))
COMMENT = '该表存放天气综合指数,id,城市id,指数值,指数描述,pm10,pm2.5,NO2,SO2,O3,CO';
Python脚本:
# -*- coding:utf-8 -*-
import time,datetime
from requests_html import HTMLSession
import pymysql
import traceback
# Notice: requests_html is only supported by python3.6
# javascript:void(0)
# https://github.com/kennethreitz/requests-html
class MysqlClass():
db = None
host = 'localhost'
usr = 'root'
pwd = 'YourPwd'
dbname = 'weather'
port = 3306
charset = 'utf8'
def ShowVersion(self):
db = pymysql.connect(self.host, self.usr, self.pwd, self.dbname, self.port)
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用 execute() 方法执行 SQL 查询
cursor.execute("SELECT VERSION()")
# 使用 fetchone() 方法获取单条数据.
data = cursor.fetchone()
print("Database version : %s " % data)
# 关闭数据库连接
db.close()
def OpenDB(self):
'''
打开mysql:
'''
self.db = pymysql.connect(host = self.host, user = self.usr, passwd = self.pwd, db = self.dbname,charset = self.charset)
#print('Open MySQL!')
def CloseDB(self):
"""
关闭sql
"""
self.db.close()
#print('Close MySQL!')
def ExcuteSQL(self,str_sql):
self.OpenDB()
try:
cursor = self.db.cursor()
cursor.execute(str_sql)
cursor.close()
self.db.commit()
except:
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def GetMaxId(self,tableName):
sql_1 = "select max(id) from "+tableName
maxnum = 0
try:
cursor = self.db.cursor()
cursor.execute(sql_1)
ret1 = cursor.fetchone()
maxnum = ret1[0]#返回为tupple
cursor.close()
except :
self.db.rollback()
traceback.print_exc()
return maxnum
def GetCidBySname(self, city):
self.OpenDB()
cid = 1
sql_1 = "select id from city where Sname= \'%s\'"%(city)
try:
cursor = self.db.cursor()
cursor.execute(sql_1)
ret1 = cursor.fetchone()
cid = ret1[0]#返回为tupple
cursor.close()
except :
self.db.rollback()
traceback.print_exc()
self.CloseDB()
return cid
def Insert_City(self, data_dict):
''' 插入天气数据到weather表中 '''
self.OpenDB()
num = self.GetMaxId('city')
if(num==None):
num = 1
else:
num = num+1
#查询数据是否重复
if (num > 0):
cursor = self.db.cursor()
sql_1 = 'select * from city where Sname=\'%s\' '% (data_dict['Sname'])
cursor.execute(sql_1)
ret1 = cursor.fetchall()
cursor.close()
if (len(ret1) > 0):
exit(data_dict['Sname']+' is here!')
#插入数据
sql_2 = "INSERT INTO city(id,Sname,Lname) \
VALUES (%d,\'%s\',\'%s\')"%(num,data_dict['Sname'],data_dict['Lname'])
try:
# 执行sql语句
cursor = self.db.cursor()
cursor.execute(sql_2)
cursor.close()
# 提交到数据库执行
self.db.commit()
except:
# 发生错误时回滚
print('error',data_dict)
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def Insert_Weather(self, cid, data_dict):
''' 插入天气数据到weather表中 '''
self.OpenDB()
num = self.GetMaxId('weather')
if(num==None):
num = 1
else:
num = num+1
#插入数据
ts_str = time.strftime('%Y-%m-%d %H:%M:%S')
sql_1 = "INSERT INTO weather(id,cid,Fevn,Ftemp,Fcond,Fhumi,Futime,Fwind,ts) \
VALUES (%d,%d,\'%s\',%d,\'%s\',%d,\'%s\',\'%s\',\'%s\')"%(num,cid,data_dict['Fevn'],data_dict['Ftemp'],data_dict['Fcond'],data_dict['Fhumi'],data_dict['Futime'],data_dict['Fwind'],ts_str)
try:
# 执行sql语句
cursor = self.db.cursor()
cursor.execute(sql_1)
cursor.close()
# 提交到数据库执行
self.db.commit()
except:
# 发生错误时回滚
print('error',data_dict)
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def Insert_Aqi(self, cid, data_dict):
''' 插入天气数据到aqi表中 '''
self.OpenDB()
num = self.GetMaxId('aqi')
if(num==None):
num = 1
else:
num = num + 1
#插入数据
ts_str = time.strftime('%Y-%m-%d %H:%M:%S')
# Notice: desc为关键字,需要在其左右加~(小键盘1(!))左边字符,否则会出错
sql_1 = "INSERT INTO aqi(id,cid,val,`desc`,pm10,pm25,no2,so2,o3,co,ts) \
VALUES (%d,%d,%d,\'%s\',%d,%d,%d,%d,%d,%d,\'%s\')"%(num,cid,data_dict['val'],data_dict['desc'],data_dict['pm10'],data_dict['pm25'],data_dict['no2'],data_dict['so2'],data_dict['o3'],data_dict['co'],ts_str)
try:
# 执行sql语句
cursor = self.db.cursor()
cursor.execute(sql_1)
cursor.close()
# 提交到数据库执行
self.db.commit()
except:
# 发生错误时回滚
print('error',data_dict)
self.db.rollback()
traceback.print_exc()
self.CloseDB()
def InsertCity(Sname,Lname):
sql = MysqlClass()
dict_city = {'Sname':Sname,'Lname':Lname}
sql.Insert_City(dict_city)
def GetWeather(city):
session = HTMLSession()
url = 'https://tianqi.moji.com/weather/china/guangdong/'+city
r = session.get(url)
#print(r.html.text) #输出网页内容
selevn = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_alert.clearfix > ul > li > a > em'
seltemp = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_weather.clearfix > em'
selwea = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_weather.clearfix > b'
seltime = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_weather.clearfix > strong'
selhumidy = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_about.clearfix > span'
selwind = 'body > div.wrap.clearfix.wea_info > div.left > div.wea_about.clearfix > em'
retevn = r.html.find(selevn)
rettemp = r.html.find(seltemp)
retwea = r.html.find(selwea)
rettime = r.html.find(seltime)
rethumidy = r.html.find(selhumidy)
retwind = r.html.find(selwind)
listweather = [retevn[0].text,rettemp[0].text,retwea[0].text,rettime[0].text,rethumidy[0].text,retwind[0].text]
return listweather
def GetAqi(city):
session = HTMLSession()
url = 'https://tianqi.moji.com/aqi/china/guangdong/' + city
r = session.get(url)
selval = '#aqi_value'
seldesc = '#aqi_desc'
selpm10 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(1) > span'
selpm25 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(2) > span'
selno2 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(3) > span'
selso2 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(4) > span'
selo3 = '#aqi_info > div.aqi_info_item > ul > li:nth-child(5) > span'
selco = '#aqi_info > div.aqi_info_item > ul > li:nth-child(6) > span'
retval = r.html.find(selval)
retdesc = r.html.find(seldesc)
retpm10 = r.html.find(selpm10)
retpm25 = r.html.find(selpm25)
retno2 = r.html.find(selno2)
retso2 = r.html.find(selso2)
reto3 = r.html.find(selo3)
retco = r.html.find(selco)
listaqi = [retval[0].text, retdesc[0].text, retpm10[0].text, retpm25[0].text, retno2[0].text, retso2[0].text, reto3[0].text, retco[0].text]
return listaqi
def SaveWeatherInfo(city):
print('update WeatherInfo per 30min')
listWeather = GetWeather(city) # ['61 良', '25', '晴', '今天21:23更新', '湿度 75%', '南风2级'] update per 30min
# (num,cid,data_dict['Fevn'],data_dict['Ftemp'],data_dict['Fcond'],data_dict['Fhumi'],data_dict['Futime'],data_dict['Fwind'],ts_int)
strTime= time.strftime('%Y-%m-%d ')+listWeather[3][2:len(listWeather[3])-2]+':00'
listHumi = listWeather[4].split(' ')
strHumi = listHumi[1][0:len(listHumi[1])-1]
dictWeather = {'Fevn':listWeather[0],'Ftemp':int(listWeather[1]),'Fcond':listWeather[2],'Fhumi':int(strHumi),'Futime':strTime,'Fwind':listWeather[5]}
listAqi = GetAqi(city) # ['61', '良', '61', '55', '12', '3', '42', '6'] update per 1hour
# (num,cid,data_dict['val'],data_dict['desc'],data_dict['pm10'],data_dict['pm25'],data_dict['no2'],data_dict['so2'],data_dict['o3'],data_dict['co'],ts_int)
dictAqi = {'val':int(listAqi[0]),'desc':str(listAqi[1]),'pm10':int(listAqi[2]),'pm25':int(listAqi[3]),'no2':int(listAqi[4]),'so2':int(listAqi[5]),'o3':int(listAqi[6]),'co':int(listAqi[7])}
sql = MysqlClass()
cid = sql.GetCidBySname(city)
sql.Insert_Weather(cid,dictWeather)
sql.Insert_Aqi(cid, dictAqi)
def GetTimestamp():
''' get timestamp '''
ts = 0
dt = (datetime.datetime.now()).strftime('%Y-%m-%d %H:%M:%S')
timeArray = time.strptime(dt, '%Y-%m-%d %H:%M:%S')
ts = time.mktime(timeArray)
return int(ts)
if __name__ == '__main__':
city = 'pingshan-district'
#InsertCity(city,"中国广东省深圳市坪山区") #若没有插入该数据,则需要在此处插入或者手动插入
strT1 = time.strftime('%Y-%m-%d %H:%M:')+'00'
while((strT1[14:16] != '00') and (strT1,strT1[14:16] != '30')): #初始时间为xx:00:00或者xx:30:00
time.sleep(30)
strT1 = time.strftime('%Y-%m-%d %H:%M:%S')
ts1 = time.mktime(time.strptime(strT1,'%Y-%m-%d %H:%M:%S'))
while(True):
SaveWeatherInfo(city)
while(GetTimestamp()<(ts1+1800)):#墨迹约每半小时更新一次数据,即1800s保存一次数据即可
time.sleep(20)
ts1 = ts1 + 1800
KeepAlive脚本:(防止程序异常挂掉)
#!/bin/sh
#检测程序(PrintMoJi.py)是否运行,未运行则启动该程序
i=1
while [ i=1 ]
do
cd /home/xg/code/PowerPredict/
echo "I'm KeepAlive!"
ps -ef >./process
var1= grep PrintMoJi.py process
#echo $var1
#declare -i a=$?
if [ $? -eq 0 ];
then
echo "MoJiWeather is running!"
else
echo "MoJiWeather is dead!"
python3.6 PrintMoJi.py &
fi
rm ./process
sleep 30
done
将该shell脚本添加到Linux开机启动项中,设置为后台运行就可以达到防止程序异常挂掉的目的了,具体操作见:Linux下防止程序挂掉的shell脚本
3、说明
本代码当前测试环境为python3.6.3,MySQL 5.7.13
参考文献:
如何用Python爬数据?(一)网页抓取
requests-html GitHub网址:https://github.com/kennethreitz/requests-html
注:requests-html当前只支持python3.6.x