最近领导非让我在负荷预测时加上天气信息,然后今天倒腾了一下。目前看还能使用,不过也是第一天学习爬虫,能达到任务要求我就很满足了。
目标网址
历史天气查询|历史天气预报查询|历史气温查询|过去天气查询_历史天气查询网 (tianqi.com)
导入包
import requests
from bs4 import BeautifulSoup
import pandas as pd
columns=['date','UP_temperature','LOW_temperature','water','winds']
data_df = pd.DataFrame(columns=columns)
noum = 0
设置headers
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
一些处理函数(后面会用到)
def printYu(str_1):
if '小雨' in str_1 or '小雪' in str_1:
return 1
elif '阵雨' in str_1 or '阵雪' in str_1:
return 2
elif '中雨' in str_1 or '中雪' in str_1:
return 3
elif '大雨' in str_1 or '大雪' in str_1:
return 4
else:
return 0
def process_wind(str_2):
if len(str_2) <=2:
return 0
elif '微' in str_2:
return 0.5
else:
return int(str_2[-2:-1])
主要操作的代码
def get_date(url,noum):
res = requests.get(url=url,headers=headers)
res.encoding='utf-8'
index_html = res.text
# 使用bs初始化
soup = BeautifulSoup(index_html,'lxml')
data_frame = soup.select('ul.thrui li')
for data in data_frame:
temp_list = data.text.split('\n')
date = str(temp_list[1][:-4])
UP_temperature = int(temp_list[2][:-1])
LOW_temperature = int(temp_list[3][:-1])
water = int(printYu(temp_list[4]))
winds = process_wind(temp_list[5])
temp_list[2][:-1]
print(temp_list[1][:-4]) # 日期
print(temp_list[2][:-1]) # 最高温度
print(temp_list[3][:-1]) # 最低温度
print(printYu(temp_list[4])) # 降水
print(temp_list[5][-2:-1]) # 风
# print(type(data.text))
data_df.loc[noum, :] = [date, UP_temperature, LOW_temperature, water, winds]
print("*" * 100)
noum += 1
return noum
主函数
if __name__ == '__main__':
data_23 = ['20230'+str(i) if i<10 else '2023'+str(i) for i in range(1,13)]
dats_24 = ['20240'+str(i) if i<10 else '2024'+str(i) for i in range(1,5)]
data_23.extend(dats_24)
print(data_23)
for i in data_23:
url = "https://lishi.tianqi.com/beijing/{}.html".format(i)
noum = get_date(url=url,noum=noum)
print(url)
data_df.to_csv('./beijing.csv')