最近花了大概两个月时间学习python以及爬虫,初见成效,完成了一个基于高德api的全类poi爬取小项目,特此分享。关
(关于我的各项方法以及py文件命名方面可能不是很规范,欢迎大家批评指正)
主方法(获取数据,插入数据库,查询数据是否已爬取(方便随时停止随时开始)):
from db_base import get_sub_cat
from get_data import get_data
from pgsql_insert import handle_data, insert_list
from get_adcode import get_adcode
from get_list_from_pg import get_list
def main_poi():
table_name = '**_**' # 你需要插入数据的表
list_adcode = get_adcode() # 获取需要爬取的区域的
for i in range(len(list_adcode)):
adcode = list_adcode[i]
f = "adcode"
tn = "base.adcode_for_get"
list_judge = get_list(f, tn)
if adcode in list_judge: # 判断区域是否已爬取
print("该区已爬取,跳过")
else:
list_num_cat = ['01', '02', '03', '04', '05', '06', '07', '08', '09',
'10', '11', '12', '13', '14', '15', '16', '17', '18']
# 高德poi分类统共18个大类,用于生成请求条件
for a in range(len(list_num_cat)):
list_types_test = get_sub_cat(list_num_cat[a])
types = ''
for b in range(len(list_types_test)):
types = types + list_types_test[b] + '|'
code_adcode = types + adcode
f_ca = "code_adcode"
tn_ca = "base.code_adcode_for_get"
list_judge_ca = get_list(f_ca, tn_ca)
if code_adcode in list_judge_ca: # 判断某区某类是否已爬取
print("该条数据已爬取,跳过")
else:
list_poi = get_data(types, adcode)
handle_data(table_name, list_poi) # 插入poi数据方法
data = types + adcode
tn1 = 'code_adcode_for_get'
tf = 'code_adcode'
insert_list(tn1, tf, data) # 插入types+adcode数据用来记录该数据已爬取
tf1 = 'adcode'
insert_list(tn, tf1, adcode) # 插入adcode数据记录该区域已爬取
print("爬取完成")
获取网页数据方法:
import requests
from db_base import get_sub_cat
def get_data(types, adcode):
list_result_poi = [] # 最终返回值作为插入数据库的数据
list_amap_key = [] # 可去高德申请key,建议多申请几个,一个号一天可以请求5000次
i = 0
key = list_amap_key[i]
count = '25' # 由于高德每次请求每页最多给25条poi信息,故利用其进行实现分页的查询
for j in range(1, 100):
if count == '25':
page_num = j
url = " https://restapi.amap.com/v5/place/text?" \
"key={}&types={}®ion={}&page_num={}" \
"&citylimit=true&extensions=all&show_fields=navi,business&page_size=25" \
.format(key, types, adcode, page_num)
print(url)
r = requests.get(url)
js = r.json() # 将网页数据转为json
status = js['status']
if status == '1': # 判断请求是否成功
count = js['count']
re = js['pois']
list_result_poi.append(re)
elif status == '0': # 请求失败则换key
i = i + 1
key = list_amap_key[i]
return list_result_poi # 最后以一个列表字典的形式返回
获取adcode
import psycopg2
def get_adcode(): # 可根据你输入的是省还是市来获取下级区域的adcode
conn = psycopg2.connect() # 数据库信息
city = input("请输入你要查询的城市(请输入省或者市并在后面加上省或市):")
curs = conn.cursor()
select_sql1 = "select adcode from base.base_poi_city where name = " + "'" + city + "'"
curs.execute(select_sql1)
adcode1 = curs.fetchall()[0][0]
if adcode1[2:6] != '0000':
adcode2 = str(int(int(adcode1)/100))
elif adcode1[2:6] == '0000':
adcode2 = str(int(int(adcode1)/1000))
curs.close()
curs = conn.cursor()
select_sql2 = "select adcode from base.base_poi_city where adcode like " + "'" + adcode2 + "%'"
curs.execute(select_sql2)
data = curs.fetchall()
curs.close()
conn.close()
list_sub_poi = []
for i in range(len(data)):
re = data[i][0]
if re[4:6] != '01' and re[4:6] != '00':
list_sub_poi.append(re)
return list_sub_poi
获取poi分类(关于poi分类可见上面链接):
import psycopg2
def get_sub_cat(num_cat):
conn = psycopg2.connect() # 数据库信息
curs = conn.cursor()
select_sql1 = "select code from base.base_poi_cat where code like'" + num_cat + "%'"
# SQL语句选取行业分类编号code
curs.execute(select_sql1)
data = curs.fetchall()
curs.close()
conn.close()
list_sub_poi = []
for a in range(len(data)):
re = data[a][0]
list_sub_poi.append(re) # 将获取到的poi编码生成为一个列
return list_sub_poi
插入数据库方法:
import traceback
from src.utils import common
import psycopg2
import math
from src.utils import zuobiaozhuanhuan as zbzh
def handle_data(table_name, list_result0): # 插入数据的主要方法,根据自己的需求改
conn = psycopg2.connect()
cursor = conn.cursor()
dict_poi = {
"id": "",
"大类": "",
"中类": "",
"小类": "",
"兴趣点名称": "",
"百度坐标": "",
"省名称": "",
"省代号": "",
"市名称": "",
"市代号": "",
"区县名称": "",
"区县代号": "",
"地址": "",
"地理格ID": "",
"类型代号": "",
"电话": "",
"84坐标": "",
"更新时间": "",
} # 作为数据缓存字典。然后转入数据库中
for i in range(len(list_result0)):
list_result1 = list_result0[i]
for i in range(len(list_result1)):
list_result = list_result1[i]
"""
将高德坐标解密为xy84
再将xy84转化为百度坐标
"""
x_pi = 3.14159265358979324 * 3000.0 / 180.0
x = str(list_result['location']).split(',')[0]
y = str(list_result['location']).split(',')[1]
[x0, y0] = zbzh.gcj02_to_wgs84(float(x), float(y)) # 高德加密经纬度(火星)转WGS84方法
z = math.sqrt(float(x0) * float(x0) + float(y0) * float(y0)) + 0.00002 * math.sin(float(y0) * x_pi)
theta = math.atan2(float(y0), float(x0)) + 0.000003 * math.cos(float(x0) * x_pi)
lng = z * math.cos(theta) + 0.0065
lat = z * math.sin(theta) + 0.006
xy84 = str(x0) + ',' + str(y0)
xybaidu = str(lng) + ',' + str(lat)
type1 = str(list_result['type'])
dict_poi['id'] = list_result['id']
dict_poi['大类'] = type1.split(';')[0]
dict_poi['中类'] = type1.split(';')[1]
dict_poi['小类'] = type1.split(';')[2]
dict_poi['兴趣点名称'] = list_result['name']
dict_poi['百度坐标'] = xybaidu
dict_poi['省名称'] = list_result['pname']
dict_poi['省代号'] = list_result['pcode']
dict_poi['市名称'] = list_result['cityname']
dict_poi['市代号'] = list_result['citycode']
dict_poi['区县名称'] = list_result['adname']
dict_poi['区县代号'] = list_result['adcode']
dict_poi['地址'] = list_result['address']
dict_poi['84坐标'] = xy84
dict_poi['类型代号'] = list_result['typecode']
try:
dict_poi['地理格ID'] = list_result['navi']['gridcode']
except Exception:
print('该条无地理格信息')
try:
dict_poi['电话'] = list_result['business']['tel']
except Exception:
print('该条无电话信息')
try:
update_time = common.get_now()
sql = (
"insert into "
+ table_name
+ "(ID,BIG_TYPE,MOD_TYPE,SMALL_TYPE,NAME,XY,PNAME,PCODE,CITYNAME,CITYCODE,ADNAME,ADCODE,UPDATE_TIME,ADDRESS,"
+ "GRIDCODE,TYPECODE,TEL,XY84)"
+ "values("
+ "'"
+ str(dict_poi["id"])
+ "','"
+ str(dict_poi["大类"])
+ "','"
+ str(dict_poi["中类"])
+ "','"
+ str(dict_poi["小类"])
+ "','"
+ str(dict_poi["兴趣点名称"])
+ "','"
+ str(dict_poi["百度坐标"])
+ "','"
+ str(dict_poi["省名称"])
+ "','"
+ str(dict_poi["省代号"])
+ "','"
+ str(dict_poi["市名称"])
+ "','"
+ str(dict_poi["市代号"])
+ "','"
+ str(dict_poi["区县名称"])
+ "','"
+ str(dict_poi["区县代号"])
+ "','"
+ update_time
+ "','"
+ str(dict_poi["地址"])
+ "','"
+ str(dict_poi["地理格ID"])
+ "','"
+ str(dict_poi['类型代号'])
+ "','"
+ str(dict_poi['电话'])
+ "','"
+ str(dict_poi['84坐标'])
+ "')"
)
print(sql)
cursor.execute(sql)
conn.commit()
except Exception:
print("保存数据出错: {}".format(traceback.format_exc()))
conn.commit()
print('数据导入成功')
cursor.close()
def insert_list(table_name, table_feild, data): # 主要用于对已爬取数据记录的插入
conn = psycopg2.connect()
cursor = conn.cursor()
sql = (f"""insert into {table_name} ({table_feild}) values ('{data}')""")
cursor.execute(sql)
conn.commit()
cursor.close()
获取数据展示: