最近花了大概两个月时间学习python以及爬虫,初见成效,完成了一个基于高德api的全类poi爬取小项目,特此分享。关

(关于我的各项方法以及py文件命名方面可能不是很规范,欢迎大家批评指正)      

         主方法(获取数据,插入数据库,查询数据是否已爬取(方便随时停止随时开始)):

from db_base import get_sub_cat
from get_data import get_data
from pgsql_insert import handle_data, insert_list
from get_adcode import get_adcode
from get_list_from_pg import get_list


def main_poi():
    table_name = '**_**'  # 你需要插入数据的表
    list_adcode = get_adcode() # 获取需要爬取的区域的
    for i in range(len(list_adcode)):
        adcode = list_adcode[i]
        f = "adcode"
        tn = "base.adcode_for_get"
        list_judge = get_list(f, tn)
        if adcode in list_judge: # 判断区域是否已爬取
            print("该区已爬取,跳过")
        else:
            list_num_cat = ['01', '02', '03', '04', '05', '06', '07', '08', '09',
                            '10', '11', '12', '13', '14', '15', '16', '17', '18']
            # 高德poi分类统共18个大类,用于生成请求条件
            for a in range(len(list_num_cat)):
                list_types_test = get_sub_cat(list_num_cat[a])
                types = ''
                for b in range(len(list_types_test)):
                    types = types + list_types_test[b] + '|'
                code_adcode = types + adcode
                f_ca = "code_adcode"
                tn_ca = "base.code_adcode_for_get"
                list_judge_ca = get_list(f_ca, tn_ca)
                if code_adcode in list_judge_ca: # 判断某区某类是否已爬取
                    print("该条数据已爬取,跳过")
                else:
                    list_poi = get_data(types, adcode)
                    handle_data(table_name, list_poi) # 插入poi数据方法
                    data = types + adcode
                    tn1 = 'code_adcode_for_get'
                    tf = 'code_adcode'
                    insert_list(tn1, tf, data) # 插入types+adcode数据用来记录该数据已爬取
            tf1 = 'adcode'
            insert_list(tn, tf1, adcode) # 插入adcode数据记录该区域已爬取
    print("爬取完成")

        获取网页数据方法:

import requests
from db_base import get_sub_cat


def get_data(types, adcode):
    list_result_poi = []  # 最终返回值作为插入数据库的数据

    list_amap_key = [] # 可去高德申请key,建议多申请几个,一个号一天可以请求5000次
    i = 0
    key = list_amap_key[i]

    count = '25' # 由于高德每次请求每页最多给25条poi信息,故利用其进行实现分页的查询
    for j in range(1, 100):
        if count == '25':
            page_num = j
            url = "	https://restapi.amap.com/v5/place/text?" \
                  "key={}&types={}®ion={}&page_num={}" \
                  "&citylimit=true&extensions=all&show_fields=navi,business&page_size=25" \
                .format(key, types, adcode, page_num)
            print(url)
            r = requests.get(url)
            js = r.json() # 将网页数据转为json
            status = js['status']
            if status == '1': # 判断请求是否成功
                count = js['count']
                re = js['pois']
                list_result_poi.append(re)
            elif status == '0': # 请求失败则换key
                i = i + 1
                key = list_amap_key[i]

    return list_result_poi # 最后以一个列表字典的形式返回

        获取adcode

import psycopg2


def get_adcode(): # 可根据你输入的是省还是市来获取下级区域的adcode
    conn = psycopg2.connect() # 数据库信息
    city = input("请输入你要查询的城市(请输入省或者市并在后面加上省或市):")
    curs = conn.cursor()
    select_sql1 = "select adcode from base.base_poi_city where name = " + "'" + city + "'"
    curs.execute(select_sql1)
    adcode1 = curs.fetchall()[0][0]
    if adcode1[2:6] != '0000':
        adcode2 = str(int(int(adcode1)/100))
    elif adcode1[2:6] == '0000':
        adcode2 = str(int(int(adcode1)/1000))
    curs.close()

    curs = conn.cursor()
    select_sql2 = "select adcode from base.base_poi_city where adcode like " + "'" + adcode2 + "%'"
    curs.execute(select_sql2)
    data = curs.fetchall()
    curs.close()
    conn.close()

    list_sub_poi = []
    for i in range(len(data)):
        re = data[i][0]
        if re[4:6] != '01' and re[4:6] != '00':
            list_sub_poi.append(re)

    return list_sub_poi

        获取poi分类(关于poi分类可见上面链接):

import psycopg2


def get_sub_cat(num_cat):
    conn = psycopg2.connect() # 数据库信息
    curs = conn.cursor()
    select_sql1 = "select code from base.base_poi_cat where code like'" + num_cat + "%'"  
    # SQL语句选取行业分类编号code
    curs.execute(select_sql1)
    data = curs.fetchall()

    curs.close()
    conn.close()

    list_sub_poi = []
    for a in range(len(data)):
        re = data[a][0]
        list_sub_poi.append(re) # 将获取到的poi编码生成为一个列

    return list_sub_poi

        插入数据库方法:

import traceback
from src.utils import common
import psycopg2
import math
from src.utils import zuobiaozhuanhuan as zbzh


def handle_data(table_name, list_result0): # 插入数据的主要方法,根据自己的需求改
    conn = psycopg2.connect()
    cursor = conn.cursor()
    dict_poi = {
        "id": "",
        "大类": "",
        "中类": "",
        "小类": "",
        "兴趣点名称": "",
        "百度坐标": "",
        "省名称": "",
        "省代号": "",
        "市名称": "",
        "市代号": "",
        "区县名称": "",
        "区县代号": "",
        "地址": "",
        "地理格ID": "",
        "类型代号": "",
        "电话": "",
        "84坐标": "",
        "更新时间": "",
    } # 作为数据缓存字典。然后转入数据库中
    for i in range(len(list_result0)):
        list_result1 = list_result0[i]
        for i in range(len(list_result1)):
            list_result = list_result1[i]
            """
            将高德坐标解密为xy84
            再将xy84转化为百度坐标
            """
            x_pi = 3.14159265358979324 * 3000.0 / 180.0
            x = str(list_result['location']).split(',')[0]
            y = str(list_result['location']).split(',')[1]

            [x0, y0] = zbzh.gcj02_to_wgs84(float(x), float(y)) # 高德加密经纬度(火星)转WGS84方法
            z = math.sqrt(float(x0) * float(x0) + float(y0) * float(y0)) + 0.00002 * math.sin(float(y0) * x_pi)
            theta = math.atan2(float(y0), float(x0)) + 0.000003 * math.cos(float(x0) * x_pi)
            lng = z * math.cos(theta) + 0.0065
            lat = z * math.sin(theta) + 0.006
            xy84 = str(x0) + ',' + str(y0)
            xybaidu = str(lng) + ',' + str(lat)

            type1 = str(list_result['type'])
            dict_poi['id'] = list_result['id']
            dict_poi['大类'] = type1.split(';')[0]
            dict_poi['中类'] = type1.split(';')[1]
            dict_poi['小类'] = type1.split(';')[2]
            dict_poi['兴趣点名称'] = list_result['name']
            dict_poi['百度坐标'] = xybaidu
            dict_poi['省名称'] = list_result['pname']
            dict_poi['省代号'] = list_result['pcode']
            dict_poi['市名称'] = list_result['cityname']
            dict_poi['市代号'] = list_result['citycode']
            dict_poi['区县名称'] = list_result['adname']
            dict_poi['区县代号'] = list_result['adcode']
            dict_poi['地址'] = list_result['address']
            dict_poi['84坐标'] = xy84
            dict_poi['类型代号'] = list_result['typecode']

            try:
                dict_poi['地理格ID'] = list_result['navi']['gridcode']
            except Exception:
                print('该条无地理格信息')
            try:
                dict_poi['电话'] = list_result['business']['tel']
            except Exception:
                print('该条无电话信息')

            try:
                update_time = common.get_now()
                sql = (
                        "insert into "
                        + table_name
                        + "(ID,BIG_TYPE,MOD_TYPE,SMALL_TYPE,NAME,XY,PNAME,PCODE,CITYNAME,CITYCODE,ADNAME,ADCODE,UPDATE_TIME,ADDRESS,"
                        + "GRIDCODE,TYPECODE,TEL,XY84)"
                        + "values("
                        + "'"
                        + str(dict_poi["id"])
                        + "','"
                        + str(dict_poi["大类"])
                        + "','"
                        + str(dict_poi["中类"])
                        + "','"
                        + str(dict_poi["小类"])
                        + "','"
                        + str(dict_poi["兴趣点名称"])
                        + "','"
                        + str(dict_poi["百度坐标"])
                        + "','"
                        + str(dict_poi["省名称"])
                        + "','"
                        + str(dict_poi["省代号"])
                        + "','"
                        + str(dict_poi["市名称"])
                        + "','"
                        + str(dict_poi["市代号"])
                        + "','"
                        + str(dict_poi["区县名称"])
                        + "','"
                        + str(dict_poi["区县代号"])
                        + "','"
                        + update_time
                        + "','"
                        + str(dict_poi["地址"])
                        + "','"
                        + str(dict_poi["地理格ID"])
                        + "','"
                        + str(dict_poi['类型代号'])
                        + "','"
                        + str(dict_poi['电话'])
                        + "','"
                        + str(dict_poi['84坐标'])
                        + "')"
                )
                print(sql)
                cursor.execute(sql)
                conn.commit()
            except Exception:
                print("保存数据出错: {}".format(traceback.format_exc()))
                conn.commit()
    print('数据导入成功')
    cursor.close()


def insert_list(table_name, table_feild, data): # 主要用于对已爬取数据记录的插入
    conn = psycopg2.connect()
    cursor = conn.cursor()
    sql = (f"""insert into {table_name} ({table_feild}) values ('{data}')""")
    cursor.execute(sql)
    conn.commit()
    cursor.close()

        获取数据展示:

python爬取高德地图车辆数据 爬取高德poi_sql