房价爬虫数据挖掘房价爬取

转载

时光机3号 2023-12-11 01:13:57

文章标签 房价爬虫数据挖掘 html mysql sql 文章分类 数据挖掘人工智能

房价数据爬取及分析

1、爬取URL
2、URL爬取房屋信息
3、数据处理
4、相关分析
5、主成分分析
结果图片

上传代码只为记录，有待提升

1、爬取URL

爬取所有URL

1、获取区域连接
2、区域连接获各道路连接
3、道路连接如果大于5页，就获取价位的信息

获取的所有URL存入数据中
等级编号类别 url
1 市 URL
1-1 区 URL 1
1-1-1 路 URL 2
1-1-1-1 价格 URL 3

import requests
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings() #关闭HTTPS的警告
from sqlalchemy import create_engine
import pandas as pd


def gethtml(url):
    headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
            }
    try:
        r=requests.get(url,headers=headers,timeout=3,verify=False)
        r.raise_for_status()
        r.encoding='utf8'
        return r.text
    except Exception as e:
        print('error of ' , e.args)
        return ''

def get_url(url,list_url,M):
    html=gethtml(url[1])
    demo=BeautifulSoup(html,'html.parser')
    div=demo.find_all('div',attrs={'class':'items'})
    
    
    try:
        if M==1:
            hf=div[0].find_all('a') #区域
        elif M==2:
            hf=div[0].find('div',attrs={'class':'sub-items'}).find_all('a') #道路URL
        elif M==3:
            hf=div[1].find_all('a') #价位
        elif M==4:
            hf=div[2].find_all('a') #面积
        elif M==5:
            hf=div[3].find_all('a') #房型
        else:
            pass
    except Exception as e:
        print('error',e.args)
    
    #得到页数
    page=999
    try:
        page=len(demo.find('div',attrs={'class':'sale-left'}).find('div' ,attrs={'class':'multi-page'}).find_all(['a','i']))-2
    except:
        try:
            num=len(demo.find('div',attrs={'class':'sale-left'}).find('ul',attrs={'id':'houselist-mod-new'}).find_all('a')) #筛选后展示个数
            if num==0:
                page=0
            else:
                page=1
        except:
            page=0
    
    if page>0:        
        try:
            count=0
            for i in hf:
                count=count+1
                list_url[i.attrs['href']]=(i.text,i.attrs['href'],url[2]+'-'+str(count),page)
                
        except Exception as e:
            print('error',e.args)


def save_mysql(data,table_url='hours_url_6'):
    conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #创建连接
    df=pd.DataFrame(data).T
    df.columns=['name','url','num','page'] 
    pd.io.sql.to_sql(df,table_url,con=conn,if_exists = 'append',index=False) #导入数据库if_exists = 'replace','append'
    conn.dispose()
    
    


def main():
#    item={'区域':0,'售价':1,'面积':2,'房型':3}
    #list_url={key,[title,url,url_no,page]} key=url
    
    url='https://xm.anjuke.com/sale/'
    url_no='1'  #url等级编号
    ls_1={}
    list_url={}
    get_url(['城市',url,url_no],list_url,1) #获取区域
    save_mysql(list_url)
    
    ls=list_url.copy()
    

    
    for depth in range(2,6):
        
        list_url.clear()
        ls_1.clear()
        count=0
        for i in ls:
            count+=1
            url=ls[i]
            get_url(url,list_url,depth)
            if len(list_url)>0:
                list_url['get_id']=None
                save_mysql(list_url,'hours_url_8')
                for k in list_url:
                    if list_url[k][3]>5:
                        ls_1[k]=list_url[k]
                list_url.clear()
            print('\r','当前深度：%s, 获取进度：%s/%s' % (depth,count,len(ls)),end='')
        
        print('\r','当前进度：%s ， 获取总数:%s' % (depth,len(ls_1)))
        ls=ls_1.copy()
        
def main_1():
    list_url={#'https://xm.anjuke.com/sale/siming/': ('思明', 'https://xm.anjuke.com/sale/siming/', '1-1', 8), 
              #'https://xm.anjuke.com/sale/huli/': ('湖里', 'https://xm.anjuke.com/sale/huli/', '1-2', 8), 
              #'https://xm.anjuke.com/sale/jimei/': ('集美', 'https://xm.anjuke.com/sale/jimei/', '1-3', 8), 
              #'https://xm.anjuke.com/sale/haicang/': ('海沧', 'https://xm.anjuke.com/sale/haicang/', '1-4', 8), 
              #'https://xm.anjuke.com/sale/tongana/': ('同安', 'https://xm.anjuke.com/sale/tongana/', '1-5', 8), 
              'https://xm.anjuke.com/sale/xiangana/': ('翔安', 'https://xm.anjuke.com/sale/xiangana/', '1-6', 8), 
              'https://xm.anjuke.com/sale/xiamenzhoubian/': ('厦门周边', 'https://xm.anjuke.com/sale/xiamenzhoubian/', '1-7', 8)}
    ls=list_url.copy()
    ls_1={}
    list_url.clear()
    for depth in range(2,6):
        
        list_url.clear()
        ls_1.clear()
        count=0
        for i in ls:
            count+=1
            url=ls[i]
            get_url(url,list_url,depth)
            if len(list_url)>0:
                save_mysql(list_url,'hours_url_9')
                for k in list_url:
                    if list_url[k][3]>5:
                        ls_1[k]=list_url[k]
                list_url.clear()
            print('\r','当前深度：%s, 获取进度：%s/%s' % (depth,count,len(ls)),end='')
        
        print('\r','当前进度：%s ， 获取总数:%s' % (depth,len(ls_1)))
        ls=ls_1.copy()
main()

2、URL爬取房屋信息

从ID获取房源的数据
查看是否已获取数据

import pandas as pd
import pymysql
from bs4 import BeautifulSoup
import requests
import re
from sqlalchemy import create_engine
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

def get_url(url):
    try:
        headers={
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                }
        req=requests.get(url,headers=headers,timeout=3,verify=False)
        req.raise_for_status()
        req.encoding='utf-8'
        return req.text
    except Exception as e:
        error_no=req.status_code
        if error_no==404 or error_no==503:
            return error_no
        else:
            print('error',e.args)
        return 1

def html_demo(html,list_data):
    pat=re.compile(r'[\：]')
    demo=BeautifulSoup(html,'html.parser')
    div=demo.find('div',attrs={'class':'wrapper'})
    #房产信息
    li=demo.find_all('li',attrs={'class':'houseInfo-detail-item'})
    for i in li:
        div=i.find_all('div')
        count=1
        for j in div:
            if count==1:
                a=''.join(pat.sub('',j.text.strip()).split())
            elif count==2:
                b=''.join(pat.sub('',j.text.strip()).split())
                list_data[a]=b
            else:
                continue
            count+=1
#    销售信息
    demo.find('div',attrs={'class':'broker-wrap'})
    b_info=demo.find('div',attrs={'class':'broker-wrap'})
    brokercard_name=b_info.find('div',attrs={'class':'brokercard-name'}).text.split()[0]
    list_data['销售']=brokercard_name
    #评分
    em=b_info.find_all('em')
    for i in range(3):
        list_data[pat.sub('',em[i*2].text)]=em[i*2+1].text
    #公司名称
    try:
        gs_name=b_info.find('a',attrs={'class':'text-overflow'}).attrs['title'].split(':')
        list_data[gs_name[0]]=gs_name[1]
    except:
        pass
    
def save_mysql(df,database='hours_data'):
    conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #创建连接
    pd.io.sql.to_sql(df,database,con=conn,if_exists = 'append',index=False) #导入数据库if_exists = 'replace','append'
    conn.dispose()

def update_id(id_num,database='hours_url_id'):
    conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')
    cursor=conn.cursor()
    sql="update %s set get_id=1  where ID='%s'" %(database,id_num)
    cursor.execute(sql)
    conn.commit()
    cursor.close()
    conn.close()
    
def get_id(id_table):
    conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')
    cursor=conn.cursor()
    sql='select distinct ID from %s where get_id is null '% id_table
    cursor.execute(sql)
    data_id=cursor.fetchall()
    return data_id
    cursor.close()
    conn.close()
    

def data_exists(table_name,url_no):
    conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')
    cursor=conn.cursor()
    sql='select ID from %s where ID="%s"' %(table_name,url_no) 
    cursor.execute(sql)
    return cursor.arraysize
    
def main():
    data=get_id('hours_url_id')
    start_url='https://xm.anjuke.com/prop/view/'
    list_data={}
    count=0
    n=len(data)
    for id in data:
        count+=1
        print('\r','获取数据进度：%s/%s' % (count,n),end='')
        count_id=data_exists('hours_url_id',id[0])
        if count_id >= 1:
            update_id(id[0])
            continue
        try:
            url=start_url+id[0]
            html=get_url(url)
            if html==1: #其他异常跳过
                continue
            elif html==404:  #404异常删除
                pass
            elif html==503:
                print('网页需验证：503')
                break
            else:
                html_demo(html,list_data)
                list_data['ID']=id[0]
                df=pd.DataFrame(list_data,index=id)
                save_mysql(df)
            update_id(id[0])
            
        except Exception as e:
            count+=1
            print('error',url,e.args)
            continue
    
main()

3、数据处理

hours的数据处理部分

import numpy as np
import pandas as pd
import pymysql

def get_hours_data():
conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')
    sql='select * from %s ' % 'hours_data'
    df=pd.read_sql(sql,conn,index_col='ID')
    conn.close()
    return df
    
    
def hours_data_extract(df):
    df.drop_duplicates(keep='first',inplace=True)
    df=df[~df.index.duplicated()]
    if df['建筑面积'].str[-3:].drop_duplicates(keep='first').count()>1:
        print('建筑面积异常,存在单位不统一')
    if df['房屋单价'].str[-4:].drop_duplicates(keep='first').count()>1:
        print('房屋单价异常,存在单位不统一')
        
    #错误数据修复
    df=df[(df.所在楼层.str[:1]!='共') & (df.所在楼层.str[:1]!='地')]
#    df.所在楼层=df.所在楼层.str.replace(['共90层','共74层'],['共9层','共7层'])
        
    #数据分类处理
    df.建筑面积=df.建筑面积.str[:-3].astype(np.float)
    df.房屋单价=df.房屋单价.str[:-4].astype(np.int)
    df.建造年代=df.建造年代.str[:-1].astype(np.int)
    df.所在位置=df.所在位置.str.split('－').str[0].replace(['思明','湖里','集美','海沧','同安','翔安','厦门周边'],[1,2,3,4,5,6,7])
    df.房屋类型=df.房屋类型.replace(['普通住宅','平房','公寓','暂无'],[1,2,3,4])
    df[['室', '厅', '卫']] = df['房屋户型'].str.extract('(\d+)室(\d+)厅(\d+)卫', expand=False).astype(np.int)
    df['均面积']=df['建筑面积']/(df['室']+df['厅'])
    df['楼高']=df.loc[:,'所在楼层'].str[4:].str[:-2].astype(np.int)
    df.楼高=df.楼高.replace([90,74],[9,7])
    df['层级']=df.loc[:,'所在楼层'].str[:2].replace(['低层','中层','高层'],[1,2,3]).astype(np.int)
    df.装修程度=df.装修程度.replace(['精装修','简单装修','豪华装修','毛坯'],[1,2,3,4])
    df.产权年限=df.产权年限.replace(['70年','50年','40年'],[1,2,3])
    df.配套电梯=df.配套电梯.replace(['无','有'],[0,1])
    df.房本年限=df.房本年限.replace(['满五年','满二年','不满二年','暂无'],[1,2,3,4])
    df.产权性质=df.产权性质.replace(['商品房住宅','商住两用','单位集体自建房','使用权','保障性住房','动迁配套房','其他'],[1,2,3,4,5,6,7])
    df.唯一住房=df.唯一住房.replace(['是','否','暂无'],[1,0,0])
    df.房屋朝向=df.房屋朝向.replace(['东','南','西','北','东西','南北','东南','东北','西南','西北'],[1,2,3,4,5,6,7,8,9,10])
    df['price_group']=df.房屋单价//10000
    
    #删除不需要字段
    df=df.drop(['所属小区','房屋户型','参考月供','所在楼层','参考首付','销售','房源','服务','评价','工商注册名称','一手房源'],axis=1)
    
    return df


def getdata():
    df=get_hours_data()
    data=hours_data_extract(df)
    return data

4、相关分析

相关性分析
分析数据的相关性

import hoursdata
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing

df=hoursdata.getdata()

#print(df.count())
df=df[(df.产权性质==1) & (df.房屋类型==1)]

#print(df.drop_describe())

#df[['所在位置']]=df[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)
#df[['房屋朝向']]=df[['房屋朝向']].apply(preprocessing.LabelEncoder().fit_transform)


print(df[['均面积','建筑面积']].describe())
df_des=df.均面积.describe([.10,.90])

df_1=df[(df['均面积']>=df_des['10%']) & (df['均面积']<=df_des['90%'])].copy()
#df_1=df[(df['建筑面积']>=df_des[4]) & (df['建筑面积']<=df_des[6])].copy()
print('****************\n',df_1[['均面积','建筑面积']].describe())

#plt.show()
plt.hist(df_1.建筑面积,bins=np.linspace(0,500,100))
plt.hist(df_1.均面积,bins=np.linspace(0,100,100))
plt.show()

plt.scatter(df_1.建筑面积,df_1.房屋单价)
plt.xlabel('面积',fontproperties='SimHei')
plt.ylabel('单价',fontproperties='SimHei')
plt.title('价位',fontproperties='SimHei')
plt.show()

plt.scatter(df_1.建筑面积,df_1.卫)
plt.xlabel('面积',fontproperties='SimHei')
plt.ylabel('卫个数',fontproperties='SimHei')
plt.show()


a=df_1.房屋单价.corr(df_1.建筑面积)
print(a)
cor=df_1.corr()
print(type(cor))
cor=cor[abs(cor)>0.3]
cor=cor.replace(1.0,None).dropna(axis=0,how='all').dropna(axis=1,how='all')
print(cor[abs(cor)>.30])
#print(df_1)
plt.scatter(df.建造年代,df.楼高)
plt.ylabel('高度',fontproperties='SimHei')
plt.title('年代与高度',fontproperties='SimHei')
plt.xlim(1970,2020,1)
plt.show()

plt.hist(df.建造年代,bins=np.linspace(1970,2020))
plt.ylabel('在售数量',fontproperties='SimHei')
plt.title('年份出售量',fontproperties='SimHei')
plt.show()

5、主成分分析

主成分分析pca实现可视化分析

import pandas as pd
import pymysql
import numpy as np
import sklearn.decomposition as dp
import matplotlib.pyplot as plt
from sklearn import preprocessing

def get_data(data_table):
    conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')
    sql="select * from %s where 产权性质='商品房住宅'" % data_table
    df=pd.read_sql(sql,conn,index_col='ID')
    conn.close()
    return df
    
def data_revision(df):
    df=df.drop_duplicates(keep='first') #删除重复的行
    df=df[~df.index.duplicated()] #删除ID重复
    
    if df['建筑面积'].str[-3:].drop_duplicates(keep='first').count()>1:
        print('建筑面积异常,存在单位不统一')
    df['建筑面积']=df['建筑面积'].str[:-3]
    df['建筑面积'] = df['建筑面积'].astype(np.float)
    
    if df['房屋单价'].str[-4:].drop_duplicates(keep='first').count()>1:
        print('房屋单价异常,存在单位不统一')
    df['房屋单价']=df['房屋单价'].str[:-4]
    df['房屋单价'] = df['房屋单价'].astype(np.int)
    
    df['建造年代']=df['建造年代'].str[:-1]
    df['建造年代'] = df['建造年代'].astype(np.int)
    
    df['所在位置']=df['所在位置'].str.split('－').str[0]
    
    df=df.drop(['参考月供'],axis=1)
    return df
    
def data_pca(df):
    y=df['房屋单价']
    X=df[['建造年代','建筑面积','所在位置']]
#    encoder=preprocessing.LabelEncoder()
    
    #所在位置自动分组
    X[['所在位置']]=X[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)
    
    #单价进行区间分组
    bins=[0,20000,50000,100000]
    labels=[1,2,3]
    y=pd.cut(y,bins,right=True,labels=labels)
    pca=dp.PCA(n_components=2) #加载PCA算法，降维2维值
    train_X=pca.fit_transform(X) #对原始数据降维
    x_=[]
    y_=[]
    for i in range(2,0,-1):
        for j in range(len(train_X)):
            if y[j]==i:
                x_.append(train_X[j][0])
                y_.append(train_X[j][1])
        plt.scatter(x_,y_)
        x_.clear()
        y_.clear()
    plt.show()
    

def main():
    df=get_data('hours_data')
    data=data_revision(df)
    data_pca(data)
#    for i in data.columns:
#        print(data[i].head())
#    print(data.count())

main()