房价数据爬取及分析
- 1、爬取URL
- 2、URL爬取房屋信息
- 3、数据处理
- 4、相关分析
- 5、主成分分析
- 结果图片
上传代码只为记录,有待提升
1、爬取URL
爬取所有URL
1、获取区域连接
2、区域连接获各道路连接
3、道路连接如果大于5页,就获取价位的信息
获取的所有URL存入数据中
等级编号 类别 url
1 市 URL
1-1 区 URL 1
1-1-1 路 URL 2
1-1-1-1 价格 URL 3
import requests
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings() #关闭HTTPS的警告
from sqlalchemy import create_engine
import pandas as pd
def gethtml(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
try:
r=requests.get(url,headers=headers,timeout=3,verify=False)
r.raise_for_status()
r.encoding='utf8'
return r.text
except Exception as e:
print('error of ' , e.args)
return ''
def get_url(url,list_url,M):
html=gethtml(url[1])
demo=BeautifulSoup(html,'html.parser')
div=demo.find_all('div',attrs={'class':'items'})
try:
if M==1:
hf=div[0].find_all('a') #区域
elif M==2:
hf=div[0].find('div',attrs={'class':'sub-items'}).find_all('a') #道路URL
elif M==3:
hf=div[1].find_all('a') #价位
elif M==4:
hf=div[2].find_all('a') #面积
elif M==5:
hf=div[3].find_all('a') #房型
else:
pass
except Exception as e:
print('error',e.args)
#得到页数
page=999
try:
page=len(demo.find('div',attrs={'class':'sale-left'}).find('div' ,attrs={'class':'multi-page'}).find_all(['a','i']))-2
except:
try:
num=len(demo.find('div',attrs={'class':'sale-left'}).find('ul',attrs={'id':'houselist-mod-new'}).find_all('a')) #筛选后展示个数
if num==0:
page=0
else:
page=1
except:
page=0
if page>0:
try:
count=0
for i in hf:
count=count+1
list_url[i.attrs['href']]=(i.text,i.attrs['href'],url[2]+'-'+str(count),page)
except Exception as e:
print('error',e.args)
def save_mysql(data,table_url='hours_url_6'):
conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #创建连接
df=pd.DataFrame(data).T
df.columns=['name','url','num','page']
pd.io.sql.to_sql(df,table_url,con=conn,if_exists = 'append',index=False) #导入数据库if_exists = 'replace','append'
conn.dispose()
def main():
# item={'区域':0,'售价':1,'面积':2,'房型':3}
#list_url={key,[title,url,url_no,page]} key=url
url='https://xm.anjuke.com/sale/'
url_no='1' #url等级编号
ls_1={}
list_url={}
get_url(['城市',url,url_no],list_url,1) #获取区域
save_mysql(list_url)
ls=list_url.copy()
for depth in range(2,6):
list_url.clear()
ls_1.clear()
count=0
for i in ls:
count+=1
url=ls[i]
get_url(url,list_url,depth)
if len(list_url)>0:
list_url['get_id']=None
save_mysql(list_url,'hours_url_8')
for k in list_url:
if list_url[k][3]>5:
ls_1[k]=list_url[k]
list_url.clear()
print('\r','当前深度:%s, 获取进度:%s/%s' % (depth,count,len(ls)),end='')
print('\r','当前进度:%s , 获取总数:%s' % (depth,len(ls_1)))
ls=ls_1.copy()
def main_1():
list_url={#'https://xm.anjuke.com/sale/siming/': ('思明', 'https://xm.anjuke.com/sale/siming/', '1-1', 8),
#'https://xm.anjuke.com/sale/huli/': ('湖里', 'https://xm.anjuke.com/sale/huli/', '1-2', 8),
#'https://xm.anjuke.com/sale/jimei/': ('集美', 'https://xm.anjuke.com/sale/jimei/', '1-3', 8),
#'https://xm.anjuke.com/sale/haicang/': ('海沧', 'https://xm.anjuke.com/sale/haicang/', '1-4', 8),
#'https://xm.anjuke.com/sale/tongana/': ('同安', 'https://xm.anjuke.com/sale/tongana/', '1-5', 8),
'https://xm.anjuke.com/sale/xiangana/': ('翔安', 'https://xm.anjuke.com/sale/xiangana/', '1-6', 8),
'https://xm.anjuke.com/sale/xiamenzhoubian/': ('厦门周边', 'https://xm.anjuke.com/sale/xiamenzhoubian/', '1-7', 8)}
ls=list_url.copy()
ls_1={}
list_url.clear()
for depth in range(2,6):
list_url.clear()
ls_1.clear()
count=0
for i in ls:
count+=1
url=ls[i]
get_url(url,list_url,depth)
if len(list_url)>0:
save_mysql(list_url,'hours_url_9')
for k in list_url:
if list_url[k][3]>5:
ls_1[k]=list_url[k]
list_url.clear()
print('\r','当前深度:%s, 获取进度:%s/%s' % (depth,count,len(ls)),end='')
print('\r','当前进度:%s , 获取总数:%s' % (depth,len(ls_1)))
ls=ls_1.copy()
main()
2、URL爬取房屋信息
从ID获取房源的数据
查看是否已获取数据
import pandas as pd
import pymysql
from bs4 import BeautifulSoup
import requests
import re
from sqlalchemy import create_engine
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def get_url(url):
try:
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
req=requests.get(url,headers=headers,timeout=3,verify=False)
req.raise_for_status()
req.encoding='utf-8'
return req.text
except Exception as e:
error_no=req.status_code
if error_no==404 or error_no==503:
return error_no
else:
print('error',e.args)
return 1
def html_demo(html,list_data):
pat=re.compile(r'[\:]')
demo=BeautifulSoup(html,'html.parser')
div=demo.find('div',attrs={'class':'wrapper'})
#房产信息
li=demo.find_all('li',attrs={'class':'houseInfo-detail-item'})
for i in li:
div=i.find_all('div')
count=1
for j in div:
if count==1:
a=''.join(pat.sub('',j.text.strip()).split())
elif count==2:
b=''.join(pat.sub('',j.text.strip()).split())
list_data[a]=b
else:
continue
count+=1
# 销售信息
demo.find('div',attrs={'class':'broker-wrap'})
b_info=demo.find('div',attrs={'class':'broker-wrap'})
brokercard_name=b_info.find('div',attrs={'class':'brokercard-name'}).text.split()[0]
list_data['销售']=brokercard_name
#评分
em=b_info.find_all('em')
for i in range(3):
list_data[pat.sub('',em[i*2].text)]=em[i*2+1].text
#公司名称
try:
gs_name=b_info.find('a',attrs={'class':'text-overflow'}).attrs['title'].split(':')
list_data[gs_name[0]]=gs_name[1]
except:
pass
def save_mysql(df,database='hours_data'):
conn = create_engine('mysql+pymysql://root:root123@localhost:3306/gethtml?charset=utf8') #创建连接
pd.io.sql.to_sql(df,database,con=conn,if_exists = 'append',index=False) #导入数据库if_exists = 'replace','append'
conn.dispose()
def update_id(id_num,database='hours_url_id'):
conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')
cursor=conn.cursor()
sql="update %s set get_id=1 where ID='%s'" %(database,id_num)
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
def get_id(id_table):
conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')
cursor=conn.cursor()
sql='select distinct ID from %s where get_id is null '% id_table
cursor.execute(sql)
data_id=cursor.fetchall()
return data_id
cursor.close()
conn.close()
def data_exists(table_name,url_no):
conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')
cursor=conn.cursor()
sql='select ID from %s where ID="%s"' %(table_name,url_no)
cursor.execute(sql)
return cursor.arraysize
def main():
data=get_id('hours_url_id')
start_url='https://xm.anjuke.com/prop/view/'
list_data={}
count=0
n=len(data)
for id in data:
count+=1
print('\r','获取数据进度:%s/%s' % (count,n),end='')
count_id=data_exists('hours_url_id',id[0])
if count_id >= 1:
update_id(id[0])
continue
try:
url=start_url+id[0]
html=get_url(url)
if html==1: #其他异常跳过
continue
elif html==404: #404异常删除
pass
elif html==503:
print('网页需验证:503')
break
else:
html_demo(html,list_data)
list_data['ID']=id[0]
df=pd.DataFrame(list_data,index=id)
save_mysql(df)
update_id(id[0])
except Exception as e:
count+=1
print('error',url,e.args)
continue
main()
3、数据处理
hours的数据处理部分
import numpy as np
import pandas as pd
import pymysql
def get_hours_data():
conn=pymysql.connect('localhost','root','root123',database='gethtml',charset='utf8')
sql='select * from %s ' % 'hours_data'
df=pd.read_sql(sql,conn,index_col='ID')
conn.close()
return df
def hours_data_extract(df):
df.drop_duplicates(keep='first',inplace=True)
df=df[~df.index.duplicated()]
if df['建筑面积'].str[-3:].drop_duplicates(keep='first').count()>1:
print('建筑面积异常,存在单位不统一')
if df['房屋单价'].str[-4:].drop_duplicates(keep='first').count()>1:
print('房屋单价异常,存在单位不统一')
#错误数据修复
df=df[(df.所在楼层.str[:1]!='共') & (df.所在楼层.str[:1]!='地')]
# df.所在楼层=df.所在楼层.str.replace(['共90层','共74层'],['共9层','共7层'])
#数据分类处理
df.建筑面积=df.建筑面积.str[:-3].astype(np.float)
df.房屋单价=df.房屋单价.str[:-4].astype(np.int)
df.建造年代=df.建造年代.str[:-1].astype(np.int)
df.所在位置=df.所在位置.str.split('-').str[0].replace(['思明','湖里','集美','海沧','同安','翔安','厦门周边'],[1,2,3,4,5,6,7])
df.房屋类型=df.房屋类型.replace(['普通住宅','平房','公寓','暂无'],[1,2,3,4])
df[['室', '厅', '卫']] = df['房屋户型'].str.extract('(\d+)室(\d+)厅(\d+)卫', expand=False).astype(np.int)
df['均面积']=df['建筑面积']/(df['室']+df['厅'])
df['楼高']=df.loc[:,'所在楼层'].str[4:].str[:-2].astype(np.int)
df.楼高=df.楼高.replace([90,74],[9,7])
df['层级']=df.loc[:,'所在楼层'].str[:2].replace(['低层','中层','高层'],[1,2,3]).astype(np.int)
df.装修程度=df.装修程度.replace(['精装修','简单装修','豪华装修','毛坯'],[1,2,3,4])
df.产权年限=df.产权年限.replace(['70年','50年','40年'],[1,2,3])
df.配套电梯=df.配套电梯.replace(['无','有'],[0,1])
df.房本年限=df.房本年限.replace(['满五年','满二年','不满二年','暂无'],[1,2,3,4])
df.产权性质=df.产权性质.replace(['商品房住宅','商住两用','单位集体自建房','使用权','保障性住房','动迁配套房','其他'],[1,2,3,4,5,6,7])
df.唯一住房=df.唯一住房.replace(['是','否','暂无'],[1,0,0])
df.房屋朝向=df.房屋朝向.replace(['东','南','西','北','东西','南北','东南','东北','西南','西北'],[1,2,3,4,5,6,7,8,9,10])
df['price_group']=df.房屋单价//10000
#删除不需要字段
df=df.drop(['所属小区','房屋户型','参考月供','所在楼层','参考首付','销售','房源','服务','评价','工商注册名称','一手房源'],axis=1)
return df
def getdata():
df=get_hours_data()
data=hours_data_extract(df)
return data
4、相关分析
相关性分析
分析数据的相关性
import hoursdata
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
df=hoursdata.getdata()
#print(df.count())
df=df[(df.产权性质==1) & (df.房屋类型==1)]
#print(df.drop_describe())
#df[['所在位置']]=df[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)
#df[['房屋朝向']]=df[['房屋朝向']].apply(preprocessing.LabelEncoder().fit_transform)
print(df[['均面积','建筑面积']].describe())
df_des=df.均面积.describe([.10,.90])
df_1=df[(df['均面积']>=df_des['10%']) & (df['均面积']<=df_des['90%'])].copy()
#df_1=df[(df['建筑面积']>=df_des[4]) & (df['建筑面积']<=df_des[6])].copy()
print('****************\n',df_1[['均面积','建筑面积']].describe())
#plt.show()
plt.hist(df_1.建筑面积,bins=np.linspace(0,500,100))
plt.hist(df_1.均面积,bins=np.linspace(0,100,100))
plt.show()
plt.scatter(df_1.建筑面积,df_1.房屋单价)
plt.xlabel('面积',fontproperties='SimHei')
plt.ylabel('单价',fontproperties='SimHei')
plt.title('价位',fontproperties='SimHei')
plt.show()
plt.scatter(df_1.建筑面积,df_1.卫)
plt.xlabel('面积',fontproperties='SimHei')
plt.ylabel('卫个数',fontproperties='SimHei')
plt.show()
a=df_1.房屋单价.corr(df_1.建筑面积)
print(a)
cor=df_1.corr()
print(type(cor))
cor=cor[abs(cor)>0.3]
cor=cor.replace(1.0,None).dropna(axis=0,how='all').dropna(axis=1,how='all')
print(cor[abs(cor)>.30])
#print(df_1)
plt.scatter(df.建造年代,df.楼高)
plt.ylabel('高度',fontproperties='SimHei')
plt.title('年代与高度',fontproperties='SimHei')
plt.xlim(1970,2020,1)
plt.show()
plt.hist(df.建造年代,bins=np.linspace(1970,2020))
plt.ylabel('在售数量',fontproperties='SimHei')
plt.title('年份出售量',fontproperties='SimHei')
plt.show()
5、主成分分析
主成分分析pca实现可视化分析
import pandas as pd
import pymysql
import numpy as np
import sklearn.decomposition as dp
import matplotlib.pyplot as plt
from sklearn import preprocessing
def get_data(data_table):
conn=pymysql.connect('localhost','root','root123',charset='utf8',database='gethtml')
sql="select * from %s where 产权性质='商品房住宅'" % data_table
df=pd.read_sql(sql,conn,index_col='ID')
conn.close()
return df
def data_revision(df):
df=df.drop_duplicates(keep='first') #删除重复的行
df=df[~df.index.duplicated()] #删除ID重复
if df['建筑面积'].str[-3:].drop_duplicates(keep='first').count()>1:
print('建筑面积异常,存在单位不统一')
df['建筑面积']=df['建筑面积'].str[:-3]
df['建筑面积'] = df['建筑面积'].astype(np.float)
if df['房屋单价'].str[-4:].drop_duplicates(keep='first').count()>1:
print('房屋单价异常,存在单位不统一')
df['房屋单价']=df['房屋单价'].str[:-4]
df['房屋单价'] = df['房屋单价'].astype(np.int)
df['建造年代']=df['建造年代'].str[:-1]
df['建造年代'] = df['建造年代'].astype(np.int)
df['所在位置']=df['所在位置'].str.split('-').str[0]
df=df.drop(['参考月供'],axis=1)
return df
def data_pca(df):
y=df['房屋单价']
X=df[['建造年代','建筑面积','所在位置']]
# encoder=preprocessing.LabelEncoder()
#所在位置自动分组
X[['所在位置']]=X[['所在位置']].apply(preprocessing.LabelEncoder().fit_transform)
#单价进行区间分组
bins=[0,20000,50000,100000]
labels=[1,2,3]
y=pd.cut(y,bins,right=True,labels=labels)
pca=dp.PCA(n_components=2) #加载PCA算法,降维2维值
train_X=pca.fit_transform(X) #对原始数据降维
x_=[]
y_=[]
for i in range(2,0,-1):
for j in range(len(train_X)):
if y[j]==i:
x_.append(train_X[j][0])
y_.append(train_X[j][1])
plt.scatter(x_,y_)
x_.clear()
y_.clear()
plt.show()
def main():
df=get_data('hours_data')
data=data_revision(df)
data_pca(data)
# for i in data.columns:
# print(data[i].head())
# print(data.count())
main()
结果图片
数据量3W+
记录几张图片