一、选题背景:
共享单车在2015年起开始在国内掀起热潮,目前已经逐渐成为了一种新的出行方式,日常生活中短途出行和代步出行,人们更加倾向于选择共享单车,这样既可以免去了等车的时间和花费,也更加的方便快捷、经济实惠,共享单车的出现为我们的生活带来了诸多便利。
二、数据说明:
kaggle的Bike Sharing Demand项目提供了美国某城市的共享单车2011年到2012年的数据集,该数据包括了租车日期,租车季节,租车气温,租车空气湿度等数据。
三、实施过程及代码:
导入库
1 import numpy as np
2 import pandas as pd
3 import calendar
4 import seaborn as sn
5 import matplotlib.pyplot as plt
查看数据大小
1 #查看数据大小
2 train=pd.read_csv("train.csv")
3 test=pd.read_csv("test.csv")
4 print('训练数据集:',train.shape,'测试数据集:',test.shape)
1 #查看数据情况
2 train.head()
1 #查看数据情况
2 test.head()
比较上述的两个表,我们可以知道test比train少了“casual”,“registered”,“count”三个字段
查看数据总体情况
1 #查看数据总体信息
2 print('训练集数据信息: ',train.info(),'测试集数据信息: ',test.info())
数据清洗
时间特征处理
1 #时间特征处理
2 #创建一个新的表框
3 periodDf=train[['datetime','season','holiday','workingday','count']]
4 #避免报错
5 periodDf.is_copy = None
6 #日期处理,把日期提取出来(用匿名函数分离出来)
7 periodDf['date']=periodDf['datetime'].apply(lambda x: x.split()[0])
8 periodDf['time']=periodDf['datetime'].apply(lambda x: x.split()[1])
9 periodDf['year']=periodDf['date'].apply(lambda x: x.split('-')[0])
10 periodDf['month']=periodDf['date'].apply(lambda x: x.split('-')[1])
11 periodDf['day']=periodDf['date'].apply(lambda x: x.split('-')[2])
12 periodDf['hour']=periodDf['time'].apply(lambda x: x.split(':')[0])
13 #星期
14 periodDf['weekday']=periodDf['datetime'].apply(lambda x: pd.to_datetime(x).weekday())
15 #看看处理后的periodDf
16 periodDf.head()
绘图
1 #绘图
2 fig1=plt.figure(figsize=(16,4))
3 ax1=plt.subplot(111)
4 df1=periodDf.groupby(['month','year']).sum().unstack()['count']#unstack(),将列索引变为行索引
5 df1.plot(kind='area',ax=ax1,alpha=0.6)
6 ax1.set_title('2011-2012 bikes sharing demand by month')
7 ax1.set_xlabel('Figure 1')
8 ax1.set_xticks(list(range(12)))
9 ax1.set_xticklabels(['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep','Oct','Nov','DeC'])
10 ax1.set_xlim(0,11)
通过上图分析:我们可以看到2012年共享单车的租借数量比2011年是有提升的,一年中6-10月是租借的高峰期。
节假日和非节假日租车情况
1 #节假日和非节假日租车情况
2 fig2=plt.figure(figsize=(16,6))
3 ax2=plt.subplot(111)
4 df2=periodDf[['count','holiday']]
5 df2.boxplot(by='holiday',ax=ax2)
6 ax2.set_title('2011-2012 bike sharing demand by holiday')
7 ax2.set_xlabel('Figure 2')
8 ax2.set_xticklabels(['Non holiday','holiday'],rotation='horizontal')
9 ax2.set_ylim(0,800)
工作日和周末的租车情况
1 #工作日和周末的租车情况
2 fig3=plt.figure(figsize=(16,6))
3 ax3=plt.subplot(111)
4 df3=periodDf[['count','weekday']]
5 df3.boxplot(by='weekday',ax=ax3)
6 ax3.set_title('2011-2012 bike sharing demand by weekday')
7 ax3.set_xlabel('Figure 3')
8 ax3.set_xticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], rotation='horizontal')
9 ax3.set_ylim(0,800)
租车数量随季节变化趋势
1 #租车数量随季节变化趋势
2 fig5=plt.figure(figsize=(14,4))
3 ax5=plt.subplot(111)
4 df51=periodDf.groupby(['hour','holiday']).mean().unstack()['count'].rename(columns={0:'Non holiday',1:'holiday'})
5 df52=periodDf.groupby(['hour','workingday']).mean().unstack()['count'].rename(columns={0:'weekend',1:'workingday'})
6 df51.plot(ax=ax5,style=':,')
7 df52.plot(ax=ax5,style='-o')
8 ax5.set_title('2011-2012 bike sharing demand by hours')
9 ax5.set_xlabel('figure 5')
10 ax5.set_xticks(list(range(24)))
11 ax5.set_xticklabels(list(range(24)))
12 ax5.set_xlim(0,23)
13 ax5.legend()
14 plt.show()
非时间特征处理
1 #天气、温度、湿度、风速信息统计
2 climateDf=train[['weather','temp','atemp','humidity','windspeed','count']]
3 climateDf=pd.concat([climateDf,periodDf['hour']],axis=1)
1 #查看天气和风速对租车数量的影响
2 fig,axes=plt.subplots(1,2,figsize=(20,6))
3 ax6=plt.subplot(1,2,1)
4 df11=climateDf.groupby('weather').sum()['count']
5 df12=climateDf.groupby('weather').mean()['count']
6 df1=pd.concat([df11,df12],axis=1).reset_index()
7 df1.columns=['weather','sum','mean']
8 df1['sum'].plot(kind='bar',width=0.4,ax=ax6,alpha=0.6,label='')
9 df1['mean'].plot(style='r-',alpha=0.6,ax=ax6,secondary_y=True,label='mean')
10
11 ax6.set_xlabel('weather')
12 ax6.set_xticks(df1.index)
13 ax6.set_xticklabels(['sunny&cloudy','Fog&overcast','light rain&light snow','bad weather'], rotation='horizontal')
14 ax6.set_ylabel('total')
15 ax6.right_ax.set_ylabel('mean')
16 ax6.set_title('2011-2012 bike sharing demand by weather')
17 ax7=plt.subplot(1,2,2)
18
19 df21=climateDf.groupby('windspeed').sum()['count']
20 df22=climateDf.groupby('windspeed').mean()['count']
21 df2=pd.concat([df21,df22],axis=1).reset_index()
22 df2.columns=['windspeed','sum','mean']
23 df2['sum'].plot(kind='area',ax=ax7,alpha=0.7,color='orange',label='')
24 df2['mean'].plot(style='-',alpha=0.6,color='red',ax=ax7,secondary_y=True,label='mean')
25 ax7.set_xlabel('windspeed')
26 ax7.set_ylabel('total')
27 ax7.right_ax.set_ylabel('mean')
28 ax7.set_title('2011-2012 bike sharing demand by windspeed')
29 plt.show()
左图柱状图反应了不同天气下租车总数的变化,大雨大雪大雾这种恶劣天气最低。折现图反应了各种天气下平均租车数量,异常的是平均数量在恶劣天气下反而显著增加;
右图反应了随着风速变大,租车的总数量趋向于0,但平均租车数却最高。
1 train[train['weather']==4]
1 train[train['windspeed']>50]
通过查看原始数据发现,这种极端情况的数据仅为个例,所以造成了异常现象
湿度、温度对租车数量的影响
1 #查看湿度、温度对租车数量的影响
2 fig=plt.subplots(1,2,figsize=(20,8))
3
4 ax1=plt.subplot(1,2,1)
5 df1=climateDf[['humidity','count']]
6 ax1.scatter(df1['humidity'],df1['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
7 ax1.set_title('2011-2012 bike sharing demand by humidity')
8 ax1.set_xlabel('humidity')
9 ax1.set_ylabel('count')
10
11 ax2=plt.subplot(1,2,2)
12 df2=climateDf[['temp','count']]
13 ax2.scatter(df2['temp'],df2['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
14 ax2.set_title('2011-2012 bike sharing demand by temperature')
15 ax2.set_xlabel('temperature')
16 ax2.set_ylabel('count')
17 plt.show()
最适合的湿度为30-40附近,温度越高,租车数量减少,最适合的温度在25-30左右
租车数量和其它变量的相关性
1 #查看租车数量和其它变量的相关性
2 df=pd.concat([periodDf.iloc[:,-5:].astype(int),train.iloc[:,1:]],axis=1)
3 corrDf=df.corr()
4 mask=np.array(corrDf)
5 mask[np.tril_indices_from(mask)]=False
6 fig=plt.figure(figsize=(15,15))
7 sn.heatmap(corrDf,mask=mask,annot=True,square=True)
8 plt.show()
完整代码
1 import numpy as np
2 import pandas as pd
3 import calendar
4 import seaborn as sn
5 import matplotlib.pyplot as plt
6
7 #查看数据大小
8 train=pd.read_csv("train.csv")
9 test=pd.read_csv("test.csv")
10 print('训练数据集:',train.shape,'测试数据集:',test.shape)
11
12 #查看数据情况
13 train.head()
14 test.head()
15
16 #查看数据总体信息
17 print('训练集数据信息: ',train.info(),'测试集数据信息: ',test.info())
18
19
20 #时间特征处理
21 #创建一个新的表框
22 periodDf=train[['datetime','season','holiday','workingday','count']]
23 #避免报错
24 periodDf.is_copy = None
25 #日期处理,把日期提取出来(用匿名函数分离出来)
26 periodDf['date']=periodDf['datetime'].apply(lambda x: x.split()[0])
27 periodDf['time']=periodDf['datetime'].apply(lambda x: x.split()[1])
28 periodDf['year']=periodDf['date'].apply(lambda x: x.split('-')[0])
29 periodDf['month']=periodDf['date'].apply(lambda x: x.split('-')[1])
30 periodDf['day']=periodDf['date'].apply(lambda x: x.split('-')[2])
31 periodDf['hour']=periodDf['time'].apply(lambda x: x.split(':')[0])
32 #星期
33 periodDf['weekday']=periodDf['datetime'].apply(lambda x: pd.to_datetime(x).weekday())
34 #看看处理后的periodDf
35 periodDf.head()
36
37
38 #绘图
39 fig1=plt.figure(figsize=(16,4))
40 ax1=plt.subplot(111)
41 df1=periodDf.groupby(['month','year']).sum().unstack()['count']#unstack(),将列索引变为行索引
42 df1.plot(kind='area',ax=ax1,alpha=0.6)
43 ax1.set_title('2011-2012 bikes sharing demand by month')
44 ax1.set_xlabel('Figure 1')
45 ax1.set_xticks(list(range(12)))
46 ax1.set_xticklabels(['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep','Oct','Nov','DeC'])
47 ax1.set_xlim(0,11)
48
49
50 #节假日和非节假日租车情况
51 fig2=plt.figure(figsize=(16,6))
52 ax2=plt.subplot(111)
53 df2=periodDf[['count','holiday']]
54 df2.boxplot(by='holiday',ax=ax2)
55 ax2.set_title('2011-2012 bike sharing demand by holiday')
56 ax2.set_xlabel('Figure 2')
57 ax2.set_xticklabels(['Non holiday','holiday'],rotation='horizontal')
58 ax2.set_ylim(0,800)
59
60
61 #工作日和周末的租车情况
62 fig3=plt.figure(figsize=(16,6))
63 ax3=plt.subplot(111)
64 df3=periodDf[['count','weekday']]
65 df3.boxplot(by='weekday',ax=ax3)
66 ax3.set_title('2011-2012 bike sharing demand by weekday')
67 ax3.set_xlabel('Figure 3')
68 ax3.set_xticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], rotation='horizontal')
69 ax3.set_ylim(0,800)
70 fig4=plt.figure(figsize=(14,4))
71 ax4=plt.subplot(111)
72 df4=periodDf.groupby(['hour', 'season']).mean().unstack()['count']
73 df4.columns=['Spring','Summer','Fall','Winter']
74 df4.plot(ax=ax4, style='--.')
75 ax4.set_title('2011-2012 bike sharing demand by hours')
76 ax4.set_xlabel('Figure 4')
77 ax4.set_xticks(list(range(24)))
78 ax4.set_xticklabels(list(range(24)))
79 ax4.set_xlim(0,23)
80
81
82 #租车数量随季节变化趋势
83 fig5=plt.figure(figsize=(14,4))
84 ax5=plt.subplot(111)
85 df51=periodDf.groupby(['hour','holiday']).mean().unstack()['count'].rename(columns={0:'Non holiday',1:'holiday'})
86 df52=periodDf.groupby(['hour','workingday']).mean().unstack()['count'].rename(columns={0:'weekend',1:'workingday'})
87 df51.plot(ax=ax5,style=':,')
88 df52.plot(ax=ax5,style='-o')
89 ax5.set_title('2011-2012 bike sharing demand by hours')
90 ax5.set_xlabel('figure 5')
91 ax5.set_xticks(list(range(24)))
92 ax5.set_xticklabels(list(range(24)))
93 ax5.set_xlim(0,23)
94 ax5.legend()
95 plt.show()
96
97
98 #天气、温度、湿度、风速信息统计
99 climateDf=train[['weather','temp','atemp','humidity','windspeed','count']]
100
101
102 #查看天气和风速对租车数量的影响
103 fig,axes=plt.subplots(1,2,figsize=(20,6))
104 ax6=plt.subplot(1,2,1)
105 df11=climateDf.groupby('weather').sum()['count']
106 df12=climateDf.groupby('weather').mean()['count']
107 df1=pd.concat([df11,df12],axis=1).reset_index()
108 df1.columns=['weather','sum','mean']
109 df1['sum'].plot(kind='bar',width=0.4,ax=ax6,alpha=0.6,label='')
110 df1['mean'].plot(style='r-',alpha=0.6,ax=ax6,secondary_y=True,label='mean')
111 ax6.set_xlabel('weather')
112 ax6.set_xticks(df1.index)
113 ax6.set_xticklabels(['sunny&cloudy','Fog&overcast','light rain&light snow','bad weather'], rotation='horizontal')
114 ax6.set_ylabel('total')
115 ax6.right_ax.set_ylabel('mean')
116 ax6.set_title('2011-2012 bike sharing demand by weather')
117 ax7=plt.subplot(1,2,2)
118 df21=climateDf.groupby('windspeed').sum()['count']
119 df22=climateDf.groupby('windspeed').mean()['count']
120 df2=pd.concat([df21,df22],axis=1).reset_index()
121 df2.columns=['windspeed','sum','mean']
122 df2['sum'].plot(kind='area',ax=ax7,alpha=0.7,color='orange',label='')
123 df2['mean'].plot(style='-',alpha=0.6,color='red',ax=ax7,secondary_y=True,label='mean')
124 ax7.set_xlabel('windspeed')
125 ax7.set_ylabel('total')
126 ax7.right_ax.set_ylabel('mean')
127 ax7.set_title('2011-2012 bike sharing demand by windspeed')
128 plt.show()
129 climateDf=pd.concat([climateDf,periodDf['hour']],axis=1)
130 train[train['weather']==4]
131 train[train['windspeed']>50]
132
133
134 #查看湿度、温度对租车数量的影响
135 fig=plt.subplots(1,2,figsize=(20,8))
136 ax1=plt.subplot(1,2,1)
137 df1=climateDf[['humidity','count']]
138 ax1.scatter(df1['humidity'],df1['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
139 ax1.set_title('2011-2012 bike sharing demand by humidity')
140 ax1.set_xlabel('humidity')
141 ax1.set_ylabel('count')
142 ax2=plt.subplot(1,2,2)
143 df2=climateDf[['temp','count']]
144 ax2.scatter(df2['temp'],df2['count'],s=df1['count']/5, c=df1['count'],marker='.',alpha=0.6)
145 ax2.set_title('2011-2012 bike sharing demand by temperature')
146 ax2.set_xlabel('temperature')
147 ax2.set_ylabel('count')
148 plt.show()
149
150
151 #查看租车数量和其它变量的相关性
152 df=pd.concat([periodDf.iloc[:,-5:].astype(int),train.iloc[:,1:]],axis=1)
153 corrDf=df.corr()
154 mask=np.array(corrDf)
155 mask[np.tril_indices_from(mask)]=False
156 fig=plt.figure(figsize=(15,15))
157 sn.heatmap(corrDf,mask=mask,annot=True,square=True)
158 plt.show()
总结
经过上述的可视化分析,我们对共享单车租车数据有了大致的把握,对数据特征之间的关系有了初步的了解。季节、小时、月份、工作日非工作日、天气状况、温度、湿度、风速等特征对总体需求量有相关性。总的来说,效果还是蛮不错的,后面会再多加强这方面的知识。