目录
1.数据组合
1.1连接
1.1.1添加行
1.1.2添加列
1.1.3不同索引下的连接操作
1.2合并多个数据集
1.2.1一对一合并
1.2.2多对一合并
1.2.3多对多合并
2.缺失数据
2.1查找和统计缺失数据
2.2清理缺失数据
2.2.1重新编码/替换
2.2.2前值填充
2.2.3后值填充
2.2.4插值
2.2.5删除缺失值
2.3缺失值计算
3.整理数据
3.1包含值而非变量的列
3.1.1固定一列
3.1.2固定多列
3.2包含多个变量的列
3.2.1单独拆分和添加列
3.2.2在单个步骤中进行拆分和组合
3.3行与列中的变量
3.4一张表中的多个观测单元(归一化)
3.5跨多张表的观测单元
3.5.1使用循环加载多个文件
3.5.2使用列表推导加载多个文件
1.数据组合
1.1连接
1.1.1添加行
import pandas as pd
df1=pd.read_csv('concat_1.csv')
df2=pd.read_csv('concat_2.csv')
df3=pd.read_csv('concat_3.csv')
print(df1)
print(df2)
print(df3)
#concat将3个dataframe连接在一起,是原始索引的简单堆叠
row_concat=pd.concat([df1,df2,df3])
print(row_concat)
#获取第四行
print(row_concat.iloc[3,])
#将series添加到dataframe,要先将series转化为dataframe
new_row_df=pd.DataFrame([['n1','n2','n3','n4']],columns=['A','B','C','D'])
print(new_row_df)
print(pd.concat([df1,new_row_df]))
#concat连接多个对象,append连接一个对象
print(df1.append(df2))
print(df1.append(new_row_df))
#使用python字典添加数据行
data_dict={'A':'n1','B':'n2','C':'n3','D':'n4'}
print(df1.append(data_dict,ignore_index=True))
#当向dataframe添加一个字典时,必须使用参数ignore_index
#简单的连接或追加数据,可以使用ignore_index参数重置连接后的行索引
row_concat_i=pd.concat([df1,df2,df3],ignore_index=True)
print(row_concat_i)
1.1.2添加列
#axis=0.按行连接数据;axis=1,按列连接数据
col_concat=pd.concat([df1,df2,df3],axis=1)
print(col_concat)
#按列明提取数据子集
print(col_concat['A'])
#直接添加一列
col_concat['new_col_list']=['n1','n2','n3','n4']
print(col_concat)
col_concat['new_col_series']=pd.Series(['n1','n2','n3','n4'])
print(col_concat)
#ignore_index重置列索引
print(pd.concat([df1,df2,df3],axis=1,ignore_index=True))
1.1.3不同索引下的连接操作
1连接具有不同列的行
# 1连接具有不同列的行
df1.columns=['A','B','C','D']
df2.columns=['E','F','G','H']
df3.columns=['A','C','F','H']
print(df1)
print(df2)
print(df3)
row_concat=pd.concat([df1,df2,df3])
print(row_concat)
print(pd.concat([df1,df2,df3],join='inner'))
print(pd.concat([df1,df3],ignore_index=False,join='inner'))
2.连接具有不同行的列
df1.index=[0,1,2,3]
df2.index=[4,5,6,7]
df3.index=[0,2,5,7]
print(df1)
print(df2)
print(df3)
col_concat=pd.concat([df1,df2,df3],axis=1)
print(col_concat)
print(pd.concat([df1,df3],axis=1,join='inner'))
1.2合并多个数据集
person=pd.read_csv('survey_person.csv')
site=pd.read_csv('survey_site.csv')
survey=pd.read_csv('survey_survey.csv')
visited=pd.read_csv('survey_visited.csv')
print(person)
print(site)
print(survey)
print(visited)
1.2.1一对一合并
visited_subset=visited.loc[[0,2,6]]
#merge函数的参数how默认值为inner
o2o_merge=site.merge(visited_subset,left_on='name',right_on='site')
print(o2o_merge)
1.2.2多对一合并
m2o_merge=site.merge(visited,left_on='name',right_on='site')
print(m2o_merge)
1.2.3多对多合并
ps=person.merge(survey,left_on='ident',right_on='person')
vs=visited.merge(survey,left_on='ident',right_on='taken')
print(ps)
print(vs)
ps_vs=ps.merge(vs,left_on=['ident','taken','quant','reading'],
right_on=['person','ident','quant','reading'])
#print(ps_vs)
print(ps_vs.loc[0,])
2.缺失数据
2.1查找和统计缺失数据
ebola=pd.read_csv('country_timeseries.csv')
#统计非缺失值的个数
print(ebola.count())
num_rows=ebola.shape[0]
num_missing=num_rows-ebola.count()
print(num_missing)
#统计缺失值的总数或特定列中缺失值的个数,count_nonzero函数和isnull方法
import numpy as np
print(np.count_nonzero(ebola.isnull()))
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))
#value_counts方法获取缺失值个数
print(ebola.Cases_Guinea.value_counts(dropna=False).head())
2.2清理缺失数据
2.2.1重新编码/替换
#把缺失值重新编码为0
print(ebola.fillna(0).iloc[0:10,0:5])
2.2.2前值填充
print(ebola.fillna(method='ffill').iloc[0:30,0:5])
2.2.3后值填充
print(ebola.fillna(method='bfill').iloc[:,0:5].tail())
2.2.4插值
print(ebola.interpolate().iloc[0:10,0:5])
2.2.5删除缺失值
print(ebola.shape)
ebola_dropna=ebola.dropna()
print(ebola_dropna.shape)
2.3缺失值计算
ebola['Cases_multiple']=ebola['Cases_Guinea']+ebola['Cases_Liberia']+ebola['Cases_SierraLeone']
ebola_subset=ebola.loc[:,['Cases_Guinea','Cases_Liberia','Cases_SierraLeone','Cases_multiple']]
print(ebola_subset.head(n=10))
print(ebola.Cases_Guinea.sum(skipna=True))
print(ebola.Cases_Guinea.sum(skipna=False))
3.整理数据
3.1包含值而非变量的列
3.1.1固定一列
import pandas as pd
pew=pd.read_csv('pew.csv')
#只显示前几列
print(pew.iloc[:,0:6])
#melt函数可以把dataframe重塑成整洁的数据格式
pew_long=pd.melt(pew,id_vars='religion')
print(pew_long)
pew_long=pd.melt(pew,id_vars='religion',var_name='income',value_name='count')
print(pew_long)
3.1.2固定多列
billboard=pd.read_csv('billboard.csv')
#查看前几行和前几列
print(billboard.iloc[0:5,0:16])
billboard_long=pd.melt(billboard,id_vars=['year','artist','track','time','date.entered'],var_name='week',value_name='rating')
print(billboard_long.head())
3.2包含多个变量的列
ebola=pd.read_csv('country_timeseries.csv')
print(ebola.columns)
#输出所选行
print(ebola.iloc[:5,[0,1,2,3,10,11]])
ebola_long=pd.melt(ebola,id_vars=['Date','Day'])
print(ebola_long)
3.2.1单独拆分和添加列
#获取variable列
#访问字符串方法
#依据分隔符拆分列
variable_split=ebola_long.variable.str.split('_')
print(variable_split[:5])
#整个容器
print(type(variable_split))
#容器中的第一个元素
print(type(variable_split[0]))
#使用get方法对各行取想要的索引
status_values=variable_split.str.get(0)
country_values=variable_split.str.get(1)
print(status_values[5:])
print(country_values[-5:])
ebola_long['status']=status_values
ebola_long['country']=country_values
print(ebola_long.head())
3.2.2在单个步骤中进行拆分和组合
variable_split=ebola_long.variable.str.split('_',expand=True)
variable_split.columns=['status','country']
ebola_parsed=pd.concat([ebola_long,variable_split],axis=1)
print(ebola_parsed)
ebola_long['status'],ebola_long['country']=zip(*ebola_long.variable.str.split('_'))
print(ebola_long.head())
3.3行与列中的变量
weather=pd.read_csv('weather.csv')
print(weather.iloc[:5,:11])
weather_melt=pd.melt(weather,id_vars=['id','year','month','element'],var_name='day',value_name='temp')
print(weather_melt)
weather_tidy=weather_melt.pivot_table(index=['id','year','month','day'],columns='element',values='temp')
weather_tidy_flat=weather_tidy.reset_index()
print(weather_tidy_flat.head())
weather_tidy=weather_melt.pivot_table(index=['id','year','month','day'],columns='element',values='temp').reset_index()
print(weather_tidy.head())
3.4一张表中的多个观测单元(归一化)
print(billboard_long.head())
billboard_songs=billboard_long[['year','artist','track','time']]
print(billboard_songs.shape)
billboard_songs=billboard_songs.drop_duplicates()
print(billboard_songs.shape)billboard_songs['id']=range(len(billboard_songs))
print(billboard_songs.head(10))
#把歌曲dataframe合并到源数据集
billboard_ratings=billboard_long.merge(billboard_songs,on=['year','artist','track','time'])
print(billboard_ratings.shape)
print(billboard_ratings.head())
billboard_ratings=billboard_ratings[['id','date.entered','week','rating']]
print(billboard_ratings.head())
3.5跨多张表的观测单元
import os
import urllib
#只下载前2个数据集
with open('raw_data_urls.txt','r') as data_urls:
for line,url in enumerate(data_urls):
if line==2:
break
fn=url.split('/')[-1].strip()
fp=os.path.join(fn)
print(url)
print(fp)
urllib.request.urlretrieve(url,fp)
import glob
nyc_taxi_data=glob.glob('fhv_*')
print(nyc_taxi_data)
taxi1=pd.read_csv(nyc_taxi_data[0])
taxi2=pd.read_csv(nyc_taxi_data[1])
print(taxi1.head(n=2))
print(taxi2.head(n=2))
print(taxi1.shape)
print(taxi2.shape)
taxi=pd.concat([taxi1,taxi2])
print(taxi.shape)
3.5.1使用循环加载多个文件
#创建一个待添加元素的空列表
list_taxi_df=[]
#循环遍历每个csv文件名
for csv_filename in nyc_taxi_data:
#可以选择输出文件名以便调试
print(csv_filename)
#把csv文件加载到dataframe中
df=pd.read_csv(csv_filename)
#把dataframe添加到列表中
list_taxi_df.append(df)
#输出列表中dataframe的个数
print(len(list_taxi_df))
#第一个元素的类型
print(type(list_taxi_df[0]))
#查看第一个dataframe的前5行数据
print(list_taxi_df[0].head())
taxi_loop_concat=pd.concat(list_taxi_df)
print(taxi_loop_concat.shape)
#手动加载连接和循环加载连接得到的结果一样吗
print(taxi.equals(taxi_loop_concat))
3.5.2使用列表推导加载多个文件
#不带注释的循环代码
list_taxi_df=[]
for csv_filename in nyc_taxi_data:
df=pd.read_csv(csv_filename)
list_taxi_df.append(df)
#使用列表推导式重写
list_taxi_df_comp=[pd.read_csv(data) for data in nyc_taxi_data]
print(type(list_taxi_df))
taxi_loop_concat_comp=pd.concat(list_taxi_df_comp)
print(taxi_loop_concat_comp.equals(taxi_loop_concat))