数据分析项目-合集-day02
需求:
- 导入文件,查看原始数据
- 将人口数据和各州的简称数据进行合并
- 将合并的数据中重复的abbreviation进行删除
- 查看存在缺失数据的列
- 找到有哪些state/region使得state的值为NaN,进行取重操作
- 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
- 合并各州面积数据areas
- 我们会发现area(sq.m)这一列有缺失数据,找出是哪行
- 去除含有缺失数据的行
- 找出2010年的全民人口数据
- 计算各州的人口密度
- 排序,并找出人口密度最高的州
import numpy as np
import pandas as pd
from pandas import DataFrame
import shutil
#导入其他文件夹的源数据,导入一次即可
#shutil.copytree(r"C:\01-工作相关\第2阶段-03-21-数据分析资料\数据分析项目-合集",r"数据分析项目-合集")
#导入文件,查看原始数据
abb=pd.read_csv("../data/state-abbrevs.csv") #state(州的全称) #abbreviation(州的简称)
abb.head()
state | abbreviation | |
0 | Alabama | AL |
1 | Alaska | AK |
2 | Arizona | AZ |
3 | Arkansas | AR |
4 | California | CA |
area=pd.read_csv("../data/state-areas.csv") #state(州的全称) #area(州的面积)
area.head()
state | area (sq. mi) | |
0 | Alabama | 52423 |
1 | Alaska | 656425 |
2 | Arizona | 114006 |
3 | Arkansas | 53182 |
4 | California | 163707 |
pop=pd.read_csv("../data/state-population.csv") #state/region(州的简称) #age年龄 #year时间 #population人口
pop.head()
state/region | ages | year | population | |
0 | AL | under18 | 2012 | 1117489.0 |
1 | AL | total | 2012 | 4817528.0 |
2 | AL | under18 | 2010 | 1130966.0 |
3 | AL | total | 2010 | 4785570.0 |
4 | AL | under18 | 2011 | 1125763.0 |
#将人口数据和各州的简称数据进行合并
abb_pop=pd.merge(abb,pop,left_on="abbreviation",right_on="state/region",how="outer")
abb_pop.head()
state | abbreviation | state/region | ages | year | population | |
0 | Alabama | AL | AL | under18 | 2012 | 1117489.0 |
1 | Alabama | AL | AL | total | 2012 | 4817528.0 |
2 | Alabama | AL | AL | under18 | 2010 | 1130966.0 |
3 | Alabama | AL | AL | total | 2010 | 4785570.0 |
4 | Alabama | AL | AL | under18 | 2011 | 1125763.0 |
#将合并的数据中重复的abbreviation进行删除
abb_pop.drop(labels="abbreviation",axis=1,inplace=True)
abb_pop.head()
state | state/region | ages | year | population | |
0 | Alabama | AL | under18 | 2012 | 1117489.0 |
1 | Alabama | AL | total | 2012 | 4817528.0 |
2 | Alabama | AL | under18 | 2010 | 1130966.0 |
3 | Alabama | AL | total | 2010 | 4785570.0 |
4 | Alabama | AL | under18 | 2011 | 1125763.0 |
#查看存在缺失数据的列
#方式1:isnull, notll,any,all
abb_pop.isnull().any(axis=0)
#state,population这两列是存在空值的
state True
state/region False
ages False
year False
population True
dtype: bool
#方式2:
abb_pop.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state 2448 non-null object
1 state/region 2544 non-null object
2 ages 2544 non-null object
3 year 2544 non-null int64
4 population 2524 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB
#找到有哪些state/region使得state的值为NaN,进行取重操作(将state中的空值对应的简称找到,且对简称进行去重)
abb_pop
state | state/region | ages | year | population | |
0 | Alabama | AL | under18 | 2012 | 1117489.0 |
1 | Alabama | AL | total | 2012 | 4817528.0 |
2 | Alabama | AL | under18 | 2010 | 1130966.0 |
3 | Alabama | AL | total | 2010 | 4785570.0 |
4 | Alabama | AL | under18 | 2011 | 1125763.0 |
... | ... | ... | ... | ... | ... |
2539 | NaN | USA | total | 2010 | 309326295.0 |
2540 | NaN | USA | under18 | 2011 | 73902222.0 |
2541 | NaN | USA | total | 2011 | 311582564.0 |
2542 | NaN | USA | under18 | 2012 | 73708179.0 |
2543 | NaN | USA | total | 2012 | 313873685.0 |
2544 rows × 5 columns
#思路:可以将state这一列中的空值对应的行数据取出,从该行数据中就可以取出简称的值
#1.将state中的空值定位
abb_pop["state"].isnull()
#2.将上述布尔值作为源数据的行索引
abb_pop.loc[abb_pop["state"].isnull()] #将state中空对应的行数据取出
#3.将简称取出
abb_pop.loc[abb_pop["state"].isnull()]["state/region"]
#4.对简称去重
abb_pop.loc[abb_pop["state"].isnull()]["state/region"].unique()
#结论:只有PR和USA对应的全称数据为空值
array(['PR', 'USA'], dtype=object)
#为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
#思考:填充该需求中的空值可不可以使用fillna?
#- 不可以。fillna可以使用空的紧邻值做填充。fillna(value="xxx")使用指定的值填充空值
#(而这里需要填充的有两种,PR 和USA 对应的全称,无法统一填充)
# 使用给元素赋值的方式进行填充!
#1.先给USA的全称对应的空值进行批量赋值
#1.1将USA对应的行数据找到(行数据中就存在state的空值)
abb_pop["state/region"]=="USA"
abb_pop.loc[abb_pop["state/region"]=="USA"] #将USA对应的行数据取出
state | state/region | ages | year | population | |
2496 | NaN | USA | under18 | 1990 | 64218512.0 |
2497 | NaN | USA | total | 1990 | 249622814.0 |
2498 | NaN | USA | total | 1991 | 252980942.0 |
2499 | NaN | USA | under18 | 1991 | 65313018.0 |
2500 | NaN | USA | under18 | 1992 | 66509177.0 |
2501 | NaN | USA | total | 1992 | 256514231.0 |
2502 | NaN | USA | total | 1993 | 259918595.0 |
2503 | NaN | USA | under18 | 1993 | 67594938.0 |
2504 | NaN | USA | under18 | 1994 | 68640936.0 |
2505 | NaN | USA | total | 1994 | 263125826.0 |
2506 | NaN | USA | under18 | 1995 | 69473140.0 |
2507 | NaN | USA | under18 | 1996 | 70233512.0 |
2508 | NaN | USA | total | 1995 | 266278403.0 |
2509 | NaN | USA | total | 1996 | 269394291.0 |
2510 | NaN | USA | total | 1997 | 272646932.0 |
2511 | NaN | USA | under18 | 1997 | 70920738.0 |
2512 | NaN | USA | under18 | 1998 | 71431406.0 |
2513 | NaN | USA | total | 1998 | 275854116.0 |
2514 | NaN | USA | under18 | 1999 | 71946051.0 |
2515 | NaN | USA | total | 2000 | 282162411.0 |
2516 | NaN | USA | under18 | 2000 | 72376189.0 |
2517 | NaN | USA | total | 1999 | 279040181.0 |
2518 | NaN | USA | total | 2001 | 284968955.0 |
2519 | NaN | USA | under18 | 2001 | 72671175.0 |
2520 | NaN | USA | total | 2002 | 287625193.0 |
2521 | NaN | USA | under18 | 2002 | 72936457.0 |
2522 | NaN | USA | total | 2003 | 290107933.0 |
2523 | NaN | USA | under18 | 2003 | 73100758.0 |
2524 | NaN | USA | total | 2004 | 292805298.0 |
2525 | NaN | USA | under18 | 2004 | 73297735.0 |
2526 | NaN | USA | total | 2005 | 295516599.0 |
2527 | NaN | USA | under18 | 2005 | 73523669.0 |
2528 | NaN | USA | total | 2006 | 298379912.0 |
2529 | NaN | USA | under18 | 2006 | 73757714.0 |
2530 | NaN | USA | total | 2007 | 301231207.0 |
2531 | NaN | USA | under18 | 2007 | 74019405.0 |
2532 | NaN | USA | total | 2008 | 304093966.0 |
2533 | NaN | USA | under18 | 2008 | 74104602.0 |
2534 | NaN | USA | under18 | 2013 | 73585872.0 |
2535 | NaN | USA | total | 2013 | 316128839.0 |
2536 | NaN | USA | total | 2009 | 306771529.0 |
2537 | NaN | USA | under18 | 2009 | 74134167.0 |
2538 | NaN | USA | under18 | 2010 | 74119556.0 |
2539 | NaN | USA | total | 2010 | 309326295.0 |
2540 | NaN | USA | under18 | 2011 | 73902222.0 |
2541 | NaN | USA | total | 2011 | 311582564.0 |
2542 | NaN | USA | under18 | 2012 | 73708179.0 |
2543 | NaN | USA | total | 2012 | 313873685.0 |
#1.2 将USA对应的全称空的对应行索引取出
indexs=abb_pop.loc[abb_pop["state/region"]=="USA"].index
abb_pop.iloc[indexs]
abb_pop.loc[indexs,"state"]="United States"
#2.将PR的全称进行赋值
abb_pop["state/region"]=="PR"
indexs2=abb_pop.loc[abb_pop["state/region"]=="PR"].index
abb_pop.loc[indexs2,"state"]="Puerto Rico"
#- 合并各州面积数据areas
#- 我们会发现area(sq.m)这一列有缺失数据,找出是哪行
#- 去除含有缺失数据的行
#- 找出2010年的全民人口数据
#- 计算各州的人口密度
#- 排序,并找出人口密度最高的州
#- 合并各州面积数据areas
abb_pop_area=pd.merge(abb_pop,area,how="outer")
abb_pop_area
state | state/region | ages | year | population | area (sq. mi) | |
0 | Alabama | AL | under18 | 2012 | 1117489.0 | 52423.0 |
1 | Alabama | AL | total | 2012 | 4817528.0 | 52423.0 |
2 | Alabama | AL | under18 | 2010 | 1130966.0 | 52423.0 |
3 | Alabama | AL | total | 2010 | 4785570.0 | 52423.0 |
4 | Alabama | AL | under18 | 2011 | 1125763.0 | 52423.0 |
... | ... | ... | ... | ... | ... | ... |
2539 | United States | USA | total | 2010 | 309326295.0 | NaN |
2540 | United States | USA | under18 | 2011 | 73902222.0 | NaN |
2541 | United States | USA | total | 2011 | 311582564.0 | NaN |
2542 | United States | USA | under18 | 2012 | 73708179.0 | NaN |
2543 | United States | USA | total | 2012 | 313873685.0 | NaN |
2544 rows × 6 columns
#- 我们会发现area(sq.m)这一列有缺失数据,找出是哪行
abb_pop_area["area (sq. mi)"].isnull()
indexs3=abb_pop_area.loc[abb_pop_area["area (sq. mi)"].isnull()].index
#- 去除含有缺失数据的行
abb_pop_area.drop(labels=indexs3,axis=0,inplace=True)
abb_pop_area
state | state/region | ages | year | population | area (sq. mi) | |
0 | Alabama | AL | under18 | 2012 | 1117489.0 | 52423.0 |
1 | Alabama | AL | total | 2012 | 4817528.0 | 52423.0 |
2 | Alabama | AL | under18 | 2010 | 1130966.0 | 52423.0 |
3 | Alabama | AL | total | 2010 | 4785570.0 | 52423.0 |
4 | Alabama | AL | under18 | 2011 | 1125763.0 | 52423.0 |
... | ... | ... | ... | ... | ... | ... |
2491 | Puerto Rico | PR | under18 | 2010 | 896945.0 | 3515.0 |
2492 | Puerto Rico | PR | under18 | 2011 | 869327.0 | 3515.0 |
2493 | Puerto Rico | PR | total | 2011 | 3686580.0 | 3515.0 |
2494 | Puerto Rico | PR | under18 | 2012 | 841740.0 | 3515.0 |
2495 | Puerto Rico | PR | total | 2012 | 3651545.0 | 3515.0 |
2496 rows × 6 columns
#- 找出2010年的全民人口数据(基于df做条件查询)
abb_pop_area.query('ages=="total" & year==2010')
state | state/region | ages | year | population | area (sq. mi) | |
3 | Alabama | AL | total | 2010 | 4785570.0 | 52423.0 |
91 | Alaska | AK | total | 2010 | 713868.0 | 656425.0 |
101 | Arizona | AZ | total | 2010 | 6408790.0 | 114006.0 |
189 | Arkansas | AR | total | 2010 | 2922280.0 | 53182.0 |
197 | California | CA | total | 2010 | 37333601.0 | 163707.0 |
283 | Colorado | CO | total | 2010 | 5048196.0 | 104100.0 |
293 | Connecticut | CT | total | 2010 | 3579210.0 | 5544.0 |
379 | Delaware | DE | total | 2010 | 899711.0 | 1954.0 |
389 | District of Columbia | DC | total | 2010 | 605125.0 | 68.0 |
475 | Florida | FL | total | 2010 | 18846054.0 | 65758.0 |
485 | Georgia | GA | total | 2010 | 9713248.0 | 59441.0 |
570 | Hawaii | HI | total | 2010 | 1363731.0 | 10932.0 |
581 | Idaho | ID | total | 2010 | 1570718.0 | 83574.0 |
666 | Illinois | IL | total | 2010 | 12839695.0 | 57918.0 |
677 | Indiana | IN | total | 2010 | 6489965.0 | 36420.0 |
762 | Iowa | IA | total | 2010 | 3050314.0 | 56276.0 |
773 | Kansas | KS | total | 2010 | 2858910.0 | 82282.0 |
858 | Kentucky | KY | total | 2010 | 4347698.0 | 40411.0 |
869 | Louisiana | LA | total | 2010 | 4545392.0 | 51843.0 |
954 | Maine | ME | total | 2010 | 1327366.0 | 35387.0 |
965 | Montana | MT | total | 2010 | 990527.0 | 147046.0 |
1050 | Nebraska | NE | total | 2010 | 1829838.0 | 77358.0 |
1061 | Nevada | NV | total | 2010 | 2703230.0 | 110567.0 |
1146 | New Hampshire | NH | total | 2010 | 1316614.0 | 9351.0 |
1157 | New Jersey | NJ | total | 2010 | 8802707.0 | 8722.0 |
1242 | New Mexico | NM | total | 2010 | 2064982.0 | 121593.0 |
1253 | New York | NY | total | 2010 | 19398228.0 | 54475.0 |
1338 | North Carolina | NC | total | 2010 | 9559533.0 | 53821.0 |
1349 | North Dakota | ND | total | 2010 | 674344.0 | 70704.0 |
1434 | Ohio | OH | total | 2010 | 11545435.0 | 44828.0 |
1445 | Oklahoma | OK | total | 2010 | 3759263.0 | 69903.0 |
1530 | Oregon | OR | total | 2010 | 3837208.0 | 98386.0 |
1541 | Maryland | MD | total | 2010 | 5787193.0 | 12407.0 |
1626 | Massachusetts | MA | total | 2010 | 6563263.0 | 10555.0 |
1637 | Michigan | MI | total | 2010 | 9876149.0 | 96810.0 |
1722 | Minnesota | MN | total | 2010 | 5310337.0 | 86943.0 |
1733 | Mississippi | MS | total | 2010 | 2970047.0 | 48434.0 |
1818 | Missouri | MO | total | 2010 | 5996063.0 | 69709.0 |
1829 | Pennsylvania | PA | total | 2010 | 12710472.0 | 46058.0 |
1914 | Rhode Island | RI | total | 2010 | 1052669.0 | 1545.0 |
1925 | South Carolina | SC | total | 2010 | 4636361.0 | 32007.0 |
2010 | South Dakota | SD | total | 2010 | 816211.0 | 77121.0 |
2021 | Tennessee | TN | total | 2010 | 6356683.0 | 42146.0 |
2106 | Texas | TX | total | 2010 | 25245178.0 | 268601.0 |
2117 | Utah | UT | total | 2010 | 2774424.0 | 84904.0 |
2202 | Vermont | VT | total | 2010 | 625793.0 | 9615.0 |
2213 | Virginia | VA | total | 2010 | 8024417.0 | 42769.0 |
2298 | Washington | WA | total | 2010 | 6742256.0 | 71303.0 |
2309 | West Virginia | WV | total | 2010 | 1854146.0 | 24231.0 |
2394 | Wisconsin | WI | total | 2010 | 5689060.0 | 65503.0 |
2405 | Wyoming | WY | total | 2010 | 564222.0 | 97818.0 |
2490 | Puerto Rico | PR | total | 2010 | 3721208.0 | 3515.0 |
#- 计算各州的人口密度(人口./面积)
abb_pop_area["midu"]=abb_pop_area["population"]/abb_pop_area["area (sq. mi)"]
abb_pop_area
state | state/region | ages | year | population | area (sq. mi) | midu | |
0 | Alabama | AL | under18 | 2012 | 1117489.0 | 52423.0 | 21.316769 |
1 | Alabama | AL | total | 2012 | 4817528.0 | 52423.0 | 91.897221 |
2 | Alabama | AL | under18 | 2010 | 1130966.0 | 52423.0 | 21.573851 |
3 | Alabama | AL | total | 2010 | 4785570.0 | 52423.0 | 91.287603 |
4 | Alabama | AL | under18 | 2011 | 1125763.0 | 52423.0 | 21.474601 |
... | ... | ... | ... | ... | ... | ... | ... |
2491 | Puerto Rico | PR | under18 | 2010 | 896945.0 | 3515.0 | 255.176387 |
2492 | Puerto Rico | PR | under18 | 2011 | 869327.0 | 3515.0 | 247.319203 |
2493 | Puerto Rico | PR | total | 2011 | 3686580.0 | 3515.0 | 1048.813656 |
2494 | Puerto Rico | PR | under18 | 2012 | 841740.0 | 3515.0 | 239.470839 |
2495 | Puerto Rico | PR | total | 2012 | 3651545.0 | 3515.0 | 1038.846373 |
2496 rows × 7 columns
#- 排序,并找出人口密度最高的州
abb_pop_area.sort_values(by="midu",axis=0,ascending=False).iloc[0]["state"]
'District of Columbia'