import pandas as pd
persons = pd.DataFrame({
'Name':['Rosaline Franklin','William Gosset'],
'Occupation':['Chemist','Statistician'],
'Born':['1920-07-25','1876-06-13'],
'Died':['1958-04-16','1937-10-16'],
'Age':[37,61]},columns=['Occupation','Born','Died','Age'],index=['Rosaline Franklin','William Gosset'])
print(persons)
Occupation Born Died Age
Rosaline Franklin Chemist 1920-07-25 1958-04-16 37
William Gosset Statistician 1876-06-13 1937-10-16 61
first_row = persons.loc['Rosaline Franklin']
1.Serial基本操作
1.1Serial属性
print(type(first_row))
print(first_row)
print(first_row.index)
print(first_row.keys())
print(first_row.values)
print(first_row.index[0])
<class 'pandas.core.series.Series'>
Occupation Chemist
Born 1920-07-25
Died 1958-04-16
Age 37
Name: Rosaline Franklin, dtype: object
Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')
Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')
['Chemist' '1920-07-25' '1958-04-16' 37]
Occupation
1.2Serial方法
import pandas as pd
import matplotlib as plt
persons = pd.DataFrame({
'Name':['Rosaline Franklin','William Gosset'],
'Occupation':['Chemist','Statistician'],
'Born':['1920-07-25','1876-06-13'],
'Died':['1958-04-16','1937-10-16'],
'Age':[37,61]},columns=['Occupation','Born','Died','Age'],index=['Rosaline Franklin','William Gosset'])
print(persons)
Occupation Born Died Age
Rosaline Franklin Chemist 1920-07-25 1958-04-16 37
William Gosset Statistician 1876-06-13 1937-10-16 61
ages = persons['Age']
print(type(ages))
print(ages)
print(ages.mean())
print(ages.std())
print(ages.min())
print(ages.max())
print(ages.sort_values(ascending=False))
print("***"*8)
print(ages.append(ages))
<class 'pandas.core.series.Series'>
Rosaline Franklin 37
William Gosset 61
Name: Age, dtype: int64
49.0
16.97056274847714
37
61
William Gosset 61
Rosaline Franklin 37
Name: Age, dtype: int64
************************
Rosaline Franklin 37
William Gosset 61
Rosaline Franklin 37
William Gosset 61
Name: Age, dtype: int64
ages.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f742b86f290>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OSDrhLdn-1594802176944)(output_8_1.png)]
1.3Serial 过滤条件
import pandas as pd
scientists = pd.read_csv('./data/scientists.csv')
ages = scientists['Age']
print(ages)
0 37
1 61
2 90
3 66
4 56
5 45
6 41
7 77
Name: Age, dtype: int64
print(ages.describe())
count 8.000000
mean 59.125000
std 18.325918
min 37.000000
25% 44.000000
50% 58.500000
75% 68.750000
max 90.000000
Name: Age, dtype: float64
print(ages[ages > ages.mean()])
print(ages > ages.mean())
print(type(ages > ages.mean()))
1 61
2 90
3 66
7 77
Name: Age, dtype: int64
0 False
1 True
2 True
3 True
4 False
5 False
6 False
7 True
Name: Age, dtype: bool
<class 'pandas.core.series.Series'>
print(ages[[True,True,True,False,False,True,True,True]])
0 37
1 61
2 90
5 45
6 41
7 77
Name: Age, dtype: int64
1.4Serial 向量操作
import pandas as pd
scientists = pd.read_csv('./data/scientists.csv')
ages = scientists['Age']
print(ages)
0 37
1 61
2 90
3 66
4 56
5 45
6 41
7 77
Name: Age, dtype: int64
print(ages+ages)
0 74
1 122
2 180
3 132
4 112
5 90
6 82
7 154
Name: Age, dtype: int64
print(ages*ages)
0 1369
1 3721
2 8100
3 4356
4 3136
5 2025
6 1681
7 5929
Name: Age, dtype: int64
print(ages+100)
0 137
1 161
2 190
3 166
4 156
5 145
6 141
7 177
Name: Age, dtype: int64
print(ages+pd.Series([1,100]))
0 38.0
1 161.0
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
dtype: float64
import numpy as np
#print(ages + np.array([1,2,3])) # error
2.DataFrame基本操作
2.1DataFrame的条件过滤
import pandas as pd
scientists = pd.read_csv('./data/scientists.csv')
print(scientists)
Name Born Died Age Occupation
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist
5 John Snow 1813-03-15 1858-06-16 45 Physician
6 Alan Turing 1912-06-23 1954-06-07 41 Computer Scientist
7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
print(scientists[scientists['Age'] > scientists['Age'].mean()])
Name Born Died Age Occupation
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
print(scientists.loc[[True,True,0,1]])
Name Born Died Age Occupation
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
print(scientists.iloc[[1,3,4]])
Name Born Died Age Occupation
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist
print(scientists[['Age','Name']][scientists['Age'] > scientists['Age'].mean()])
Age Name
1 61 William Gosset
2 90 Florence Nightingale
3 66 Marie Curie
7 77 Johann Gauss
print(scientists[['Age','Name']][scientists['Age'] > scientists['Age'].mean()].iloc[[1,2]])
Age Name
2 90 Florence Nightingale
3 66 Marie Curie
2.2DataFrame的向量操作
import pandas as pd
d1 = pd.DataFrame({
'a':[1,2],
'b':[4,5]
})
d2 = pd.DataFrame({
'a':[4,5],
'b':[1,2]
})
print(d1)
print(d2)
a b
0 1 4
1 2 5
a b
0 4 1
1 5 2
print(d1+d2)
a b
0 5 5
1 7 7
print(d1*d2)
a b
0 4 4
1 10 10
print(d1**d2)
a b
0 1 4
1 32 25
2.3向DataFrame添加列
import pandas as pd
scientists = pd.read_csv('./data/scientists.csv')
print(scientists)
Name Born Died Age Occupation
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist
5 John Snow 1813-03-15 1858-06-16 45 Physician
6 Alan Turing 1912-06-23 1954-06-07 41 Computer Scientist
7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)
object
object
born_datetime = pd.to_datetime(scientists['Born'],format='%Y-%m-%d')
died_datetime = pd.to_datetime(scientists['Died'],format='%Y-%m-%d')
print(type(born_datetime))
<class 'pandas.core.series.Series'>
scientists['born_dt'],scientists['died_dt'] = (born_datetime,died_datetime)
print(scientists['born_dt'].dtype)
scientists.head()
datetime64[ns]
Name | Born | Died | Age | Occupation | born_dt | died_dt | |
0 | Rosaline Franklin | 1920-07-25 | 1958-04-16 | 37 | Chemist | 1920-07-25 | 1958-04-16 |
1 | William Gosset | 1876-06-13 | 1937-10-16 | 61 | Statistician | 1876-06-13 | 1937-10-16 |
2 | Florence Nightingale | 1820-05-12 | 1910-08-13 | 90 | Nurse | 1820-05-12 | 1910-08-13 |
3 | Marie Curie | 1867-11-07 | 1934-07-04 | 66 | Chemist | 1867-11-07 | 1934-07-04 |
4 | Rachel Carson | 1907-05-27 | 1964-04-14 | 56 | Biologist | 1907-05-27 | 1964-04-14 |
2.4直接修改DataFrame的值
import pandas as pd
scientists = pd.read_csv('./data/scientists.csv')
print(scientists)
Name Born Died Age Occupation
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist
5 John Snow 1813-03-15 1858-06-16 45 Physician
6 Alan Turing 1912-06-23 1954-06-07 41 Computer Scientist
7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
import random
ageList = [random.randint(30,100) for i in range(8)]
scientists['Age'] =
print(scientists)
Name Born Died Age Occupation
0 Rosaline Franklin 1920-07-25 1958-04-16 75 Chemist
1 William Gosset 1876-06-13 1937-10-16 99 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 86 Nurse
3 Marie Curie 1867-11-07 1934-07-04 67 Chemist
4 Rachel Carson 1907-05-27 1964-04-14 71 Biologist
5 John Snow 1813-03-15 1858-06-16 35 Physician
6 Alan Turing 1912-06-23 1954-06-07 78 Computer Scientist
7 Johann Gauss 1777-04-30 1855-02-23 61 Mathematician
2.5删除DataFrame的列
import pandas as pd
scientists = pd.read_csv('./data/scientists.csv')
print(scientists.columns)
Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')
scientists_drop = scientists.drop(['Age','Died'],axis=1)
print(scientists)
Name Born Died Age Occupation
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist
5 John Snow 1813-03-15 1858-06-16 45 Physician
6 Alan Turing 1912-06-23 1954-06-07 41 Computer Scientist
7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
print(scientists_drop)
Name Born Occupation
0 Rosaline Franklin 1920-07-25 Chemist
1 William Gosset 1876-06-13 Statistician
2 Florence Nightingale 1820-05-12 Nurse
3 Marie Curie 1867-11-07 Chemist
4 Rachel Carson 1907-05-27 Biologist
5 John Snow 1813-03-15 Physician
6 Alan Turing 1912-06-23 Computer Scientist
7 Johann Gauss 1777-04-30 Mathematician