常规数据处理
- 部分向量处理函数
Name | structure | function |
oder() | 按元素取值由小到大的顺序显示向量x各元素的序号 | |
unique() | 去除向量中的重复项 | |
rev() | 反转函数,反转函数排序方式 | |
sort() | sort(a,decreasing=) | 排序函数,默认升序 |
duplicated | 判断元素是否有重复 | |
diff() | diff(目标向量,lag=) | 向量差分函数 |
矩阵处理:
1、选取矩阵子集
利用行列序号选取
**#利用行列序号选取**
#利用[row,col]切片选取
x=matrix(1:25,ncol=5)
> x[2,5] #选取第二行第五列的元素
[1] 22
> x[3,] #选取第三行的所有元素
[1] 3 8 13 18 23
> x[,5] #选取第四列的所有元素
[1] 21 22 23 24 25
> x[,-5] #利用负号选取剔除第五列后的所有行,返回结果保留表的结构
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
> x[2,c(3,4)] #利用向量实现多个元素选取
[1] 12 17
利用行列序号矩阵选取
#利用行列序号矩阵选取矩阵子集
> x=matrix(-12:12,ncol=5)
> x
[,1] [,2] [,3] [,4] [,5]
[1,] -12 -7 -2 3 8
[2,] -11 -6 -1 4 9
[3,] -10 -5 0 5 10
[4,] -9 -4 1 6 11
[5,] -8 -3 2 7 12
> index=cbind(c(1,2,5),c(3,4,4));index
[,1] [,2]
[1,] 1 3
[2,] 2 4
[3,] 5 4
> x[index] #选取(1,3)、(2,4)、(5,4)的元素
[1] -2 4 7
利用向量序号选取
#利用向量序号选取矩阵子集
> x=matrix
> x=matrix(0:35,ncol=6)
> x[1]
[1] 0
> x[5]
[1] 4
> x[6]
[1] 5
> x[20:35]
[1] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
利用逻辑条件选取
#利用逻辑条件选取子集
> x=matrix(1:36,ncol=6)
> y=x>16;y #返回逻辑值矩阵y
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] FALSE FALSE FALSE TRUE TRUE TRUE
[2,] FALSE FALSE FALSE TRUE TRUE TRUE
[3,] FALSE FALSE FALSE TRUE TRUE TRUE
[4,] FALSE FALSE FALSE TRUE TRUE TRUE
[5,] FALSE FALSE TRUE TRUE TRUE TRUE
[6,] FALSE FALSE TRUE TRUE TRUE TRUE
> x[y] #返回符合y条件的x值
[1] 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
> x[median(x)] #median()表示取中值
[1] 18
> x[median(x)]=0 #使x的中值为0
> x
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 7 13 19 25 31
[2,] 2 8 14 20 26 32
[3,] 3 9 15 21 27 33
[4,] 4 10 16 22 28 34
[5,] 5 11 17 23 29 35
[6,] 6 12 0 24 30 36
- 数据框的选取
关于数据子集的选取,
利用data()引用数据集,
利用[]切片选取,
利用$符号可以选取指定字段的数据。
head()、subset()、tail()、name()等函数,结合逻辑条件表达式,可以实现条件选取
> data(longley);str(longley) #获取数据集
'data.frame': 16 obs. of 7 variables:
$ GNP.deflator: num 83 88.5 88.2 89.5 96.2 ...
$ GNP : num 234 259 258 285 329 ...
$ Unemployed : num 236 232 368 335 210 ...
$ Armed.Forces: num 159 146 162 165 310 ...
$ Population : num 108 109 110 111 112 ...
$ Year : int 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 ...
$ Employed : num 60.3 61.1 60.2 61.2 63.2 ...
> longley[1:3,] #选取前3行
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
> head(longley,3) #选取指定数据集的前3行
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
> head(longley,5)
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
1950 89.5 284.599 335.1 165.0 110.929 1950 61.187
1951 96.2 328.975 209.9 309.9 112.075 1951 63.221
> tail(longley,2) #tail(),选取最后两行
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1961 115.7 518.173 480.6 257.2 127.852 1961 69.331
1962 116.9 554.894 400.7 282.7 130.081 1962 70.551
> head(longley) #默认前6行
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
1950 89.5 284.599 335.1 165.0 110.929 1950 61.187
1951 96.2 328.975 209.9 309.9 112.075 1951 63.221
1952 98.1 346.999 193.2 359.4 113.270 1952 63.639
> names(longley) #显示字段名称
[1] "GNP.deflator" "GNP" "Unemployed" "Armed.Forces" "Population"
[6] "Year" "Employed"
> longley$Year #选取指定字段
[1] 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
> head(longley["GNP"],3) #选取指定数据集的指定字段的前三行数据,保留数据表结构
GNP
1947 234.289
1948 259.426
1949 258.054
> head(longley$GNP,3) #注意与前一种选法区别,结果结构不一样
[1] 234.289 259.426 258.054
> longley[[2]]
[1] 234.289 259.426 258.054 284.599 328.975 346.999 365.385 363.112 397.469 419.180
[11] 442.769 444.546 482.704 502.601 518.173 554.894
> longley[2]
GNP
1947 234.289
1948 259.426
1949 258.054
1950 284.599
1951 328.975
1952 346.999
1953 365.385
1954 363.112
1955 397.469
1956 419.180
1957 442.769
1958 444.546
1959 482.704
1960 502.601
1961 518.173
1962 554.894
> #后者保留了结构
> head(longley[2],3) #利用[]+索引的方式选择指定字段的数据
GNP
1947 234.289
1948 259.426
1949 258.054
> longley[1:3,c("GNP","Population")] #利用向量函数,实现多字段指定行数数据选取
GNP Population
1947 234.289 107.608
1948 259.426 108.632
1949 258.054 109.773
> Y1960=longley[1960,]
> Y1960=longley[1960,];Y1960
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
NA NA NA NA NA NA NA NA
> Y1960=longley["1960",];Y1960
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1960 114.2 502.601 393.1 251.4 125.368 1960 69.564
#注意加入""的区别
> longley[c("1955","1960")]
> longley[c("1955","1960"),]
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1955 101.2 397.469 290.4 304.8 117.388 1955 66.019
1960 114.2 502.601 393.1 251.4 125.368 1960 69.564
> subset(longley,GNP>350&Population>110)
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1953 99.0 365.385 187.0 354.7 115.094 1953 64.989
1954 100.0 363.112 357.8 335.0 116.219 1954 63.761
1955 101.2 397.469 290.4 304.8 117.388 1955 66.019
1956 104.6 419.180 282.2 285.7 118.734 1956 67.857
1957 108.4 442.769 293.6 279.8 120.445 1957 68.169
1958 110.8 444.546 468.1 263.7 121.950 1958 66.513
1959 112.6 482.704 381.3 255.2 123.366 1959 68.655
1960 114.2 502.601 393.1 251.4 125.368 1960 69.564
1961 115.7 518.173 480.6 257.2 127.852 1961 69.331
1962 116.9 554.894 400.7 282.7 130.081 1962 70.551
- 字段添加
可以通过rbind或者cbind实现向指定数据集的行数据或者列数据添加
> data(longley)
> gnpPop=round(longley[,"GNP"]/longley[,"Population"],2)
> longley=cbind(longley,gnp.Pop=gnpPop)
> head(longley)
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed gnp.Pop
1947 83.0 234.289 235.6 159.0 107.608 1947 60.323 2.18
1948 88.5 259.426 232.5 145.6 108.632 1948 61.122 2.39
1949 88.2 258.054 368.2 161.6 109.773 1949 60.171 2.35
1950 89.5 284.599 335.1 165.0 110.929 1950 61.187 2.57
1951 96.2 328.975 209.9 309.9 112.075 1951 63.221 2.94
1952 98.1 346.999 193.2 359.4 113.270 1952 63.639 3.06