R 重要函数1
+处理循环 - R不仅有for/while循环,还有更强大的”一句话”函数。
- 排序
- 总结数据信息
lapply
- 可以循环处理列表中的每一个元素
- lapply(参数):lapply(列表, 函数/函数名, 其他参数)
- 总是返回一个列表
- sapply:简化结果
- 结果列表元素 长度均为1, 返回向量
- 结果列表元素 长度相同且大于1, 返回矩阵
str()函数,将R对象以整洁的形式展现
> str(lapply)
function (X, FUN, ...)
>
> x <- list(a=1:10, b=c(11,21,31,41,51))
>
> lapply(x,mean)
$a
[1] 5.5
$b
[1] 31
>
> y <- 1:4
> y
[1] 1 2 3 4
>
> lapply(y, runif)
[[1]]
[1] 0.9258423
[[2]]
[1] 0.1017509 0.8193295
[[3]]
[1] 0.4439033 0.8459882 0.6488867
[[4]]
[1] 0.9457995 0.6919540 0.2560230 0.7049408
> lapply(y, runif, min=0, max=100)
[[1]]
[1] 35.7373
[[2]]
[1] 71.53044 77.24861
[[3]]
[1] 34.95911 89.55134 88.55318
[[4]]
[1] 5.153867 10.121085 42.380309 75.263465
>
> z <- list(a=matrix(1:6,2,3), b=matrix(4:7, 2,2))
匿名函数
> lapply(z, function(m) m[1,])
$a
[1] 1 3 5
$b
[1] 4 6
sapply 化简
> xx <- list(a=1:10, b=c(11,21,31,41,51))
> lapply(xx,mean)
$a
[1] 5.5
$b
[1] 31
> sapply(xx,mean)
a b
5.5 31.0
> # 结果是向量
> class(sapply(xx,mean))
[1] "numeric"
apply
- 沿着数组的某一维度处理数据
- 例如:将函数用于矩阵的行或列
- 虽然与for/while循环的效率相似,但是只用一句话就可以完成
- apple(参数): apply(数组,维度,函数/函数名)
> x <- matrix(1:16, 4,4)
> x
[,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
按第二个维度求平均
> apply(x,2,mean)
[1] 2.5 6.5 10.5 14.5
> apply(x,2,sum)
[1] 10 26 42 58
>
> apply(x,1,mean)
[1] 7 8 9 10
> apply(x,1,sum)
[1] 28 32 36 40
>
> rowSums(x)
[1] 28 32 36 40
> rowMeans(x)
[1] 7 8 9 10
> colSums(x)
[1] 10 26 42 58
> colMeans(x)
[1] 2.5 6.5 10.5 14.5
正态分布 数据中抽取100个, rnorm(100)
> y <-matrix(rnorm(100), 10, 10)
> y
[,1] [,2] [,3] [,4]
[1,] -0.42408345 -0.81191312 -0.4349958 0.9117361
[2,] -1.59500688 -1.37715262 -0.9208207 0.2169574
[3,] -0.02884541 -0.60309179 2.6446003 -1.5699809
[4,] -1.33869371 1.14138910 0.2897329 -1.0382647
[5,] -0.08813003 -1.79020861 0.4934611 1.2258229
[6,] 0.35357543 1.62036017 2.0647605 -0.7446514
[7,] 0.06826431 -0.01523161 1.4100426 0.4365272
[8,] 0.17936838 0.10653279 -0.1215581 1.3045328
[9,] -0.10656597 -0.23300177 -1.7992047 1.5249087
[10,] 0.19372157 -1.40774281 0.9179701 0.1454203
[,5] [,6] [,7] [,8]
[1,] 0.92221426 1.7177518 -0.1749652 -1.26730726
[2,] -0.03996861 0.7869630 -1.3943303 0.11080946
[3,] 0.54047580 0.6427981 -1.4902075 0.56682238
[4,] 0.73704820 -0.2443959 1.8903175 -0.10904068
[5,] 0.85082790 0.7765309 -1.8429869 1.66472836
[6,] -0.78412681 -0.2577596 -0.4798235 -0.72776087
[7,] -1.05074824 -0.5878802 -0.4436067 -1.75696499
[8,] 0.52615765 0.2661636 0.4968588 -0.59706833
[9,] -1.31384555 0.1837208 -0.8989840 -0.97109103
[10,] -1.27582614 -0.4912126 -0.4111826 -0.06176029
[,9] [,10]
[1,] 0.021664887 -0.9130532
[2,] -1.319827987 -0.6903143
[3,] 0.003699993 -0.4236587
[4,] 2.118854720 0.1540048
[5,] 0.911901519 0.9230440
[6,] 0.670066295 1.0743565
[7,] 0.559020830 0.8903212
[8,] 0.815196681 0.8093940
[9,] -0.925934039 -1.6096589
[10,] -0.196218579 -0.3277840
数据的百分位点 quantile
> apply(y,1, quantile, probs=c(0.25, 0.75))
[,1] [,2] [,3] [,4] [,5]
25% -0.7176838 -1.36282146 -0.5582335 -0.2105571 0.05726776
75% 0.6892183 0.07311494 0.5602357 1.0403039 0.92025839
[,6] [,7] [,8] [,9] [,10]
25% -0.6657765 -0.5518118 0.1247417 -1.2281569 -0.47120509
75% 0.9732839 0.5283974 0.7385849 -0.1381749 0.09362514
>
> z <- array(rnorm(2*3*4), c(2,3,4))
> # 求第三维的平均
> apply(z, c(1,2), mean)
[,1] [,2] [,3]
[1,] 0.6195860 -0.3119071 -0.9599445
[2,] -0.2581552 0.5928777 -0.4447871
> # 求第二维的平均
> apply(z, c(1,3), mean)
[,1] [,2] [,3] [,4]
[1,] -0.7371599 0.3604719 -0.1596728 -0.3333267
[2,] 0.4932399 0.2285235 -0.5752525 -0.2932637
> # 求第一维度平均
> apply(z, c(2,3), mean)
[,1] [,2] [,3] [,4]
[1,] 0.081461439 0.5454456 -0.29642436 0.3923788
[2,] -0.004320375 0.9781844 -0.88212155 0.4701987
[3,] -0.443021081 -0.6401369 0.07615789 -1.8024630
mapply
- mapple
- lapply的多元版本
- mapply(参数): mapply(函数/函数名, 数据, 函数相关参数)
> list(rep(1,4), rep(2,3), rep(3,2), rep(4,1))
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
>
> mapply(rep, 1:4, 4:1)
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
自定义函数
> s <- function(n,mean, std) {
+ rnorm(n, mean, std)
+ }
> s(4,0,1)
[1] 1.497657 1.698591 1.001766 1.504275
>
> mapply(s, 1:5, 5:1, 2)
[[1]]
[1] 6.085853
[[2]]
[1] 3.4970236 0.7516018
[[3]]
[1] 1.545612 1.638842 2.926176
[[4]]
[1] 2.2733057 1.7033031 -0.5363114 0.9900924
[[5]]
[1] -2.8560507 1.1328330 -0.1063901 3.0666459 2.5156422
等价
> list(s(1,5,2), s(2,4,2),s(3,3,2),s(4,2,2),s(5,1,2))
[[1]]
[1] 5.901506
[[2]]
[1] 3.721798 3.939584
[[3]]
[1] 6.893767 3.712145 1.356954
[[4]]
[1] 2.9130755 0.6228036 0.3546602 2.8658280
[[5]]
[1] 1.1939391 3.3872741 -2.1148133 0.6025163 -1.9011238
tapply
- tapply
- 对向量的子集进行操作
- tapply(参数):tapply(向量,因子/因子列表,函数/函数名)
代码
> # 共15个元素
> x <- c(rnorm(5), runif(5), rnorm(5,1))
> x
[1] 0.32817627 2.93822645 0.42105552 0.13388377
[5] -0.75003397 0.86811817 0.06303492 0.24657806
[9] 0.19771753 0.35881366 1.68465900 0.56996848
[13] -0.07091078 -0.69957398 2.44112173
> # 因子
> f <- gl(3,5)
> f
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Levels: 1 2 3
>
> tapply(x,f, mean)
1 2 3
0.6142616 0.3468525 0.7850529
> tapply(x,f, mean, simplify = FALSE)
$`1`
[1] 0.6142616
$`2`
[1] 0.3468525
$`3`
[1] 0.7850529
split
- split
- 根据 因子或因子列表 将向量或其他对象 分组
- 通常与 lapply 一起使用
- split(参数): split(向量/列表/数据库, 因子/因子列表)
根据因子split
> x <- c(rnorm(5), runif(5), rnorm(5,1))
> f <- gl(3,5)
>
> # 返回一个列表
> split(x,f)
$`1`
[1] -0.7844589 -0.3206112 0.8029451 1.0301777 0.2514358
$`2`
[1] 0.1920543 0.2009230 0.3023645 0.5293238 0.6970664
$`3`
[1] 0.827199 -0.293274 1.734497 1.498040 1.649147
通常 split 和lapply 一起使用
> lapply(split(x,f), mean)
$`1`
[1] 0.1958977
$`2`
[1] 0.3843464
$`3`
[1] 1.083122
airquality
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
>
> s <- split(airquality, airquality$Month)
# 上面是月份,下面是每个月份记录数目
> table(airquality$Month)
5 6 7 8 9
31 30 31 31 30
> lapply(s, function(x) colMeans(x[,c('Ozone', 'Wind', 'Temp')]))
$`5`
Ozone Wind Temp
NA 11.62258 65.54839
$`6`
Ozone Wind Temp
NA 10.26667 79.10000
$`7`
Ozone Wind Temp
NA 8.941935 83.903226
$`8`
Ozone Wind Temp
NA 8.793548 83.967742
$`9`
Ozone Wind Temp
NA 10.18 76.90
> sapply(s, function(x) colMeans(x[,c('Ozone', 'Wind', 'Temp')]))
5 6 7 8 9
Ozone NA NA NA NA NA
Wind 11.62258 10.26667 8.941935 8.793548 10.18
Temp 65.54839 79.10000 83.903226 83.967742 76.90
去除NA
> sapply(s, function(x) colMeans(x[,c('Ozone', 'Wind', 'Temp')], na.rm = TRUE))
5 6 7 8 9
Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
Wind 11.62258 10.26667 8.941935 8.793548 10.18000
Temp 65.54839 79.10000 83.903226 83.967742 76.90000
排序
- 排序
- sort: 对向量进行排序,返回 排好序的内容
- order: 返回排好序的内容的 下标, 多个排序标准
> x <- data.frame(v1=1:5, v2=c(10,7,9,6,8), v3=11:15, v4=c(1,1,2,2,1))
> x
v1 v2 v3 v4
1 1 10 11 1
2 2 7 12 1
3 3 9 13 2
4 4 6 14 2
5 5 8 15 1
>
> sort(x$v2)
[1] 6 7 8 9 10
> sort(x$v2, decreasing = TRUE)
[1] 10 9 8 7 6
>
> order(x$v2)
[1] 4 2 5 3 1
> x[order(x$v2),]
v1 v2 v3 v4
4 4 6 14 2
2 2 7 12 1
5 5 8 15 1
3 3 9 13 2
1 1 10 11 1
>
> x[order(x$v4, x$v2), ]
v1 v2 v3 v4
2 2 7 12 1
5 5 8 15 1
1 1 10 11 1
4 4 6 14 2
3 3 9 13 2
>
> x[order(x$v4, x$v2,decreasing = TRUE), ]
v1 v2 v3 v4
3 3 9 13 2
4 4 6 14 2
1 1 10 11 1
5 5 8 15 1
2 2 7 12 1
总结数据信息
head() tail()
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
> tail(airquality)
Ozone Solar.R Wind Temp Month Day
148 14 20 16.6 63 9 25
149 30 193 6.9 70 9 26
150 NA 145 13.2 77 9 27
151 14 191 14.3 75 9 28
152 18 131 8.0 76 9 29
153 20 223 11.5 68 9 30
>
> head(airquality,10)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
9 8 19 20.1 61 5 9
10 NA 194 8.6 69 5 10
> tail(airquality,10)
Ozone Solar.R Wind Temp Month Day
144 13 238 12.6 64 9 21
145 23 14 9.2 71 9 22
146 36 139 10.3 81 9 23
147 7 49 10.3 69 9 24
148 14 20 16.6 63 9 25
149 30 193 6.9 70 9 26
150 NA 145 13.2 77 9 27
151 14 191 14.3 75 9 28
152 18 131 8.0 76 9 29
153 20 223 11.5 68 9 30
>
> summary(airquality)
Ozone Solar.R Wind
Min. : 1.00 Min. : 7.0 Min. : 1.700
1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400
Median : 31.50 Median :205.0 Median : 9.700
Mean : 42.13 Mean :185.9 Mean : 9.958
3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500
Max. :168.00 Max. :334.0 Max. :20.700
NA's :37 NA's :7
Temp Month Day
Min. :56.00 Min. :5.000 Min. : 1.0
1st Qu.:72.00 1st Qu.:6.000 1st Qu.: 8.0
Median :79.00 Median :7.000 Median :16.0
Mean :77.88 Mean :6.993 Mean :15.8
3rd Qu.:85.00 3rd Qu.:8.000 3rd Qu.:23.0
Max. :97.00 Max. :9.000 Max. :31.0
>
> str(airquality)
'data.frame': 153 obs. of 6 variables:
$ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
$ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
$ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
$ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
$ Month : int 5 5 5 5 5 5 5 5 5 5 ...
$ Day : int 1 2 3 4 5 6 7 8 9 10 ...
>
> table(airquality$Month)
5 6 7 8 9
31 30 31 31 30
>
> table(airquality$Ozone)
1 4 6 7 8 9 10 11 12 13 14 16 18 19 20
1 1 1 3 1 3 1 3 2 4 4 4 4 1 4
21 22 23 24 27 28 29 30 31 32 34 35 36 37 39
4 1 6 2 1 3 1 2 1 3 1 2 2 2 2
40 41 44 45 46 47 48 49 50 52 59 61 63 64 65
1 1 3 2 1 1 1 1 1 1 2 1 1 2 1
66 71 73 76 77 78 79 80 82 84 85 89 91 96 97
1 1 2 1 1 2 1 1 1 1 2 1 1 1 2
108 110 115 118 122 135 168
1 1 1 1 1 1 1
> table(airquality$Ozone, useNA = 'ifany')
1 4 6 7 8 9 10 11 12 13 14 16
1 1 1 3 1 3 1 3 2 4 4 4
18 19 20 21 22 23 24 27 28 29 30 31
4 1 4 4 1 6 2 1 3 1 2 1
32 34 35 36 37 39 40 41 44 45 46 47
3 1 2 2 2 2 1 1 3 2 1 1
48 49 50 52 59 61 63 64 65 66 71 73
1 1 1 1 2 1 1 2 1 1 1 2
76 77 78 79 80 82 84 85 89 91 96 97
1 1 2 1 1 1 1 2 1 1 1 2
108 110 115 118 122 135 168 <NA>
1 1 1 1 1 1 1 37
>
> table(airquality$Month, airquality$Day)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
9 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
23 24 25 26 27 28 29 30 31
5 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 0
7 1 1 1 1 1 1 1 1 1
8 1 1 1 1 1 1 1 1 1
9 1 1 1 1 1 1 1 1 0
>
> any(is.na(airquality$Ozone))
[1] TRUE
> sum(is.na(airquality$Ozone))
[1] 37
>
> all(airquality$Month<12)
[1] TRUE
>
> titanic <- as.data.frame(Titanic)
> head(titanic)
Class Sex Age Survived Freq
1 1st Male Child No 0
2 2nd Male Child No 0
3 3rd Male Child No 35
4 Crew Male Child No 0
5 1st Female Child No 0
6 2nd Female Child No 0
> dim(titanic)
[1] 32 5
> summary(titanic)
Class Sex Age Survived Freq
1st :8 Male :16 Child:16 No :16 Min. : 0.00
2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
3rd :8 Median : 13.50
Crew:8 Mean : 68.78
3rd Qu.: 77.00
Max. :670.00
> # 交叉表
> x <- xtabs(Freq ~Class + Age, data=titanic)
> # 排版更扁平
> ftable(x)
Age Child Adult
Class
1st 6 319
2nd 24 261
3rd 79 627
Crew 0 885
> object.size(airquality)
5496 bytes
> print(object.size(airquality), units='Kb')
5.4 Kb
小结
- 一句话 循环: lappy(sapply, split)/apply/mapply/tapply
- 排序: sort/order
- 总结数据信息: head/tail/summary/str/table/xtabs/ftable/object/size