R 重要函数1

+处理循环 - R不仅有for/while循环,还有更强大的”一句话”函数。

  • 排序
  • 总结数据信息

lapply

  • 可以循环处理列表中的每一个元素
  • lapply(参数):lapply(列表, 函数/函数名, 其他参数)
  • 总是返回一个列表
  • sapply:简化结果
    • 结果列表元素 长度均为1, 返回向量
    • 结果列表元素 长度相同且大于1, 返回矩阵

str()函数,将R对象以整洁的形式展现

> str(lapply)
function (X, FUN, ...)  
> 
> x <- list(a=1:10, b=c(11,21,31,41,51))
> 
> lapply(x,mean)
$a
[1] 5.5

$b
[1] 31

> 
> y <- 1:4
> y
[1] 1 2 3 4
> 
> lapply(y, runif)
[[1]]
[1] 0.9258423

[[2]]
[1] 0.1017509 0.8193295

[[3]]
[1] 0.4439033 0.8459882 0.6488867

[[4]]
[1] 0.9457995 0.6919540 0.2560230 0.7049408

> lapply(y, runif, min=0, max=100)
[[1]]
[1] 35.7373

[[2]]
[1] 71.53044 77.24861

[[3]]
[1] 34.95911 89.55134 88.55318

[[4]]
[1]  5.153867 10.121085 42.380309 75.263465

> 
> z <- list(a=matrix(1:6,2,3), b=matrix(4:7, 2,2))

匿名函数

> lapply(z, function(m) m[1,])
$a
[1] 1 3 5

$b
[1] 4 6

sapply 化简

> xx <- list(a=1:10, b=c(11,21,31,41,51))
> lapply(xx,mean)
$a
[1] 5.5

$b
[1] 31

> sapply(xx,mean)
   a    b 
 5.5 31.0 

> # 结果是向量
> class(sapply(xx,mean))
[1] "numeric"

apply

  • 沿着数组的某一维度处理数据
    • 例如:将函数用于矩阵的行或列
    • 虽然与for/while循环的效率相似,但是只用一句话就可以完成
  • apple(参数): apply(数组,维度,函数/函数名)
    > x <- matrix(1:16, 4,4)
    > x
         [,1] [,2] [,3] [,4]
    [1,]    1    5    9   13
    [2,]    2    6   10   14
    [3,]    3    7   11   15
    [4,]    4    8   12   16

按第二个维度求平均

> apply(x,2,mean)
[1]  2.5  6.5 10.5 14.5
> apply(x,2,sum)
[1] 10 26 42 58
> 
> apply(x,1,mean)
[1]  7  8  9 10
> apply(x,1,sum)
[1] 28 32 36 40
> 
> rowSums(x)
[1] 28 32 36 40
> rowMeans(x)
[1]  7  8  9 10
> colSums(x)
[1] 10 26 42 58
> colMeans(x)
[1]  2.5  6.5 10.5 14.5

正态分布 数据中抽取100个, rnorm(100)

> y <-matrix(rnorm(100), 10, 10)
> y
             [,1]        [,2]       [,3]       [,4]
 [1,] -0.42408345 -0.81191312 -0.4349958  0.9117361
 [2,] -1.59500688 -1.37715262 -0.9208207  0.2169574
 [3,] -0.02884541 -0.60309179  2.6446003 -1.5699809
 [4,] -1.33869371  1.14138910  0.2897329 -1.0382647
 [5,] -0.08813003 -1.79020861  0.4934611  1.2258229
 [6,]  0.35357543  1.62036017  2.0647605 -0.7446514
 [7,]  0.06826431 -0.01523161  1.4100426  0.4365272
 [8,]  0.17936838  0.10653279 -0.1215581  1.3045328
 [9,] -0.10656597 -0.23300177 -1.7992047  1.5249087
[10,]  0.19372157 -1.40774281  0.9179701  0.1454203
             [,5]       [,6]       [,7]        [,8]
 [1,]  0.92221426  1.7177518 -0.1749652 -1.26730726
 [2,] -0.03996861  0.7869630 -1.3943303  0.11080946
 [3,]  0.54047580  0.6427981 -1.4902075  0.56682238
 [4,]  0.73704820 -0.2443959  1.8903175 -0.10904068
 [5,]  0.85082790  0.7765309 -1.8429869  1.66472836
 [6,] -0.78412681 -0.2577596 -0.4798235 -0.72776087
 [7,] -1.05074824 -0.5878802 -0.4436067 -1.75696499
 [8,]  0.52615765  0.2661636  0.4968588 -0.59706833
 [9,] -1.31384555  0.1837208 -0.8989840 -0.97109103
[10,] -1.27582614 -0.4912126 -0.4111826 -0.06176029
              [,9]      [,10]
 [1,]  0.021664887 -0.9130532
 [2,] -1.319827987 -0.6903143
 [3,]  0.003699993 -0.4236587
 [4,]  2.118854720  0.1540048
 [5,]  0.911901519  0.9230440
 [6,]  0.670066295  1.0743565
 [7,]  0.559020830  0.8903212
 [8,]  0.815196681  0.8093940
 [9,] -0.925934039 -1.6096589
[10,] -0.196218579 -0.3277840

数据的百分位点 quantile

> apply(y,1, quantile, probs=c(0.25, 0.75))
          [,1]        [,2]       [,3]       [,4]       [,5]
25% -0.7176838 -1.36282146 -0.5582335 -0.2105571 0.05726776
75%  0.6892183  0.07311494  0.5602357  1.0403039 0.92025839
          [,6]       [,7]      [,8]       [,9]       [,10]
25% -0.6657765 -0.5518118 0.1247417 -1.2281569 -0.47120509
75%  0.9732839  0.5283974 0.7385849 -0.1381749  0.09362514
> 
> z <- array(rnorm(2*3*4), c(2,3,4))
> # 求第三维的平均
> apply(z, c(1,2), mean)
           [,1]       [,2]       [,3]
[1,]  0.6195860 -0.3119071 -0.9599445
[2,] -0.2581552  0.5928777 -0.4447871
> # 求第二维的平均
> apply(z, c(1,3), mean)
           [,1]      [,2]       [,3]       [,4]
[1,] -0.7371599 0.3604719 -0.1596728 -0.3333267
[2,]  0.4932399 0.2285235 -0.5752525 -0.2932637
> # 求第一维度平均
> apply(z, c(2,3), mean)
             [,1]       [,2]        [,3]       [,4]
[1,]  0.081461439  0.5454456 -0.29642436  0.3923788
[2,] -0.004320375  0.9781844 -0.88212155  0.4701987
[3,] -0.443021081 -0.6401369  0.07615789 -1.8024630

mapply

  • mapple
    • lapply的多元版本
    • mapply(参数): mapply(函数/函数名, 数据, 函数相关参数)
> list(rep(1,4), rep(2,3), rep(3,2), rep(4,1))
[[1]]
[1] 1 1 1 1

[[2]]
[1] 2 2 2

[[3]]
[1] 3 3

[[4]]
[1] 4

> 
> mapply(rep, 1:4, 4:1)
[[1]]
[1] 1 1 1 1

[[2]]
[1] 2 2 2

[[3]]
[1] 3 3

[[4]]
[1] 4

自定义函数

> s <- function(n,mean, std) {
+   rnorm(n, mean, std)
+ }
> s(4,0,1)
[1] 1.497657 1.698591 1.001766 1.504275
> 
> mapply(s, 1:5, 5:1, 2)
[[1]]
[1] 6.085853

[[2]]
[1] 3.4970236 0.7516018

[[3]]
[1] 1.545612 1.638842 2.926176

[[4]]
[1]  2.2733057  1.7033031 -0.5363114  0.9900924

[[5]]
[1] -2.8560507  1.1328330 -0.1063901  3.0666459  2.5156422

等价

> list(s(1,5,2), s(2,4,2),s(3,3,2),s(4,2,2),s(5,1,2))
[[1]]
[1] 5.901506

[[2]]
[1] 3.721798 3.939584

[[3]]
[1] 6.893767 3.712145 1.356954

[[4]]
[1] 2.9130755 0.6228036 0.3546602 2.8658280

[[5]]
[1]  1.1939391  3.3872741 -2.1148133  0.6025163 -1.9011238

tapply

  • tapply
    • 对向量的子集进行操作
    • tapply(参数):tapply(向量,因子/因子列表,函数/函数名)

代码

> # 共15个元素
> x <- c(rnorm(5), runif(5), rnorm(5,1))
> x
 [1]  0.32817627  2.93822645  0.42105552  0.13388377
 [5] -0.75003397  0.86811817  0.06303492  0.24657806
 [9]  0.19771753  0.35881366  1.68465900  0.56996848
[13] -0.07091078 -0.69957398  2.44112173
> # 因子
> f <- gl(3,5)
> f
 [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Levels: 1 2 3
> 
> tapply(x,f, mean)
        1         2         3 
0.6142616 0.3468525 0.7850529 
> tapply(x,f, mean, simplify = FALSE)
$`1`
[1] 0.6142616

$`2`
[1] 0.3468525

$`3`
[1] 0.7850529

split

  • split
    • 根据 因子或因子列表 将向量或其他对象 分组
    • 通常与 lapply 一起使用
    • split(参数): split(向量/列表/数据库, 因子/因子列表)

根据因子split

> x <- c(rnorm(5), runif(5), rnorm(5,1))
> f <- gl(3,5)
> 
> # 返回一个列表
> split(x,f)
$`1`
[1] -0.7844589 -0.3206112  0.8029451  1.0301777  0.2514358

$`2`
[1] 0.1920543 0.2009230 0.3023645 0.5293238 0.6970664

$`3`
[1]  0.827199 -0.293274  1.734497  1.498040  1.649147

通常 split 和lapply 一起使用

> lapply(split(x,f), mean)
$`1`
[1] 0.1958977

$`2`
[1] 0.3843464

$`3`
[1] 1.083122

airquality

> head(airquality)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
> 
> s <- split(airquality, airquality$Month)
 
# 上面是月份,下面是每个月份记录数目
> table(airquality$Month)

 5  6  7  8  9 
31 30 31 31 30 

> lapply(s, function(x) colMeans(x[,c('Ozone', 'Wind', 'Temp')]))
$`5`
   Ozone     Wind     Temp 
      NA 11.62258 65.54839 

$`6`
   Ozone     Wind     Temp 
      NA 10.26667 79.10000 

$`7`
    Ozone      Wind      Temp 
       NA  8.941935 83.903226 

$`8`
    Ozone      Wind      Temp 
       NA  8.793548 83.967742 

$`9`
Ozone  Wind  Temp 
   NA 10.18 76.90 


> sapply(s, function(x) colMeans(x[,c('Ozone', 'Wind', 'Temp')]))
             5        6         7         8     9
Ozone       NA       NA        NA        NA    NA
Wind  11.62258 10.26667  8.941935  8.793548 10.18
Temp  65.54839 79.10000 83.903226 83.967742 76.90

去除NA

> sapply(s, function(x) colMeans(x[,c('Ozone', 'Wind', 'Temp')], na.rm = TRUE))
             5        6         7         8        9
Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
Wind  11.62258 10.26667  8.941935  8.793548 10.18000
Temp  65.54839 79.10000 83.903226 83.967742 76.90000

排序

  • 排序
    • sort: 对向量进行排序,返回 排好序的内容
    • order: 返回排好序的内容的 下标, 多个排序标准
> x <- data.frame(v1=1:5, v2=c(10,7,9,6,8), v3=11:15, v4=c(1,1,2,2,1))
> x
  v1 v2 v3 v4
1  1 10 11  1
2  2  7 12  1
3  3  9 13  2
4  4  6 14  2
5  5  8 15  1
> 
> sort(x$v2)
[1]  6  7  8  9 10
> sort(x$v2, decreasing = TRUE)
[1] 10  9  8  7  6
> 
> order(x$v2)
[1] 4 2 5 3 1
> x[order(x$v2),]
  v1 v2 v3 v4
4  4  6 14  2
2  2  7 12  1
5  5  8 15  1
3  3  9 13  2
1  1 10 11  1
> 
> x[order(x$v4, x$v2), ]
  v1 v2 v3 v4
2  2  7 12  1
5  5  8 15  1
1  1 10 11  1
4  4  6 14  2
3  3  9 13  2
> 
> x[order(x$v4, x$v2,decreasing = TRUE), ]
  v1 v2 v3 v4
3  3  9 13  2
4  4  6 14  2
1  1 10 11  1
5  5  8 15  1
2  2  7 12  1

总结数据信息

head() tail()

> head(airquality)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
> tail(airquality)
    Ozone Solar.R Wind Temp Month Day
148    14      20 16.6   63     9  25
149    30     193  6.9   70     9  26
150    NA     145 13.2   77     9  27
151    14     191 14.3   75     9  28
152    18     131  8.0   76     9  29
153    20     223 11.5   68     9  30
> 
> head(airquality,10)
   Ozone Solar.R Wind Temp Month Day
1     41     190  7.4   67     5   1
2     36     118  8.0   72     5   2
3     12     149 12.6   74     5   3
4     18     313 11.5   62     5   4
5     NA      NA 14.3   56     5   5
6     28      NA 14.9   66     5   6
7     23     299  8.6   65     5   7
8     19      99 13.8   59     5   8
9      8      19 20.1   61     5   9
10    NA     194  8.6   69     5  10
> tail(airquality,10)
    Ozone Solar.R Wind Temp Month Day
144    13     238 12.6   64     9  21
145    23      14  9.2   71     9  22
146    36     139 10.3   81     9  23
147     7      49 10.3   69     9  24
148    14      20 16.6   63     9  25
149    30     193  6.9   70     9  26
150    NA     145 13.2   77     9  27
151    14     191 14.3   75     9  28
152    18     131  8.0   76     9  29
153    20     223 11.5   68     9  30
> 
> summary(airquality)
     Ozone           Solar.R           Wind       
 Min.   :  1.00   Min.   :  7.0   Min.   : 1.700  
 1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400  
 Median : 31.50   Median :205.0   Median : 9.700  
 Mean   : 42.13   Mean   :185.9   Mean   : 9.958  
 3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500  
 Max.   :168.00   Max.   :334.0   Max.   :20.700  
 NA's   :37       NA's   :7                       
      Temp           Month            Day      
 Min.   :56.00   Min.   :5.000   Min.   : 1.0  
 1st Qu.:72.00   1st Qu.:6.000   1st Qu.: 8.0  
 Median :79.00   Median :7.000   Median :16.0  
 Mean   :77.88   Mean   :6.993   Mean   :15.8  
 3rd Qu.:85.00   3rd Qu.:8.000   3rd Qu.:23.0  
 Max.   :97.00   Max.   :9.000   Max.   :31.0  

> 
> str(airquality)
'data.frame': 153 obs. of  6 variables:
 $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
 $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
 $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
 $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
 $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
 $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
> 
> table(airquality$Month)

 5  6  7  8  9 
31 30 31 31 30 
> 
> table(airquality$Ozone)

  1   4   6   7   8   9  10  11  12  13  14  16  18  19  20 
  1   1   1   3   1   3   1   3   2   4   4   4   4   1   4 
 21  22  23  24  27  28  29  30  31  32  34  35  36  37  39 
  4   1   6   2   1   3   1   2   1   3   1   2   2   2   2 
 40  41  44  45  46  47  48  49  50  52  59  61  63  64  65 
  1   1   3   2   1   1   1   1   1   1   2   1   1   2   1 
 66  71  73  76  77  78  79  80  82  84  85  89  91  96  97 
  1   1   2   1   1   2   1   1   1   1   2   1   1   1   2 
108 110 115 118 122 135 168 
  1   1   1   1   1   1   1 
> table(airquality$Ozone, useNA = 'ifany')

   1    4    6    7    8    9   10   11   12   13   14   16 
   1    1    1    3    1    3    1    3    2    4    4    4 
  18   19   20   21   22   23   24   27   28   29   30   31 
   4    1    4    4    1    6    2    1    3    1    2    1 
  32   34   35   36   37   39   40   41   44   45   46   47 
   3    1    2    2    2    2    1    1    3    2    1    1 
  48   49   50   52   59   61   63   64   65   66   71   73 
   1    1    1    1    2    1    1    2    1    1    1    2 
  76   77   78   79   80   82   84   85   89   91   96   97 
   1    1    2    1    1    1    1    2    1    1    1    2 
 108  110  115  118  122  135  168 <NA> 
   1    1    1    1    1    1    1   37 
> 
> table(airquality$Month, airquality$Day)

    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
  5 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1
  6 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1
  7 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1
  8 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1
  9 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1

    23 24 25 26 27 28 29 30 31
  5  1  1  1  1  1  1  1  1  1
  6  1  1  1  1  1  1  1  1  0
  7  1  1  1  1  1  1  1  1  1
  8  1  1  1  1  1  1  1  1  1
  9  1  1  1  1  1  1  1  1  0
> 
> any(is.na(airquality$Ozone))
[1] TRUE
> sum(is.na(airquality$Ozone))
[1] 37
> 
> all(airquality$Month<12)
[1] TRUE
> 
> titanic <- as.data.frame(Titanic)
> head(titanic)
  Class    Sex   Age Survived Freq
1   1st   Male Child       No    0
2   2nd   Male Child       No    0
3   3rd   Male Child       No   35
4  Crew   Male Child       No    0
5   1st Female Child       No    0
6   2nd Female Child       No    0
> dim(titanic)
[1] 32  5
> summary(titanic)
  Class       Sex        Age     Survived      Freq       
 1st :8   Male  :16   Child:16   No :16   Min.   :  0.00  
 2nd :8   Female:16   Adult:16   Yes:16   1st Qu.:  0.75  
 3rd :8                                   Median : 13.50  
 Crew:8                                   Mean   : 68.78  
                                          3rd Qu.: 77.00  
                                          Max.   :670.00  
> # 交叉表
> x <- xtabs(Freq ~Class + Age, data=titanic)
> # 排版更扁平
> ftable(x)
      Age Child Adult
Class                
1st           6   319
2nd          24   261
3rd          79   627
Crew          0   885
> object.size(airquality)
5496 bytes
> print(object.size(airquality), units='Kb')
5.4 Kb

小结

  • 一句话 循环: lappy(sapply, split)/apply/mapply/tapply
  • 排序: sort/order
  • 总结数据信息: head/tail/summary/str/table/xtabs/ftable/object/size