10 dplyr 分组

分组可以做很多事情，但最常用的，肯定还是分组统计。

.by 在函数中分组
- summarise(..., .by) 统计个别统计量
- summ(..., .by) 快速生成大量变量的多个统计量
- mutate(..., .by) 根据分组统计量更改或新建变量
group_by() 在函数前构造分组数据
ungroup() 取消数据分组

10.1 `summarise(..., .by)`

library(tidyverse)

Warning: package 'ggplot2' was built under R version 4.3.3

Warning: package 'stringr' was built under R version 4.3.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

diamonds %>%
  summarise(
    price = mean(price),
    .by = clarity
  )

# A tibble: 8 × 2
  clarity price
  <ord>   <dbl>
1 SI2     5063.
2 SI1     3996.
3 VS1     3839.
4 VS2     3925.
5 VVS2    3284.
6 VVS1    2523.
7 I1      3924.
8 IF      2865.

diamonds %>%
  summarise(
    price = mean(price),
    .by = c(cut, clarity)
  )

# A tibble: 40 × 3
   cut       clarity price
   <ord>     <ord>   <dbl>
 1 Ideal     SI2     4756.
 2 Premium   SI1     4455.
 3 Good      VS1     3801.
 4 Premium   VS2     4550.
 5 Good      SI2     4580.
 6 Very Good VVS2    3038.
 7 Very Good VVS1    2459.
 8 Very Good SI1     3932.
 9 Fair      VS2     4175.
10 Very Good VS1     3805.
# ℹ 30 more rows

10.2 `summ(..., .by)`

diamonds %>%
  summ(price, .by = clarity)

# A tibble: 8 × 9
  clarity variable type      n unique  mean    sd   min   max
  <ord>   <chr>    <chr> <int>  <int> <dbl> <dbl> <int> <int>
1 SI2     price    int    9194   4904 5063. 4260.   326 18804
2 SI1     price    int   13065   5380 3996. 3799.   326 18818
3 VS1     price    int    8171   3926 3839. 4012.   327 18795
4 VS2     price    int   12258   5051 3925. 4042.   334 18823
5 VVS2    price    int    5066   2409 3284. 3822.   336 18768
6 VVS1    price    int    3655   1623 2523. 3335.   336 18777
7 I1      price    int     741    632 3924. 2807.   345 18531
8 IF      price    int    1790    902 2865. 3920.   369 18806

diamonds %>%
  summ(price, .by = c(cut, clarity))

# A tibble: 40 × 10
   cut       clarity variable type      n unique  mean    sd   min   max
   <ord>     <ord>   <chr>    <chr> <int>  <int> <dbl> <dbl> <int> <int>
 1 Ideal     SI2     price    int    2598   1922 4756. 4252.   326 18804
 2 Premium   SI1     price    int    3575   2107 4455. 4071.   326 18797
 3 Good      VS1     price    int     648    548 3801. 3703.   327 18340
 4 Premium   VS2     price    int    3357   1868 4550. 4457.   334 18823
 5 Good      SI2     price    int    1081    880 4580. 3901.   335 18788
 6 Very Good VVS2    price    int    1235    791 3038. 3768.   336 18211
 7 Very Good VVS1    price    int     789    511 2459. 3317.   336 18777
 8 Very Good SI1     price    int    3240   2107 3932. 3708.   337 18818
 9 Fair      VS2     price    int     261    240 4175. 3532.   337 18565
10 Very Good VS1     price    int    1775   1304 3805. 3864.   338 18500
# ℹ 30 more rows

diamonds %>%
  summ(x:z, .by = clarity)

# A tibble: 24 × 9
   clarity variable type      n unique  mean    sd   min   max
   <ord>   <chr>    <chr> <int>  <int> <dbl> <dbl> <dbl> <dbl>
 1 SI2     x        dbl    9194    492  6.40 1.06   0     9.51
 2 SI2     y        dbl    9194    490  6.40 1.18   0    58.9 
 3 SI2     z        dbl    9194    320  3.95 0.660  0     8.06
 4 SI1     x        dbl   13065    480  5.89 1.04   3.88  8.9 
 5 SI1     y        dbl   13065    471  5.89 1.04   3.84  8.87
 6 SI1     z        dbl   13065    301  3.64 0.645  0     5.49
 7 VS1     x        dbl    8171    466  5.57 1.06   0     8.83
 8 VS1     y        dbl    8171    456  5.58 1.10   0    31.8 
 9 VS1     z        dbl    8171    297  3.44 0.725  0    31.8 
10 VS2     x        dbl   12258    490  5.66 1.09   0     9.66
# ℹ 14 more rows

10.3 `mutate(..., .by)`

diamonds %>%
  mutate(
    price = mean(price),
    .by = clarity
  )

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55 5063.  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61 3996.  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65 3839.  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58 3925.  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58 5063.  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57 3284.  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57 2523.  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55 3996.  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61 3925.  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61 3839.  4     4.05  2.39
# ℹ 53,930 more rows

diamonds %>%
  mutate(
    mean_price = mean(price),
    .by = clarity
  )

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z mean_price
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>      <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43      5063.
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31      3996.
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31      3839.
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63      3925.
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75      5063.
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48      3284.
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47      2523.
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53      3996.
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49      3925.
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39      3839.
# ℹ 53,930 more rows

diamonds %>%
  mutate(
    price3g = cut(price, 3),
    .by = color
  )

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z price3g    
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <fct>      
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43 (308,6.46e…
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31 (308,6.46e…
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31 (308,6.46e…
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63 (316,6.5e+…
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75 (317,6.46e…
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48 (317,6.46e…
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47 (316,6.5e+…
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53 (319,6.49e…
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49 (308,6.46e…
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39 (319,6.49e…
# ℹ 53,930 more rows

10.4 `group_by()`

diamonds %>%
  group_by(cut)

# A tibble: 53,940 × 10
# Groups:   cut [5]
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

diamonds %>%
  group_by(cut) %>%
  summarise(price = mean(price))

# A tibble: 5 × 2
  cut       price
  <ord>     <dbl>
1 Fair      4359.
2 Good      3929.
3 Very Good 3982.
4 Premium   4584.
5 Ideal     3458.

diamonds %>%
  group_by(cut) %>%
  summ(price)

# A tibble: 5 × 9
  cut       variable type      n unique  mean    sd   min   max
  <ord>     <chr>    <chr> <int>  <int> <dbl> <dbl> <int> <int>
1 Fair      price    int    1610   1267 4359. 3560.   337 18574
2 Good      price    int    4906   3086 3929. 3682.   327 18788
3 Very Good price    int   12082   5840 3982. 3936.   336 18818
4 Premium   price    int   13791   6014 4584. 4349.   326 18823
5 Ideal     price    int   21551   7281 3458. 3808.   326 18806

diamonds %>%
  group_by(clarity) %>%
  summ(x:z)

# A tibble: 24 × 9
   clarity variable type      n unique  mean    sd   min   max
   <ord>   <chr>    <chr> <int>  <int> <dbl> <dbl> <dbl> <dbl>
 1 I1      x        dbl     741    319  6.76 1.03   4.33 10.7 
 2 I1      y        dbl     741    324  6.71 1.03   4.29 10.5 
 3 I1      z        dbl     741    249  4.21 0.718  0     6.98
 4 SI2     x        dbl    9194    492  6.40 1.06   0     9.51
 5 SI2     y        dbl    9194    490  6.40 1.18   0    58.9 
 6 SI2     z        dbl    9194    320  3.95 0.660  0     8.06
 7 SI1     x        dbl   13065    480  5.89 1.04   3.88  8.9 
 8 SI1     y        dbl   13065    471  5.89 1.04   3.84  8.87
 9 SI1     z        dbl   13065    301  3.64 0.645  0     5.49
10 VS2     x        dbl   12258    490  5.66 1.09   0     9.66
# ℹ 14 more rows

diamonds %>%
  group_by(cut) %>%
  summ(price, .by = clarity)

Error in `summarise()`:
! Can't supply `.by` when `.data` is a grouped data frame.

diamonds %>%
  group_by(color) %>%
  mutate(
    price3g = cut(price, 3)
  )

# A tibble: 53,940 × 11
# Groups:   color [7]
   carat cut       color clarity depth table price     x     y     z price3g    
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <fct>      
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43 (308,6.46e…
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31 (308,6.46e…
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31 (308,6.46e…
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63 (316,6.5e+…
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75 (317,6.46e…
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48 (317,6.46e…
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47 (316,6.5e+…
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53 (319,6.49e…
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49 (308,6.46e…
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39 (319,6.49e…
# ℹ 53,930 more rows