分组可以做很多事情,但最常用的,肯定还是分组统计。
.by
在函数中分组
summarise(..., .by)
统计个别统计量
summ(..., .by)
快速生成大量变量的多个统计量
mutate(..., .by)
根据分组统计量更改或新建变量
group_by()
在函数前构造分组数据
ungroup()
取消数据分组
summarise(..., .by)
Warning: package 'ggplot2' was built under R version 4.3.3
Warning: package 'stringr' was built under R version 4.3.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
diamonds %>%
summarise(
price = mean(price),
.by = clarity
)
# A tibble: 8 × 2
clarity price
<ord> <dbl>
1 SI2 5063.
2 SI1 3996.
3 VS1 3839.
4 VS2 3925.
5 VVS2 3284.
6 VVS1 2523.
7 I1 3924.
8 IF 2865.
diamonds %>%
summarise(
price = mean(price),
.by = c(cut, clarity)
)
# A tibble: 40 × 3
cut clarity price
<ord> <ord> <dbl>
1 Ideal SI2 4756.
2 Premium SI1 4455.
3 Good VS1 3801.
4 Premium VS2 4550.
5 Good SI2 4580.
6 Very Good VVS2 3038.
7 Very Good VVS1 2459.
8 Very Good SI1 3932.
9 Fair VS2 4175.
10 Very Good VS1 3805.
# ℹ 30 more rows
summ(..., .by)
diamonds %>%
summ(price, .by = clarity)
# A tibble: 8 × 9
clarity variable type n unique mean sd min max
<ord> <chr> <chr> <int> <int> <dbl> <dbl> <int> <int>
1 SI2 price int 9194 4904 5063. 4260. 326 18804
2 SI1 price int 13065 5380 3996. 3799. 326 18818
3 VS1 price int 8171 3926 3839. 4012. 327 18795
4 VS2 price int 12258 5051 3925. 4042. 334 18823
5 VVS2 price int 5066 2409 3284. 3822. 336 18768
6 VVS1 price int 3655 1623 2523. 3335. 336 18777
7 I1 price int 741 632 3924. 2807. 345 18531
8 IF price int 1790 902 2865. 3920. 369 18806
diamonds %>%
summ(price, .by = c(cut, clarity))
# A tibble: 40 × 10
cut clarity variable type n unique mean sd min max
<ord> <ord> <chr> <chr> <int> <int> <dbl> <dbl> <int> <int>
1 Ideal SI2 price int 2598 1922 4756. 4252. 326 18804
2 Premium SI1 price int 3575 2107 4455. 4071. 326 18797
3 Good VS1 price int 648 548 3801. 3703. 327 18340
4 Premium VS2 price int 3357 1868 4550. 4457. 334 18823
5 Good SI2 price int 1081 880 4580. 3901. 335 18788
6 Very Good VVS2 price int 1235 791 3038. 3768. 336 18211
7 Very Good VVS1 price int 789 511 2459. 3317. 336 18777
8 Very Good SI1 price int 3240 2107 3932. 3708. 337 18818
9 Fair VS2 price int 261 240 4175. 3532. 337 18565
10 Very Good VS1 price int 1775 1304 3805. 3864. 338 18500
# ℹ 30 more rows
diamonds %>%
summ(x:z, .by = clarity)
# A tibble: 24 × 9
clarity variable type n unique mean sd min max
<ord> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 SI2 x dbl 9194 492 6.40 1.06 0 9.51
2 SI2 y dbl 9194 490 6.40 1.18 0 58.9
3 SI2 z dbl 9194 320 3.95 0.660 0 8.06
4 SI1 x dbl 13065 480 5.89 1.04 3.88 8.9
5 SI1 y dbl 13065 471 5.89 1.04 3.84 8.87
6 SI1 z dbl 13065 301 3.64 0.645 0 5.49
7 VS1 x dbl 8171 466 5.57 1.06 0 8.83
8 VS1 y dbl 8171 456 5.58 1.10 0 31.8
9 VS1 z dbl 8171 297 3.44 0.725 0 31.8
10 VS2 x dbl 12258 490 5.66 1.09 0 9.66
# ℹ 14 more rows
mutate(..., .by)
diamonds %>%
mutate(
price = mean(price),
.by = clarity
)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 5063. 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 3996. 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 3839. 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 3925. 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 5063. 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 3284. 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 2523. 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 3996. 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 3925. 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 3839. 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>%
mutate(
mean_price = mean(price),
.by = clarity
)
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z mean_price
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 5063.
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 3996.
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 3839.
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 3925.
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 5063.
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 3284.
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 2523.
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 3996.
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 3925.
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 3839.
# ℹ 53,930 more rows
diamonds %>%
mutate(
price3g = cut(price, 3),
.by = color
)
# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z price3g
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <fct>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 (308,6.46e…
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 (308,6.46e…
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 (308,6.46e…
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 (316,6.5e+…
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 (317,6.46e…
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 (317,6.46e…
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 (316,6.5e+…
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 (319,6.49e…
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 (308,6.46e…
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 (319,6.49e…
# ℹ 53,930 more rows
group_by()
diamonds %>%
group_by(cut)
# A tibble: 53,940 × 10
# Groups: cut [5]
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
diamonds %>%
group_by(cut) %>%
summarise(price = mean(price))
# A tibble: 5 × 2
cut price
<ord> <dbl>
1 Fair 4359.
2 Good 3929.
3 Very Good 3982.
4 Premium 4584.
5 Ideal 3458.
diamonds %>%
group_by(cut) %>%
summ(price)
# A tibble: 5 × 9
cut variable type n unique mean sd min max
<ord> <chr> <chr> <int> <int> <dbl> <dbl> <int> <int>
1 Fair price int 1610 1267 4359. 3560. 337 18574
2 Good price int 4906 3086 3929. 3682. 327 18788
3 Very Good price int 12082 5840 3982. 3936. 336 18818
4 Premium price int 13791 6014 4584. 4349. 326 18823
5 Ideal price int 21551 7281 3458. 3808. 326 18806
diamonds %>%
group_by(clarity) %>%
summ(x:z)
# A tibble: 24 × 9
clarity variable type n unique mean sd min max
<ord> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 I1 x dbl 741 319 6.76 1.03 4.33 10.7
2 I1 y dbl 741 324 6.71 1.03 4.29 10.5
3 I1 z dbl 741 249 4.21 0.718 0 6.98
4 SI2 x dbl 9194 492 6.40 1.06 0 9.51
5 SI2 y dbl 9194 490 6.40 1.18 0 58.9
6 SI2 z dbl 9194 320 3.95 0.660 0 8.06
7 SI1 x dbl 13065 480 5.89 1.04 3.88 8.9
8 SI1 y dbl 13065 471 5.89 1.04 3.84 8.87
9 SI1 z dbl 13065 301 3.64 0.645 0 5.49
10 VS2 x dbl 12258 490 5.66 1.09 0 9.66
# ℹ 14 more rows
diamonds %>%
group_by(cut) %>%
summ(price, .by = clarity)
Error in `summarise()`:
! Can't supply `.by` when `.data` is a grouped data frame.
diamonds %>%
group_by(color) %>%
mutate(
price3g = cut(price, 3)
)
# A tibble: 53,940 × 11
# Groups: color [7]
carat cut color clarity depth table price x y z price3g
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <fct>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 (308,6.46e…
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 (308,6.46e…
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 (308,6.46e…
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 (316,6.5e+…
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 (317,6.46e…
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 (317,6.46e…
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 (316,6.5e+…
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 (319,6.49e…
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 (308,6.46e…
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 (319,6.49e…
# ℹ 53,930 more rows