Grouping and Summarizing in R

Fundamentals of Data Science

Author

Jeremy Teitelbaum

Loading the penguins from csv

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

penguins = read_csv("data/penguins-raw.csv")

Rows: 344 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): studyName, Species, Region, Island, Stage, Individual ID, Clutch C...
dbl  (7): Sample Number, Culmen Length (mm), Culmen Depth (mm), Flipper Leng...
date (1): Date Egg

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

penguins |> summarise_all(class)

# A tibble: 1 × 17
  studyName `Sample Number` Species   Region    Island    Stage  `Individual ID`
  <chr>     <chr>           <chr>     <chr>     <chr>     <chr>  <chr>          
1 character numeric         character character character chara… character      
# ℹ 10 more variables: `Clutch Completion` <chr>, `Date Egg` <chr>,
#   `Culmen Length (mm)` <chr>, `Culmen Depth (mm)` <chr>,
#   `Flipper Length (mm)` <chr>, `Body Mass (g)` <chr>, Sex <chr>,
#   `Delta 15 N (o/oo)` <chr>, `Delta 13 C (o/oo)` <chr>, Comments <chr>

Focus on the relevant numerical variables

focus <- c('Species','Island','Sex','Culmen Length (mm)','Culmen Depth (mm)','Flipper Length (mm)','Body Mass (g)')
simplified <- penguins |> select(all_of(focus))

Clean up the column names

Using a named vector

edited_columns <- c(species = "Species", island = "Island", culmen_length='Culmen Length (mm)', culmen_depth = 'Culmen Depth (mm)', flipper_length = 'Flipper Length (mm)', body_mass = 'Body Mass (g)')
rename(simplified, all_of(edited_columns))

# A tibble: 344 × 7
   species      island Sex   culmen_length culmen_depth flipper_length body_mass
   <chr>        <chr>  <chr>         <dbl>        <dbl>          <dbl>     <dbl>
 1 Adelie Peng… Torge… MALE           39.1         18.7            181      3750
 2 Adelie Peng… Torge… FEMA…          39.5         17.4            186      3800
 3 Adelie Peng… Torge… FEMA…          40.3         18              195      3250
 4 Adelie Peng… Torge… <NA>           NA           NA               NA        NA
 5 Adelie Peng… Torge… FEMA…          36.7         19.3            193      3450
 6 Adelie Peng… Torge… MALE           39.3         20.6            190      3650
 7 Adelie Peng… Torge… FEMA…          38.9         17.8            181      3625
 8 Adelie Peng… Torge… MALE           39.2         19.6            195      4675
 9 Adelie Peng… Torge… <NA>           34.1         18.1            193      3475
10 Adelie Peng… Torge… <NA>           42           20.2            190      4250
# ℹ 334 more rows

Renaming with a function:. Note the escape codes (double ).

fixer <- function(n) {
        a <- gsub(" \\(mm\\)","",n)
        a <- gsub(" \\(g\\)","",a)
        a <- gsub(" ","_",a)
        return(tolower(a))
}
simplified <- rename_with(simplified,fixer)

Simplify factor names

species <- simplified |>
    select(`species`) |>
    unique()
simplified <- simplified |> mutate(`species` = tolower(word(`species`, 1)))
simplified <- simplified |> mutate(`island` = tolower(`island`))
simplified <- simplified |> mutate(`sex` = tolower(`sex`))

Missing Values

Note use of anonymous function.

simplified |> summarize_all(\(x) sum(is.na(x)))

# A tibble: 1 × 7
  species island   sex culmen_length culmen_depth flipper_length body_mass
    <int>  <int> <int>         <int>        <int>          <int>     <int>
1       0      0    11             2            2              2         2

simplified |> summarize(across(culmen_length:body_mass,  \(x) sum(is.na(x))))

# A tibble: 1 × 4
  culmen_length culmen_depth flipper_length body_mass
          <int>        <int>          <int>     <int>
1             2            2              2         2

simplified <- simplified |> filter(if_all(names(simplified), \(x) !is.na(x)))

Standardizing

R can do it pretty efficiently using across and mutate.

simplified |> mutate(across(culmen_length:body_mass, list(std=\(x) (x-mean(x))/sd(x))))

# A tibble: 333 × 11
   species island    sex    culmen_length culmen_depth flipper_length body_mass
   <chr>   <chr>     <chr>          <dbl>        <dbl>          <dbl>     <dbl>
 1 adelie  torgersen male            39.1         18.7            181      3750
 2 adelie  torgersen female          39.5         17.4            186      3800
 3 adelie  torgersen female          40.3         18              195      3250
 4 adelie  torgersen female          36.7         19.3            193      3450
 5 adelie  torgersen male            39.3         20.6            190      3650
 6 adelie  torgersen female          38.9         17.8            181      3625
 7 adelie  torgersen male            39.2         19.6            195      4675
 8 adelie  torgersen female          41.1         17.6            182      3200
 9 adelie  torgersen male            38.6         21.2            191      3800
10 adelie  torgersen male            34.6         21.1            198      4400
# ℹ 323 more rows
# ℹ 4 more variables: culmen_length_std <dbl>, culmen_depth_std <dbl>,
#   flipper_length_std <dbl>, body_mass_std <dbl>

Grouping

simplified |> select(sex:body_mass) |> group_by(sex) |> summarize(across(culmen_length:body_mass,mean))

# A tibble: 2 × 5
  sex    culmen_length culmen_depth flipper_length body_mass
  <chr>          <dbl>        <dbl>          <dbl>     <dbl>
1 female          42.1         16.4           197.     3862.
2 male            45.9         17.9           205.     4546.

by_sex_and_species <- simplified |> group_by(species,sex) |> summarize(across(culmen_length:body_mass,mean))

`summarise()` has grouped output by 'species'. You can override using the
`.groups` argument.

You can select using filter.

by_sex_and_species |> filter(sex=='male')

# A tibble: 3 × 6
# Groups:   species [3]
  species   sex   culmen_length culmen_depth flipper_length body_mass
  <chr>     <chr>         <dbl>        <dbl>          <dbl>     <dbl>
1 adelie    male           40.4         19.1           192.     4043.
2 chinstrap male           51.1         19.3           200.     3939.
3 gentoo    male           49.5         15.7           222.     5485.

Making a pivot table.

by_sex_and_species |>
    select(species, sex, culmen_length) |>
    pivot_wider(names_from = species, values_from = culmen_length)

# A tibble: 2 × 4
  sex    adelie chinstrap gentoo
  <chr>   <dbl>     <dbl>  <dbl>
1 female   37.3      46.6   45.6
2 male     40.4      51.1   49.5

Making a function for each possibility

ptable <- function(value) {
    by_sex_and_species |>
        select(species, sex, {{ value }}) |>
        pivot_wider(names_from = species, values_from = {{ value }})
}
ptable("body_mass")

# A tibble: 2 × 4
  sex    adelie chinstrap gentoo
  <chr>   <dbl>     <dbl>  <dbl>
1 female  3369.     3527.  4680.
2 male    4043.     3939.  5485.