Regexps in R

Fundamentals of Data Science

Author

Jeremy Teitelbaum

Regexps in R

The language is the same, but the wrapper functions are different.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

str_view(c("Help", "Hero", "Hello", "Friend"), "Hel")

[1] │ <Hel>p
[3] │ <Hel>lo

str_view(c("Help", "Hero", "Hello", "Friend"), "He.*l")

[1] │ <Hel>p
[3] │ <Hell>o

str_view(c("Now is the time for us to rise up"), "[A-Za-z]+[a-z]?")

[1] │ <Now> <is> <the> <time> <for> <us> <to> <rise> <up>

str_detect(c("Now is the", "time to rise up"), "\\b\\w+\\b")

[1] TRUE TRUE

str_view(c("Now is the", "time to rise up"), "\\b\\w+\\b")

[1] │ <Now> <is> <the>
[2] │ <time> <to> <rise> <up>

str_count(c("Now is the", "time to rise up"), "\\b\\w+\\b")

[1] 3 4

gettysburg <- read_lines("data/gettysburg.txt")
str_extract(gettysburg, "\\b(\\w+)\\b")

[1] "Four" NA     "Now"  NA     "But"

# str_match_all is inconvenient -- output is a list
words <- str_match_all(gettysburg[1], "\\b\\w+\\b")

filenames <- read_lines("data/filenames.txt")
matches <- str_match_all(filenames, ".*_([a-z]{3}[0-9]{5})_.*\\.(qmd|Rmd|pdf)")

# separate_wider_regex works with tibbles
# note also separate_wider_delim

filenames <- read_lines("data/filenames.txt")
filenames <- tibble(names = filenames)
filenames <- filenames |> separate_wider_regex(names, patterns = c(
    ".*_",
    netid = "[a-z]{3}[0-9]{5}",
    "_.*\\.",
    extension = "qmd|Rmd|pdf"
), cols_remove = FALSE)
# matches have to fill the line
# use too_few = "debug" to get extra info if this fails (omit pdf from extension for example)
filenames |> mutate(new_name = str_c(netid, ".", extension))

# A tibble: 40 × 4
   netid    extension names                                             new_name
   <chr>    <chr>     <chr>                                             <chr>   
 1 aft85126 qmd       HW2 - R - QMD_aft85126_attempt_2023-09-24-18-40-… aft8512…
 2 pez35105 qmd       HW2 - R - QMD_pez35105_attempt_2023-09-23-23-21-… pez3510…
 3 qty84085 pdf       HW2 - R - QMD_qty84085_attempt_2023-09-23-23-21-… qty8408…
 4 min29847 qmd       HW2 - R - QMD_min29847_attempt_2023-09-24-00-57-… min2984…
 5 imk48906 qmd       HW2 - R - QMD_imk48906_attempt_2023-09-24-13-30-… imk4890…
 6 uwc08078 qmd       HW2 - R - QMD_uwc08078_attempt_2023-09-24-00-03-… uwc0807…
 7 kld62064 Rmd       HW2 - R - QMD_kld62064_attempt_2023-09-24-18-53-… kld6206…
 8 mnr42924 qmd       HW2 - R - QMD_mnr42924_attempt_2023-09-24-22-44-… mnr4292…
 9 kzs45796 qmd       HW2 - R - QMD_kzs45796_attempt_2023-09-12-11-29-… kzs4579…
10 vhy10473 qmd       HW2 - R - QMD_vhy10473_attempt_2023-09-24-22-34-… vhy1047…
# ℹ 30 more rows