Skip to content

Instantly share code, notes, and snippets.

@njtierney
Created July 17, 2025 00:59
Show Gist options
  • Save njtierney/fa265bb558d255cc22bbbb188e65aedf to your computer and use it in GitHub Desktop.
Save njtierney/fa265bb558d255cc22bbbb188e65aedf to your computer and use it in GitHub Desktop.
library(tidyverse)

# overall, the approach is to try and make a function that takes in a vector
# and returns some output, in this case a list
create_dummies <- function(vec, col_name = NULL) {
  # %||% is the "null pipe", this is the equivalent of:
  # if (is.null(col_name)){
  #   col_name <- deparse(substitute(vec))
  # }
  col_name <- col_name %||% deparse(substitute(vec))
  
  unique_vals <- na.omit(unique(vec))
  dummy_list <- map(
    .x = unique_vals, 
    # this is a bit curly, and also issues a warning
    # but it says where x is equal to unique non missings, AND isn't missing
    .f = \(x) as.numeric(vec == x & !is.na(vec))
    )
  # set the names to be named after the unique values
  # this could also be converted into a number sequence with
  # seq_along(unique_vals) instead of "unique_vals" here
  # setNames(dummy_list, paste0(col_name, "_", seq_along(unique_vals)))
  setNames(dummy_list, paste0(col_name, "_", unique_vals))
}

x <- c("1", NA, "2", NA, "3")
# finds "x",
create_dummies(x)
#> $x_1
#> [1] 1 0 0 0 0
#> 
#> $x_2
#> [1] 0 0 1 0 0
#> 
#> $x_3
#> [1] 0 0 0 0 1
# or any name
any_name <- c("1", NA, "2", NA, "3")
create_dummies(any_name)
#> $any_name_1
#> [1] 1 0 0 0 0
#> 
#> $any_name_2
#> [1] 0 0 1 0 0
#> 
#> $any_name_3
#> [1] 0 0 0 0 1
# but will also allow you to specify the name
create_dummies(x, "zz")
#> $zz_1
#> [1] 1 0 0 0 0
#> 
#> $zz_2
#> [1] 0 0 1 0 0
#> 
#> $zz_3
#> [1] 0 0 0 0 1

# Add dummies to a data frame
add_dummies <- function(data, cols = where(is.character)) {
  
  col_names <- data |> 
    select({{ cols }}) |>
    names()
  
  dummy_results <- map(
    .x = col_names, 
    .f = \(col) create_dummies(data[[col]], col)
    )
  
  # dummy results is a nested list
  # so we flatten it out
  flatten_dummy <- list_flatten(dummy_results)
  
  # then bind it
  data |> 
    bind_cols(flatten_dummy)
}

# examples
dat <- tibble(
  x = c("1", NA, "2", NA, "3"),
  x2 = c("1", NA, "1", NA, "3"),
  y = c("A", "B", "A", "B", "C"),
  w = c("X", "Y", "X", "Y", "Z")
)

# default is all character columns
dat |> 
  add_dummies()
#> # A tibble: 5 × 15
#>   x     x2    y     w       x_1   x_2   x_3  x2_1  x2_3   y_A   y_B   y_C   w_X
#>   <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1     1     A     X         1     0     0     1     0     1     0     0     1
#> 2 <NA>  <NA>  B     Y         0     0     0     0     0     0     1     0     0
#> 3 2     1     A     X         0     1     0     1     0     1     0     0     1
#> 4 <NA>  <NA>  B     Y         0     0     0     0     0     0     1     0     0
#> 5 3     3     C     Z         0     0     1     0     1     0     0     1     0
#> # ℹ 2 more variables: w_Y <dbl>, w_Z <dbl>

# specific columns
dat |> 
  add_dummies(c(x, y))
#> # A tibble: 5 × 10
#>   x     x2    y     w       x_1   x_2   x_3   y_A   y_B   y_C
#>   <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1     1     A     X         1     0     0     1     0     0
#> 2 <NA>  <NA>  B     Y         0     0     0     0     1     0
#> 3 2     1     A     X         0     1     0     1     0     0
#> 4 <NA>  <NA>  B     Y         0     0     0     0     1     0
#> 5 3     3     C     Z         0     0     1     0     0     1

dat |> 
  add_dummies(c(x2))
#> # A tibble: 5 × 6
#>   x     x2    y     w      x2_1  x2_3
#>   <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1     1     A     X         1     0
#> 2 <NA>  <NA>  B     Y         0     0
#> 3 2     1     A     X         1     0
#> 4 <NA>  <NA>  B     Y         0     0
#> 5 3     3     C     Z         0     1

Created on 2025-07-17 with reprex v2.1.1

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.5.1 (2025-06-13)
#>  os       macOS Sonoma 14.5
#>  system   aarch64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Australia/Brisbane
#>  date     2025-07-17
#>  pandoc   3.4 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
#>  quarto   1.7.31 @ /usr/local/bin/quarto
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package      * version    date (UTC) lib source
#>  cli            3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
#>  digest         0.6.37     2024-08-19 [1] CRAN (R 4.5.0)
#>  dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.5.0)
#>  evaluate       1.0.4      2025-06-18 [1] CRAN (R 4.5.0)
#>  farver         2.1.2      2024-05-13 [1] CRAN (R 4.5.0)
#>  fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
#>  forcats      * 1.0.0      2023-01-29 [1] CRAN (R 4.5.0)
#>  fs             1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
#>  generics       0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
#>  ggplot2      * 3.5.2      2025-04-09 [1] CRAN (R 4.5.0)
#>  glue           1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
#>  gtable         0.3.6      2024-10-25 [1] CRAN (R 4.5.0)
#>  hms            1.1.3      2023-03-21 [1] CRAN (R 4.5.0)
#>  htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.5.0)
#>  knitr          1.50       2025-03-16 [1] CRAN (R 4.5.0)
#>  lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.5.0)
#>  lubridate    * 1.9.4      2024-12-08 [1] CRAN (R 4.5.0)
#>  magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.5.0)
#>  pillar         1.11.0     2025-07-04 [1] CRAN (R 4.5.0)
#>  pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
#>  purrr        * 1.0.4.9000 2025-05-04 [1] Github (tidyverse/purrr@9c8beb4)
#>  R6             2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
#>  RColorBrewer   1.1-3      2022-04-03 [1] CRAN (R 4.5.0)
#>  readr        * 2.1.5      2024-01-10 [1] CRAN (R 4.5.0)
#>  reprex         2.1.1      2024-07-06 [1] CRAN (R 4.5.0)
#>  rlang          1.1.6      2025-04-11 [1] CRAN (R 4.5.0)
#>  rmarkdown      2.29       2024-11-04 [1] CRAN (R 4.5.0)
#>  rstudioapi     0.17.1     2024-10-22 [1] CRAN (R 4.5.0)
#>  scales         1.4.0      2025-04-24 [1] CRAN (R 4.5.0)
#>  sessioninfo    1.2.3      2025-02-05 [1] CRAN (R 4.5.0)
#>  stringi        1.8.7      2025-03-27 [1] CRAN (R 4.5.0)
#>  stringr      * 1.5.1      2023-11-14 [1] CRAN (R 4.5.0)
#>  tibble       * 3.3.0      2025-06-08 [1] CRAN (R 4.5.0)
#>  tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.5.0)
#>  tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
#>  tidyverse    * 2.0.0      2023-02-22 [1] CRAN (R 4.5.0)
#>  timechange     0.3.0      2024-01-18 [1] CRAN (R 4.5.0)
#>  tzdb           0.5.0      2025-03-15 [1] CRAN (R 4.5.0)
#>  utf8           1.2.6      2025-06-08 [1] CRAN (R 4.5.0)
#>  vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.5.0)
#>  withr          3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
#>  xfun           0.52       2025-04-02 [1] CRAN (R 4.5.0)
#>  yaml           2.3.10     2024-07-26 [1] CRAN (R 4.5.0)
#> 
#>  [1] /Users/nick_1/Library/R/arm64/4.5/library
#>  [2] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
#>  * ── Packages attached to the search path.
#> 
#> ──────────────────────────────────────────────────────────────────────────────
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment