Last active
September 12, 2019 21:27
-
-
Save tylerlittlefield/389acc5593b6fca3514c2be22ab6f258 to your computer and use it in GitHub Desktop.
Grabbing the context of regular expression matches
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(dplyr) | |
library(purrr) | |
library(rlang) | |
library(text2vec) # for movie reviews data | |
# favor stringr over base R because stringr handles NAs whereas base R returns | |
# zero length, base R below: | |
# regmatches(string, regexpr(x, string, ignore.case = ignore_case)) | |
pattern_context <- function(string, pattern, n_before = 10, n_after = 10, | |
ignore_case = TRUE, first = TRUE) { | |
x <- paste0(".{0,", n_before, "}", pattern, ".{0,", n_after, "}") | |
if (first) | |
stringr::str_extract(string, stringr::regex(pattern = x, ignore_case = ignore_case)) | |
else | |
stringr::str_extract_all(string, stringr::regex(pattern = x, ignore_case = ignore_case)) | |
} | |
pattern_context_df <- function(.data, ..., cols = everything()) { | |
df_context <- .data %>% | |
dplyr::select({{ cols }}) %>% | |
purrr::map(pattern_context, ...) %>% | |
dplyr::bind_cols() | |
names(df_context) <- paste0(names(df_context), "_context") | |
df <- dplyr::bind_cols(.data, df_context) | |
df[, order(colnames(df))] | |
} | |
# create variables | |
movie_review %>% | |
as_tibble() %>% | |
mutate( | |
bad_context = pattern_context(review, "bad"), | |
good_context = pattern_context(review, "good"), | |
) %>% | |
select(review:good_context) | |
#> # A tibble: 5,000 x 3 | |
#> review bad_context good_context | |
#> <chr> <chr> <chr> | |
#> 1 With all this stuff going down at ~ drugs are bad m'~ <NA> | |
#> 2 "\\\"The Classic War of the Worlds~ <NA> <NA> | |
#> 3 The film starts with a manager (Ni~ oments is badly ~ ivers the goods w~ | |
#> 4 "It must be assumed that those who~ <NA> ng of the Good Fr~ | |
#> 5 "Superbly trashy and wondrously un~ <NA> <NA> | |
#> 6 I dont know why people think this ~ is such a bad mo~ " a pretty good p~ | |
#> 7 This movie could have been very go~ "me really bad m~ been very good, b~ | |
#> 8 I watched this video at a friend's~ cience is bad, a~ <NA> | |
#> 9 A friend of mine bought this film ~ <NA> <NA> | |
#> 10 "<br /><br />This movie is full of~ <NA> <NA> | |
#> # ... with 4,990 more rows | |
# create variable for every column | |
movie_review %>% | |
as_tibble() %>% | |
select(id, review) %>% | |
pattern_context_df("good") | |
#> # A tibble: 5,000 x 4 | |
#> id id_context review review_context | |
#> <chr> <chr> <chr> <chr> | |
#> 1 5814_8 <NA> With all this stuff going down at~ <NA> | |
#> 2 2381_9 <NA> "\\\"The Classic War of the World~ <NA> | |
#> 3 7759_3 <NA> The film starts with a manager (N~ ivers the goods wi~ | |
#> 4 3630_4 <NA> "It must be assumed that those wh~ ng of the Good Fri~ | |
#> 5 9495_8 <NA> "Superbly trashy and wondrously u~ <NA> | |
#> 6 8196_8 <NA> I dont know why people think this~ " a pretty good pl~ | |
#> 7 7166_2 <NA> This movie could have been very g~ been very good, bu~ | |
#> 8 10633~ <NA> I watched this video at a friend'~ <NA> | |
#> 9 319_1 <NA> A friend of mine bought this film~ <NA> | |
#> 10 8713_~ <NA> "<br /><br />This movie is full o~ <NA> | |
#> # ... with 4,990 more rows | |
# create variable for specified columns | |
movie_review %>% | |
as_tibble() %>% | |
select(id, review) %>% | |
pattern_context_df("good", cols = review) | |
#> # A tibble: 5,000 x 3 | |
#> id review review_context | |
#> <chr> <chr> <chr> | |
#> 1 5814_8 With all this stuff going down at the mome~ <NA> | |
#> 2 2381_9 "\\\"The Classic War of the Worlds\\\" by ~ <NA> | |
#> 3 7759_3 The film starts with a manager (Nicholas B~ ivers the goods wit~ | |
#> 4 3630_4 "It must be assumed that those who praised~ ng of the Good Frid~ | |
#> 5 9495_8 "Superbly trashy and wondrously unpretenti~ <NA> | |
#> 6 8196_8 I dont know why people think this is such ~ " a pretty good plo~ | |
#> 7 7166_2 This movie could have been very good, but ~ been very good, but~ | |
#> 8 10633_1 I watched this video at a friend's house. ~ <NA> | |
#> 9 319_1 A friend of mine bought this film for 1, a~ <NA> | |
#> 10 8713_10 "<br /><br />This movie is full of referen~ <NA> | |
#> # ... with 4,990 more rows | |
# create variable for all columns except specified | |
movie_review %>% | |
as_tibble() %>% | |
select(id, review) %>% | |
pattern_context_df("good", cols = -id) | |
#> # A tibble: 5,000 x 3 | |
#> id review review_context | |
#> <chr> <chr> <chr> | |
#> 1 5814_8 With all this stuff going down at the mome~ <NA> | |
#> 2 2381_9 "\\\"The Classic War of the Worlds\\\" by ~ <NA> | |
#> 3 7759_3 The film starts with a manager (Nicholas B~ ivers the goods wit~ | |
#> 4 3630_4 "It must be assumed that those who praised~ ng of the Good Frid~ | |
#> 5 9495_8 "Superbly trashy and wondrously unpretenti~ <NA> | |
#> 6 8196_8 I dont know why people think this is such ~ " a pretty good plo~ | |
#> 7 7166_2 This movie could have been very good, but ~ been very good, but~ | |
#> 8 10633_1 I watched this video at a friend's house. ~ <NA> | |
#> 9 319_1 A friend of mine bought this film for 1, a~ <NA> | |
#> 10 8713_10 "<br /><br />This movie is full of referen~ <NA> | |
#> # ... with 4,990 more rows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment