Created
September 6, 2019 19:01
-
-
Save pr130/bc754e20c383ab5e48d4b1f52b7b2575 to your computer and use it in GitHub Desktop.
Find words that are similar in following rows.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(dplyr) | |
library(stringdist) | |
df <- tibble(word = c("Agrilus pilosivittatus", | |
"Agrilus pilosovittatus", | |
"foo", | |
"bar", | |
"baz", | |
"something else", | |
"some stuff", | |
"some stff")) | |
# create a lag variable (this "shifts" the whole variable) | |
df <- df %>% | |
mutate(word_before = lag(word)) %>% | |
mutate(row_num = row_number()) | |
# look at the new data (see the "shift"?) | |
df | |
# calculate the levensthein distance between the two | |
df <- df %>% | |
mutate(dist = stringdist::stringdist(word, word_before)) | |
df | |
# decide which rows to keep based on levensthein distance | |
rows_to_keep <- df %>% | |
filter(dist < 2) %>% # adapt here to keep pairs with more / less levenshtein distance | |
pull(row_num) # get the row_num variable as a vector | |
rows_to_keep | |
# we also want to get the word before each of the rows, so for each row number also keep the row before | |
# the c() means we "concatinate" the two vectors | |
rows_to_keep # we want this | |
rows_to_keep - 1 # "plus" this | |
all_rows_to_keep <- c(rows_to_keep, rows_to_keep - 1) | |
sort(all_rows_to_keep) # just to look at it | |
# get words!! | |
df %>% | |
filter(row_num %in% all_rows_to_keep) %>% | |
select(word) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment