pr130 · September 6, 2019 19:01
diff --git a/find_similar_words.R b/find_similar_words.R
 library(stringr)
 library(dplyr)
 library(stringdist)

 df <- tibble(word = c("Agrilus pilosivittatus",
                      "Agrilus pilosovittatus", 
                      "foo", 
                      "bar", 
                      "baz", 
                      "something else", 
                      "some stuff", 
                      "some stff"))

 # create a lag variable (this "shifts" the whole variable)
 df <- df %>% 
  mutate(word_before = lag(word)) %>% 
  mutate(row_num = row_number()) 

 # look at the new data (see the "shift"?)
 df

 # calculate the levensthein distance between the two 
 df <- df %>% 
  mutate(dist = stringdist::stringdist(word, word_before))

 df

 # decide which rows to keep based on levensthein distance
 rows_to_keep <- df %>% 
  filter(dist < 2) %>% # adapt here to keep pairs with more / less levenshtein distance
  pull(row_num) # get the row_num variable as a vector

 rows_to_keep
 # we also want to get the word before each of the rows, so for each row number also keep the row before
 # the c() means we "concatinate" the two vectors

 rows_to_keep # we want this 
 rows_to_keep - 1 # "plus" this
 all_rows_to_keep <- c(rows_to_keep, rows_to_keep  - 1)

 sort(all_rows_to_keep) # just to look at it

 # get words!!
 df %>% 
  filter(row_num %in% all_rows_to_keep) %>% 
  select(word)
	library(stringr)
	library(dplyr)
	library(stringdist)

	df <- tibble(word = c("Agrilus pilosivittatus",
	"Agrilus pilosovittatus",
	"foo",
	"bar",
	"baz",
	"something else",
	"some stuff",
	"some stff"))

	# create a lag variable (this "shifts" the whole variable)
	df <- df %>%
	mutate(word_before = lag(word)) %>%
	mutate(row_num = row_number())

	# look at the new data (see the "shift"?)
	df

	# calculate the levensthein distance between the two
	df <- df %>%
	mutate(dist = stringdist::stringdist(word, word_before))

	df

	# decide which rows to keep based on levensthein distance
	rows_to_keep <- df %>%
	filter(dist < 2) %>% # adapt here to keep pairs with more / less levenshtein distance
	pull(row_num) # get the row_num variable as a vector

	rows_to_keep
	# we also want to get the word before each of the rows, so for each row number also keep the row before
	# the c() means we "concatinate" the two vectors

	rows_to_keep # we want this
	rows_to_keep - 1 # "plus" this
	all_rows_to_keep <- c(rows_to_keep, rows_to_keep - 1)

	sort(all_rows_to_keep) # just to look at it

	# get words!!
	df %>%
	filter(row_num %in% all_rows_to_keep) %>%
	select(word)