Last active
May 3, 2023 03:49
-
-
Save econandrew/a9930d812eb420b20358 to your computer and use it in GitHub Desktop.
R code for fuzzy sentence matching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################ ### | |
# FUZZY MATCHING FUNCTIONS #### | |
############################################################################ ### | |
fuzzy_prep_words <- function(words) { | |
# Prepares a list of words for fuzzy matching. All the other fuzzy matching | |
# functions will run word through this. Given a list of sentences, returns | |
# a list of words. | |
words <- unlist(strsplit(tolower(gsub("[[:punct:]]", " ", words)), "\\W+")) | |
return(words) | |
} | |
fuzzy_gen_word_freq <- function(l, fun = identity) { | |
# Returns a word frequency vector based on vector of sentences l and with | |
# frequencies post-processed by fun (e.g. log) | |
fun(sort(table(fuzzy_prep_words(unlist(strsplit(l, ' ')))), decreasing=T))+1 | |
} | |
fuzzy_title_match <- function(a, b, wf) { | |
# Fuzzy matches a performance title based on a custom algorithm tuned for | |
# this purpose. Words are frequency-weighted (like tf-idf). | |
# | |
# Args: | |
# a, b: the two titles to match | |
# wf: a vector of word frequencies as generated by fuzzy_gen_word_freq | |
# | |
# Returns: | |
# A fuzzy match score, higher is better, +Inf for exact match | |
if (a == b) # Shortcut to make faster | |
return (Inf) | |
a.words <- fuzzy_prep_words(a) | |
b.words <- fuzzy_prep_words(b) | |
a.freqs <- sapply(a.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) }) | |
b.freqs <- sapply(b.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) }) | |
d <- adist(a.words, b.words) | |
a.matches <- 1-apply(d, 1, function(x) { min(x, 2) })/2 | |
b.matches <- 1-apply(d, 2, function(x) { min(x, 2) })/2 | |
matchsum <- min(sum(a.matches * 1/a.freqs), sum(b.matches * 1/b.freqs)) | |
unmatchsum <- sum(floor(1-a.matches) * 1/a.freqs) + sum(floor(1-b.matches) * 1/b.freqs) | |
return(matchsum / unmatchsum) | |
} | |
A <- c( | |
"Plantains, green (large)", | |
"Plantains, yellow", | |
"Plantains, purple (small)", | |
"Beef, minced, lean", | |
"Beef, minced, extra lean", | |
"Beef, steak, filet", | |
"Lamb" | |
) | |
B <- c( | |
"Large Green Plaintains", | |
"Pork", | |
"A yellow plantan", | |
"Lean beef (minced)", | |
"Beef-steak-fillet", | |
"extra lean minced beef" | |
) | |
# Example - outer function needs a vectorised function so there's a little extra work, otherwise this is pretty simple | |
# The scores matrix contains all the pairwise scores. Then it would be a simple matter to pick the best match for each | |
# with details depending on whether there can be multiple matches, whether everything must match, etc. | |
wf <- fuzzy_gen_word_freq(c(A, B)) | |
vectorised_match <- function (L1,L2) { mapply(function(a,b) { fuzzy_title_match(a, b, wf) }, L1, L2) } | |
scores <- outer(A, B, vectorised_match) | |
rownames(scores) <- A | |
colnames(scores) <- B |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Check out @dgrtwo fuzzyjoin::stringdist_join which includes Levenshtein, soundex, etc methods:
https://github.com/dgrtwo/fuzzyjoin/blob/master/R/stringdist_join.R