Last active
February 1, 2022 13:01
-
-
Save gdmcdonald/bacfaafe2cccff18b6a81b319cbc3580 to your computer and use it in GitHub Desktop.
Efficient fuzzy match of two data frames by one common string column in R, outputing a list of the matching and non-matching rows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Efficient fuzzy match of two data frames by one common column | |
library(dplyr) | |
library(fuzzyjoin) | |
library(stringdist) | |
eff_fuzzy_match<-function(data_frame_A, | |
data_frame_B, | |
by_what, | |
choose_p = 0.1, | |
choose_max_dist = 0.4, | |
best_only = TRUE, | |
make_lower_case = TRUE){ | |
#set matching variables to lowercase if make_lower_case is true | |
if (make_lower_case == TRUE){ | |
data_frame_A[,by_what]<-tolower(data_frame_A[,by_what]) | |
data_frame_B[,by_what]<-tolower(data_frame_B[,by_what]) | |
} | |
#take only unique entries | |
data_frame_A = unique(data_frame_A) | |
data_frame_B = unique(data_frame_B) | |
#Merge data frames with dplyr | |
ExactMatches<-inner_join(x = data_frame_A, | |
y = data_frame_B, | |
by = by_what) | |
#Now look at those which didn't match at all | |
NoMatchesA<-anti_join(x = data_frame_A, | |
y = data_frame_B, | |
by = by_what) | |
NoMatchesB<-anti_join(x = data_frame_B, | |
y = data_frame_A, | |
by = by_what) | |
#FuzzyMatch the remaining ones by string distance | |
FuzzyMatch<-stringdist_inner_join(x = NoMatchesA, | |
y = NoMatchesB, | |
by = by_what, | |
method= 'jw', | |
p = choose_p, | |
max_dist = choose_max_dist) | |
#Add a column specifying the string distance | |
FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")], | |
b = FuzzyMatch[,paste0(by_what,".y")], | |
method = 'jw', | |
p = choose_p) | |
#order by smallest string distance first | |
FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),] | |
#take the best matching one only if best_only is TRUE | |
if(best_only==TRUE){ | |
FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),] | |
FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),] | |
} | |
NotMatchedA<-anti_join(x = NoMatchesA, | |
y = FuzzyMatch, | |
by = setNames(paste0(by_what,".x"), | |
by_what)) | |
NotMatchedB<-anti_join(x = NoMatchesB, | |
y = FuzzyMatch, | |
by = setNames(paste0(by_what,".y"), | |
by_what)) | |
completeReturnList = list(exact = ExactMatches, | |
fuzzy = FuzzyMatch, | |
remainderA = NotMatchedA, | |
remainderB = NotMatchedB) | |
return(completeReturnList) | |
} | |
# #testing: | |
# | |
# dfa = data.frame(FirstNames=c("George", | |
# "Ketut", | |
# "Harriet", | |
# "Zhu", | |
# "Sarika", | |
# "Apple")) | |
# dfb = data.frame(FirstNames=c("GeorGe", | |
# "Ketut", | |
# "Harry", | |
# "Z.", | |
# "Rika", | |
# "Marion")) | |
# eff_fuzzy_match(dfa,dfb,by_what = "FirstNames") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment