econandrew · May 3, 2023 03:49 · ahcyip · Jun 6, 2017
diff --git a/fuzzymatch.R b/fuzzymatch.R
 ############################################################################ ###
 # FUZZY MATCHING FUNCTIONS                                                  ####
 ############################################################################ ###

 fuzzy_prep_words <- function(words) {
  # Prepares a list of words for fuzzy matching. All the other fuzzy matching
  # functions will run word through this. Given a list of sentences, returns
  # a list of words.
  
  words <- unlist(strsplit(tolower(gsub("[[:punct:]]", " ", words)), "\\W+"))
  return(words)
 }

 fuzzy_gen_word_freq <- function(l, fun = identity) {
  # Returns a word frequency vector based on vector of sentences l and with
  # frequencies post-processed by fun (e.g. log)
  
  fun(sort(table(fuzzy_prep_words(unlist(strsplit(l, ' ')))), decreasing=T))+1
 }

 fuzzy_title_match <- function(a, b, wf) {
  # Fuzzy matches a performance title based on a custom algorithm tuned for
  # this purpose. Words are frequency-weighted (like tf-idf).
  # 
  # Args:
  #   a, b: the two titles to match
  #   wf: a vector of word frequencies as generated by fuzzy_gen_word_freq
  #
  # Returns:
  #   A fuzzy match score, higher is better, +Inf for exact match
  
  if (a == b) # Shortcut to make faster
    return (Inf)
  a.words <- fuzzy_prep_words(a)
  b.words <- fuzzy_prep_words(b)
  a.freqs <- sapply(a.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
  b.freqs <- sapply(b.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
  
  d <- adist(a.words, b.words)
  a.matches <- 1-apply(d, 1, function(x) { min(x, 2) })/2
  b.matches <- 1-apply(d, 2, function(x) { min(x, 2) })/2
  
  matchsum <- min(sum(a.matches * 1/a.freqs), sum(b.matches * 1/b.freqs))
  unmatchsum <- sum(floor(1-a.matches) * 1/a.freqs) + sum(floor(1-b.matches) * 1/b.freqs)
  return(matchsum / unmatchsum)
 }

 A <- c(
  "Plantains, green (large)",
  "Plantains, yellow",
  "Plantains, purple (small)",
  "Beef, minced, lean",
  "Beef, minced, extra lean",
  "Beef, steak, filet",
  "Lamb"
 )

 B <- c(
  "Large Green Plaintains",
  "Pork",
  "A yellow plantan",
  "Lean beef (minced)",
  "Beef-steak-fillet",
  "extra lean minced beef"
 )

 # Example - outer function needs a vectorised function so there's a little extra work, otherwise this is pretty simple
 # The scores matrix contains all the pairwise scores. Then it would be a simple matter to pick the best match for each
 # with details depending on whether there can be multiple matches, whether everything must match, etc.

 wf <- fuzzy_gen_word_freq(c(A, B))
 vectorised_match <- function (L1,L2) { mapply(function(a,b) { fuzzy_title_match(a, b, wf) }, L1, L2) } 
 scores <- outer(A, B, vectorised_match)
 rownames(scores) <- A
 colnames(scores) <- B
	############################################################################ ###
	# FUZZY MATCHING FUNCTIONS ####
	############################################################################ ###

	fuzzy_prep_words <- function(words) {
	# Prepares a list of words for fuzzy matching. All the other fuzzy matching
	# functions will run word through this. Given a list of sentences, returns
	# a list of words.

	words <- unlist(strsplit(tolower(gsub("[[:punct:]]", " ", words)), "\\W+"))
	return(words)
	}

	fuzzy_gen_word_freq <- function(l, fun = identity) {
	# Returns a word frequency vector based on vector of sentences l and with
	# frequencies post-processed by fun (e.g. log)

	fun(sort(table(fuzzy_prep_words(unlist(strsplit(l, ' ')))), decreasing=T))+1
	}

	fuzzy_title_match <- function(a, b, wf) {
	# Fuzzy matches a performance title based on a custom algorithm tuned for
	# this purpose. Words are frequency-weighted (like tf-idf).
	#
	# Args:
	# a, b: the two titles to match
	# wf: a vector of word frequencies as generated by fuzzy_gen_word_freq
	#
	# Returns:
	# A fuzzy match score, higher is better, +Inf for exact match

	if (a == b) # Shortcut to make faster
	return (Inf)
	a.words <- fuzzy_prep_words(a)
	b.words <- fuzzy_prep_words(b)
	a.freqs <- sapply(a.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })
	b.freqs <- sapply(b.words, function(x) { ifelse(is.na(wf[x]), 1, wf[x]) })

	d <- adist(a.words, b.words)
	a.matches <- 1-apply(d, 1, function(x) { min(x, 2) })/2
	b.matches <- 1-apply(d, 2, function(x) { min(x, 2) })/2

	matchsum <- min(sum(a.matches * 1/a.freqs), sum(b.matches * 1/b.freqs))
	unmatchsum <- sum(floor(1-a.matches) * 1/a.freqs) + sum(floor(1-b.matches) * 1/b.freqs)
	return(matchsum / unmatchsum)
	}

	A <- c(
	"Plantains, green (large)",
	"Plantains, yellow",
	"Plantains, purple (small)",
	"Beef, minced, lean",
	"Beef, minced, extra lean",
	"Beef, steak, filet",
	"Lamb"
	)

	B <- c(
	"Large Green Plaintains",
	"Pork",
	"A yellow plantan",
	"Lean beef (minced)",
	"Beef-steak-fillet",
	"extra lean minced beef"
	)

	# Example - outer function needs a vectorised function so there's a little extra work, otherwise this is pretty simple
	# The scores matrix contains all the pairwise scores. Then it would be a simple matter to pick the best match for each
	# with details depending on whether there can be multiple matches, whether everything must match, etc.

	wf <- fuzzy_gen_word_freq(c(A, B))
	vectorised_match <- function (L1,L2) { mapply(function(a,b) { fuzzy_title_match(a, b, wf) }, L1, L2) }
	scores <- outer(A, B, vectorised_match)
	rownames(scores) <- A
	colnames(scores) <- B