Skip to content

Instantly share code, notes, and snippets.

@zsiec
Created March 31, 2011 08:23
Show Gist options
  • Save zsiec/896025 to your computer and use it in GitHub Desktop.
Save zsiec/896025 to your computer and use it in GitHub Desktop.
class AccountRecord
require 'levenshtein_damerau'
include Mongoid::Document
extend Mongoid::Geo::Near
field :salesforce
field :name
field :phone
field :billing_address
field :name_soundex
field :phone_compressed
field :location, :type => Array, :geo => true
field :dismissed, :type => Boolean, :default => false
geo_index :location
validate :generate_soundex
validate :compress_phone
after_save :queue_geocoding
after_save :queue_similarity
attr_accessor :skip_callbacks
GLOBAL_SCORE_SCALE = 10
#wir konnen itemize wenn notig
GLOBAL_DISTANCE_THRESHOLD = 10
#gewichte (GRRRH! MUSCLE MILK!)
NAME_DISTANCE_WEIGHT = 3
NAME_SOUNDEX_DISTANCE_WEIGHT = 2
PHONE_DISTANCE_WEIGHT = 1
PROXIMITY_WEIGHT = 2
#was ist das beste ergebnis kann ich denn bekommen?
MAX_SCORE = ((GLOBAL_SCORE_SCALE * NAME_DISTANCE_WEIGHT) + (GLOBAL_SCORE_SCALE * NAME_SOUNDEX_DISTANCE_WEIGHT) + (GLOBAL_SCORE_SCALE * PHONE_DISTANCE_WEIGHT) + (GLOBAL_SCORE_SCALE * PROXIMITY_WEIGHT))
#grund codes
REASON_CODES = { :location => [1, "Due to physical proximity"], :name => [2, "Due to the names being similar"], :name_soundex => [3, "Due to the names sounding similar"], :phone => [4, "Due to the phone numbers being similar"] }
def aggregate_similar_records(percentage = 20)
close_to_me = (self.class.only(:id).where(:location.within => { "$center" => [ [ location.first, location.last ], 0.05 ] }) if location.present?) || Array.new
min_score = get_min_score(percentage)
AccountRecord.all.each do |r|
#normalisieren
name_match_score = 0
name_soundex_match_score = 0
phone_match_score = 0
proximity_match_score = 0
total_score = 0
#berechnen einzelwertungen
name_match_score = distance_score(r.name, name)
name_soundex_match_score = distance_score(r.name_soundex, name_soundex)
phone_match_score = distance_score(r.phone_compressed, phone_compressed)
proximity_match_score = location_score(close_to_me, r)
#berechnen gesamtscore
total_score = ((name_match_score * NAME_DISTANCE_WEIGHT) + (name_soundex_match_score * NAME_SOUNDEX_DISTANCE_WEIGHT) + (phone_match_score * PHONE_DISTANCE_WEIGHT) + (proximity_match_score * PROXIMITY_WEIGHT) )
#es ist eine uberlegung wert?
if total_score.to_i >= min_score.to_i
#also dann, warum?
reasons = Array.new
reasons.push REASON_CODES[:name][0] if name_match_score > 0
reasons.push REASON_CODES[:name_soundex][0] if name_soundex_match_score > 0
reasons.push REASON_CODES[:phone][0] if phone_match_score > 0
reasons.push REASON_CODES[:location][0] if proximity_match_score > 0
reason_list = reasons.join(",")
#add to redis store
STORE.hset "#{id}:similar", r.id, "#{total_score}:#{reason_list}" unless (STORE.hgetall("#{id}:similar").has_key?(r.id) || id == r.id)
STORE.hset "#{r.id}:similar", id, "#{total_score}:#{reason_list}" unless (STORE.hgetall("#{r.id}:similar").has_key?(id) || id == r.id)
end
end
#Old absolute tests
#same_name = self.class.only(:id).where(:name_soundex => name_soundex)
#same_phone = self.class.only(:id).where(:phone_compressed => phone_compressed)
#metaphone and physical distance
#results = (close_to_me + same_name + same_phone).map(&:id).uniq - [id]
#results.each do |other_id|
# STORE.sadd "#{id}:similar", other_id unless STORE.sismember("#{id}:similar", other_id)
# STORE.sadd "#{other_id}:similar", id unless STORE.sismember("#{other_id}:similar", id)
#end
end
def location_score(results, record)
return 0 if not results.any?
#alles oder nichts kaufen, keine granularitat
if results.include?(r)
return GLOBAL_SCORE_SCALE
else
return 0
end
end
def distance_score(a, b)
return 0 if (!a.present? || !b.present?)
score = (((GLOBAL_DISTANCE_THRESHOLD - distance(a,b)).to_f / GLOBAL_DISTANCE_THRESHOLD.to_f) * GLOBAL_SCORE_SCALE.to_f).ceil
return (score if score >= 0) || 0
end
def get_min_score(percentage)
MAX_SCORE - (MAX_SCORE * ( percentage.to_f / 100 )).ceil
end
def similar_records
similar_records = Array.new
STORE.hgetall("#{id}:similar").each do |key, val|
similar_records.push key
end
return similar_records
#self.class.find(STORE.("#{id}:similar"))
end
def reasons_for_match(other_id)
reason_text_array = Array.new
if STORE.hget("#{id}:similar", "#{other_id}") != nil
reason_code_array = STORE.hget("#{id}:similar", "#{other_id}").split(":")[1]
reasons = Array.new
reasons = reason_code_array.split(",")
reasons.each do |r|
REASON_CODES.each do |key, val|
if val[0].to_i == r.to_i
reason_text_array.push val[1]
end
end
end
end
return reason_text_array
end
private
def queue_geocoding
return if skip_callbacks
Resque.enqueue(Job::Geocode, id) if billing_address_changed?
end
def queue_similarity
return if skip_callbacks
Resque.enqueue(Job::Similarity, id)
end
def generate_soundex
return unless name.present?
self.name_soundex = Text::Metaphone.metaphone(name)
self.name_soundex ||= name
end
def compress_phone
return unless phone.present?
self.phone_compressed = phone.gsub(/[^0-9]/,'')
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment