Created
March 31, 2011 08:23
-
-
Save zsiec/896025 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class AccountRecord | |
require 'levenshtein_damerau' | |
include Mongoid::Document | |
extend Mongoid::Geo::Near | |
field :salesforce | |
field :name | |
field :phone | |
field :billing_address | |
field :name_soundex | |
field :phone_compressed | |
field :location, :type => Array, :geo => true | |
field :dismissed, :type => Boolean, :default => false | |
geo_index :location | |
validate :generate_soundex | |
validate :compress_phone | |
after_save :queue_geocoding | |
after_save :queue_similarity | |
attr_accessor :skip_callbacks | |
GLOBAL_SCORE_SCALE = 10 | |
#wir konnen itemize wenn notig | |
GLOBAL_DISTANCE_THRESHOLD = 10 | |
#gewichte (GRRRH! MUSCLE MILK!) | |
NAME_DISTANCE_WEIGHT = 3 | |
NAME_SOUNDEX_DISTANCE_WEIGHT = 2 | |
PHONE_DISTANCE_WEIGHT = 1 | |
PROXIMITY_WEIGHT = 2 | |
#was ist das beste ergebnis kann ich denn bekommen? | |
MAX_SCORE = ((GLOBAL_SCORE_SCALE * NAME_DISTANCE_WEIGHT) + (GLOBAL_SCORE_SCALE * NAME_SOUNDEX_DISTANCE_WEIGHT) + (GLOBAL_SCORE_SCALE * PHONE_DISTANCE_WEIGHT) + (GLOBAL_SCORE_SCALE * PROXIMITY_WEIGHT)) | |
#grund codes | |
REASON_CODES = { :location => [1, "Due to physical proximity"], :name => [2, "Due to the names being similar"], :name_soundex => [3, "Due to the names sounding similar"], :phone => [4, "Due to the phone numbers being similar"] } | |
def aggregate_similar_records(percentage = 20) | |
close_to_me = (self.class.only(:id).where(:location.within => { "$center" => [ [ location.first, location.last ], 0.05 ] }) if location.present?) || Array.new | |
min_score = get_min_score(percentage) | |
AccountRecord.all.each do |r| | |
#normalisieren | |
name_match_score = 0 | |
name_soundex_match_score = 0 | |
phone_match_score = 0 | |
proximity_match_score = 0 | |
total_score = 0 | |
#berechnen einzelwertungen | |
name_match_score = distance_score(r.name, name) | |
name_soundex_match_score = distance_score(r.name_soundex, name_soundex) | |
phone_match_score = distance_score(r.phone_compressed, phone_compressed) | |
proximity_match_score = location_score(close_to_me, r) | |
#berechnen gesamtscore | |
total_score = ((name_match_score * NAME_DISTANCE_WEIGHT) + (name_soundex_match_score * NAME_SOUNDEX_DISTANCE_WEIGHT) + (phone_match_score * PHONE_DISTANCE_WEIGHT) + (proximity_match_score * PROXIMITY_WEIGHT) ) | |
#es ist eine uberlegung wert? | |
if total_score.to_i >= min_score.to_i | |
#also dann, warum? | |
reasons = Array.new | |
reasons.push REASON_CODES[:name][0] if name_match_score > 0 | |
reasons.push REASON_CODES[:name_soundex][0] if name_soundex_match_score > 0 | |
reasons.push REASON_CODES[:phone][0] if phone_match_score > 0 | |
reasons.push REASON_CODES[:location][0] if proximity_match_score > 0 | |
reason_list = reasons.join(",") | |
#add to redis store | |
STORE.hset "#{id}:similar", r.id, "#{total_score}:#{reason_list}" unless (STORE.hgetall("#{id}:similar").has_key?(r.id) || id == r.id) | |
STORE.hset "#{r.id}:similar", id, "#{total_score}:#{reason_list}" unless (STORE.hgetall("#{r.id}:similar").has_key?(id) || id == r.id) | |
end | |
end | |
#Old absolute tests | |
#same_name = self.class.only(:id).where(:name_soundex => name_soundex) | |
#same_phone = self.class.only(:id).where(:phone_compressed => phone_compressed) | |
#metaphone and physical distance | |
#results = (close_to_me + same_name + same_phone).map(&:id).uniq - [id] | |
#results.each do |other_id| | |
# STORE.sadd "#{id}:similar", other_id unless STORE.sismember("#{id}:similar", other_id) | |
# STORE.sadd "#{other_id}:similar", id unless STORE.sismember("#{other_id}:similar", id) | |
#end | |
end | |
def location_score(results, record) | |
return 0 if not results.any? | |
#alles oder nichts kaufen, keine granularitat | |
if results.include?(r) | |
return GLOBAL_SCORE_SCALE | |
else | |
return 0 | |
end | |
end | |
def distance_score(a, b) | |
return 0 if (!a.present? || !b.present?) | |
score = (((GLOBAL_DISTANCE_THRESHOLD - distance(a,b)).to_f / GLOBAL_DISTANCE_THRESHOLD.to_f) * GLOBAL_SCORE_SCALE.to_f).ceil | |
return (score if score >= 0) || 0 | |
end | |
def get_min_score(percentage) | |
MAX_SCORE - (MAX_SCORE * ( percentage.to_f / 100 )).ceil | |
end | |
def similar_records | |
similar_records = Array.new | |
STORE.hgetall("#{id}:similar").each do |key, val| | |
similar_records.push key | |
end | |
return similar_records | |
#self.class.find(STORE.("#{id}:similar")) | |
end | |
def reasons_for_match(other_id) | |
reason_text_array = Array.new | |
if STORE.hget("#{id}:similar", "#{other_id}") != nil | |
reason_code_array = STORE.hget("#{id}:similar", "#{other_id}").split(":")[1] | |
reasons = Array.new | |
reasons = reason_code_array.split(",") | |
reasons.each do |r| | |
REASON_CODES.each do |key, val| | |
if val[0].to_i == r.to_i | |
reason_text_array.push val[1] | |
end | |
end | |
end | |
end | |
return reason_text_array | |
end | |
private | |
def queue_geocoding | |
return if skip_callbacks | |
Resque.enqueue(Job::Geocode, id) if billing_address_changed? | |
end | |
def queue_similarity | |
return if skip_callbacks | |
Resque.enqueue(Job::Similarity, id) | |
end | |
def generate_soundex | |
return unless name.present? | |
self.name_soundex = Text::Metaphone.metaphone(name) | |
self.name_soundex ||= name | |
end | |
def compress_phone | |
return unless phone.present? | |
self.phone_compressed = phone.gsub(/[^0-9]/,'') | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment