Created
November 13, 2020 12:28
-
-
Save calvinclaus/affe120fa1f334e36377c814c6c56854 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "httparty" | |
class HumanNameUtils | |
CACHE_SETTINGS = { | |
cache: 60 * 60 * 24 * 60, | |
valid: 60 * 60 * 24 * 60, | |
period: 0.1, | |
timeout: 60, | |
fail: nil, | |
}.freeze | |
TITLES = ["★", "ツ", "☛", "✪", "PHD", "FESC", "LLM", "Dr.-medic", "Lic.", "medic.stom.", "medic", "stom.", "et dent.", "F.E.B.U", "FEBU", "Dent.", "Medizinalrat", "Febs", "FEBVS", "F.E.B.V.S", "MBBCH", "m.b.b.ch", "f.e.b.s.", "Prim.", "Univ", "Priv", "FECSM", "DI Dr.", "Dr.-Ing", "Dipl.-Kfm.", "Prof.Dr.h.c.", "Prof.h.c.", "Lssbb", "Assoc", "CIPD", "LL.M", "CEng", "FRAeS", "Dr.med.", "Dr.h.c", "Prof.", "Dr. Dr.", "hp H-P", "OA", "MR", "HR", "OÄ", "OA Dr.", "DDr.", "Ddr", "DDDr", "Dr.", "Dr", "Doz.", "Dr.med.dent", "dr-phil", "phil", "med.", "Med.", "Med", "med", "Doc", "MD", "PD", "Phd", "phd", "PHD", "PhD", "Ph.D.", "MLE", "M.D.", "EDiR", "DNB", "FRCR", "MMag.", "mag.", "Mag.", "Mag.a", "Univ.", "univ.", "Dipl", "Ing", "Ing DI", "DI", "FH", "EMBA", "MMBA", "MBA", "ESQ", "mpa", "mba", "Mba", "BSc", "BSC", "bsc", "MSc", "MSC", "msc", "CFA", "Bc.", "D.", "M.Sc.", "MPH", "Exec.", "exec.", ".h.c.", "Dr", "dr", "Med", "med", "MD", "PD", "Prof", "MA", "MAS", "Mas", "MAs", "PMP", "B.A.", "BA", "Tcm", "M.A.", "LLB", "Mcipd", "CMgr", "MCMI", "Ilm", "-6st", "AssocCIPD", "CIWM", "BFP", "FCA", "ACEL", "MCIPP", "FdSc", "ACMI", "LCGI", "CPA", "CMA", "PG", "Dip", "HRM", "SFHEA", "PMP", "Cfcipd", "FPC", "Fcipd", "SFHEA", "CA", "ACIPD", "fCMgr", "MCIPS", "Chartered", "CertRP", "Prince2", "SHRM", "FRICS", "Komm.-Rat", "Vkfm", "MSs", "Akad", "FDL", "GPM", "IM", "Business Manager", "akadfdl", "BM", "VW"].freeze | |
def self.cleanup_name(name) | |
name = RemoveEmoji::Sanitize.call(name) | |
name = HumanNameUtils.remove_in_parens_unless_everything_in_parens(name) | |
name = name.gsub(%r{[®\!\?\:/\,\_\|\(\¯\`\*\•\´)]}, " ") | |
word_ends = "($|^| |\\.|\\||\\,|\\!|\\:|\\-|\\_)" | |
name = name.gsub(Regexp.new(word_ends + "-" + word_ends, Regexp::IGNORECASE), " ").strip.gsub(/ +/, " ") | |
TITLES.map(&:downcase).sort_by(&:size).reverse.each do |title| | |
name = name.gsub(Regexp.new(word_ends + Regexp.escape(title) + word_ends, Regexp::IGNORECASE), " ").strip.gsub(/ +/, " ") | |
end | |
name = name.gsub(".", " ") | |
name.strip.gsub(/ +/, " ") | |
end | |
def self.remove_in_parens_unless_everything_in_parens(orig_name) | |
name = orig_name.gsub(/\(.*\)/, " ").strip.gsub(/ +/, " ") | |
return orig_name if name.blank? | |
name | |
end | |
def self.capitalize_last_word(str) | |
arr = str.split(" ") | |
arr.last.capitalize! | |
arr.join(" ") | |
end | |
def self.clean_and_split_name(name) | |
original_name = name | |
name = cleanup_name(name) | |
first = name.split(" ").first.split("-").first.capitalize | |
last = name.split(" ").last.split("-").last.capitalize | |
match = original_name.match(%r{(-| )(de ?(la|l')?|von ?(der|den|de)?|van( |-)?(der|den|de|het|('|`|´|’|‘)? ?t)?|v\. ?d\.|v/d) ?\w*}i) | |
if match.present? | |
match = match[0].sub(/^-/, "") | |
match = match.sub(/('|`|´|’|‘)/, "’") | |
last = if match.split(" ").size > 1 | |
capitalize_last_word(match.strip.downcase) | |
else | |
name.gsub(match.strip, " ").strip.gsub(/ +/, " ").split(" ").last.split("-").last.capitalize | |
end | |
end | |
res = OpenStruct.new( | |
first: first, | |
last: last, | |
) | |
res.last = res.last + "." if res.last.length == 1 | |
res | |
end | |
def self.gender(first_name: nil, country: "DE", min_accuracy: 70, min_samples: 50, min_accuracy_under_min_samples: 90) | |
params = {name: first_name, key: ENV['GENDER_API_KEY']} | |
params[:country] = country unless country == "ALL" | |
params[:country] = "DE" if country.blank? | |
gender_data = retry_if_nil do | |
url = "https://gender-api.com/get" | |
APICache.get(url + params.to_json, CACHE_SETTINGS) do | |
http_response = HTTParty.get(url, query: params) | |
raise "Gender API Request Failed" if http_response.code != 200 # will return nil | |
JSON.parse(http_response.body) | |
end | |
end | |
raise GenderNotFound, gender_data if gender_data.nil? || gender_data["accuracy"] < min_accuracy || (gender_data["samples"] < min_samples && gender_data["accuracy"] < min_accuracy_under_min_samples) | |
gender_data["gender"] | |
end | |
def self.standardize_name_for_comparison(name) | |
name = cleanup_name(name) | |
name = name.downcase | |
name = name.gsub(/ü/, "ue") | |
name = name.gsub(/ö/, "oe") | |
name = name.gsub(/ä/, "ae") | |
name = name.gsub(/ß/, "ss") | |
I18n.transliterate(name) | |
end | |
def self.prepare_name_for_fuzzy_equal(name) | |
standardize_name_for_comparison(name).split(/( |-)/) | |
end | |
def self.fuzzy_equal?(n1, n2, prepared: false) | |
return false if n1.blank? || n2.blank? | |
unless prepared | |
n1 = prepare_name_for_fuzzy_equal(n1) | |
n2 = prepare_name_for_fuzzy_equal(n2) | |
end | |
return false unless longer_includes_all_shorter?(n1.select{ |w| w.size > 1 }, n2.select{ |w| w.size > 1 }) | |
return false unless longer_includes_all_shorter?(n1.map{ |w| w.chars.first }, n2.map{ |w| w.chars.first }) | |
true | |
end | |
def self.longer_includes_all_shorter?(a1, a2) | |
shorter = a1.size < a2.size ? a1 : a2 | |
longer = a1.size >= a2.size ? a1 : a2 | |
shorter.all?{ |word| longer.include?(word) } | |
end | |
def self.retry_if_nil | |
max_tries = 3 | |
tries = 0 | |
res = nil | |
loop do | |
res = yield | |
tries += 1 | |
break if !res.nil? || tries > max_tries | |
pp "Sleeping in rety_if_nil" | |
sleep(1) | |
end | |
res | |
end | |
end | |
class GenderNotFound < StandardError | |
attr_accessor :gender_data | |
def initialize(gender_data) | |
self.gender_data = gender_data | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment