Created
November 12, 2020 20:29
-
-
Save tlemburg/e0a983cc2bf0828f757b86e97a99b474 to your computer and use it in GitHub Desktop.
Grab country phone codes out of Wikipedia article
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
require 'yaml' | |
doc = Nokogiri::HTML.parse(open("https://en.wikipedia.org/wiki/List_of_country_calling_codes")) | |
table = doc.css('table.wikitable').first | |
cells = table.css('td') | |
# put known overrides here. Last updated/analyzed 2020-11-12 | |
data_hash = { | |
'VA' => '379', # Vatican City is listed on wikipedia in both 379 and also in 39, which is Italy. | |
# stick with the more specific one | |
'KZ' => '7', # Kazahkstan is listed also with +997 on wikipedia but these are only for SIM | |
# cards ICCID | |
'XV' => nil # refers to international networks that are outside of country boundaries | |
} | |
cells.each do |cell| | |
if cell.text.strip.start_with?('+1: North American Numbering Plan countries and territories') | |
# Find the country codes | |
matches = cell.text.scan(/([A-Z]{2})/).flatten | |
matches.each do |alpha2| | |
if data_hash.key?(alpha2) | |
#puts "duplicate #{alpha2}" | |
next | |
end | |
data_hash[alpha2] = '1' | |
#puts "#{alpha2} => 1" | |
end | |
elsif cell.text.strip.start_with?(/\+[2-9]/) | |
next unless cell.text.strip.match?(/([A-Z]{2})/) | |
code = cell.text.strip.split(':').first[1..-1] | |
alpha2s = cell.text.strip.split(':').last.strip.split(', ') | |
alpha2s.each do |alpha2| | |
if data_hash.key?(alpha2) | |
#puts "duplicate #{alpha2}" | |
next | |
end | |
data_hash[alpha2] = code | |
# puts "#{alpha2} => #{code}" | |
end | |
end | |
end | |
puts data_hash.sort.to_h.to_yaml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment