Last active
December 10, 2015 19:28
-
-
Save tommetge/4481169 to your computer and use it in GitHub Desktop.
Encyclopedia WOT scraper for figuring out how many points of view (POV) abound in the series
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'mechanize' | |
# Utility methods | |
def merge_povs(first_pov, second_pov) | |
second_pov.each do |k, v| | |
sum = [first_pov[k].to_i, second_pov[k].to_i].inject(:+) | |
first_pov[k] = sum | |
end | |
return first_pov | |
end | |
# Scraper methods | |
def get_chapter_povs(agent, chapter_url) | |
puts " Fetching #{chapter_url}..." | |
chapter = agent.get(chapter_url) | |
povs = chapter.search("p").text.scan(/[a-zA-Z0-9].* POV/) | |
chapter_povs = {} | |
povs.each do |pov| | |
if chapter_povs[pov] | |
chapter_povs[pov] = chapter_povs[pov] + 1 | |
else | |
chapter_povs[pov] = 1 | |
end | |
end | |
return chapter_povs | |
end | |
def get_book_pov(agent, starting_page, book_url) | |
puts "Fetching #{book_url}..." | |
book_povs = {} | |
book = agent.get(starting_page + "/" + book_url) | |
# Scrape chapter URLs | |
chapters = book.search("ol li a").map do |ch_src| | |
ch_src.attributes["href"].value | |
end | |
# Add prologues and friends | |
book.search("ul li a").each do |extra| | |
chapters << extra.attributes["href"].value | |
end | |
chapters.each do |chapter_url| | |
povs = get_chapter_povs(agent, chapter_url) rescue {} | |
merge_povs(book_povs, povs) | |
end | |
return book_povs | |
end | |
# The real business: scraping books for POVs | |
def main | |
agent = Mechanize.new | |
starting_page = "http://encyclopaedia-wot.org" | |
page = agent.get(starting_page) | |
books = page.search('ol li a').map do |book_src| | |
book_src.attributes["href"].value | |
end | |
final_povs = {} | |
books.each do |book| | |
povs = get_book_pov(agent, starting_page, book) | |
pp povs | |
merge_povs(final_povs, povs) | |
end | |
pp final_povs | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sorted, for easier scanning:
{"Abaldar Yulan POV"=>1,
"Adelorna Bastine POV"=>1,
"Alliandre POV"=>1,
"Almen Bunt POV"=>1,
"Alteima POV"=>1,
"Alviarin Freidhen POV"=>4,
"Alviarin POV"=>1,
"Androl POV"=>4,
"Aran'gar POV"=>2,
"Arymilla POV"=>1,
"Asmodean POV"=>1,
"Asne Zeramene POV"=>1,
"Assid Bakuun POV"=>1,
"Aviendha POV"=>13,
"Bain POV"=>1,
"Barmellin POV"=>1,
"Barriga POV"=>1,
"Bayle Domon POV"=>4,
"Bayrd POV"=>1,
"Beonin POV"=>1,
"Bertome Saighan POV"=>1,
"Bethamin Zeami POV"=>1,
"Birgitte POV"=>2,
"Cadsuane Melaidhrin POV"=>1,
"Cadsuane POV"=>15,
"Chulein POV"=>1,
"Cyndane POV"=>3,
"Dain Bornhald POV"=>2,
"Daved Hanlon POV"=>2,
"Davram Bashere POV"=>2,
"Delana Mosalaine POV"=>2,
"Demandred POV"=>5,
"Demira Eriff POV"=>2,
"Dyelin Taravin POV"=>1,
"Eamon Valda POV"=>3,
"Eben Hopwil POV"=>1,
"Egeanin Sarna POV"=>4,
"Egeanin Tamarath POV"=>1,
"Egwene POV"=>110,
"Egwene al'Vere POV"=>1,
"Elaida POV"=>3,
"Elaida a'Roihan POV"=>4,
"Elayne POV"=>66,
"Elenia Sarand POV"=>1,
"Ellorien Traemane POV"=>1,
"Elza Penfell POV"=>3,
"Ethenielle POV"=>1,
"Faile POV"=>23,
"Falendre POV"=>1,
"Falion Bhoda POV"=>2,
"Fortuona POV"=>1,
"Furyk Karede POV"=>4,
"Gabrelle POV"=>1,
"Galad Damodred POV"=>1,
"Galad POV"=>15,
"Galina Casban POV"=>5,
"Galina POV"=>2,
"Gareth Bryne POV"=>4,
"Gawyn POV"=>16,
"Geofram Bornhald POV"=>5,
"Gholam POV"=>1,
"Graendal POV"=>9,
"Hadnan Kadere POV"=>2,
"Harine POV"=>1,
"High Lady Suroth Sabelle Meldarath POV"=>1,
"Isam POV"=>1,
"Isam/Luc POV"=>1,
"Ituralde POV"=>5,
"Jaichim Carridin POV"=>4,
"Jaret Byar POV"=>1,
"Jesse Bilal POV"=>1,
"Joline Maza POV"=>1,
"Katerine Alruddin POV"=>2,
"Kennar Miraj POV"=>2,
"Lan POV"=>4,
"Leane Sharif POV"=>1,
"Leilwin POV"=>1,
"Lelaine Akashi POV"=>1,
"Liandrin POV"=>4,
"Loial POV"=>1,
"Luan Norwelyn POV"=>1,
"Maeric POV"=>1,
"Malenarin Rai POV"=>1,
"Masema Dagar POV"=>1,
"Mat POV"=>90,
"Merana Ambrey POV"=>3,
"Mesaana POV"=>2,
"Mili Skane POV"=>1,
"Min POV"=>29,
"Moghedien POV"=>7,
"Moiraine POV"=>8,
"Moiraine1 POV"=>1,
"Morgase POV"=>11,
"Morgase Trakand POV"=>1,
"Moridin POV"=>2,
"Myrelle POV"=>1,
"Nesune Bihara POV"=>2,
"Noal Charin POV"=>1,
"Nynaeve POV"=>57,
"Olver POV"=>1,
"Omni POV"=>7,
"Osan'gar POV"=>3,
"Padan Fain POV"=>10,
"Pedron Niall POV"=>5,
"Perrin POV"=>132,
"Pevara POV"=>5,
"Raefar Kisman POV"=>1,
"Rahvin POV"=>1,
"Rand POV"=>202,
"Reanne Corly POV"=>2,
"Renald Fanwar POV"=>1,
"Rhadam Asunawa POV"=>1,
"Rodel Ituralde POV"=>5,
"Romanda Cassin POV"=>2,
"Romanda POV"=>1,
"Saerin Asnobar POV"=>2,
"Sahra Covenry POV"=>1,
"Samitsu POV"=>1,
"Sammael POV"=>2,
"Sarene Nemdahl POV"=>1,
"Seaine Herimon POV"=>3,
"Seanchan POV"=>1,
"Seeker POV"=>1,
"Semirhage POV"=>2,
"Sevanna POV"=>6,
"Shaidar Haran POV"=>1,
"Shalon POV"=>2,
"Sheriam Bayanar POV"=>2,
"Sheriam POV"=>1,
"Siuan POV"=>4,
"Siuan Sanche POV"=>10,
"Sorilea POV"=>1,
"Sulin POV"=>1,
"Suroth POV"=>2,
"Talmanes POV"=>5,
"Tarna Feir POV"=>1,
"The Watcher4 POV"=>1,
"Thom Merrilin POV"=>4,
"Timna POV"=>1,
"Toveine Gazal POV"=>2,
"Tuon POV"=>6,
"Tylee Khirgan POV"=>1,
"Varek POV"=>1,
"Verin Mathwin POV"=>3,
"Verin POV"=>2,
"Vilnar Barada POV"=>1,
"Weilin Aldragoran POV"=>1,
"Yukiri POV"=>1}