Created
March 26, 2009 13:51
-
-
Save thinkcreate/86104 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require File.dirname(__FILE__) + '/models' | |
class ActiveWoningnet | |
def initialize | |
@address = "<address known>" | |
@b = Selenium::Client::Driver.new("localhost", 4444, "*firefox", @address, 10000); | |
@b.start_new_browser_session | |
end | |
def login | |
@b.open '/' | |
if @b.text?("Naar Mijn WoningNet") | |
@b.click "link=Naar Mijn WoningNet", :wait_for => :page | |
else | |
@b.type "txtRegNr", "<your reg number>" | |
@b.type "txtPostcode", "<your pc>" | |
@b.type "txtPassword", get_password | |
@b.click 'Submit', :wait_for => :page | |
@b.text?("<your name>") # check if we got in | |
end | |
self | |
end | |
def paginate_query(opts={}) | |
opts = ({:start_page=>1, :max_pages => 1}).merge!(opts) | |
puts 'paginate_query: Using opts:'+opts.inspect | |
login | |
@b.click "link=Toon zoekresultaat", :wait_for => :page | |
# the results come in 3 sections: | |
# - 4 results | |
# - banner | |
# - another 6 results | |
# generate enough xpath-expressions of both result-sections | |
# "//span[%d]/a/div[1]/p/span[1]" | |
# //div[3]/span[%d]/a/div[1] | |
# //div[5]/span[%d]/a/div[1] | |
home_detail_links = %w( | |
//div[4]/span[%d]/a/div/p/span[1] | |
//div[6]/span[%d]/a/div/p/span[1] | |
).map{|xpath| (1..6).to_a.map{|ix| xpath % ix}}.flatten | |
adres_label = "//div[@id='details']/h3/b" | |
nextpage_link = "//div[@id='volgende']/a" | |
(opts[:start_page]-1).times do |ix| | |
puts "Skipping page #{ix+1}" | |
if @b.element?(nextpage_link) | |
@b.click nextpage_link, :wait_for => :page | |
else | |
return # end of collection, no scraping needed | |
end | |
end | |
1.upto(opts[:max_pages]) do |ix| | |
puts "paginate_query: beginning of page #{ix}" | |
home_detail_links.each do |link| | |
next unless @b.element?(link) # dont break, maybe in other section | |
@b.click link, :wait_for => :page | |
puts "Op pagina: #{@b.text_content(adres_label)}" | |
yield @b.get_html_source if block_given? | |
@b.go_back :wait_for => :page | |
end | |
# next page? | |
if @b.element?(nextpage_link) | |
@b.click nextpage_link, :wait_for => :page | |
else | |
break | |
end | |
end | |
ensure | |
close_session | |
end | |
def close_session | |
@b.close_current_browser_session | |
end | |
protected | |
def get_password | |
print 'Password for site: ' | |
`stty -echo` | |
STDIN.gets.chomp ensure `stty echo` | |
puts | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module WoningNet | |
%w(hpricot dm-core dm-timestamps dm-validations).each{|gem| require gem} | |
class Draw | |
include DataMapper::Resource | |
property :id, Serial | |
property :date, Date | |
has n, :homes, :order => [:created_at.asc] | |
end | |
class Home | |
include DataMapper::Resource | |
include Comparable | |
property :id, Serial | |
property :uid, Text | |
property :adres, Text | |
property :postcode, Text | |
property :plaats, Text | |
property :stadsdeel, Text | |
property :brutohuur_in_cents, Integer | |
property :aantal_kamers, Integer | |
property :oppervlakte, Integer | |
property :created_at, DateTime | |
property :updated_at, DateTime | |
property :long, Float | |
property :lat, Float | |
property :ok, Boolean, :default=>true | |
property :draw_id, Integer | |
after :create do | |
self.set_coordinates | |
self.save | |
end | |
has n, :response_counts, :order => [:created_at.asc] # .last really meaning 'last added' | |
belongs_to :draw | |
validates_is_unique :adres | |
def self.build_from_source(html) | |
doc = Hpricot(html) | |
uid = html.scan(/printwoningdetails.asp\?(.*)&',/) | |
adres = (doc/"div#details h3 b").inner_html | |
postcode, plaats = (doc/"div#details p:eq(0) b").inner_html.scan(/(\d{4} [A-Z]{2}) (.+)$/).flatten | |
stadsdeel = (doc/"div#details p:eq(1) b").inner_html.scan(/ +[Wijk\/Stadsdeel ]?(.+)$/).flatten.first | |
value_exp = "div[text()='%s'] + div" | |
aantal_reacties = (doc/(value_exp % 'Reacties tot nu toe')).inner_html.to_i | |
brutohuur_in_cents = (doc/(value_exp % 'Bruto huur')).inner_html.split.last.gsub(',','').to_i | |
aantal_kamers = (doc/(value_exp % 'Totaal aantal kamers')).inner_html.to_i | |
oppervlakte = (doc/(value_exp % 'Totale oppervlakte')).inner_html.split.first.to_i | |
home = new(:uid => uid, | |
:adres=>adres,:postcode=>postcode,\ | |
:plaats=>plaats,:stadsdeel=>stadsdeel,\ | |
:brutohuur_in_cents=>brutohuur_in_cents,\ | |
:aantal_kamers => aantal_kamers,:oppervlakte=>oppervlakte) | |
home.response_counts << ResponseCount.new(:count=>aantal_reacties) | |
home | |
end | |
def self.scrape!(opts={}) | |
# scrapes the stored query of the account used | |
ActiveWoningnet.new.paginate_query(opts) do |page| | |
home = Home.build_from_source(page) | |
home.draw = Draw.all.last | |
unless home.save | |
# already exists - update responses | |
known = Home.first({:adres=>home.adres}) | |
known.response_counts << home.response_counts.last | |
known.save | |
end | |
end | |
end | |
def set_coordinates | |
full_address = "#{self.adres}, #{self.plaats}" | |
# address with additions like 'HS' and 'RE' removed | |
simple_address = "#{self.adres.split('-')[0..-2]}, #{self.plaats}" | |
long,lat = begin | |
Google::Geo.new.locate(full_address).coordinates | |
rescue | |
Google::Geo.new.locate(simple_address).coordinates | |
rescue | |
[nil,nil] | |
end | |
attribute_set(:long, long) | |
attribute_set(:lat, lat) | |
[long,lat] | |
end | |
end | |
class ResponseCount | |
include DataMapper::Resource | |
property :id, Serial | |
property :count, Integer | |
property :created_at, DateTime | |
property :updated_at, DateTime | |
property :home_id, Integer | |
belongs_to :home | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment