Skip to content

Instantly share code, notes, and snippets.

@thinkcreate
Created March 26, 2009 13:51
Show Gist options
  • Save thinkcreate/86104 to your computer and use it in GitHub Desktop.
Save thinkcreate/86104 to your computer and use it in GitHub Desktop.
require File.dirname(__FILE__) + '/models'
class ActiveWoningnet
def initialize
@address = "<address known>"
@b = Selenium::Client::Driver.new("localhost", 4444, "*firefox", @address, 10000);
@b.start_new_browser_session
end
def login
@b.open '/'
if @b.text?("Naar Mijn WoningNet")
@b.click "link=Naar Mijn WoningNet", :wait_for => :page
else
@b.type "txtRegNr", "<your reg number>"
@b.type "txtPostcode", "<your pc>"
@b.type "txtPassword", get_password
@b.click 'Submit', :wait_for => :page
@b.text?("<your name>") # check if we got in
end
self
end
def paginate_query(opts={})
opts = ({:start_page=>1, :max_pages => 1}).merge!(opts)
puts 'paginate_query: Using opts:'+opts.inspect
login
@b.click "link=Toon zoekresultaat", :wait_for => :page
# the results come in 3 sections:
# - 4 results
# - banner
# - another 6 results
# generate enough xpath-expressions of both result-sections
# "//span[%d]/a/div[1]/p/span[1]"
# //div[3]/span[%d]/a/div[1]
# //div[5]/span[%d]/a/div[1]
home_detail_links = %w(
//div[4]/span[%d]/a/div/p/span[1]
//div[6]/span[%d]/a/div/p/span[1]
).map{|xpath| (1..6).to_a.map{|ix| xpath % ix}}.flatten
adres_label = "//div[@id='details']/h3/b"
nextpage_link = "//div[@id='volgende']/a"
(opts[:start_page]-1).times do |ix|
puts "Skipping page #{ix+1}"
if @b.element?(nextpage_link)
@b.click nextpage_link, :wait_for => :page
else
return # end of collection, no scraping needed
end
end
1.upto(opts[:max_pages]) do |ix|
puts "paginate_query: beginning of page #{ix}"
home_detail_links.each do |link|
next unless @b.element?(link) # dont break, maybe in other section
@b.click link, :wait_for => :page
puts "Op pagina: #{@b.text_content(adres_label)}"
yield @b.get_html_source if block_given?
@b.go_back :wait_for => :page
end
# next page?
if @b.element?(nextpage_link)
@b.click nextpage_link, :wait_for => :page
else
break
end
end
ensure
close_session
end
def close_session
@b.close_current_browser_session
end
protected
def get_password
print 'Password for site: '
`stty -echo`
STDIN.gets.chomp ensure `stty echo`
puts
end
end
module WoningNet
%w(hpricot dm-core dm-timestamps dm-validations).each{|gem| require gem}
class Draw
include DataMapper::Resource
property :id, Serial
property :date, Date
has n, :homes, :order => [:created_at.asc]
end
class Home
include DataMapper::Resource
include Comparable
property :id, Serial
property :uid, Text
property :adres, Text
property :postcode, Text
property :plaats, Text
property :stadsdeel, Text
property :brutohuur_in_cents, Integer
property :aantal_kamers, Integer
property :oppervlakte, Integer
property :created_at, DateTime
property :updated_at, DateTime
property :long, Float
property :lat, Float
property :ok, Boolean, :default=>true
property :draw_id, Integer
after :create do
self.set_coordinates
self.save
end
has n, :response_counts, :order => [:created_at.asc] # .last really meaning 'last added'
belongs_to :draw
validates_is_unique :adres
def self.build_from_source(html)
doc = Hpricot(html)
uid = html.scan(/printwoningdetails.asp\?(.*)&',/)
adres = (doc/"div#details h3 b").inner_html
postcode, plaats = (doc/"div#details p:eq(0) b").inner_html.scan(/(\d{4} [A-Z]{2}) (.+)$/).flatten
stadsdeel = (doc/"div#details p:eq(1) b").inner_html.scan(/ +[Wijk\/Stadsdeel ]?(.+)$/).flatten.first
value_exp = "div[text()='%s'] + div"
aantal_reacties = (doc/(value_exp % 'Reacties tot nu toe')).inner_html.to_i
brutohuur_in_cents = (doc/(value_exp % 'Bruto huur')).inner_html.split.last.gsub(',','').to_i
aantal_kamers = (doc/(value_exp % 'Totaal aantal kamers')).inner_html.to_i
oppervlakte = (doc/(value_exp % 'Totale oppervlakte')).inner_html.split.first.to_i
home = new(:uid => uid,
:adres=>adres,:postcode=>postcode,\
:plaats=>plaats,:stadsdeel=>stadsdeel,\
:brutohuur_in_cents=>brutohuur_in_cents,\
:aantal_kamers => aantal_kamers,:oppervlakte=>oppervlakte)
home.response_counts << ResponseCount.new(:count=>aantal_reacties)
home
end
def self.scrape!(opts={})
# scrapes the stored query of the account used
ActiveWoningnet.new.paginate_query(opts) do |page|
home = Home.build_from_source(page)
home.draw = Draw.all.last
unless home.save
# already exists - update responses
known = Home.first({:adres=>home.adres})
known.response_counts << home.response_counts.last
known.save
end
end
end
def set_coordinates
full_address = "#{self.adres}, #{self.plaats}"
# address with additions like 'HS' and 'RE' removed
simple_address = "#{self.adres.split('-')[0..-2]}, #{self.plaats}"
long,lat = begin
Google::Geo.new.locate(full_address).coordinates
rescue
Google::Geo.new.locate(simple_address).coordinates
rescue
[nil,nil]
end
attribute_set(:long, long)
attribute_set(:lat, lat)
[long,lat]
end
end
class ResponseCount
include DataMapper::Resource
property :id, Serial
property :count, Integer
property :created_at, DateTime
property :updated_at, DateTime
property :home_id, Integer
belongs_to :home
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment