Created
February 21, 2015 13:09
Revisions
-
joshcrews created this gist
Feb 21, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,79 @@ require 'rubygems' require 'mechanize' require 'csv' charities = [] ('A'..'Z').each do |letter| puts "\nstarting #{letter}" url = "http://www.ecfa.org/MemberSearch.aspx?FirstLetter=#{letter}" agent = Mechanize.new { |agent| agent.user_agent_alias = 'Mac Safari' } page = agent.get(url) page.search('#BaseContent_Content_GridViewData tr').each do |row| next if row.css('td').first.nil? print "." begin charity = { name: row.css('td').first.text.strip, city: row.css('td')[2].text.strip, state: row.css('td')[3].text.strip, detail_path: row.css('td')[4].css('a').attr('href').text.strip, } charity_page = agent.get(charity[:detail_path]) charity[:detail_url] = "http://www.ecfa.org/#{charity[:detail_path]}" charity[:cash_donations] = charity_page.search('#BaseContent_Content_lblCashDonations').text.strip.gsub(',', '') charity[:noncash_donations] = charity_page.search('#BaseContent_Content_lblNonCashDonations').text.strip.gsub(',', '') charity[:other_revenue] = charity_page.search('#BaseContent_Content_lblOtherRevenue').text.strip.gsub(',', '') charity[:net_assets] = charity_page.search('#BaseContent_Content_lblNetAssets').text.strip.gsub(',', '') charity[:contact_phone] = charity_page.search('#BaseContent_Content_lblContactInfoPhone').text.strip charity[:website] = charity_page.search('#BaseContent_Content_lblContactInfoWebsite').text.strip charity[:executive_director] = charity_page.search('#BaseContent_Content_lblContact').text.strip charity[:ministry_types] = charity_page.search('#BaseContent_Content_lstMinistryTypes td').map(&:text).map(&:strip).select{|t| !t.empty?}.join(', ') charity[:ministry_sectors] = charity_page.search('#BaseContent_Content_lstMinistrySectors td').map(&:text).map(&:strip).select{|t| !t.empty?}.join(', ') charities << charity # # optional sleep so as to go easy on their servers # sleep 2 rescue puts "fail on #{row.text}" end end end row_headers = [ :name, :city, :state, :detail_url, :cash_donations, :noncash_donations, :other_revenue, :net_assets, :contact_phone, :website, :executive_director, :ministry_types, :ministry_sectors, ] CSV.open("output.csv", "wb") do |csv| csv << row_headers.map(&:to_s) charities.each do |charity| csv << row_headers.map{|h| charity[h]} end end