Skip to content

Instantly share code, notes, and snippets.

@joshcrews
Created February 21, 2015 13:09

Revisions

  1. joshcrews created this gist Feb 21, 2015.
    79 changes: 79 additions & 0 deletions gistfile1.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,79 @@
    require 'rubygems'
    require 'mechanize'
    require 'csv'

    charities = []

    ('A'..'Z').each do |letter|
    puts "\nstarting #{letter}"

    url = "http://www.ecfa.org/MemberSearch.aspx?FirstLetter=#{letter}"

    agent = Mechanize.new { |agent|
    agent.user_agent_alias = 'Mac Safari'
    }

    page = agent.get(url)

    page.search('#BaseContent_Content_GridViewData tr').each do |row|
    next if row.css('td').first.nil?
    print "."

    begin
    charity = {
    name: row.css('td').first.text.strip,
    city: row.css('td')[2].text.strip,
    state: row.css('td')[3].text.strip,
    detail_path: row.css('td')[4].css('a').attr('href').text.strip,
    }

    charity_page = agent.get(charity[:detail_path])

    charity[:detail_url] = "http://www.ecfa.org/#{charity[:detail_path]}"

    charity[:cash_donations] = charity_page.search('#BaseContent_Content_lblCashDonations').text.strip.gsub(',', '')
    charity[:noncash_donations] = charity_page.search('#BaseContent_Content_lblNonCashDonations').text.strip.gsub(',', '')
    charity[:other_revenue] = charity_page.search('#BaseContent_Content_lblOtherRevenue').text.strip.gsub(',', '')
    charity[:net_assets] = charity_page.search('#BaseContent_Content_lblNetAssets').text.strip.gsub(',', '')
    charity[:contact_phone] = charity_page.search('#BaseContent_Content_lblContactInfoPhone').text.strip
    charity[:website] = charity_page.search('#BaseContent_Content_lblContactInfoWebsite').text.strip
    charity[:executive_director] = charity_page.search('#BaseContent_Content_lblContact').text.strip

    charity[:ministry_types] = charity_page.search('#BaseContent_Content_lstMinistryTypes td').map(&:text).map(&:strip).select{|t| !t.empty?}.join(', ')
    charity[:ministry_sectors] = charity_page.search('#BaseContent_Content_lstMinistrySectors td').map(&:text).map(&:strip).select{|t| !t.empty?}.join(', ')

    charities << charity

    #
    # optional sleep so as to go easy on their servers
    #
    sleep 2
    rescue
    puts "fail on #{row.text}"
    end
    end
    end

    row_headers = [
    :name,
    :city,
    :state,
    :detail_url,
    :cash_donations,
    :noncash_donations,
    :other_revenue,
    :net_assets,
    :contact_phone,
    :website,
    :executive_director,
    :ministry_types,
    :ministry_sectors,
    ]

    CSV.open("output.csv", "wb") do |csv|
    csv << row_headers.map(&:to_s)

    charities.each do |charity|
    csv << row_headers.map{|h| charity[h]}
    end
    end