Skip to content

Instantly share code, notes, and snippets.

@svolle
Last active August 29, 2015 14:01
Show Gist options
  • Select an option

  • Save svolle/5f1d1e2457bc1cd7c9c9 to your computer and use it in GitHub Desktop.

Select an option

Save svolle/5f1d1e2457bc1cd7c9c9 to your computer and use it in GitHub Desktop.
Brain dead script to get "developer density" in main French cities.
#!/bin/env python
"""
Brain dead script that scraps wikipedia for the top French cities and their population
then queries Github to get an estimated city developer population.
"""
__author__ = 'svolle'
import requests
import github3
from bs4 import BeautifulSoup
from getpass import getpass
import logging
import time
import csv
import sys
import re
WIKI_HOST = 'http://fr.wikipedia.org'
CITIES_LIST_PAGE_URL = WIKI_HOST + '/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es'
NUM_CITIES = 20
OUTPUT_FILE = 'results.csv'
results = {} # { 'Lyon': { 'population': 123456, 'developers': 123456 } }
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)
def get_top_cities_list(response):
doc = BeautifulSoup(response.text)
cities_table = doc.find(id='Classement_des_communes').parent.parent.find('table')
top_cities = map(lambda city_link: (city_link.text, city_link['href']),
cities_table.select('td:first-child b a')[:NUM_CITIES])
return top_cities
def get_larger_area_population(city_name, city_url):
logger.debug('Getting larger city population for city %s' % city_name)
response = requests.get(WIKI_HOST + city_url)
if response.status_code > 200:
logger.debug('Error while getting city page. Skipping.')
return
try:
doc = BeautifulSoup(response.text)
larger_city_population_node = doc.find('a', text='aire urbaine').parent.parent.select('td span')[0]
population = int(re.sub(r'\s|hab\.', '', larger_city_population_node.text))
logger.debug('Got population %d for city %s' % (population, city_name))
return population
except:
logger.debug('Could not get additional information for city %s' % city_name)
def get_developer_population(city_name):
developers = 0
logger.debug('Getting developers population from Github')
search_iterator = github3.search_users("location:'%s'" % city_name)
for _ in search_iterator:
developers = search_iterator.total_count
break
logger.debug('Got %d developers for city %s from Github' % (search_iterator.total_count, city_name))
rate_limit = github3.rate_limit()
search_rate_limit = rate_limit['resources']['search']
logger.debug('Github rate limitation: %d' % search_rate_limit['remaining'])
if search_rate_limit['remaining'] == 0:
logger.debug('Hit Github rate limitation.')
resets_at = search_rate_limit['reset']
wait_for = int(resets_at - time.time()) + 1
logger.debug('Waiting for %d seconds' % wait_for)
time.sleep(wait_for)
else:
time.sleep(2)
return developers
def write_results(results):
logger.debug('Writing results to CSV file')
csv_result = ['City,Population,Developers,Developer per 1000 inhabitants']
for city_name, city_data in results:
csv_result.append('%s;%d;%d;%.2f' % (
city_name, city_data['population'], city_data['developers'],
city_data['developers'] / (city_data['population'] // 1000))
)
with open(OUTPUT_FILE, 'w') as f:
writer = csv.writer(f)
writer.writerow(['City', 'Population', 'Developers', 'Developer per 1000 inhabitants'])
writer.writerows(csv_result)
def main():
user = input('Github user: ')
password = False
while not password:
password = getpass('Password for {0}: '.format(user))
github3.login(user, password)
logger.debug('Getting list of top cities')
response = requests.get(CITIES_LIST_PAGE_URL)
if response.status_code > 200:
raise RuntimeError('Could not get cities list page')
top_cities = get_top_cities_list(response)
for top_city_name, top_city_url in top_cities:
population = get_larger_area_population(top_city_name, top_city_url)
if population is not None:
developer_population = get_developer_population(top_city_name)
results[top_city_name] = {'population': population, 'developers': developer_population}
write_results(results)
logging.debug('All done')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment