Last active
August 29, 2015 14:01
-
-
Save svolle/5f1d1e2457bc1cd7c9c9 to your computer and use it in GitHub Desktop.
Brain dead script to get "developer density" in main French cities.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/env python | |
| """ | |
| Brain dead script that scraps wikipedia for the top French cities and their population | |
| then queries Github to get an estimated city developer population. | |
| """ | |
| __author__ = 'svolle' | |
| import requests | |
| import github3 | |
| from bs4 import BeautifulSoup | |
| from getpass import getpass | |
| import logging | |
| import time | |
| import csv | |
| import sys | |
| import re | |
| WIKI_HOST = 'http://fr.wikipedia.org' | |
| CITIES_LIST_PAGE_URL = WIKI_HOST + '/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es' | |
| NUM_CITIES = 20 | |
| OUTPUT_FILE = 'results.csv' | |
| results = {} # { 'Lyon': { 'population': 123456, 'developers': 123456 } } | |
| logger = logging.getLogger(__name__) | |
| logger.addHandler(logging.StreamHandler(sys.stdout)) | |
| logger.setLevel(logging.DEBUG) | |
| def get_top_cities_list(response): | |
| doc = BeautifulSoup(response.text) | |
| cities_table = doc.find(id='Classement_des_communes').parent.parent.find('table') | |
| top_cities = map(lambda city_link: (city_link.text, city_link['href']), | |
| cities_table.select('td:first-child b a')[:NUM_CITIES]) | |
| return top_cities | |
| def get_larger_area_population(city_name, city_url): | |
| logger.debug('Getting larger city population for city %s' % city_name) | |
| response = requests.get(WIKI_HOST + city_url) | |
| if response.status_code > 200: | |
| logger.debug('Error while getting city page. Skipping.') | |
| return | |
| try: | |
| doc = BeautifulSoup(response.text) | |
| larger_city_population_node = doc.find('a', text='aire urbaine').parent.parent.select('td span')[0] | |
| population = int(re.sub(r'\s|hab\.', '', larger_city_population_node.text)) | |
| logger.debug('Got population %d for city %s' % (population, city_name)) | |
| return population | |
| except: | |
| logger.debug('Could not get additional information for city %s' % city_name) | |
| def get_developer_population(city_name): | |
| developers = 0 | |
| logger.debug('Getting developers population from Github') | |
| search_iterator = github3.search_users("location:'%s'" % city_name) | |
| for _ in search_iterator: | |
| developers = search_iterator.total_count | |
| break | |
| logger.debug('Got %d developers for city %s from Github' % (search_iterator.total_count, city_name)) | |
| rate_limit = github3.rate_limit() | |
| search_rate_limit = rate_limit['resources']['search'] | |
| logger.debug('Github rate limitation: %d' % search_rate_limit['remaining']) | |
| if search_rate_limit['remaining'] == 0: | |
| logger.debug('Hit Github rate limitation.') | |
| resets_at = search_rate_limit['reset'] | |
| wait_for = int(resets_at - time.time()) + 1 | |
| logger.debug('Waiting for %d seconds' % wait_for) | |
| time.sleep(wait_for) | |
| else: | |
| time.sleep(2) | |
| return developers | |
| def write_results(results): | |
| logger.debug('Writing results to CSV file') | |
| csv_result = ['City,Population,Developers,Developer per 1000 inhabitants'] | |
| for city_name, city_data in results: | |
| csv_result.append('%s;%d;%d;%.2f' % ( | |
| city_name, city_data['population'], city_data['developers'], | |
| city_data['developers'] / (city_data['population'] // 1000)) | |
| ) | |
| with open(OUTPUT_FILE, 'w') as f: | |
| writer = csv.writer(f) | |
| writer.writerow(['City', 'Population', 'Developers', 'Developer per 1000 inhabitants']) | |
| writer.writerows(csv_result) | |
| def main(): | |
| user = input('Github user: ') | |
| password = False | |
| while not password: | |
| password = getpass('Password for {0}: '.format(user)) | |
| github3.login(user, password) | |
| logger.debug('Getting list of top cities') | |
| response = requests.get(CITIES_LIST_PAGE_URL) | |
| if response.status_code > 200: | |
| raise RuntimeError('Could not get cities list page') | |
| top_cities = get_top_cities_list(response) | |
| for top_city_name, top_city_url in top_cities: | |
| population = get_larger_area_population(top_city_name, top_city_url) | |
| if population is not None: | |
| developer_population = get_developer_population(top_city_name) | |
| results[top_city_name] = {'population': population, 'developers': developer_population} | |
| write_results(results) | |
| logging.debug('All done') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment