Last active
May 21, 2020 18:15
-
-
Save max-kov/10fb29e3716a1756e4995fee72c6da4d to your computer and use it in GitHub Desktop.
Cambridge colleges admissions scraper and analyser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import colorsys | |
import pandas as pd | |
from scipy.stats import beta | |
from matplotlib import pyplot as plt | |
import math | |
import numpy as np | |
import matplotlib.patches as mpatches | |
excluded_colleges = [] | |
def rgb_tuble_to_html(color): | |
eight_byte_color = [int(math.floor(c * 255.)) for c in color] | |
return "#{:02x}{:02x}{:02x}".format(*eight_byte_color) | |
def load_bound_df(year): | |
df = pd.read_csv("{}.csv".format(year)) | |
df = df[list(set(df.columns) - set(excluded_colleges))] | |
df = df.set_index("-") | |
df = df.T | |
df["Total Applications"] = df["Direct applications"] + df["Open applications"] | |
df["Total Offers"] = df["Direct offers"] + df["Pool offers by other Colleges"] | |
df["Total Rejections"] = df["Total Applications"] - df["Total Offers"] | |
# Getting the 80% confidence interval using a Beta distribution | |
year_intervals = [(college, beta.ppf([0.1, 0.9], row["Total Offers"] + 1, row["Total Rejections"] + 1)) | |
for college, row in df.iterrows()] | |
return pd.DataFrame([(c, l, u) for (c, (l, u)) in year_intervals], columns=["college", "lower", "upper"]) | |
for year in [2017, 2016, 2015]: | |
year_intervals = load_bound_df(year) | |
year_2017_overview_plot = (year_intervals.set_index("college").sort_values("lower").plot(kind="bar", title="{} Overview".format(year))) | |
plt.show() | |
years_to_analyze = [2013, 2014, 2015, 2016, 2017] | |
interval_dfs = [] | |
for year in years_to_analyze: | |
intervals = load_bound_df(year) | |
intervals["year"] = str(year) | |
interval_dfs.append(intervals) | |
interval_df = pd.concat(interval_dfs) | |
ax = plt.axes() | |
# Picked these based on the overview of 2015,2016,2017 | |
selected_colleges = [u'Jesus College', u'Churchill College', u'Magdalene College', u'Emmanuel College', u'Gonville and Caius College', u'Pembroke College', u'Clare College', u"Queens' College"] | |
legend_patches = [] | |
for college, color_index in zip(selected_colleges, np.linspace(0, 1., num=len(selected_colleges), endpoint=False)): | |
college_vals = interval_df[interval_df["college"] == college].set_index("year") | |
color = rgb_tuble_to_html(colorsys.hsv_to_rgb(color_index, 0.8, 0.8)) | |
college_vals["lower"].plot(style=":", ax=ax, color=color) | |
college_vals["upper"].plot(style="-", ax=ax, color=color) | |
legend_patches.append(mpatches.Patch(color=color, label=college)) | |
plt.legend(handles=legend_patches) | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import csv | |
import json | |
import os | |
for year_num in range(2010, 2018): | |
year = str(year_num) | |
curl_req = "curl 'https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'origin: https://www.undergraduate.study.cam.ac.uk' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: en-US,en;q=0.8,ru;q=0.6,lv;q=0.4' -H 'cookie: X-Mapping-iejmlgke=D5A918F0CE6670C7B94E800FC55B2597; has_js=1' -H 'upgrade-insecure-requests: 1' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' -H 'content-type: application/x-www-form-urlencoded' -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'cache-control: max-age=0' -H 'authority: www.undergraduate.study.cam.ac.uk' -H 'referer: https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'dnt: 1' --data 'period=year&year="+year+"&app%5Bapplications%5D=applications&open%5Bopen%5D=open&off%5Boffers%5D=offers&winter%5Bwinter%5D=winter&acc%5Bacceptances%5D=acceptances&summer%5Bsummer%5D=summer&what=university&college=Christ%27s+College&course=Anglo-Saxon%2C+Norse%2C+and+Celtic&group=college&op=Show+graph&form_build_id=form-bRA6Yr2VG3zK5k1Fv170KMfcvDrohtLUlmxgYFDJCB0&form_id=cam_app_charts_my_form_1' --compressed" | |
html = os.popen(curl_req).read() | |
soup = BeautifulSoup(html,"html.parser") | |
json_data = soup.find("div",{"class":"chart"}).get("data-chart") | |
data = json.loads(json_data) | |
colleges = ["-"]+data["xAxis"][0]["categories"] | |
type = [] | |
num_data = [] | |
for thing in data["series"]: | |
num_data.append(thing["data"]) | |
type.append(thing["name"]) | |
with open(year+".csv", 'w') as csvfile: | |
parsed_data = csv.writer(csvfile) | |
parsed_data.writerow(colleges) | |
for i,data_type in enumerate(type): | |
parsed_data.writerow([data_type]+num_data[i]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment