Created
January 28, 2017 17:56
-
-
Save jmsword/f198672f2755abbde83212cfb3d63f09 to your computer and use it in GitHub Desktop.
Education
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import pandas as pd | |
import sqlite3 as lite | |
import csv | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import statsmodels.api as sm | |
#Where the data is on the web | |
url = 'http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm' | |
r = requests.get(url) | |
soup = BeautifulSoup(r.content, "lxml") | |
#Specific table we want to target | |
table = soup('table')[6] | |
#Narrowing down the table to just the rows we want | |
raw_data = [] | |
rows = table.find_all('tr')[8:191] | |
for row in rows: | |
cols = row.find_all('td') | |
cols = [ele.text.strip() for ele in cols] | |
raw_data.append([ele for ele in cols]) | |
#Data we need is in indicies: 0,1,7,10 | |
data_set = [] | |
for i in raw_data: | |
country = str(i[0]) | |
year = str(i[1]) | |
men = int(i[7]) | |
women = int(i[10]) | |
total = int(i[4]) | |
clean_data = [country, year, men, women, total] | |
data_set.append(clean_data) | |
#Create dataframe to put HTML data into | |
df_columns = ['Country', 'Year', 'Men', 'Women', 'Total'] | |
df = pd.DataFrame(data_set, index=None, columns=df_columns) | |
con = lite.connect('gdp.db') | |
cur = con.cursor() | |
with con: | |
cur.execute('CREATE TABLE gdp (Country TEXT, _1999 REAL, _2000 REAL, _2001 REAL, _2002 REAL, _2003 REAL, _2004 REAL, _2005 REAL, _2006 REAL, _2007 REAL, _2008 REAL, _2009 REAL, _2010 REAL )') | |
with open('API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv') as inputFile: | |
next(inputFile) | |
next(inputFile) | |
next(inputFile) | |
next(inputFile) | |
header = next(inputFile) | |
inputReader = csv.reader(inputFile) | |
for line in inputReader: | |
with con: | |
cur.execute('INSERT INTO gdp (Country, _1999, _2000, _2001, _2002, _2003, _2004, _2005, _2006, _2007, _2008, _2009, _2010) VALUES ("' + line[0] + '","' + '","'.join(line[43:55]) + '");') | |
#Get gdp table as pandas dataframe | |
with con: | |
cur.execute('SELECT * FROM gdp') | |
rows = cur.fetchall() | |
cols = [desc[0] for desc in cur.description] | |
gdp = pd.DataFrame(rows, columns=cols) | |
#Replace blank values with 'NaN' | |
gdp.replace('', np.nan, inplace=True) | |
#Merge gdp and educational attainment dataframes on country name | |
#Several countries were dropped because the names didn't match up perfectly. In a perfect world I would look | |
#through the list manually and change the names to match up, but for the purpose of this assignment I am just | |
#going to work with the countries that matched to save time. | |
combined = pd.merge(df, gdp, on='Country', how='inner') | |
#Drop rows with 'NaN' values | |
combined.dropna(inplace=True) | |
#Rename columns to match year education attainment and gdb values | |
#rewrite at some point to a function to remove '_' from column name instead | |
#of doing it manually | |
combined.rename(columns={'_1999': '1999', | |
'_2000': '2000', | |
'_2001': '2001', | |
'_2002': '2002', | |
'_2003': '2003', | |
'_2004': '2004', | |
'_2005': '2005', | |
'_2006': '2006', | |
'_2007': '2007', | |
'_2008': '2008', | |
'_2009': '2009', | |
'_2010': '2010', | |
}, inplace=True) | |
#Log transformation on gdp numbers | |
for col in combined.columns[5:17]: | |
combined[col] = np.log(combined[col]) | |
#Match gdp numbers with corresponding education year in a new column | |
for row in combined.index: | |
for col in combined.columns: | |
if np.all(combined.ix[row]['Year'] == col): | |
combined.ix[row,'log_gdp'] = combined.ix[row,col] | |
#ANALYSIS | |
total_edu = combined['Total'] | |
gdp = combined['log_gdp'] | |
x = np.matrix(total_edu).transpose() | |
y = np.matrix(gdp).transpose() | |
X = sm.add_constant(x) | |
model = sm.OLS(y,X) | |
f = model.fit() | |
print ('P-Values: ', f.pvalues) | |
print ('R-Squared: ', f.rsquared) | |
print("With an R-Squared value of: ", f.rsquared, ", there appears to be little correlation between school life expentancy and a country's GDP." \ | |
"I believe this is because with increasing globalization and the movement of labor across boarders, local education attainment " \ | |
"is becoming less and less important to a country's productivity.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment