jmsword · January 28, 2017 17:56
diff --git a/education.py b/education.py
 from bs4 import BeautifulSoup
 import requests
 import pandas as pd
 import sqlite3 as lite
 import csv
 import numpy as np
 import matplotlib.pyplot as plt
 import numpy as np
 import statsmodels.api as sm

 #Where the data is on the web
 url = 'http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm'
 r = requests.get(url)

 soup = BeautifulSoup(r.content, "lxml")

 #Specific table we want to target
 table = soup('table')[6]

 #Narrowing down the table to just the rows we want
 raw_data = []
 rows = table.find_all('tr')[8:191]
 for row in rows:
 	cols = row.find_all('td')
 	cols = [ele.text.strip() for ele in cols]
 	raw_data.append([ele for ele in cols])

 #Data we need is in indicies: 0,1,7,10
 data_set = []
 for i in raw_data:
 	country = str(i[0])
 	year = str(i[1])
 	men = int(i[7])
 	women = int(i[10])
 	total = int(i[4])
 	clean_data = [country, year, men, women, total]
 	data_set.append(clean_data)

 #Create dataframe to put HTML data into
 df_columns = ['Country', 'Year', 'Men', 'Women', 'Total']
 df = pd.DataFrame(data_set, index=None, columns=df_columns)

 con = lite.connect('gdp.db')
 cur = con.cursor()

 with con:
    cur.execute('CREATE TABLE gdp (Country TEXT, _1999 REAL, _2000 REAL, _2001 REAL, _2002 REAL, _2003 REAL, _2004 REAL, _2005 REAL, _2006 REAL, _2007 REAL, _2008 REAL, _2009 REAL, _2010 REAL )')

 with open('API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv') as inputFile:
 	next(inputFile)
 	next(inputFile)
 	next(inputFile)
 	next(inputFile)
 	header = next(inputFile)
 	inputReader = csv.reader(inputFile)
 	for line in inputReader:
 		with con:
 			cur.execute('INSERT INTO gdp (Country, _1999, _2000, _2001, _2002, _2003, _2004, _2005, _2006, _2007, _2008, _2009, _2010) VALUES ("' + line[0] + '","' + '","'.join(line[43:55]) + '");')

 #Get gdp table as pandas dataframe
 with con:
 	cur.execute('SELECT * FROM gdp')
 	rows = cur.fetchall()
 	cols = [desc[0] for desc in cur.description]
 	gdp = pd.DataFrame(rows, columns=cols)

 #Replace blank values with 'NaN'
 gdp.replace('', np.nan, inplace=True)

 #Merge gdp and educational attainment dataframes on country name
 #Several countries were dropped because the names didn't match up perfectly. In a perfect world I would look
 #through the list manually and change the names to match up, but for the purpose of this assignment I am just
 #going to work with the countries that matched to save time.
 combined = pd.merge(df, gdp, on='Country', how='inner')

 #Drop rows with 'NaN' values
 combined.dropna(inplace=True)

 #Rename columns to match year education attainment and gdb values
 #rewrite at some point to a function to remove '_' from column name instead
 #of doing it manually
 combined.rename(columns={'_1999': '1999',
 						 '_2000': '2000',
 						 '_2001': '2001',
 						 '_2002': '2002',
 						 '_2003': '2003',
 						 '_2004': '2004',
 						 '_2005': '2005',
 						 '_2006': '2006',
 						 '_2007': '2007',
 						 '_2008': '2008',
 						 '_2009': '2009',
 						 '_2010': '2010',
 						}, inplace=True)


 #Log transformation on gdp numbers
 for col in combined.columns[5:17]:
 	combined[col] = np.log(combined[col])

 #Match gdp numbers with corresponding education year in a new column
 for row in combined.index:
    for col in combined.columns:
        if np.all(combined.ix[row]['Year'] == col):
            combined.ix[row,'log_gdp'] = combined.ix[row,col]

 #ANALYSIS

 total_edu = combined['Total']
 gdp = combined['log_gdp']

 x = np.matrix(total_edu).transpose()
 y = np.matrix(gdp).transpose()

 X = sm.add_constant(x)
 model = sm.OLS(y,X)
 f = model.fit()

 print ('P-Values: ', f.pvalues)
 print ('R-Squared: ', f.rsquared)

 print("With an R-Squared value of: ", f.rsquared, ", there appears to be little correlation between school life expentancy and a country's GDP." \
 	"I believe this is because with increasing globalization and the movement of labor across boarders, local education attainment " \
 	"is becoming less and less important to a country's productivity.")
	from bs4 import BeautifulSoup
	import requests
	import pandas as pd
	import sqlite3 as lite
	import csv
	import numpy as np
	import matplotlib.pyplot as plt
	import numpy as np
	import statsmodels.api as sm

	#Where the data is on the web
	url = 'http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm'
	r = requests.get(url)

	soup = BeautifulSoup(r.content, "lxml")

	#Specific table we want to target
	table = soup('table')[6]

	#Narrowing down the table to just the rows we want
	raw_data = []
	rows = table.find_all('tr')[8:191]
	for row in rows:
	cols = row.find_all('td')
	cols = [ele.text.strip() for ele in cols]
	raw_data.append([ele for ele in cols])

	#Data we need is in indicies: 0,1,7,10
	data_set = []
	for i in raw_data:
	country = str(i[0])
	year = str(i[1])
	men = int(i[7])
	women = int(i[10])
	total = int(i[4])
	clean_data = [country, year, men, women, total]
	data_set.append(clean_data)

	#Create dataframe to put HTML data into
	df_columns = ['Country', 'Year', 'Men', 'Women', 'Total']
	df = pd.DataFrame(data_set, index=None, columns=df_columns)

	con = lite.connect('gdp.db')
	cur = con.cursor()

	with con:
	cur.execute('CREATE TABLE gdp (Country TEXT, _1999 REAL, _2000 REAL, _2001 REAL, _2002 REAL, _2003 REAL, _2004 REAL, _2005 REAL, _2006 REAL, _2007 REAL, _2008 REAL, _2009 REAL, _2010 REAL )')

	with open('API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv') as inputFile:
	next(inputFile)
	next(inputFile)
	next(inputFile)
	next(inputFile)
	header = next(inputFile)
	inputReader = csv.reader(inputFile)
	for line in inputReader:
	with con:
	cur.execute('INSERT INTO gdp (Country, _1999, _2000, _2001, _2002, _2003, _2004, _2005, _2006, _2007, _2008, _2009, _2010) VALUES ("' + line[0] + '","' + '","'.join(line[43:55]) + '");')

	#Get gdp table as pandas dataframe
	with con:
	cur.execute('SELECT * FROM gdp')
	rows = cur.fetchall()
	cols = [desc[0] for desc in cur.description]
	gdp = pd.DataFrame(rows, columns=cols)

	#Replace blank values with 'NaN'
	gdp.replace('', np.nan, inplace=True)

	#Merge gdp and educational attainment dataframes on country name
	#Several countries were dropped because the names didn't match up perfectly. In a perfect world I would look
	#through the list manually and change the names to match up, but for the purpose of this assignment I am just
	#going to work with the countries that matched to save time.
	combined = pd.merge(df, gdp, on='Country', how='inner')

	#Drop rows with 'NaN' values
	combined.dropna(inplace=True)

	#Rename columns to match year education attainment and gdb values
	#rewrite at some point to a function to remove '_' from column name instead
	#of doing it manually
	combined.rename(columns={'_1999': '1999',
	'_2000': '2000',
	'_2001': '2001',
	'_2002': '2002',
	'_2003': '2003',
	'_2004': '2004',
	'_2005': '2005',
	'_2006': '2006',
	'_2007': '2007',
	'_2008': '2008',
	'_2009': '2009',
	'_2010': '2010',
	}, inplace=True)


	#Log transformation on gdp numbers
	for col in combined.columns[5:17]:
	combined[col] = np.log(combined[col])

	#Match gdp numbers with corresponding education year in a new column
	for row in combined.index:
	for col in combined.columns:
	if np.all(combined.ix[row]['Year'] == col):
	combined.ix[row,'log_gdp'] = combined.ix[row,col]

	#ANALYSIS

	total_edu = combined['Total']
	gdp = combined['log_gdp']

	x = np.matrix(total_edu).transpose()
	y = np.matrix(gdp).transpose()

	X = sm.add_constant(x)
	model = sm.OLS(y,X)
	f = model.fit()

	print ('P-Values: ', f.pvalues)
	print ('R-Squared: ', f.rsquared)

	print("With an R-Squared value of: ", f.rsquared, ", there appears to be little correlation between school life expentancy and a country's GDP." \
	"I believe this is because with increasing globalization and the movement of labor across boarders, local education attainment " \
	"is becoming less and less important to a country's productivity.")