srishtis · October 9, 2018 11:45
diff --git a/hpp_missing_values_3.py b/hpp_missing_values_3.py
 # function to scale a column
 def norm_minmax(col):
    return (col-col.min())/(col.max()-col.min())
    
 # By business definition, LotFrontage is the area of each street connected to the house property
 # Intuitively it should be highly correlated to variables like LotArea
 # It should also depend on LotShape, LotConfig
 # Let us make a simple Linear regressor to get the most accurate values

 # convert categoricals to dummies
 #also dropping the target 'SalePrice' for now as the target currently is 'LotFrontage'
 total_df_dummy = pd.get_dummies(total_df.drop('SalePrice',axis=1))
 # scaling all numerical columns
 for col in total_df_dummy.drop('LotFrontage',axis=1).columns:
    total_df_dummy[col] = norm_minmax(total_df_dummy[col])

 frontage_train = total_df_dummy.dropna()
 frontage_train_y = frontage_train.LotFrontage
 frontage_train_X = frontage_train.drop('LotFrontage',axis=1)  

 # fit model
 lin_reg= linear_model.LinearRegression()
 lin_reg.fit(frontage_train_X, frontage_train_y)

 # check model results
 lr_coefs = pd.Series(lin_reg.coef_,index=frontage_train_X.columns)
 print(lr_coefs.sort_values(ascending=False))


 # use model predictions to populate nulls
 nulls_in_lotfrontage = total_df.LotFrontage.isnull()
 features = total_df_dummy[nulls_in_lotfrontage].drop('LotFrontage',axis=1)
 target = lin_reg.predict(features)

 # fill nan values
 total_df.loc[nulls_in_lotfrontage,'LotFrontage'] = target
	# function to scale a column
	def norm_minmax(col):
	return (col-col.min())/(col.max()-col.min())

	# By business definition, LotFrontage is the area of each street connected to the house property
	# Intuitively it should be highly correlated to variables like LotArea
	# It should also depend on LotShape, LotConfig
	# Let us make a simple Linear regressor to get the most accurate values

	# convert categoricals to dummies
	#also dropping the target 'SalePrice' for now as the target currently is 'LotFrontage'
	total_df_dummy = pd.get_dummies(total_df.drop('SalePrice',axis=1))
	# scaling all numerical columns
	for col in total_df_dummy.drop('LotFrontage',axis=1).columns:
	total_df_dummy[col] = norm_minmax(total_df_dummy[col])

	frontage_train = total_df_dummy.dropna()
	frontage_train_y = frontage_train.LotFrontage
	frontage_train_X = frontage_train.drop('LotFrontage',axis=1)

	# fit model
	lin_reg= linear_model.LinearRegression()
	lin_reg.fit(frontage_train_X, frontage_train_y)

	# check model results
	lr_coefs = pd.Series(lin_reg.coef_,index=frontage_train_X.columns)
	print(lr_coefs.sort_values(ascending=False))


	# use model predictions to populate nulls
	nulls_in_lotfrontage = total_df.LotFrontage.isnull()
	features = total_df_dummy[nulls_in_lotfrontage].drop('LotFrontage',axis=1)
	target = lin_reg.predict(features)

	# fill nan values
	total_df.loc[nulls_in_lotfrontage,'LotFrontage'] = target