Created
October 9, 2018 11:45
-
-
Save srishtis/4e241d8600570d4df184c5918b026759 to your computer and use it in GitHub Desktop.
HPP missing value treatment part 3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to scale a column | |
def norm_minmax(col): | |
return (col-col.min())/(col.max()-col.min()) | |
# By business definition, LotFrontage is the area of each street connected to the house property | |
# Intuitively it should be highly correlated to variables like LotArea | |
# It should also depend on LotShape, LotConfig | |
# Let us make a simple Linear regressor to get the most accurate values | |
# convert categoricals to dummies | |
#also dropping the target 'SalePrice' for now as the target currently is 'LotFrontage' | |
total_df_dummy = pd.get_dummies(total_df.drop('SalePrice',axis=1)) | |
# scaling all numerical columns | |
for col in total_df_dummy.drop('LotFrontage',axis=1).columns: | |
total_df_dummy[col] = norm_minmax(total_df_dummy[col]) | |
frontage_train = total_df_dummy.dropna() | |
frontage_train_y = frontage_train.LotFrontage | |
frontage_train_X = frontage_train.drop('LotFrontage',axis=1) | |
# fit model | |
lin_reg= linear_model.LinearRegression() | |
lin_reg.fit(frontage_train_X, frontage_train_y) | |
# check model results | |
lr_coefs = pd.Series(lin_reg.coef_,index=frontage_train_X.columns) | |
print(lr_coefs.sort_values(ascending=False)) | |
# use model predictions to populate nulls | |
nulls_in_lotfrontage = total_df.LotFrontage.isnull() | |
features = total_df_dummy[nulls_in_lotfrontage].drop('LotFrontage',axis=1) | |
target = lin_reg.predict(features) | |
# fill nan values | |
total_df.loc[nulls_in_lotfrontage,'LotFrontage'] = target |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment