Created
October 9, 2018 13:51
-
-
Save srishtis/b649ce9cb8ad0957308fc280b8dfcf3e to your computer and use it in GitHub Desktop.
Creating features for hpp Kaggle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let us start with the variables having highest correlation with the target variable | |
# looking at OverallQual, GrLivArea, GarageCars and TotalBsmtSF | |
# Since it is one of the highest correlated variables with the response, we can create a quadratic variable that might be a part of the regression equation | |
total_df["OverallQual_2"] = total_df["OverallQual"].astype(int) ** 2 | |
#also creating cubic | |
total_df["OverallQual_3"] = total_df["OverallQual"].astype(int) ** 3 | |
# another sqrt transformation | |
total_df["OverallQual_sqrt"] = np.sqrt(total_df["OverallQual"].astype(int)) | |
# OverallQual is just a categorical variable in guise of integers | |
# Changing OverallQual into a categorical variable | |
total_df['OverallQual'] = total_df['OverallQual'].astype(str) | |
# next variable: GrLivArea | |
# creating the polynomial variables from here as well | |
total_df["GrLivArea_2"] = total_df["GrLivArea"] ** 2 | |
#also creating cubic | |
total_df["GrLivArea_3"] = total_df["GrLivArea"] ** 3 | |
# another sqrt transformation | |
total_df["GrLivArea_sqrt"] = np.sqrt(total_df["GrLivArea"]) | |
# log transformed | |
total_df['GrLivArea_log'] = np.log1p(total_df['GrLivArea']) | |
# we can also create buckets on GrLivArea | |
total_df['GrLivArea_Band'] = pd.cut(total_df['GrLivArea'], 6,labels=["1", "2", "3","4","5","6"]) | |
print(total_df['GrLivArea_Band'].unique()) | |
# since these are essential categorical variables, | |
# let us convert them to string | |
total_df['GrLivArea_Band'] = total_df['GrLivArea_Band'].astype(str) | |
# creating polynomial features from TotalBsmtSF | |
total_df["TotalBsmtSF_2"] = total_df["TotalBsmtSF"] ** 2 | |
#also creating cubic | |
total_df["TotalBsmtSF_3"] = total_df["TotalBsmtSF"] ** 3 | |
# another sqrt transformation | |
total_df["TotalBsmtSF_sqrt"] = np.sqrt(total_df["TotalBsmtSF"]) | |
# log transformed variable | |
total_df['TotalBsmtSF_log'] = np.log1p(total_df['TotalBsmtSF']) | |
# also creating a 1-0 flag called 'HasBsmt' using 'TotalBsmtSF' | |
#if area>0 it is 'Y', else 'N' | |
total_df['HasBsmt'] = np.where(total_df['TotalBsmtSF']>0, 'Y', 'N') | |
# we can also create buckets on TotalBsmtSF | |
total_df['TotalBsmtSF_Band'] = pd.cut(total_df['TotalBsmtSF'], 3,labels=["1", "2", "3"]) | |
print(total_df['TotalBsmtSF_Band'].unique()) | |
# since these are essential categorical variables, | |
# let us convert them to string | |
total_df['TotalBsmtSF_Band'] = total_df['TotalBsmtSF_Band'].astype(str) | |
# creating polynomial features from GarageCars | |
total_df["GarageCars_2"] = total_df["GarageCars"] ** 2 | |
#also creating cubic | |
total_df["GarageCars_3"] = total_df["GarageCars"] ** 3 | |
# another sqrt transformation | |
total_df["GarageCars_sqrt"] = np.sqrt(total_df["GarageCars"]) | |
# log transformed variable | |
total_df['GarageCars_log'] = np.log1p(total_df['GarageCars']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment