Skip to content

Instantly share code, notes, and snippets.

@sxjscience
Last active May 4, 2021 18:35
Show Gist options
  • Save sxjscience/7f2c858132a6a9bff1d3c6f15b9690be to your computer and use it in GitHub Desktop.
Save sxjscience/7f2c858132a6a9bff1d3c6f15b9690be to your computer and use it in GitHub Desktop.

California House Prices Prediction

Install AutoGluon (I used version==0.1 in the submission and you may try the latest version, which may give you a better performance).

pip install autogluon

Competition in https://www.kaggle.com/c/california-house-prices

kaggle competitions download -c california-house-prices
unzip california-house-prices.zip -d california-house-prices

Run experiments:

# Weighted Ensemble TextPredictor (Multimodal Network) (0.13487)
mkdir -p ag_house_cal
python3 run_house_price.py --mode stack5 --exp_path ag_house_cal 2>&1 | tee -a ag_house_cal/log.txt
import pandas as pd
import numpy as np
import argparse
import os
import json
import random
from autogluon.tabular import TabularPredictor
from autogluon.text import TextPredictor
from autogluon.text.text_prediction.infer_types import infer_column_problem_types
from autogluon.text.text_prediction import constants as _C
def get_parser():
parser = argparse.ArgumentParser(
description='The Basic Example of AutoGluon for House Price Prediction.')
parser.add_argument('--mode', choices=['stack5', 'weighted', 'single'], default='weighted')
parser.add_argument('--data_path', type=str, default='california-house-prices')
parser.add_argument('--seed', type=int, default=123)
parser.add_argument('--exp_path', default=None)
parser.add_argument('--with_tax_values', default=1, type=int)
parser.add_argument('--overwrite_exp', action='store_true',
help='Whether to overwrite the existing experiment.')
return parser
def preprocess(df, with_tax_values=True, log_scale_lot=True,
log_scale_listed_price=True, has_label=True):
new_df = df.copy()
new_df.drop('Id', axis=1, inplace=True)
new_df['Elementary School'] = new_df['Elementary School'].apply(lambda ele: str(ele)[:-len(' Elementary School')] if str(ele).endswith('Elementary School') else ele)
if log_scale_lot:
new_df['Lot'] = np.log(new_df['Lot'] + 1)
if log_scale_listed_price:
log_listed_price = np.log(new_df['Listed Price']).clip(0, None)
new_df['Listed Price'] = log_listed_price
if with_tax_values:
new_df['Tax assessed value'] = np.log(new_df['Tax assessed value'] + 1)
new_df['Annual tax amount'] = np.log(new_df['Annual tax amount'] + 1)
else:
new_df.drop('Tax assessed value', axis=1, inplace=True)
new_df.drop('Annual tax amount', axis=1, inplace=True)
if has_label:
new_df['Sold Price'] = np.log(new_df['Sold Price'])
return new_df
def set_seed(seed):
import mxnet as mx
import torch as th
th.manual_seed(seed)
mx.random.seed(seed)
np.random.seed(seed)
random.seed(seed)
def train(args):
set_seed(args.seed)
train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv'))
# For the purpose of generating submission file
submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv'))
train_df = preprocess(train_df,
with_tax_values=args.with_tax_values, has_label=True)
test_df = preprocess(test_df,
with_tax_values=args.with_tax_values, has_label=False)
label_column = 'Sold Price'
feature_columns = [ele for ele in train_df.columns if ele != label_column]
eval_metric = 'r2'
if args.mode == 'weighted':
predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)
predictor.fit(train_df, hyperparameters='multimodal')
leaderboard = predictor.leaderboard()
leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv'))
elif args.mode == 'single':
predictor = TextPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)
predictor.fit(train_df, seed=args.seed)
elif args.mode == 'stack5':
predictor = TabularPredictor(eval_metric=eval_metric, label=label_column,
path=args.exp_path)
predictor.fit(train_df, hyperparameters='multimodal',
num_bag_folds=5, num_stack_levels=1)
leaderboard = predictor.leaderboard()
leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv'))
else:
raise NotImplementedError
predictions = np.exp(predictor.predict(test_df))
submission_df['Sold Price'] = predictions
submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None)
if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
if args.exp_path is None:
args.exp_path = f'ag_zillow_{args.mode}'
train(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment