|
import pandas as pd |
|
import numpy as np |
|
import argparse |
|
import os |
|
import json |
|
import random |
|
from autogluon.tabular import TabularPredictor |
|
from autogluon.text import TextPredictor |
|
from autogluon.text.text_prediction.infer_types import infer_column_problem_types |
|
from autogluon.text.text_prediction import constants as _C |
|
|
|
|
|
def get_parser(): |
|
parser = argparse.ArgumentParser( |
|
description='The Basic Example of AutoGluon for House Price Prediction.') |
|
parser.add_argument('--mode', choices=['stack5', 'weighted', 'single'], default='weighted') |
|
parser.add_argument('--data_path', type=str, default='california-house-prices') |
|
parser.add_argument('--seed', type=int, default=123) |
|
parser.add_argument('--exp_path', default=None) |
|
parser.add_argument('--with_tax_values', default=1, type=int) |
|
parser.add_argument('--overwrite_exp', action='store_true', |
|
help='Whether to overwrite the existing experiment.') |
|
return parser |
|
|
|
|
|
def preprocess(df, with_tax_values=True, log_scale_lot=True, |
|
log_scale_listed_price=True, has_label=True): |
|
new_df = df.copy() |
|
new_df.drop('Id', axis=1, inplace=True) |
|
new_df['Elementary School'] = new_df['Elementary School'].apply(lambda ele: str(ele)[:-len(' Elementary School')] if str(ele).endswith('Elementary School') else ele) |
|
if log_scale_lot: |
|
new_df['Lot'] = np.log(new_df['Lot'] + 1) |
|
if log_scale_listed_price: |
|
log_listed_price = np.log(new_df['Listed Price']).clip(0, None) |
|
new_df['Listed Price'] = log_listed_price |
|
if with_tax_values: |
|
new_df['Tax assessed value'] = np.log(new_df['Tax assessed value'] + 1) |
|
new_df['Annual tax amount'] = np.log(new_df['Annual tax amount'] + 1) |
|
else: |
|
new_df.drop('Tax assessed value', axis=1, inplace=True) |
|
new_df.drop('Annual tax amount', axis=1, inplace=True) |
|
if has_label: |
|
new_df['Sold Price'] = np.log(new_df['Sold Price']) |
|
return new_df |
|
|
|
|
|
def set_seed(seed): |
|
import mxnet as mx |
|
import torch as th |
|
th.manual_seed(seed) |
|
mx.random.seed(seed) |
|
np.random.seed(seed) |
|
random.seed(seed) |
|
|
|
|
|
def train(args): |
|
set_seed(args.seed) |
|
train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv')) |
|
test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv')) |
|
# For the purpose of generating submission file |
|
submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv')) |
|
train_df = preprocess(train_df, |
|
with_tax_values=args.with_tax_values, has_label=True) |
|
test_df = preprocess(test_df, |
|
with_tax_values=args.with_tax_values, has_label=False) |
|
label_column = 'Sold Price' |
|
feature_columns = [ele for ele in train_df.columns if ele != label_column] |
|
eval_metric = 'r2' |
|
if args.mode == 'weighted': |
|
predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path) |
|
predictor.fit(train_df, hyperparameters='multimodal') |
|
leaderboard = predictor.leaderboard() |
|
leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv')) |
|
elif args.mode == 'single': |
|
predictor = TextPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path) |
|
predictor.fit(train_df, seed=args.seed) |
|
elif args.mode == 'stack5': |
|
predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, |
|
path=args.exp_path) |
|
predictor.fit(train_df, hyperparameters='multimodal', |
|
num_bag_folds=5, num_stack_levels=1) |
|
leaderboard = predictor.leaderboard() |
|
leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv')) |
|
else: |
|
raise NotImplementedError |
|
predictions = np.exp(predictor.predict(test_df)) |
|
submission_df['Sold Price'] = predictions |
|
submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None) |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = get_parser() |
|
args = parser.parse_args() |
|
if args.exp_path is None: |
|
args.exp_path = f'ag_zillow_{args.mode}' |
|
train(args) |