Created
March 12, 2021 05:55
-
-
Save sxjscience/9cbe4ac99d983770746fc22a6dba7513 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import argparse | |
import os | |
import json | |
import random | |
from autogluon.tabular import TabularPredictor | |
from autogluon.text import TextPredictor | |
from autogluon.text.text_prediction.infer_types import infer_column_problem_types | |
from autogluon.text.text_prediction import constants as _C | |
def get_parser(): | |
parser = argparse.ArgumentParser( | |
description='The Basic Example of AutoGluon for House Price Prediction.') | |
parser.add_argument('--mode', choices=['stack5', 'weighted', 'single'], default='weighted') | |
parser.add_argument('--data_path', type=str, default='kaggle') | |
parser.add_argument('--seed', type=int, default=123) | |
parser.add_argument('--exp_path', default=None) | |
parser.add_argument('--with_tax_values', default=1, type=int) | |
parser.add_argument('--overwrite_exp', action='store_true', | |
help='Whether to overwrite the existing experiment.') | |
return parser | |
def preprocess(df, with_tax_values=True, log_scale_lot=True, has_label=True): | |
new_df = df.copy() | |
new_df.drop('Id', axis=1, inplace=True) | |
new_df['Elementary School'] = new_df['Elementary School'].apply(lambda ele: str(ele)[:-len(' Elementary School')] if str(ele).endswith('Elementary School') else ele) | |
if log_scale_lot: | |
new_df['Lot'] = np.log(new_df['Lot'] + 1) | |
if with_tax_values: | |
new_df['Tax assessed value'] = np.log(new_df['Tax assessed value'] + 1) | |
new_df['Annual tax amount'] = np.log(new_df['Annual tax amount'] + 1) | |
else: | |
new_df.drop('Tax assessed value', axis=1, inplace=True) | |
new_df.drop('Annual tax amount', axis=1, inplace=True) | |
if has_label: | |
new_df['Sold Price'] = np.log(new_df['Sold Price']) | |
return new_df | |
def set_seed(seed): | |
import mxnet as mx | |
import torch as th | |
th.manual_seed(seed) | |
mx.random.seed(seed) | |
np.random.seed(seed) | |
random.seed(seed) | |
def train(args): | |
set_seed(args.seed) | |
train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv')) | |
test_no_label_df = pd.read_csv(os.path.join(args.data_path, 'test.csv')) | |
test_label_df = pd.read_csv(os.path.join(args.data_path, 'test_label.csv')) | |
test_df = test_no_label_df.merge(test_label_df, 'outer', 'Id') | |
train_df = preprocess(train_df, with_tax_values=args.with_tax_values, has_label=True) | |
test_no_label_df = preprocess(test_no_label_df, | |
with_tax_values=args.with_tax_values, has_label=False) | |
test_df = preprocess(test_df, | |
with_tax_values=args.with_tax_values, has_label=True) | |
label_column = 'Sold Price' | |
feature_columns = [ele for ele in train_df.columns if ele != label_column] | |
eval_metric = 'r2' | |
if args.mode == 'weighted': | |
predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path) | |
predictor.fit(train_df, hyperparameters='multimodal') | |
leaderboard = predictor.leaderboard(test_df) | |
leaderboard.to_csv(os.path.join(args.exp_path, 'test_leaderboard.csv')) | |
elif args.mode == 'single': | |
predictor = TextPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path) | |
predictor.fit(train_df, seed=args.seed) | |
elif args.mode == 'stack5': | |
predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, | |
path=args.exp_path) | |
predictor.fit(train_df, hyperparameters='multimodal', | |
num_bag_folds=5, num_stack_levels=1) | |
leaderboard = predictor.leaderboard(test_df) | |
leaderboard.to_csv(os.path.join(args.exp_path, 'test_leaderboard.csv')) | |
else: | |
raise NotImplementedError | |
predictions = np.exp(predictor.predict(test_no_label_df)) | |
predictions.to_csv(os.path.join(args.exp_path, 'test_predictions.csv')) | |
score = predictor.evaluate(test_df) | |
with open(os.path.join(args.exp_path, 'test_score.json'), 'w') as of: | |
json.dump({'r2': score}, of) | |
if __name__ == '__main__': | |
parser = get_parser() | |
args = parser.parse_args() | |
if args.exp_path is None: | |
args.exp_path = f'ag_zillow_{args.mode}' | |
train(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment