Last active
November 25, 2021 06:15
-
-
Save yudhiesh/0a0c96955ab53ce65fbff7295e7b3055 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lightgbm as lgb | |
import mlflow | |
import optuna | |
from optuna.integration import LightGBMPruningCallback | |
from datetime import datetime | |
from sklearn.metrics import roc_auc_score, log_loss | |
from sklearn.datasets import load_breast_cancer | |
from sklearn.model_selection import train_test_split | |
def get_train_valid(dataset): | |
X = dataset.iloc[:, :-1] | |
y = dataset.iloc[:, -1] | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.25, random_state=42 | |
) | |
return X_train, X_test, y_train, y_test | |
def get_study_results(study, objective, name): | |
"""Get the results of an Optuna stufy based on the study name and the objective instance""" | |
print(f"Best trial - {name}:") | |
trial = study.best_trial | |
print(" Params: ") | |
for key, value in trial.params.items(): | |
print(f" {key}: {value}") | |
best_model = objective.best_booster | |
print(f"Best validation score: {best_model.best_score_}") | |
return best_model | |
def run_experiment(n_trials, dataset, name): | |
""" | |
Runs an experiment for n_trials using the train & valid dataset | |
returns: the study and objective instance | |
""" | |
print(f"Running experiment for : {name.title()}") | |
objective = Objective(dataset=dataset) | |
study = optuna.create_study( | |
pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), | |
direction="maximize", | |
) | |
study.optimize(objective, n_trials=n_trials, callbacks=[objective.callback]) | |
return study, objective | |
class Objective: | |
""" | |
Objective class to perform hyperparameter tuning using Optuna and keep track of the best model | |
using the callback function. | |
Takes as input the train and valid dataset to be used and returns the roc_auc score per trial | |
which will be passed to an Optuna study | |
""" | |
def __init__(self, dataset): | |
self.best_booster = None | |
self._booster = None | |
self.dataset = dataset | |
def __call__(self, trial): | |
X_train, X_test, y_train, y_test = get_train_valid(self.dataset) | |
param_grid = { | |
"n_estimators": trial.suggest_categorical("n_estimators", [10000]), | |
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3), | |
"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20), | |
"max_depth": trial.suggest_int("max_depth", 3, 12), | |
"min_data_in_leaf": trial.suggest_int( | |
"min_data_in_leaf", 200, 10000, step=100 | |
), | |
"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5), | |
"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5), | |
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15), | |
"bagging_fraction": trial.suggest_float( | |
"bagging_fraction", 0.2, 0.95, step=0.1 | |
), | |
"bagging_freq": trial.suggest_categorical("bagging_freq", [1]), | |
"feature_fraction": trial.suggest_float( | |
"feature_fraction", 0.2, 0.95, step=0.1 | |
), | |
} | |
run_name = f"Test_{datetime.now()}" | |
with mlflow.start_run(run_name=run_name): | |
model = lgb.LGBMClassifier(objective="binary", **param_grid) | |
model.fit( | |
X_train, | |
y_train, | |
eval_set=[(X_test, y_test)], | |
eval_metric=["auc", "binary_logloss"], | |
early_stopping_rounds=100, | |
callbacks=[LightGBMPruningCallback(trial, "auc")], | |
) | |
# log params from optuna trial | |
mlflow.log_params(trial.params) | |
self._booster = model | |
y_pred = model.predict_proba(X_test)[:, 1] | |
roc_auc = roc_auc_score(y_test, y_pred) | |
log_loss_ = log_loss(y_test, y_pred) | |
# log metrics here | |
mlflow.log_metrics( | |
{ | |
"log_loss": log_loss_, | |
"roc_auc": roc_auc, | |
}, | |
) | |
return roc_auc | |
def callback(self, study, trial): | |
if study.best_trial == trial: | |
self.best_booster = self._booster | |
if __name__ == "__main__": | |
study, objective = run_experiment( | |
n_trials=100, | |
dataset=load_breast_cancer(as_frame=True).frame, | |
name="Data1", | |
) | |
best_model = get_study_results( | |
study=study, | |
objective=objective, | |
name="Data1", | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment