Skip to content

Instantly share code, notes, and snippets.

@lesteve
Created February 2, 2026 16:23
Show Gist options
  • Select an option

  • Save lesteve/7fd423c4227973338569dd73a340879f to your computer and use it in GitHub Desktop.

Select an option

Save lesteve/7fd423c4227973338569dd73a340879f to your computer and use it in GitHub Desktop.
tabular-benchmark-analysis
# %%
import pandas as pd
from pathlib import Path
path = Path('~/dev/tabular-benchmark/analyses/results/benchmark_total.csv').expanduser()
data_all = pd.read_csv(path)
data_all = data_all.dropna(axis='columns', how='all')
# %%
mask = (
(data_all['dataset_name'] == 'nyc-taxi-green-dec-2016') &
data_all['model_name'].str.match('.*(GradientBoosting|XGBoost)')
)
data = data_all[mask].reset_index()
# Set negative scores to 0
data.loc[data['mean_test_score'] < 0, 'mean_test_score'] = 0
# %%
# Questions:
# - Why is HistGradientBoosting not that good? I can not reproduce it seems, is it a bug in scikit-learn at the time?
# I do get that the perf of HistGradientBoosting is below XGBoos and GradientBoosting
data[['model_name', 'mean_test_score']].groupby('model_name').describe()
# %%
# max_features is only used in RandomForest ... so not useful to see whether that improves things ...
data_all.loc[data_all['model__max_features'].notnull(), 'model_name'].value_counts()
# %%
data.loc[data['model_name'] == 'GradientBoostingTree', 'mean_test_score'].hist()
# %%
data.loc[data['model_name'] == 'HistGradientBoostingTree', 'mean_test_score'].hist()
# %%
[x for x in data.columns if 'col' in x]
# %%
parameters = data.loc[data['model_name'] == 'XGBoost', ['model__learning_rate', 'model__colsample_bylevel', 'model__colsample_bytree', 'mean_test_score']]
import plotly.express as px
fig = px.parallel_coordinates(
parameters,
color="mean_test_score",
color_continuous_scale=px.colors.sequential.Viridis,
)
fig.show()
# %%
data_xgboost = data.loc[data['model_name'] == 'XGBoost']
parameter_columns = [col for col in data.columns if 'model__' in col]
# parameters are relevant if they have more than 1 unique value
relevant_parameters = list(data_xgboost[parameter_columns].columns[data_xgboost[parameter_columns].nunique() > 1])
parameters = data_xgboost[relevant_parameters + ['mean_test_score']]
fig = px.parallel_coordinates(
parameters,
color="mean_test_score",
color_continuous_scale=px.colors.sequential.Viridis,
)
fig.show()
# Summary there does not seem to be a big effect for XGBoost of the colsample_* parameters ...
# %%
relevant_parameters
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment