Created
February 2, 2026 16:23
-
-
Save lesteve/7fd423c4227973338569dd73a340879f to your computer and use it in GitHub Desktop.
tabular-benchmark-analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # %% | |
| import pandas as pd | |
| from pathlib import Path | |
| path = Path('~/dev/tabular-benchmark/analyses/results/benchmark_total.csv').expanduser() | |
| data_all = pd.read_csv(path) | |
| data_all = data_all.dropna(axis='columns', how='all') | |
| # %% | |
| mask = ( | |
| (data_all['dataset_name'] == 'nyc-taxi-green-dec-2016') & | |
| data_all['model_name'].str.match('.*(GradientBoosting|XGBoost)') | |
| ) | |
| data = data_all[mask].reset_index() | |
| # Set negative scores to 0 | |
| data.loc[data['mean_test_score'] < 0, 'mean_test_score'] = 0 | |
| # %% | |
| # Questions: | |
| # - Why is HistGradientBoosting not that good? I can not reproduce it seems, is it a bug in scikit-learn at the time? | |
| # I do get that the perf of HistGradientBoosting is below XGBoos and GradientBoosting | |
| data[['model_name', 'mean_test_score']].groupby('model_name').describe() | |
| # %% | |
| # max_features is only used in RandomForest ... so not useful to see whether that improves things ... | |
| data_all.loc[data_all['model__max_features'].notnull(), 'model_name'].value_counts() | |
| # %% | |
| data.loc[data['model_name'] == 'GradientBoostingTree', 'mean_test_score'].hist() | |
| # %% | |
| data.loc[data['model_name'] == 'HistGradientBoostingTree', 'mean_test_score'].hist() | |
| # %% | |
| [x for x in data.columns if 'col' in x] | |
| # %% | |
| parameters = data.loc[data['model_name'] == 'XGBoost', ['model__learning_rate', 'model__colsample_bylevel', 'model__colsample_bytree', 'mean_test_score']] | |
| import plotly.express as px | |
| fig = px.parallel_coordinates( | |
| parameters, | |
| color="mean_test_score", | |
| color_continuous_scale=px.colors.sequential.Viridis, | |
| ) | |
| fig.show() | |
| # %% | |
| data_xgboost = data.loc[data['model_name'] == 'XGBoost'] | |
| parameter_columns = [col for col in data.columns if 'model__' in col] | |
| # parameters are relevant if they have more than 1 unique value | |
| relevant_parameters = list(data_xgboost[parameter_columns].columns[data_xgboost[parameter_columns].nunique() > 1]) | |
| parameters = data_xgboost[relevant_parameters + ['mean_test_score']] | |
| fig = px.parallel_coordinates( | |
| parameters, | |
| color="mean_test_score", | |
| color_continuous_scale=px.colors.sequential.Viridis, | |
| ) | |
| fig.show() | |
| # Summary there does not seem to be a big effect for XGBoost of the colsample_* parameters ... | |
| # %% | |
| relevant_parameters | |
| # %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment