lesteve · February 2, 2026 16:23
diff --git a/tabular-benchmark-analysis.py b/tabular-benchmark-analysis.py
 # %%
 import pandas as pd
 from pathlib import Path

 path = Path('~/dev/tabular-benchmark/analyses/results/benchmark_total.csv').expanduser()

 data_all = pd.read_csv(path)
 data_all = data_all.dropna(axis='columns', how='all')

 # %%
 mask = (
    (data_all['dataset_name'] == 'nyc-taxi-green-dec-2016') & 
    data_all['model_name'].str.match('.*(GradientBoosting|XGBoost)')
 )
 data = data_all[mask].reset_index()
 # Set negative scores to 0
 data.loc[data['mean_test_score'] < 0, 'mean_test_score'] = 0

 # %%
 # Questions:
 # - Why is HistGradientBoosting not that good? I can not reproduce it seems, is it a bug in scikit-learn at the time?

 # I do get that the perf of HistGradientBoosting is below XGBoos and GradientBoosting
 data[['model_name', 'mean_test_score']].groupby('model_name').describe()
 # %%
 # max_features is only used in RandomForest ... so not useful to see whether that improves things ... 
 data_all.loc[data_all['model__max_features'].notnull(), 'model_name'].value_counts()
 # %%
 data.loc[data['model_name'] == 'GradientBoostingTree', 'mean_test_score'].hist()

 # %%
 data.loc[data['model_name'] == 'HistGradientBoostingTree', 'mean_test_score'].hist()

 # %%
 [x for x in data.columns if 'col' in x]
 # %%
 parameters = data.loc[data['model_name'] == 'XGBoost', ['model__learning_rate', 'model__colsample_bylevel', 'model__colsample_bytree', 'mean_test_score']]

 import plotly.express as px

 fig = px.parallel_coordinates(
    parameters,
    color="mean_test_score",
    color_continuous_scale=px.colors.sequential.Viridis,
 )
 fig.show()

 # %%
 data_xgboost = data.loc[data['model_name'] == 'XGBoost']
 parameter_columns = [col for col in data.columns if 'model__' in col]
 # parameters are relevant if they have more than 1 unique value
 relevant_parameters = list(data_xgboost[parameter_columns].columns[data_xgboost[parameter_columns].nunique() > 1])
 parameters = data_xgboost[relevant_parameters + ['mean_test_score']]

 fig = px.parallel_coordinates(
    parameters,
    color="mean_test_score",
    color_continuous_scale=px.colors.sequential.Viridis,
 )
 fig.show()

 # Summary there does not seem to be a big effect for XGBoost of the colsample_* parameters ...
 # %%
 relevant_parameters
 # %%
	# %%
	import pandas as pd
	from pathlib import Path

	path = Path('~/dev/tabular-benchmark/analyses/results/benchmark_total.csv').expanduser()

	data_all = pd.read_csv(path)
	data_all = data_all.dropna(axis='columns', how='all')

	# %%
	mask = (
	(data_all['dataset_name'] == 'nyc-taxi-green-dec-2016') &
	data_all['model_name'].str.match('.*(GradientBoosting\|XGBoost)')
	)
	data = data_all[mask].reset_index()
	# Set negative scores to 0
	data.loc[data['mean_test_score'] < 0, 'mean_test_score'] = 0

	# %%
	# Questions:
	# - Why is HistGradientBoosting not that good? I can not reproduce it seems, is it a bug in scikit-learn at the time?

	# I do get that the perf of HistGradientBoosting is below XGBoos and GradientBoosting
	data[['model_name', 'mean_test_score']].groupby('model_name').describe()
	# %%
	# max_features is only used in RandomForest ... so not useful to see whether that improves things ...
	data_all.loc[data_all['model__max_features'].notnull(), 'model_name'].value_counts()
	# %%
	data.loc[data['model_name'] == 'GradientBoostingTree', 'mean_test_score'].hist()

	# %%
	data.loc[data['model_name'] == 'HistGradientBoostingTree', 'mean_test_score'].hist()

	# %%
	[x for x in data.columns if 'col' in x]
	# %%
	parameters = data.loc[data['model_name'] == 'XGBoost', ['model__learning_rate', 'model__colsample_bylevel', 'model__colsample_bytree', 'mean_test_score']]

	import plotly.express as px

	fig = px.parallel_coordinates(
	parameters,
	color="mean_test_score",
	color_continuous_scale=px.colors.sequential.Viridis,
	)
	fig.show()

	# %%
	data_xgboost = data.loc[data['model_name'] == 'XGBoost']
	parameter_columns = [col for col in data.columns if 'model__' in col]
	# parameters are relevant if they have more than 1 unique value
	relevant_parameters = list(data_xgboost[parameter_columns].columns[data_xgboost[parameter_columns].nunique() > 1])
	parameters = data_xgboost[relevant_parameters + ['mean_test_score']]

	fig = px.parallel_coordinates(
	parameters,
	color="mean_test_score",
	color_continuous_scale=px.colors.sequential.Viridis,
	)
	fig.show()

	# Summary there does not seem to be a big effect for XGBoost of the colsample_* parameters ...
	# %%
	relevant_parameters
	# %%
No results found