Skip to content

Instantly share code, notes, and snippets.

@betatim
Last active December 10, 2025 07:38
Show Gist options
  • Select an option

  • Save betatim/d322fc2902c4439e8db51ed051499037 to your computer and use it in GitHub Desktop.

Select an option

Save betatim/d322fc2902c4439e8db51ed051499037 to your computer and use it in GitHub Desktop.
Comparing scikit-learn's random forest with lightgbm's implementation.
"""
Benchmark: scikit-learn RandomForest vs LightGBM RandomForest
Compares performance across:
- Number of samples (1K, 10K, 100K, 500K)
- Number of features (10, 50, 200)
- Feature types (numerical, categorical, mixed)
- Number of classes (2, 5, 10)
Includes cases optimized for LightGBM's strengths:
- Native categorical handling (no one-hot encoding needed)
- Large datasets with high-cardinality categorical features
Evaluation metric: Log Loss (cross-entropy) on held-out test data
- Lower is better
- Measures both prediction quality and probability calibration
"""
import time
import warnings
from dataclasses import dataclass
from typing import Literal
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
try:
import lightgbm as lgb
except ImportError:
raise ImportError("Please install lightgbm: pip install lightgbm")
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
N_JOBS = -1 # Use all cores
N_ESTIMATORS = 100
RANDOM_STATE = 42
@dataclass
class BenchmarkConfig:
"""Configuration for a single benchmark run."""
name: str
n_samples: int
n_features: int
n_classes: int
feature_type: Literal["numerical", "categorical", "mixed"]
n_categorical_features: int = 0
categorical_cardinality: int = 10
description: str = ""
def generate_data(config: BenchmarkConfig):
"""Generate synthetic data based on configuration with train/test split."""
rng = np.random.default_rng(RANDOM_STATE)
n_numerical = config.n_features - config.n_categorical_features
n_categorical = config.n_categorical_features
# Generate numerical features
if n_numerical > 0:
X_num = rng.standard_normal((config.n_samples, n_numerical))
else:
X_num = np.empty((config.n_samples, 0))
# Generate categorical features (as integers)
if n_categorical > 0:
X_cat = rng.integers(
0, config.categorical_cardinality, size=(config.n_samples, n_categorical)
)
else:
X_cat = np.empty((config.n_samples, 0), dtype=np.int64)
# Combine features
X = np.hstack([X_num, X_cat])
# Generate target based on feature values (to create some signal)
# Shuffle column order so all features have equal chance of contributing
n_signal_features = min(5, config.n_features)
shuffled_col_indices = rng.permutation(config.n_features)
signal_col_indices = shuffled_col_indices[:n_signal_features]
signal = X[:, signal_col_indices].sum(axis=1)
y = pd.qcut(signal, q=config.n_classes, labels=False).astype(np.int32)
# Track which columns are categorical
categorical_indices = list(range(n_numerical, config.n_features))
# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
return X_train, X_test, y_train, y_test, categorical_indices
def benchmark_sklearn(X_train, X_test, y_train, y_test, categorical_indices, config: BenchmarkConfig):
"""Benchmark scikit-learn RandomForestClassifier."""
# sklearn needs categorical features encoded properly
# For fair comparison, we use the data as-is (already ordinal encoded)
clf = RandomForestClassifier(
n_estimators=N_ESTIMATORS,
n_jobs=N_JOBS,
random_state=RANDOM_STATE,
max_features="sqrt",
)
start = time.perf_counter()
clf.fit(X_train, y_train)
fit_time = time.perf_counter() - start
start = time.perf_counter()
y_proba = clf.predict_proba(X_test)
predict_time = time.perf_counter() - start
# Compute log loss on test set
score = log_loss(y_test, y_proba)
return fit_time, predict_time, score
def benchmark_lightgbm(X_train, X_test, y_train, y_test, categorical_indices, config: BenchmarkConfig):
"""Benchmark LightGBM in RandomForest mode."""
# LightGBM RF parameters
params = {
"boosting_type": "rf",
"objective": "multiclass" if config.n_classes > 2 else "binary",
"num_class": config.n_classes if config.n_classes > 2 else 1,
"n_estimators": N_ESTIMATORS,
"bagging_freq": 1,
"bagging_fraction": 0.8,
"feature_fraction": np.sqrt(config.n_features) / config.n_features,
"n_jobs": N_JOBS,
"random_state": RANDOM_STATE,
"verbose": -1,
}
# Create DataFrame with proper categorical dtypes for LightGBM
if categorical_indices:
df_train = pd.DataFrame(X_train)
df_test = pd.DataFrame(X_test)
for idx in categorical_indices:
df_train[idx] = df_train[idx].astype("category")
df_test[idx] = df_test[idx].astype("category")
X_lgb_train = df_train
X_lgb_test = df_test
else:
X_lgb_train = X_train
X_lgb_test = X_test
clf = lgb.LGBMClassifier(**params)
start = time.perf_counter()
clf.fit(X_lgb_train, y_train, categorical_feature=categorical_indices if categorical_indices else "auto")
fit_time = time.perf_counter() - start
start = time.perf_counter()
y_proba = clf.predict_proba(X_lgb_test)
predict_time = time.perf_counter() - start
# Compute log loss on test set
score = log_loss(y_test, y_proba)
return fit_time, predict_time, score
def run_benchmark(config: BenchmarkConfig):
"""Run a single benchmark configuration."""
print(f"\n{'=' * 70}")
print(f"Benchmark: {config.name}")
print(f" {config.description}")
print(f" Samples: {config.n_samples:,} | Features: {config.n_features} "
f"| Classes: {config.n_classes} | Type: {config.feature_type}")
if config.n_categorical_features > 0:
print(f" Categorical: {config.n_categorical_features} features, "
f"cardinality={config.categorical_cardinality}")
print("-" * 70)
# Generate data with train/test split
print("Generating data...", end=" ", flush=True)
X_train, X_test, y_train, y_test, categorical_indices = generate_data(config)
print(f"Done. Train: {X_train.shape}, Test: {X_test.shape}")
# Benchmark sklearn
print("Running scikit-learn RandomForest...", end=" ", flush=True)
sklearn_fit, sklearn_pred, sklearn_score = benchmark_sklearn(
X_train, X_test, y_train, y_test, categorical_indices, config
)
print(f"Done.")
# Benchmark LightGBM
print("Running LightGBM RandomForest...", end=" ", flush=True)
lgb_fit, lgb_pred, lgb_score = benchmark_lightgbm(
X_train, X_test, y_train, y_test, categorical_indices, config
)
print(f"Done.")
# Results
print("\nResults:")
print(f" {'Method':<20} {'Fit (s)':<10} {'Predict (s)':<12} {'Log Loss':<12}")
print(f" {'-' * 54}")
print(f" {'scikit-learn':<20} {sklearn_fit:<10.3f} {sklearn_pred:<12.3f} {sklearn_score:<12.4f}")
print(f" {'LightGBM':<20} {lgb_fit:<10.3f} {lgb_pred:<12.3f} {lgb_score:<12.4f}")
print(f" {'-' * 54}")
speedup_fit = sklearn_fit / lgb_fit if lgb_fit > 0 else float("inf")
speedup_pred = sklearn_pred / lgb_pred if lgb_pred > 0 else float("inf")
winner_fit = "LightGBM" if speedup_fit > 1 else "sklearn"
winner_pred = "LightGBM" if speedup_pred > 1 else "sklearn"
# Lower log loss is better
winner_score = "sklearn" if sklearn_score < lgb_score else "LightGBM"
score_diff = abs(sklearn_score - lgb_score)
print(f" Fit speedup: {speedup_fit:.2f}x ({winner_fit} faster)")
print(f" Predict speedup: {speedup_pred:.2f}x ({winner_pred} faster)")
print(f" Log Loss: {winner_score} better by {score_diff:.4f}")
return {
"name": config.name,
"n_samples": config.n_samples,
"n_features": config.n_features,
"n_classes": config.n_classes,
"feature_type": config.feature_type,
"sklearn_fit": sklearn_fit,
"sklearn_pred": sklearn_pred,
"sklearn_score": sklearn_score,
"lgb_fit": lgb_fit,
"lgb_pred": lgb_pred,
"lgb_score": lgb_score,
"speedup_fit": speedup_fit,
"speedup_pred": speedup_pred,
}
def main():
print("=" * 70)
print("RandomForest Benchmark: scikit-learn vs LightGBM")
print(f"Configuration: {N_ESTIMATORS} trees, using all available cores")
print("=" * 70)
# Define benchmark configurations
configs = [
# Small baseline - should run in a few seconds
BenchmarkConfig(
name="Small Numerical",
n_samples=1_000,
n_features=10,
n_classes=2,
feature_type="numerical",
description="Quick baseline test with small numerical data",
),
# Medium numerical - tests scaling
BenchmarkConfig(
name="Medium Numerical",
n_samples=10_000,
n_features=50,
n_classes=2,
feature_type="numerical",
description="Medium-sized numerical dataset",
),
# Large numerical - more substantial test
BenchmarkConfig(
name="Large Numerical",
n_samples=100_000,
n_features=50,
n_classes=2,
feature_type="numerical",
description="Large numerical dataset",
),
# Multi-class classification
BenchmarkConfig(
name="Multi-class (5 classes)",
n_samples=50_000,
n_features=30,
n_classes=5,
feature_type="numerical",
description="Multi-class classification problem",
),
# Multi-class with more classes
BenchmarkConfig(
name="Multi-class (10 classes)",
n_samples=50_000,
n_features=30,
n_classes=10,
feature_type="numerical",
description="Multi-class classification with 10 classes",
),
# Mixed features - moderate categorical cardinality
BenchmarkConfig(
name="Mixed Features",
n_samples=50_000,
n_features=40,
n_classes=2,
feature_type="mixed",
n_categorical_features=20,
categorical_cardinality=10,
description="Mixed numerical and categorical features",
),
# High-cardinality categorical (LightGBM strength)
BenchmarkConfig(
name="High-Cardinality Categorical",
n_samples=100_000,
n_features=30,
n_classes=2,
feature_type="mixed",
n_categorical_features=15,
categorical_cardinality=100,
description="High-cardinality categorical features (LightGBM advantage)",
),
# Pure categorical (LightGBM strength)
BenchmarkConfig(
name="Pure Categorical",
n_samples=100_000,
n_features=50,
n_classes=2,
feature_type="categorical",
n_categorical_features=50,
categorical_cardinality=50,
description="All categorical features (LightGBM native support)",
),
# LightGBM optimal case: Large dataset, many high-cardinality categoricals
BenchmarkConfig(
name="LightGBM Optimal",
n_samples=500_000,
n_features=100,
n_classes=3,
feature_type="mixed",
n_categorical_features=60,
categorical_cardinality=200,
description="Large dataset with many high-cardinality categoricals (LightGBM's sweet spot)",
),
# High-dimensional numerical
BenchmarkConfig(
name="High-Dimensional Numerical",
n_samples=50_000,
n_features=200,
n_classes=2,
feature_type="numerical",
description="High-dimensional numerical data",
),
# Very large samples
BenchmarkConfig(
name="Very Large Dataset",
n_samples=500_000,
n_features=50,
n_classes=2,
feature_type="numerical",
description="Very large sample size test",
),
]
# Run all benchmarks
results = []
for config in configs:
result = run_benchmark(config)
results.append(result)
# Summary table - Timing
print("\n" + "=" * 80)
print("SUMMARY - TIMING")
print("=" * 80)
print(f"\n{'Benchmark':<30} {'sklearn Fit':<12} {'LightGBM Fit':<12} {'Speedup':<10}")
print("-" * 70)
for r in results:
speedup_str = f"{r['speedup_fit']:.2f}x"
winner = "✓ LGB" if r['speedup_fit'] > 1 else "✓ SKL"
print(f"{r['name']:<30} {r['sklearn_fit']:<12.3f} {r['lgb_fit']:<12.3f} {speedup_str:<8} {winner}")
print("\n" + "-" * 70)
avg_speedup = np.mean([r['speedup_fit'] for r in results])
print(f"Average fit speedup: {avg_speedup:.2f}x")
# Find best cases for each (timing)
best_for_lgb = max(results, key=lambda r: r['speedup_fit'])
best_for_sklearn = min(results, key=lambda r: r['speedup_fit'])
print(f"\nBest case for LightGBM: {best_for_lgb['name']} ({best_for_lgb['speedup_fit']:.2f}x faster)")
print(f"Best case for sklearn: {best_for_sklearn['name']} ({1/best_for_sklearn['speedup_fit']:.2f}x faster)")
# Summary table - Predictive Performance (Log Loss)
print("\n" + "=" * 80)
print("SUMMARY - PREDICTIVE PERFORMANCE (Log Loss, lower is better)")
print("=" * 80)
print(f"\n{'Benchmark':<30} {'sklearn':<12} {'LightGBM':<12} {'Diff':<10} {'Winner':<8}")
print("-" * 80)
sklearn_wins = 0
lgb_wins = 0
for r in results:
diff = r['sklearn_score'] - r['lgb_score']
if r['sklearn_score'] < r['lgb_score']:
winner = "sklearn"
sklearn_wins += 1
elif r['lgb_score'] < r['sklearn_score']:
winner = "LightGBM"
lgb_wins += 1
else:
winner = "tie"
print(f"{r['name']:<30} {r['sklearn_score']:<12.4f} {r['lgb_score']:<12.4f} {diff:+.4f} {winner}")
print("\n" + "-" * 80)
avg_sklearn_score = np.mean([r['sklearn_score'] for r in results])
avg_lgb_score = np.mean([r['lgb_score'] for r in results])
print(f"Average Log Loss - sklearn: {avg_sklearn_score:.4f}, LightGBM: {avg_lgb_score:.4f}")
print(f"Wins - sklearn: {sklearn_wins}, LightGBM: {lgb_wins}")
if __name__ == "__main__":
main()
======================================================================
RandomForest Benchmark: scikit-learn vs LightGBM
Configuration: 100 trees, using all available cores
======================================================================
======================================================================
Benchmark: Small Numerical
Quick baseline test with small numerical data
Samples: 1,000 | Features: 10 | Classes: 2 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (800, 10), Test: (200, 10)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 0.144 0.012 0.2992
LightGBM 0.348 0.001 0.5132
------------------------------------------------------
Fit speedup: 0.41x (sklearn faster)
Predict speedup: 10.73x (LightGBM faster)
Log Loss: sklearn better by 0.2139
======================================================================
Benchmark: Medium Numerical
Medium-sized numerical dataset
Samples: 10,000 | Features: 50 | Classes: 2 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (8000, 50), Test: (2000, 50)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 0.465 0.014 0.3598
LightGBM 0.573 0.003 0.6178
------------------------------------------------------
Fit speedup: 0.81x (sklearn faster)
Predict speedup: 4.20x (LightGBM faster)
Log Loss: sklearn better by 0.2580
======================================================================
Benchmark: Large Numerical
Large numerical dataset
Samples: 100,000 | Features: 50 | Classes: 2 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (80000, 50), Test: (20000, 50)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 6.286 0.041 0.2667
LightGBM 0.815 0.019 0.6066
------------------------------------------------------
Fit speedup: 7.71x (LightGBM faster)
Predict speedup: 2.17x (LightGBM faster)
Log Loss: sklearn better by 0.3400
======================================================================
Benchmark: Multi-class (5 classes)
Multi-class classification problem
Samples: 50,000 | Features: 30 | Classes: 5 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (40000, 30), Test: (10000, 30)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 2.321 0.028 0.8599
LightGBM 3.022 0.042 1.4571
------------------------------------------------------
Fit speedup: 0.77x (sklearn faster)
Predict speedup: 0.66x (sklearn faster)
Log Loss: sklearn better by 0.5972
======================================================================
Benchmark: Multi-class (10 classes)
Multi-class classification with 10 classes
Samples: 50,000 | Features: 30 | Classes: 10 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (40000, 30), Test: (10000, 30)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 2.437 0.048 1.4693
LightGBM 5.832 0.079 2.1385
------------------------------------------------------
Fit speedup: 0.42x (sklearn faster)
Predict speedup: 0.60x (sklearn faster)
Log Loss: sklearn better by 0.6692
======================================================================
Benchmark: Mixed Features
Mixed numerical and categorical features
Samples: 50,000 | Features: 40 | Classes: 2 | Type: mixed
Categorical: 20 features, cardinality=10
----------------------------------------------------------------------
Generating data... Done. Train: (40000, 40), Test: (10000, 40)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 1.388 0.027 0.2516
LightGBM 0.680 0.015 0.5941
------------------------------------------------------
Fit speedup: 2.04x (LightGBM faster)
Predict speedup: 1.76x (LightGBM faster)
Log Loss: sklearn better by 0.3425
======================================================================
Benchmark: High-Cardinality Categorical
High-cardinality categorical features (LightGBM advantage)
Samples: 100,000 | Features: 30 | Classes: 2 | Type: mixed
Categorical: 15 features, cardinality=100
----------------------------------------------------------------------
Generating data... Done. Train: (80000, 30), Test: (20000, 30)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 3.098 0.041 0.1882
LightGBM 0.843 0.027 0.6055
------------------------------------------------------
Fit speedup: 3.67x (LightGBM faster)
Predict speedup: 1.50x (LightGBM faster)
Log Loss: sklearn better by 0.4174
======================================================================
Benchmark: Pure Categorical
All categorical features (LightGBM native support)
Samples: 100,000 | Features: 50 | Classes: 2 | Type: categorical
Categorical: 50 features, cardinality=50
----------------------------------------------------------------------
Generating data... Done. Train: (80000, 50), Test: (20000, 50)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 2.499 0.042 0.2766
LightGBM 0.897 0.027 0.6145
------------------------------------------------------
Fit speedup: 2.79x (LightGBM faster)
Predict speedup: 1.60x (LightGBM faster)
Log Loss: sklearn better by 0.3379
======================================================================
Benchmark: LightGBM Optimal
Large dataset with many high-cardinality categoricals (LightGBM's sweet spot)
Samples: 500,000 | Features: 100 | Classes: 3 | Type: mixed
Categorical: 60 features, cardinality=200
----------------------------------------------------------------------
Generating data... Done. Train: (400000, 100), Test: (100000, 100)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 56.499 0.384 0.3760
LightGBM 4.810 0.257 1.0001
------------------------------------------------------
Fit speedup: 11.75x (LightGBM faster)
Predict speedup: 1.50x (LightGBM faster)
Log Loss: sklearn better by 0.6242
======================================================================
Benchmark: High-Dimensional Numerical
High-dimensional numerical data
Samples: 50,000 | Features: 200 | Classes: 2 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (40000, 200), Test: (10000, 200)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 6.897 0.028 0.4134
LightGBM 0.895 0.017 0.6599
------------------------------------------------------
Fit speedup: 7.70x (LightGBM faster)
Predict speedup: 1.60x (LightGBM faster)
Log Loss: sklearn better by 0.2465
======================================================================
Benchmark: Very Large Dataset
Very large sample size test
Samples: 500,000 | Features: 50 | Classes: 2 | Type: numerical
----------------------------------------------------------------------
Generating data... Done. Train: (400000, 50), Test: (100000, 50)
Running scikit-learn RandomForest... Done.
Running LightGBM RandomForest... Done.
Results:
Method Fit (s) Predict (s) Log Loss
------------------------------------------------------
scikit-learn 46.691 0.313 0.2193
LightGBM 1.726 0.078 0.6171
------------------------------------------------------
Fit speedup: 27.05x (LightGBM faster)
Predict speedup: 4.02x (LightGBM faster)
Log Loss: sklearn better by 0.3978
================================================================================
SUMMARY - TIMING
================================================================================
Benchmark sklearn Fit LightGBM Fit Speedup
----------------------------------------------------------------------
Small Numerical 0.144 0.348 0.41x ✓ SKL
Medium Numerical 0.465 0.573 0.81x ✓ SKL
Large Numerical 6.286 0.815 7.71x ✓ LGB
Multi-class (5 classes) 2.321 3.022 0.77x ✓ SKL
Multi-class (10 classes) 2.437 5.832 0.42x ✓ SKL
Mixed Features 1.388 0.680 2.04x ✓ LGB
High-Cardinality Categorical 3.098 0.843 3.67x ✓ LGB
Pure Categorical 2.499 0.897 2.79x ✓ LGB
LightGBM Optimal 56.499 4.810 11.75x ✓ LGB
High-Dimensional Numerical 6.897 0.895 7.70x ✓ LGB
Very Large Dataset 46.691 1.726 27.05x ✓ LGB
----------------------------------------------------------------------
Average fit speedup: 5.92x
Best case for LightGBM: Very Large Dataset (27.05x faster)
Best case for sklearn: Small Numerical (2.42x faster)
================================================================================
SUMMARY - PREDICTIVE PERFORMANCE (Log Loss, lower is better)
================================================================================
Benchmark sklearn LightGBM Diff Winner
--------------------------------------------------------------------------------
Small Numerical 0.2992 0.5132 -0.2139 sklearn
Medium Numerical 0.3598 0.6178 -0.2580 sklearn
Large Numerical 0.2667 0.6066 -0.3400 sklearn
Multi-class (5 classes) 0.8599 1.4571 -0.5972 sklearn
Multi-class (10 classes) 1.4693 2.1385 -0.6692 sklearn
Mixed Features 0.2516 0.5941 -0.3425 sklearn
High-Cardinality Categorical 0.1882 0.6055 -0.4174 sklearn
Pure Categorical 0.2766 0.6145 -0.3379 sklearn
LightGBM Optimal 0.3760 1.0001 -0.6242 sklearn
High-Dimensional Numerical 0.4134 0.6599 -0.2465 sklearn
Very Large Dataset 0.2193 0.6171 -0.3978 sklearn
--------------------------------------------------------------------------------
Average Log Loss - sklearn: 0.4527, LightGBM: 0.8568
Wins - sklearn: 11, LightGBM: 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment