Last active
December 10, 2025 07:38
-
-
Save betatim/d322fc2902c4439e8db51ed051499037 to your computer and use it in GitHub Desktop.
Comparing scikit-learn's random forest with lightgbm's implementation.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Benchmark: scikit-learn RandomForest vs LightGBM RandomForest | |
| Compares performance across: | |
| - Number of samples (1K, 10K, 100K, 500K) | |
| - Number of features (10, 50, 200) | |
| - Feature types (numerical, categorical, mixed) | |
| - Number of classes (2, 5, 10) | |
| Includes cases optimized for LightGBM's strengths: | |
| - Native categorical handling (no one-hot encoding needed) | |
| - Large datasets with high-cardinality categorical features | |
| Evaluation metric: Log Loss (cross-entropy) on held-out test data | |
| - Lower is better | |
| - Measures both prediction quality and probability calibration | |
| """ | |
| import time | |
| import warnings | |
| from dataclasses import dataclass | |
| from typing import Literal | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import log_loss | |
| from sklearn.model_selection import train_test_split | |
| try: | |
| import lightgbm as lgb | |
| except ImportError: | |
| raise ImportError("Please install lightgbm: pip install lightgbm") | |
| # Suppress warnings for cleaner output | |
| warnings.filterwarnings("ignore") | |
| N_JOBS = -1 # Use all cores | |
| N_ESTIMATORS = 100 | |
| RANDOM_STATE = 42 | |
| @dataclass | |
| class BenchmarkConfig: | |
| """Configuration for a single benchmark run.""" | |
| name: str | |
| n_samples: int | |
| n_features: int | |
| n_classes: int | |
| feature_type: Literal["numerical", "categorical", "mixed"] | |
| n_categorical_features: int = 0 | |
| categorical_cardinality: int = 10 | |
| description: str = "" | |
| def generate_data(config: BenchmarkConfig): | |
| """Generate synthetic data based on configuration with train/test split.""" | |
| rng = np.random.default_rng(RANDOM_STATE) | |
| n_numerical = config.n_features - config.n_categorical_features | |
| n_categorical = config.n_categorical_features | |
| # Generate numerical features | |
| if n_numerical > 0: | |
| X_num = rng.standard_normal((config.n_samples, n_numerical)) | |
| else: | |
| X_num = np.empty((config.n_samples, 0)) | |
| # Generate categorical features (as integers) | |
| if n_categorical > 0: | |
| X_cat = rng.integers( | |
| 0, config.categorical_cardinality, size=(config.n_samples, n_categorical) | |
| ) | |
| else: | |
| X_cat = np.empty((config.n_samples, 0), dtype=np.int64) | |
| # Combine features | |
| X = np.hstack([X_num, X_cat]) | |
| # Generate target based on feature values (to create some signal) | |
| # Shuffle column order so all features have equal chance of contributing | |
| n_signal_features = min(5, config.n_features) | |
| shuffled_col_indices = rng.permutation(config.n_features) | |
| signal_col_indices = shuffled_col_indices[:n_signal_features] | |
| signal = X[:, signal_col_indices].sum(axis=1) | |
| y = pd.qcut(signal, q=config.n_classes, labels=False).astype(np.int32) | |
| # Track which columns are categorical | |
| categorical_indices = list(range(n_numerical, config.n_features)) | |
| # Train/test split (80/20) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y | |
| ) | |
| return X_train, X_test, y_train, y_test, categorical_indices | |
| def benchmark_sklearn(X_train, X_test, y_train, y_test, categorical_indices, config: BenchmarkConfig): | |
| """Benchmark scikit-learn RandomForestClassifier.""" | |
| # sklearn needs categorical features encoded properly | |
| # For fair comparison, we use the data as-is (already ordinal encoded) | |
| clf = RandomForestClassifier( | |
| n_estimators=N_ESTIMATORS, | |
| n_jobs=N_JOBS, | |
| random_state=RANDOM_STATE, | |
| max_features="sqrt", | |
| ) | |
| start = time.perf_counter() | |
| clf.fit(X_train, y_train) | |
| fit_time = time.perf_counter() - start | |
| start = time.perf_counter() | |
| y_proba = clf.predict_proba(X_test) | |
| predict_time = time.perf_counter() - start | |
| # Compute log loss on test set | |
| score = log_loss(y_test, y_proba) | |
| return fit_time, predict_time, score | |
| def benchmark_lightgbm(X_train, X_test, y_train, y_test, categorical_indices, config: BenchmarkConfig): | |
| """Benchmark LightGBM in RandomForest mode.""" | |
| # LightGBM RF parameters | |
| params = { | |
| "boosting_type": "rf", | |
| "objective": "multiclass" if config.n_classes > 2 else "binary", | |
| "num_class": config.n_classes if config.n_classes > 2 else 1, | |
| "n_estimators": N_ESTIMATORS, | |
| "bagging_freq": 1, | |
| "bagging_fraction": 0.8, | |
| "feature_fraction": np.sqrt(config.n_features) / config.n_features, | |
| "n_jobs": N_JOBS, | |
| "random_state": RANDOM_STATE, | |
| "verbose": -1, | |
| } | |
| # Create DataFrame with proper categorical dtypes for LightGBM | |
| if categorical_indices: | |
| df_train = pd.DataFrame(X_train) | |
| df_test = pd.DataFrame(X_test) | |
| for idx in categorical_indices: | |
| df_train[idx] = df_train[idx].astype("category") | |
| df_test[idx] = df_test[idx].astype("category") | |
| X_lgb_train = df_train | |
| X_lgb_test = df_test | |
| else: | |
| X_lgb_train = X_train | |
| X_lgb_test = X_test | |
| clf = lgb.LGBMClassifier(**params) | |
| start = time.perf_counter() | |
| clf.fit(X_lgb_train, y_train, categorical_feature=categorical_indices if categorical_indices else "auto") | |
| fit_time = time.perf_counter() - start | |
| start = time.perf_counter() | |
| y_proba = clf.predict_proba(X_lgb_test) | |
| predict_time = time.perf_counter() - start | |
| # Compute log loss on test set | |
| score = log_loss(y_test, y_proba) | |
| return fit_time, predict_time, score | |
| def run_benchmark(config: BenchmarkConfig): | |
| """Run a single benchmark configuration.""" | |
| print(f"\n{'=' * 70}") | |
| print(f"Benchmark: {config.name}") | |
| print(f" {config.description}") | |
| print(f" Samples: {config.n_samples:,} | Features: {config.n_features} " | |
| f"| Classes: {config.n_classes} | Type: {config.feature_type}") | |
| if config.n_categorical_features > 0: | |
| print(f" Categorical: {config.n_categorical_features} features, " | |
| f"cardinality={config.categorical_cardinality}") | |
| print("-" * 70) | |
| # Generate data with train/test split | |
| print("Generating data...", end=" ", flush=True) | |
| X_train, X_test, y_train, y_test, categorical_indices = generate_data(config) | |
| print(f"Done. Train: {X_train.shape}, Test: {X_test.shape}") | |
| # Benchmark sklearn | |
| print("Running scikit-learn RandomForest...", end=" ", flush=True) | |
| sklearn_fit, sklearn_pred, sklearn_score = benchmark_sklearn( | |
| X_train, X_test, y_train, y_test, categorical_indices, config | |
| ) | |
| print(f"Done.") | |
| # Benchmark LightGBM | |
| print("Running LightGBM RandomForest...", end=" ", flush=True) | |
| lgb_fit, lgb_pred, lgb_score = benchmark_lightgbm( | |
| X_train, X_test, y_train, y_test, categorical_indices, config | |
| ) | |
| print(f"Done.") | |
| # Results | |
| print("\nResults:") | |
| print(f" {'Method':<20} {'Fit (s)':<10} {'Predict (s)':<12} {'Log Loss':<12}") | |
| print(f" {'-' * 54}") | |
| print(f" {'scikit-learn':<20} {sklearn_fit:<10.3f} {sklearn_pred:<12.3f} {sklearn_score:<12.4f}") | |
| print(f" {'LightGBM':<20} {lgb_fit:<10.3f} {lgb_pred:<12.3f} {lgb_score:<12.4f}") | |
| print(f" {'-' * 54}") | |
| speedup_fit = sklearn_fit / lgb_fit if lgb_fit > 0 else float("inf") | |
| speedup_pred = sklearn_pred / lgb_pred if lgb_pred > 0 else float("inf") | |
| winner_fit = "LightGBM" if speedup_fit > 1 else "sklearn" | |
| winner_pred = "LightGBM" if speedup_pred > 1 else "sklearn" | |
| # Lower log loss is better | |
| winner_score = "sklearn" if sklearn_score < lgb_score else "LightGBM" | |
| score_diff = abs(sklearn_score - lgb_score) | |
| print(f" Fit speedup: {speedup_fit:.2f}x ({winner_fit} faster)") | |
| print(f" Predict speedup: {speedup_pred:.2f}x ({winner_pred} faster)") | |
| print(f" Log Loss: {winner_score} better by {score_diff:.4f}") | |
| return { | |
| "name": config.name, | |
| "n_samples": config.n_samples, | |
| "n_features": config.n_features, | |
| "n_classes": config.n_classes, | |
| "feature_type": config.feature_type, | |
| "sklearn_fit": sklearn_fit, | |
| "sklearn_pred": sklearn_pred, | |
| "sklearn_score": sklearn_score, | |
| "lgb_fit": lgb_fit, | |
| "lgb_pred": lgb_pred, | |
| "lgb_score": lgb_score, | |
| "speedup_fit": speedup_fit, | |
| "speedup_pred": speedup_pred, | |
| } | |
| def main(): | |
| print("=" * 70) | |
| print("RandomForest Benchmark: scikit-learn vs LightGBM") | |
| print(f"Configuration: {N_ESTIMATORS} trees, using all available cores") | |
| print("=" * 70) | |
| # Define benchmark configurations | |
| configs = [ | |
| # Small baseline - should run in a few seconds | |
| BenchmarkConfig( | |
| name="Small Numerical", | |
| n_samples=1_000, | |
| n_features=10, | |
| n_classes=2, | |
| feature_type="numerical", | |
| description="Quick baseline test with small numerical data", | |
| ), | |
| # Medium numerical - tests scaling | |
| BenchmarkConfig( | |
| name="Medium Numerical", | |
| n_samples=10_000, | |
| n_features=50, | |
| n_classes=2, | |
| feature_type="numerical", | |
| description="Medium-sized numerical dataset", | |
| ), | |
| # Large numerical - more substantial test | |
| BenchmarkConfig( | |
| name="Large Numerical", | |
| n_samples=100_000, | |
| n_features=50, | |
| n_classes=2, | |
| feature_type="numerical", | |
| description="Large numerical dataset", | |
| ), | |
| # Multi-class classification | |
| BenchmarkConfig( | |
| name="Multi-class (5 classes)", | |
| n_samples=50_000, | |
| n_features=30, | |
| n_classes=5, | |
| feature_type="numerical", | |
| description="Multi-class classification problem", | |
| ), | |
| # Multi-class with more classes | |
| BenchmarkConfig( | |
| name="Multi-class (10 classes)", | |
| n_samples=50_000, | |
| n_features=30, | |
| n_classes=10, | |
| feature_type="numerical", | |
| description="Multi-class classification with 10 classes", | |
| ), | |
| # Mixed features - moderate categorical cardinality | |
| BenchmarkConfig( | |
| name="Mixed Features", | |
| n_samples=50_000, | |
| n_features=40, | |
| n_classes=2, | |
| feature_type="mixed", | |
| n_categorical_features=20, | |
| categorical_cardinality=10, | |
| description="Mixed numerical and categorical features", | |
| ), | |
| # High-cardinality categorical (LightGBM strength) | |
| BenchmarkConfig( | |
| name="High-Cardinality Categorical", | |
| n_samples=100_000, | |
| n_features=30, | |
| n_classes=2, | |
| feature_type="mixed", | |
| n_categorical_features=15, | |
| categorical_cardinality=100, | |
| description="High-cardinality categorical features (LightGBM advantage)", | |
| ), | |
| # Pure categorical (LightGBM strength) | |
| BenchmarkConfig( | |
| name="Pure Categorical", | |
| n_samples=100_000, | |
| n_features=50, | |
| n_classes=2, | |
| feature_type="categorical", | |
| n_categorical_features=50, | |
| categorical_cardinality=50, | |
| description="All categorical features (LightGBM native support)", | |
| ), | |
| # LightGBM optimal case: Large dataset, many high-cardinality categoricals | |
| BenchmarkConfig( | |
| name="LightGBM Optimal", | |
| n_samples=500_000, | |
| n_features=100, | |
| n_classes=3, | |
| feature_type="mixed", | |
| n_categorical_features=60, | |
| categorical_cardinality=200, | |
| description="Large dataset with many high-cardinality categoricals (LightGBM's sweet spot)", | |
| ), | |
| # High-dimensional numerical | |
| BenchmarkConfig( | |
| name="High-Dimensional Numerical", | |
| n_samples=50_000, | |
| n_features=200, | |
| n_classes=2, | |
| feature_type="numerical", | |
| description="High-dimensional numerical data", | |
| ), | |
| # Very large samples | |
| BenchmarkConfig( | |
| name="Very Large Dataset", | |
| n_samples=500_000, | |
| n_features=50, | |
| n_classes=2, | |
| feature_type="numerical", | |
| description="Very large sample size test", | |
| ), | |
| ] | |
| # Run all benchmarks | |
| results = [] | |
| for config in configs: | |
| result = run_benchmark(config) | |
| results.append(result) | |
| # Summary table - Timing | |
| print("\n" + "=" * 80) | |
| print("SUMMARY - TIMING") | |
| print("=" * 80) | |
| print(f"\n{'Benchmark':<30} {'sklearn Fit':<12} {'LightGBM Fit':<12} {'Speedup':<10}") | |
| print("-" * 70) | |
| for r in results: | |
| speedup_str = f"{r['speedup_fit']:.2f}x" | |
| winner = "✓ LGB" if r['speedup_fit'] > 1 else "✓ SKL" | |
| print(f"{r['name']:<30} {r['sklearn_fit']:<12.3f} {r['lgb_fit']:<12.3f} {speedup_str:<8} {winner}") | |
| print("\n" + "-" * 70) | |
| avg_speedup = np.mean([r['speedup_fit'] for r in results]) | |
| print(f"Average fit speedup: {avg_speedup:.2f}x") | |
| # Find best cases for each (timing) | |
| best_for_lgb = max(results, key=lambda r: r['speedup_fit']) | |
| best_for_sklearn = min(results, key=lambda r: r['speedup_fit']) | |
| print(f"\nBest case for LightGBM: {best_for_lgb['name']} ({best_for_lgb['speedup_fit']:.2f}x faster)") | |
| print(f"Best case for sklearn: {best_for_sklearn['name']} ({1/best_for_sklearn['speedup_fit']:.2f}x faster)") | |
| # Summary table - Predictive Performance (Log Loss) | |
| print("\n" + "=" * 80) | |
| print("SUMMARY - PREDICTIVE PERFORMANCE (Log Loss, lower is better)") | |
| print("=" * 80) | |
| print(f"\n{'Benchmark':<30} {'sklearn':<12} {'LightGBM':<12} {'Diff':<10} {'Winner':<8}") | |
| print("-" * 80) | |
| sklearn_wins = 0 | |
| lgb_wins = 0 | |
| for r in results: | |
| diff = r['sklearn_score'] - r['lgb_score'] | |
| if r['sklearn_score'] < r['lgb_score']: | |
| winner = "sklearn" | |
| sklearn_wins += 1 | |
| elif r['lgb_score'] < r['sklearn_score']: | |
| winner = "LightGBM" | |
| lgb_wins += 1 | |
| else: | |
| winner = "tie" | |
| print(f"{r['name']:<30} {r['sklearn_score']:<12.4f} {r['lgb_score']:<12.4f} {diff:+.4f} {winner}") | |
| print("\n" + "-" * 80) | |
| avg_sklearn_score = np.mean([r['sklearn_score'] for r in results]) | |
| avg_lgb_score = np.mean([r['lgb_score'] for r in results]) | |
| print(f"Average Log Loss - sklearn: {avg_sklearn_score:.4f}, LightGBM: {avg_lgb_score:.4f}") | |
| print(f"Wins - sklearn: {sklearn_wins}, LightGBM: {lgb_wins}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ====================================================================== | |
| RandomForest Benchmark: scikit-learn vs LightGBM | |
| Configuration: 100 trees, using all available cores | |
| ====================================================================== | |
| ====================================================================== | |
| Benchmark: Small Numerical | |
| Quick baseline test with small numerical data | |
| Samples: 1,000 | Features: 10 | Classes: 2 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (800, 10), Test: (200, 10) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 0.144 0.012 0.2992 | |
| LightGBM 0.348 0.001 0.5132 | |
| ------------------------------------------------------ | |
| Fit speedup: 0.41x (sklearn faster) | |
| Predict speedup: 10.73x (LightGBM faster) | |
| Log Loss: sklearn better by 0.2139 | |
| ====================================================================== | |
| Benchmark: Medium Numerical | |
| Medium-sized numerical dataset | |
| Samples: 10,000 | Features: 50 | Classes: 2 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (8000, 50), Test: (2000, 50) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 0.465 0.014 0.3598 | |
| LightGBM 0.573 0.003 0.6178 | |
| ------------------------------------------------------ | |
| Fit speedup: 0.81x (sklearn faster) | |
| Predict speedup: 4.20x (LightGBM faster) | |
| Log Loss: sklearn better by 0.2580 | |
| ====================================================================== | |
| Benchmark: Large Numerical | |
| Large numerical dataset | |
| Samples: 100,000 | Features: 50 | Classes: 2 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (80000, 50), Test: (20000, 50) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 6.286 0.041 0.2667 | |
| LightGBM 0.815 0.019 0.6066 | |
| ------------------------------------------------------ | |
| Fit speedup: 7.71x (LightGBM faster) | |
| Predict speedup: 2.17x (LightGBM faster) | |
| Log Loss: sklearn better by 0.3400 | |
| ====================================================================== | |
| Benchmark: Multi-class (5 classes) | |
| Multi-class classification problem | |
| Samples: 50,000 | Features: 30 | Classes: 5 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (40000, 30), Test: (10000, 30) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 2.321 0.028 0.8599 | |
| LightGBM 3.022 0.042 1.4571 | |
| ------------------------------------------------------ | |
| Fit speedup: 0.77x (sklearn faster) | |
| Predict speedup: 0.66x (sklearn faster) | |
| Log Loss: sklearn better by 0.5972 | |
| ====================================================================== | |
| Benchmark: Multi-class (10 classes) | |
| Multi-class classification with 10 classes | |
| Samples: 50,000 | Features: 30 | Classes: 10 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (40000, 30), Test: (10000, 30) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 2.437 0.048 1.4693 | |
| LightGBM 5.832 0.079 2.1385 | |
| ------------------------------------------------------ | |
| Fit speedup: 0.42x (sklearn faster) | |
| Predict speedup: 0.60x (sklearn faster) | |
| Log Loss: sklearn better by 0.6692 | |
| ====================================================================== | |
| Benchmark: Mixed Features | |
| Mixed numerical and categorical features | |
| Samples: 50,000 | Features: 40 | Classes: 2 | Type: mixed | |
| Categorical: 20 features, cardinality=10 | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (40000, 40), Test: (10000, 40) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 1.388 0.027 0.2516 | |
| LightGBM 0.680 0.015 0.5941 | |
| ------------------------------------------------------ | |
| Fit speedup: 2.04x (LightGBM faster) | |
| Predict speedup: 1.76x (LightGBM faster) | |
| Log Loss: sklearn better by 0.3425 | |
| ====================================================================== | |
| Benchmark: High-Cardinality Categorical | |
| High-cardinality categorical features (LightGBM advantage) | |
| Samples: 100,000 | Features: 30 | Classes: 2 | Type: mixed | |
| Categorical: 15 features, cardinality=100 | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (80000, 30), Test: (20000, 30) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 3.098 0.041 0.1882 | |
| LightGBM 0.843 0.027 0.6055 | |
| ------------------------------------------------------ | |
| Fit speedup: 3.67x (LightGBM faster) | |
| Predict speedup: 1.50x (LightGBM faster) | |
| Log Loss: sklearn better by 0.4174 | |
| ====================================================================== | |
| Benchmark: Pure Categorical | |
| All categorical features (LightGBM native support) | |
| Samples: 100,000 | Features: 50 | Classes: 2 | Type: categorical | |
| Categorical: 50 features, cardinality=50 | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (80000, 50), Test: (20000, 50) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 2.499 0.042 0.2766 | |
| LightGBM 0.897 0.027 0.6145 | |
| ------------------------------------------------------ | |
| Fit speedup: 2.79x (LightGBM faster) | |
| Predict speedup: 1.60x (LightGBM faster) | |
| Log Loss: sklearn better by 0.3379 | |
| ====================================================================== | |
| Benchmark: LightGBM Optimal | |
| Large dataset with many high-cardinality categoricals (LightGBM's sweet spot) | |
| Samples: 500,000 | Features: 100 | Classes: 3 | Type: mixed | |
| Categorical: 60 features, cardinality=200 | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (400000, 100), Test: (100000, 100) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 56.499 0.384 0.3760 | |
| LightGBM 4.810 0.257 1.0001 | |
| ------------------------------------------------------ | |
| Fit speedup: 11.75x (LightGBM faster) | |
| Predict speedup: 1.50x (LightGBM faster) | |
| Log Loss: sklearn better by 0.6242 | |
| ====================================================================== | |
| Benchmark: High-Dimensional Numerical | |
| High-dimensional numerical data | |
| Samples: 50,000 | Features: 200 | Classes: 2 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (40000, 200), Test: (10000, 200) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 6.897 0.028 0.4134 | |
| LightGBM 0.895 0.017 0.6599 | |
| ------------------------------------------------------ | |
| Fit speedup: 7.70x (LightGBM faster) | |
| Predict speedup: 1.60x (LightGBM faster) | |
| Log Loss: sklearn better by 0.2465 | |
| ====================================================================== | |
| Benchmark: Very Large Dataset | |
| Very large sample size test | |
| Samples: 500,000 | Features: 50 | Classes: 2 | Type: numerical | |
| ---------------------------------------------------------------------- | |
| Generating data... Done. Train: (400000, 50), Test: (100000, 50) | |
| Running scikit-learn RandomForest... Done. | |
| Running LightGBM RandomForest... Done. | |
| Results: | |
| Method Fit (s) Predict (s) Log Loss | |
| ------------------------------------------------------ | |
| scikit-learn 46.691 0.313 0.2193 | |
| LightGBM 1.726 0.078 0.6171 | |
| ------------------------------------------------------ | |
| Fit speedup: 27.05x (LightGBM faster) | |
| Predict speedup: 4.02x (LightGBM faster) | |
| Log Loss: sklearn better by 0.3978 | |
| ================================================================================ | |
| SUMMARY - TIMING | |
| ================================================================================ | |
| Benchmark sklearn Fit LightGBM Fit Speedup | |
| ---------------------------------------------------------------------- | |
| Small Numerical 0.144 0.348 0.41x ✓ SKL | |
| Medium Numerical 0.465 0.573 0.81x ✓ SKL | |
| Large Numerical 6.286 0.815 7.71x ✓ LGB | |
| Multi-class (5 classes) 2.321 3.022 0.77x ✓ SKL | |
| Multi-class (10 classes) 2.437 5.832 0.42x ✓ SKL | |
| Mixed Features 1.388 0.680 2.04x ✓ LGB | |
| High-Cardinality Categorical 3.098 0.843 3.67x ✓ LGB | |
| Pure Categorical 2.499 0.897 2.79x ✓ LGB | |
| LightGBM Optimal 56.499 4.810 11.75x ✓ LGB | |
| High-Dimensional Numerical 6.897 0.895 7.70x ✓ LGB | |
| Very Large Dataset 46.691 1.726 27.05x ✓ LGB | |
| ---------------------------------------------------------------------- | |
| Average fit speedup: 5.92x | |
| Best case for LightGBM: Very Large Dataset (27.05x faster) | |
| Best case for sklearn: Small Numerical (2.42x faster) | |
| ================================================================================ | |
| SUMMARY - PREDICTIVE PERFORMANCE (Log Loss, lower is better) | |
| ================================================================================ | |
| Benchmark sklearn LightGBM Diff Winner | |
| -------------------------------------------------------------------------------- | |
| Small Numerical 0.2992 0.5132 -0.2139 sklearn | |
| Medium Numerical 0.3598 0.6178 -0.2580 sklearn | |
| Large Numerical 0.2667 0.6066 -0.3400 sklearn | |
| Multi-class (5 classes) 0.8599 1.4571 -0.5972 sklearn | |
| Multi-class (10 classes) 1.4693 2.1385 -0.6692 sklearn | |
| Mixed Features 0.2516 0.5941 -0.3425 sklearn | |
| High-Cardinality Categorical 0.1882 0.6055 -0.4174 sklearn | |
| Pure Categorical 0.2766 0.6145 -0.3379 sklearn | |
| LightGBM Optimal 0.3760 1.0001 -0.6242 sklearn | |
| High-Dimensional Numerical 0.4134 0.6599 -0.2465 sklearn | |
| Very Large Dataset 0.2193 0.6171 -0.3978 sklearn | |
| -------------------------------------------------------------------------------- | |
| Average Log Loss - sklearn: 0.4527, LightGBM: 0.8568 | |
| Wins - sklearn: 11, LightGBM: 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment