Created
April 13, 2026 15:00
-
-
Save betatim/e88fa0747da9ec4465eed4a395a6a66f to your computer and use it in GitHub Desktop.
GridSearchCV on a (random) pipeline. Main take away is that fitting the pipeline takes seconds.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Pipeline(StandardScaler, PCA, RandomForest) with GridSearchCV | |
| ============================================================= | |
| GridSearchCV over an all-proxy Pipeline on the full Forest Cover Type | |
| dataset (581K samples, 54 features, 7 classes). | |
| Pipeline: StandardScaler -> PCA -> RandomForestClassifier | |
| All three steps are cuml.accel proxies, so the GridSearchCV patch | |
| moves data to GPU once and all CV splits run on device. | |
| The parameter grid searches over: | |
| - pca__n_components: [20, 40] | |
| - clf__n_estimators: [100, 200, 500] | |
| - clf__max_depth: [10, 20, None] | |
| Data: Forest Cover Type (full, 581K samples). | |
| Usage: | |
| python gridsearch_pipeline_rf.py # CPU | |
| python -m cuml.accel gridsearch_pipeline_rf.py # GPU | |
| Accelerated estimators: | |
| StandardScaler, PCA, RandomForestClassifier | |
| """ | |
| import time | |
| import warnings | |
| from collections import OrderedDict | |
| import numpy as np | |
| from sklearn.datasets import fetch_covtype | |
| from sklearn.decomposition import PCA | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.model_selection import GridSearchCV, train_test_split | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| # --------------------------------------------------------------------------- | |
| # Timing helpers | |
| # --------------------------------------------------------------------------- | |
| _timings = OrderedDict() | |
| class timed: | |
| def __init__(self, name): | |
| self.name = name | |
| def __enter__(self): | |
| self.start = time.perf_counter() | |
| return self | |
| def __exit__(self, *exc): | |
| _timings[self.name] = time.perf_counter() - self.start | |
| def _accel_mode(): | |
| try: | |
| from cuml.accel import enabled | |
| return enabled() | |
| except Exception: | |
| return False | |
| def _n_jobs(): | |
| """n_jobs=1 is required for GPU optimization; use all cores on CPU.""" | |
| return 1 if _accel_mode() else 10 | |
| def print_header(n_train, n_test, n_features, n_classes): | |
| mode = "GPU (cuml.accel)" if _accel_mode() else "CPU (sklearn)" | |
| n_jobs = _n_jobs() | |
| print("=" * 70) | |
| print(" Pipeline(Scaler, PCA, RF) with GridSearchCV") | |
| print(f" Mode: {mode}") | |
| print(f" n_jobs: {n_jobs}") | |
| print(f" Train: {n_train:,}") | |
| print(f" Test: {n_test:,}") | |
| print(f" Features: {n_features}") | |
| print(f" Classes: {n_classes}") | |
| print("=" * 70) | |
| def print_timings(): | |
| total = sum(_timings.values()) | |
| print() | |
| print("=" * 70) | |
| print(" TIMING SUMMARY") | |
| print("-" * 70) | |
| for name, t in _timings.items(): | |
| print(f" {name:40s} {t:>8.2f}s") | |
| print("-" * 70) | |
| print(f" {'TOTAL':40s} {total:>8.2f}s") | |
| print("=" * 70) | |
| # --------------------------------------------------------------------------- | |
| # Main workflow | |
| # --------------------------------------------------------------------------- | |
| N_FOLDS = 3 | |
| def main(): | |
| # 1. Load data | |
| print("\n[1/4] Loading Forest Cover Type dataset (full) ...") | |
| with timed("Load data"): | |
| X, y = fetch_covtype(return_X_y=True) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y, | |
| ) | |
| n_classes = len(np.unique(y_train)) | |
| print(f" Total: {X.shape[0]:,}") | |
| print(f" Train: {X_train.shape[0]:,}") | |
| print(f" Test: {X_test.shape[0]:,}") | |
| print(f" Features: {X.shape[1]}") | |
| print_header(X_train.shape[0], X_test.shape[0], X.shape[1], n_classes) | |
| # 2. Parameter grid | |
| pipe = Pipeline([ | |
| ("scaler", StandardScaler()), | |
| ("pca", PCA()), | |
| ("clf", RandomForestClassifier(random_state=42)), | |
| ]) | |
| param_grid = { | |
| "pca__n_components": [20, 40], | |
| "clf__n_estimators": [100, 200, 500], | |
| "clf__max_depth": [10, 20, None], | |
| } | |
| n_combos = 2 * 3 * 3 | |
| total_fits = n_combos * N_FOLDS | |
| print(f"\n[2/4] Pipeline: StandardScaler -> PCA -> RandomForestClassifier") | |
| print(f"\n Parameter grid:") | |
| print(f" pca__n_components: {param_grid['pca__n_components']}") | |
| print(f" clf__n_estimators: {param_grid['clf__n_estimators']}") | |
| print(f" clf__max_depth: {param_grid['clf__max_depth']}") | |
| print(f" Total: {n_combos} combos x {N_FOLDS} folds = {total_fits} fits") | |
| # 3. Run GridSearchCV | |
| print(f"\n[3/4] Running GridSearchCV ...") | |
| with timed("GridSearchCV"): | |
| search = GridSearchCV( | |
| pipe, | |
| param_grid, | |
| cv=N_FOLDS, | |
| scoring="accuracy", | |
| n_jobs=_n_jobs(), | |
| refit=True, | |
| ) | |
| search.fit(X_train, y_train) | |
| print(f"\n Best parameters:") | |
| for param, val in sorted(search.best_params_.items()): | |
| print(f" {param}: {val}") | |
| print(f" Best CV accuracy: {search.best_score_:.4f}") | |
| # 4. Test evaluation | |
| print(f"\n[4/4] Test set evaluation ...") | |
| with timed("Test evaluation"): | |
| test_acc = search.score(X_test, y_test) | |
| print(f" Test accuracy: {test_acc:.4f}") | |
| # Results table | |
| cv_results = search.cv_results_ | |
| print("\n" + "-" * 70) | |
| print(" ALL GRIDSEARCHCV RESULTS (ranked by accuracy)") | |
| print("-" * 70) | |
| ranked = np.argsort(cv_results["rank_test_score"]) | |
| for rank, idx in enumerate(ranked, 1): | |
| p = cv_results["params"][idx] | |
| score = cv_results["mean_test_score"][idx] | |
| std = cv_results["std_test_score"][idx] | |
| fit_time = cv_results["mean_fit_time"][idx] | |
| depth = str(p["clf__max_depth"]) | |
| print(f" #{rank:<2d} pca={p['pca__n_components']:<3d}" | |
| f" n_est={p['clf__n_estimators']:<4d}" | |
| f" depth={depth:<5s}" | |
| f" acc={score:.4f}(+/-{std:.4f})" | |
| f" fit={fit_time:.1f}s") | |
| print_timings() | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ALL GRIDSEARCHCV RESULTS (ranked by accuracy) | |
| ---------------------------------------------------------------------- | |
| #1 pca=20 n_est=500 depth=None acc=0.9356(+/-0.0011) fit=13.4s | |
| #2 pca=40 n_est=500 depth=None acc=0.9351(+/-0.0006) fit=14.5s | |
| #3 pca=20 n_est=200 depth=None acc=0.9351(+/-0.0011) fit=5.5s | |
| #4 pca=40 n_est=200 depth=None acc=0.9349(+/-0.0005) fit=5.7s | |
| #5 pca=20 n_est=100 depth=None acc=0.9346(+/-0.0011) fit=2.7s | |
| #6 pca=40 n_est=100 depth=None acc=0.9340(+/-0.0007) fit=3.0s | |
| #7 pca=40 n_est=500 depth=20 acc=0.9098(+/-0.0009) fit=10.9s | |
| #8 pca=40 n_est=200 depth=20 acc=0.9095(+/-0.0011) fit=4.5s | |
| #9 pca=40 n_est=100 depth=20 acc=0.9091(+/-0.0011) fit=2.4s | |
| #10 pca=20 n_est=500 depth=20 acc=0.9071(+/-0.0015) fit=10.4s | |
| #11 pca=20 n_est=200 depth=20 acc=0.9068(+/-0.0018) fit=4.2s | |
| #12 pca=20 n_est=100 depth=20 acc=0.9062(+/-0.0015) fit=2.3s | |
| #13 pca=40 n_est=100 depth=10 acc=0.7555(+/-0.0013) fit=1.8s | |
| #14 pca=40 n_est=200 depth=10 acc=0.7542(+/-0.0010) fit=3.4s | |
| #15 pca=40 n_est=500 depth=10 acc=0.7536(+/-0.0010) fit=8.0s | |
| #16 pca=20 n_est=500 depth=10 acc=0.7488(+/-0.0014) fit=8.0s | |
| #17 pca=20 n_est=200 depth=10 acc=0.7481(+/-0.0018) fit=3.3s | |
| #18 pca=20 n_est=100 depth=10 acc=0.7468(+/-0.0025) fit=1.9s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Are these results with the implementation in rapidsai/cuml#7843 ? If so, which commit?