betatim · April 13, 2026 15:00 · csadorf · Apr 13, 2026
diff --git a/gridsearch.py b/gridsearch.py
 """
 Pipeline(StandardScaler, PCA, RandomForest) with GridSearchCV
 =============================================================

 GridSearchCV over an all-proxy Pipeline on the full Forest Cover Type
 dataset (581K samples, 54 features, 7 classes).

 Pipeline: StandardScaler -> PCA -> RandomForestClassifier

 All three steps are cuml.accel proxies, so the GridSearchCV patch
 moves data to GPU once and all CV splits run on device.

 The parameter grid searches over:
  - pca__n_components: [20, 40]
  - clf__n_estimators: [100, 200, 500]
  - clf__max_depth:    [10, 20, None]

 Data: Forest Cover Type (full, 581K samples).

 Usage:
    python gridsearch_pipeline_rf.py                   # CPU
    python -m cuml.accel gridsearch_pipeline_rf.py     # GPU

 Accelerated estimators:
    StandardScaler, PCA, RandomForestClassifier
 """

 import time
 import warnings
 from collections import OrderedDict

 import numpy as np
 from sklearn.datasets import fetch_covtype
 from sklearn.decomposition import PCA
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import GridSearchCV, train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler

 warnings.filterwarnings("ignore", category=FutureWarning)

 # ---------------------------------------------------------------------------
 # Timing helpers
 # ---------------------------------------------------------------------------

 _timings = OrderedDict()


 class timed:
    def __init__(self, name):
        self.name = name

    def __enter__(self):
        self.start = time.perf_counter()
        return self

    def __exit__(self, *exc):
        _timings[self.name] = time.perf_counter() - self.start


 def _accel_mode():
    try:
        from cuml.accel import enabled
        return enabled()
    except Exception:
        return False


 def _n_jobs():
    """n_jobs=1 is required for GPU optimization; use all cores on CPU."""
    return 1 if _accel_mode() else 10


 def print_header(n_train, n_test, n_features, n_classes):
    mode = "GPU (cuml.accel)" if _accel_mode() else "CPU (sklearn)"
    n_jobs = _n_jobs()
    print("=" * 70)
    print("  Pipeline(Scaler, PCA, RF) with GridSearchCV")
    print(f"  Mode:     {mode}")
    print(f"  n_jobs:   {n_jobs}")
    print(f"  Train:    {n_train:,}")
    print(f"  Test:     {n_test:,}")
    print(f"  Features: {n_features}")
    print(f"  Classes:  {n_classes}")
    print("=" * 70)


 def print_timings():
    total = sum(_timings.values())
    print()
    print("=" * 70)
    print("  TIMING SUMMARY")
    print("-" * 70)
    for name, t in _timings.items():
        print(f"  {name:40s}  {t:>8.2f}s")
    print("-" * 70)
    print(f"  {'TOTAL':40s}  {total:>8.2f}s")
    print("=" * 70)


 # ---------------------------------------------------------------------------
 # Main workflow
 # ---------------------------------------------------------------------------

 N_FOLDS = 3


 def main():
    # 1. Load data
    print("\n[1/4] Loading Forest Cover Type dataset (full) ...")
    with timed("Load data"):
        X, y = fetch_covtype(return_X_y=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y,
    )
    n_classes = len(np.unique(y_train))
    print(f"       Total:    {X.shape[0]:,}")
    print(f"       Train:    {X_train.shape[0]:,}")
    print(f"       Test:     {X_test.shape[0]:,}")
    print(f"       Features: {X.shape[1]}")
    print_header(X_train.shape[0], X_test.shape[0], X.shape[1], n_classes)

    # 2. Parameter grid
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA()),
        ("clf", RandomForestClassifier(random_state=42)),
    ])

    param_grid = {
        "pca__n_components": [20, 40],
        "clf__n_estimators": [100, 200, 500],
        "clf__max_depth": [10, 20, None],
    }

    n_combos = 2 * 3 * 3
    total_fits = n_combos * N_FOLDS
    print(f"\n[2/4] Pipeline: StandardScaler -> PCA -> RandomForestClassifier")
    print(f"\n       Parameter grid:")
    print(f"       pca__n_components:  {param_grid['pca__n_components']}")
    print(f"       clf__n_estimators:  {param_grid['clf__n_estimators']}")
    print(f"       clf__max_depth:     {param_grid['clf__max_depth']}")
    print(f"       Total: {n_combos} combos x {N_FOLDS} folds = {total_fits} fits")

    # 3. Run GridSearchCV
    print(f"\n[3/4] Running GridSearchCV ...")
    with timed("GridSearchCV"):
        search = GridSearchCV(
            pipe,
            param_grid,
            cv=N_FOLDS,
            scoring="accuracy",
            n_jobs=_n_jobs(),
            refit=True,
        )
        search.fit(X_train, y_train)

    print(f"\n       Best parameters:")
    for param, val in sorted(search.best_params_.items()):
        print(f"         {param}: {val}")
    print(f"       Best CV accuracy: {search.best_score_:.4f}")

    # 4. Test evaluation
    print(f"\n[4/4] Test set evaluation ...")
    with timed("Test evaluation"):
        test_acc = search.score(X_test, y_test)
    print(f"       Test accuracy: {test_acc:.4f}")

    # Results table
    cv_results = search.cv_results_
    print("\n" + "-" * 70)
    print("  ALL GRIDSEARCHCV RESULTS (ranked by accuracy)")
    print("-" * 70)
    ranked = np.argsort(cv_results["rank_test_score"])
    for rank, idx in enumerate(ranked, 1):
        p = cv_results["params"][idx]
        score = cv_results["mean_test_score"][idx]
        std = cv_results["std_test_score"][idx]
        fit_time = cv_results["mean_fit_time"][idx]
        depth = str(p["clf__max_depth"])
        print(f"  #{rank:<2d}  pca={p['pca__n_components']:<3d}"
              f"  n_est={p['clf__n_estimators']:<4d}"
              f"  depth={depth:<5s}"
              f"  acc={score:.4f}(+/-{std:.4f})"
              f"  fit={fit_time:.1f}s")

    print_timings()


 if __name__ == "__main__":
    main()
diff --git a/runtimes.txt b/runtimes.txt
  ALL GRIDSEARCHCV RESULTS (ranked by accuracy)
 ----------------------------------------------------------------------
  #1   pca=20   n_est=500   depth=None   acc=0.9356(+/-0.0011)  fit=13.4s
  #2   pca=40   n_est=500   depth=None   acc=0.9351(+/-0.0006)  fit=14.5s
  #3   pca=20   n_est=200   depth=None   acc=0.9351(+/-0.0011)  fit=5.5s
  #4   pca=40   n_est=200   depth=None   acc=0.9349(+/-0.0005)  fit=5.7s
  #5   pca=20   n_est=100   depth=None   acc=0.9346(+/-0.0011)  fit=2.7s
  #6   pca=40   n_est=100   depth=None   acc=0.9340(+/-0.0007)  fit=3.0s
  #7   pca=40   n_est=500   depth=20     acc=0.9098(+/-0.0009)  fit=10.9s
  #8   pca=40   n_est=200   depth=20     acc=0.9095(+/-0.0011)  fit=4.5s
  #9   pca=40   n_est=100   depth=20     acc=0.9091(+/-0.0011)  fit=2.4s
  #10  pca=20   n_est=500   depth=20     acc=0.9071(+/-0.0015)  fit=10.4s
  #11  pca=20   n_est=200   depth=20     acc=0.9068(+/-0.0018)  fit=4.2s
  #12  pca=20   n_est=100   depth=20     acc=0.9062(+/-0.0015)  fit=2.3s
  #13  pca=40   n_est=100   depth=10     acc=0.7555(+/-0.0013)  fit=1.8s
  #14  pca=40   n_est=200   depth=10     acc=0.7542(+/-0.0010)  fit=3.4s
  #15  pca=40   n_est=500   depth=10     acc=0.7536(+/-0.0010)  fit=8.0s
  #16  pca=20   n_est=500   depth=10     acc=0.7488(+/-0.0014)  fit=8.0s
  #17  pca=20   n_est=200   depth=10     acc=0.7481(+/-0.0018)  fit=3.3s
  #18  pca=20   n_est=100   depth=10     acc=0.7468(+/-0.0025)  fit=1.9s
	"""
	Pipeline(StandardScaler, PCA, RandomForest) with GridSearchCV
	=============================================================

	GridSearchCV over an all-proxy Pipeline on the full Forest Cover Type
	dataset (581K samples, 54 features, 7 classes).

	Pipeline: StandardScaler -> PCA -> RandomForestClassifier

	All three steps are cuml.accel proxies, so the GridSearchCV patch
	moves data to GPU once and all CV splits run on device.

	The parameter grid searches over:
	- pca__n_components: [20, 40]
	- clf__n_estimators: [100, 200, 500]
	- clf__max_depth: [10, 20, None]

	Data: Forest Cover Type (full, 581K samples).

	Usage:
	python gridsearch_pipeline_rf.py # CPU
	python -m cuml.accel gridsearch_pipeline_rf.py # GPU

	Accelerated estimators:
	StandardScaler, PCA, RandomForestClassifier
	"""

	import time
	import warnings
	from collections import OrderedDict

	import numpy as np
	from sklearn.datasets import fetch_covtype
	from sklearn.decomposition import PCA
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import GridSearchCV, train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler

	warnings.filterwarnings("ignore", category=FutureWarning)

	# ---------------------------------------------------------------------------
	# Timing helpers
	# ---------------------------------------------------------------------------

	_timings = OrderedDict()


	class timed:
	def __init__(self, name):
	self.name = name

	def __enter__(self):
	self.start = time.perf_counter()
	return self

	def __exit__(self, *exc):
	_timings[self.name] = time.perf_counter() - self.start


	def _accel_mode():
	try:
	from cuml.accel import enabled
	return enabled()
	except Exception:
	return False


	def _n_jobs():
	"""n_jobs=1 is required for GPU optimization; use all cores on CPU."""
	return 1 if _accel_mode() else 10


	def print_header(n_train, n_test, n_features, n_classes):
	mode = "GPU (cuml.accel)" if _accel_mode() else "CPU (sklearn)"
	n_jobs = _n_jobs()
	print("=" * 70)
	print(" Pipeline(Scaler, PCA, RF) with GridSearchCV")
	print(f" Mode: {mode}")
	print(f" n_jobs: {n_jobs}")
	print(f" Train: {n_train:,}")
	print(f" Test: {n_test:,}")
	print(f" Features: {n_features}")
	print(f" Classes: {n_classes}")
	print("=" * 70)


	def print_timings():
	total = sum(_timings.values())
	print()
	print("=" * 70)
	print(" TIMING SUMMARY")
	print("-" * 70)
	for name, t in _timings.items():
	print(f" {name:40s} {t:>8.2f}s")
	print("-" * 70)
	print(f" {'TOTAL':40s} {total:>8.2f}s")
	print("=" * 70)


	# ---------------------------------------------------------------------------
	# Main workflow
	# ---------------------------------------------------------------------------

	N_FOLDS = 3


	def main():
	# 1. Load data
	print("\n[1/4] Loading Forest Cover Type dataset (full) ...")
	with timed("Load data"):
	X, y = fetch_covtype(return_X_y=True)

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y,
	)
	n_classes = len(np.unique(y_train))
	print(f" Total: {X.shape[0]:,}")
	print(f" Train: {X_train.shape[0]:,}")
	print(f" Test: {X_test.shape[0]:,}")
	print(f" Features: {X.shape[1]}")
	print_header(X_train.shape[0], X_test.shape[0], X.shape[1], n_classes)

	# 2. Parameter grid
	pipe = Pipeline([
	("scaler", StandardScaler()),
	("pca", PCA()),
	("clf", RandomForestClassifier(random_state=42)),
	])

	param_grid = {
	"pca__n_components": [20, 40],
	"clf__n_estimators": [100, 200, 500],
	"clf__max_depth": [10, 20, None],
	}

	n_combos = 2 * 3 * 3
	total_fits = n_combos * N_FOLDS
	print(f"\n[2/4] Pipeline: StandardScaler -> PCA -> RandomForestClassifier")
	print(f"\n Parameter grid:")
	print(f" pca__n_components: {param_grid['pca__n_components']}")
	print(f" clf__n_estimators: {param_grid['clf__n_estimators']}")
	print(f" clf__max_depth: {param_grid['clf__max_depth']}")
	print(f" Total: {n_combos} combos x {N_FOLDS} folds = {total_fits} fits")

	# 3. Run GridSearchCV
	print(f"\n[3/4] Running GridSearchCV ...")
	with timed("GridSearchCV"):
	search = GridSearchCV(
	pipe,
	param_grid,
	cv=N_FOLDS,
	scoring="accuracy",
	n_jobs=_n_jobs(),
	refit=True,
	)
	search.fit(X_train, y_train)

	print(f"\n Best parameters:")
	for param, val in sorted(search.best_params_.items()):
	print(f" {param}: {val}")
	print(f" Best CV accuracy: {search.best_score_:.4f}")

	# 4. Test evaluation
	print(f"\n[4/4] Test set evaluation ...")
	with timed("Test evaluation"):
	test_acc = search.score(X_test, y_test)
	print(f" Test accuracy: {test_acc:.4f}")

	# Results table
	cv_results = search.cv_results_
	print("\n" + "-" * 70)
	print(" ALL GRIDSEARCHCV RESULTS (ranked by accuracy)")
	print("-" * 70)
	ranked = np.argsort(cv_results["rank_test_score"])
	for rank, idx in enumerate(ranked, 1):
	p = cv_results["params"][idx]
	score = cv_results["mean_test_score"][idx]
	std = cv_results["std_test_score"][idx]
	fit_time = cv_results["mean_fit_time"][idx]
	depth = str(p["clf__max_depth"])
	print(f" #{rank:<2d} pca={p['pca__n_components']:<3d}"
	f" n_est={p['clf__n_estimators']:<4d}"
	f" depth={depth:<5s}"
	f" acc={score:.4f}(+/-{std:.4f})"
	f" fit={fit_time:.1f}s")

	print_timings()


	if __name__ == "__main__":
	main()
	ALL GRIDSEARCHCV RESULTS (ranked by accuracy)
	----------------------------------------------------------------------
	#1 pca=20 n_est=500 depth=None acc=0.9356(+/-0.0011) fit=13.4s
	#2 pca=40 n_est=500 depth=None acc=0.9351(+/-0.0006) fit=14.5s
	#3 pca=20 n_est=200 depth=None acc=0.9351(+/-0.0011) fit=5.5s
	#4 pca=40 n_est=200 depth=None acc=0.9349(+/-0.0005) fit=5.7s
	#5 pca=20 n_est=100 depth=None acc=0.9346(+/-0.0011) fit=2.7s
	#6 pca=40 n_est=100 depth=None acc=0.9340(+/-0.0007) fit=3.0s
	#7 pca=40 n_est=500 depth=20 acc=0.9098(+/-0.0009) fit=10.9s
	#8 pca=40 n_est=200 depth=20 acc=0.9095(+/-0.0011) fit=4.5s
	#9 pca=40 n_est=100 depth=20 acc=0.9091(+/-0.0011) fit=2.4s
	#10 pca=20 n_est=500 depth=20 acc=0.9071(+/-0.0015) fit=10.4s
	#11 pca=20 n_est=200 depth=20 acc=0.9068(+/-0.0018) fit=4.2s
	#12 pca=20 n_est=100 depth=20 acc=0.9062(+/-0.0015) fit=2.3s
	#13 pca=40 n_est=100 depth=10 acc=0.7555(+/-0.0013) fit=1.8s
	#14 pca=40 n_est=200 depth=10 acc=0.7542(+/-0.0010) fit=3.4s
	#15 pca=40 n_est=500 depth=10 acc=0.7536(+/-0.0010) fit=8.0s
	#16 pca=20 n_est=500 depth=10 acc=0.7488(+/-0.0014) fit=8.0s
	#17 pca=20 n_est=200 depth=10 acc=0.7481(+/-0.0018) fit=3.3s
	#18 pca=20 n_est=100 depth=10 acc=0.7468(+/-0.0025) fit=1.9s