Skip to content

Instantly share code, notes, and snippets.

@DMTSource
Last active March 24, 2021 21:33

Revisions

  1. DMTSource renamed this gist Mar 24, 2021. 1 changed file with 0 additions and 0 deletions.
  2. DMTSource revised this gist Mar 24, 2021. No changes.
  3. DMTSource created this gist Mar 24, 2021.
    96 changes: 96 additions & 0 deletions readme_long_gridsearchcv_example.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,96 @@
    import sklearn.decomposition
    import sklearn.ensemble
    import sklearn.linear_model
    import sklearn.preprocessing
    import sklearn.svm
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split

    from baikal import Input, Model, make_step
    from baikal.plot import plot_model
    from baikal.steps import Stack

    ### ADDED to readme_long_example
    from sklearn.model_selection import GridSearchCV, StratifiedKFold
    ###

    # 1. Define the steps
    LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
    RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
    ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)
    PCA = make_step(sklearn.decomposition.PCA)
    SVC = make_step(sklearn.svm.SVC)
    PowerTransformer = make_step(sklearn.preprocessing.PowerTransformer)

    # 2. Build the model
    x1 = Input(name="x1")
    x2 = Input(name="x2")
    y_t = Input(name="y_t")

    y1 = ExtraTreesClassifier()(x1, y_t)
    y2 = RandomForestClassifier()(x2, y_t)
    z = PowerTransformer()(x2)
    z = PCA()(z)
    y3 = LogisticRegression()(z, y_t)

    stacked_features = Stack()([y1, y2, y3])
    y_p = SVC()(stacked_features, y_t)

    model = Model([x1, x2], y_p, y_t)
    plot_model(model, filename="multiple_input_nonlinear_pipeline_example_plot.png")

    # 3. Train the model
    dataset = load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, random_state=0
    )

    # Let's suppose the dataset is originally split in two
    X1_train, X2_train = X_train[:, :15], X_train[:, 15:]
    X1_test, X2_test = X_test[:, :15], X_test[:, 15:]

    ### ADDED to readme_long_example
    #model.fit([X1_train, X2_train], y_train)
    param_grid = [
    {
    "LogisticRegression_0": [
    LogisticRegression(
    random_state=0, solver="lbfgs", multi_class="multinomial"
    )
    ],
    "LogisticRegression_0__C": [0.01, 0.1, 1],
    "PCA_0__n_components": [1, 2, 3, 4],
    },
    {
    "RandomForestClassifier_0": [RandomForestClassifier(random_state=0)],
    "RandomForestClassifier_0__n_estimators": [10, 50, 100],
    },
    ]
    cv = StratifiedKFold(n_splits=3)#, random_state=0) #shuffle is false so no random
    gscv_baikal = GridSearchCV(
    model,
    param_grid,
    cv=cv,
    scoring="accuracy",
    return_train_score=True,
    verbose=1,
    )
    # THIS WILL FAIL, it does NOT like the inputs?
    # ValueError: Found input variables with inconsistent numbers of samples: [2, 426]
    gscv_baikal.fit([X1_train, X2_train], y_train)

    print("Best score:", gscv_baikal.best_score_)
    print("Best parameters", gscv_baikal.best_params_)
    model = gscv_baikal.best_estimator_.model
    ###

    # 4. Use the model
    y_test_pred = model.predict([X1_test, X2_test])

    # This also works:
    # y_test_pred = model.predict({x1: X1_test, x2: X2_test})

    # We can also query any intermediate outputs:
    outs = model.predict(
    [X1_test, X2_test], output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"]
    )