Last active
March 24, 2021 21:33
Revisions
-
DMTSource renamed this gist
Mar 24, 2021 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
DMTSource revised this gist
Mar 24, 2021 . No changes.There are no files selected for viewing
-
DMTSource created this gist
Mar 24, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,96 @@ import sklearn.decomposition import sklearn.ensemble import sklearn.linear_model import sklearn.preprocessing import sklearn.svm from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model from baikal.steps import Stack ### ADDED to readme_long_example from sklearn.model_selection import GridSearchCV, StratifiedKFold ### # 1. Define the steps LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) PCA = make_step(sklearn.decomposition.PCA) SVC = make_step(sklearn.svm.SVC) PowerTransformer = make_step(sklearn.preprocessing.PowerTransformer) # 2. Build the model x1 = Input(name="x1") x2 = Input(name="x2") y_t = Input(name="y_t") y1 = ExtraTreesClassifier()(x1, y_t) y2 = RandomForestClassifier()(x2, y_t) z = PowerTransformer()(x2) z = PCA()(z) y3 = LogisticRegression()(z, y_t) stacked_features = Stack()([y1, y2, y3]) y_p = SVC()(stacked_features, y_t) model = Model([x1, x2], y_p, y_t) plot_model(model, filename="multiple_input_nonlinear_pipeline_example_plot.png") # 3. Train the model dataset = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, random_state=0 ) # Let's suppose the dataset is originally split in two X1_train, X2_train = X_train[:, :15], X_train[:, 15:] X1_test, X2_test = X_test[:, :15], X_test[:, 15:] ### ADDED to readme_long_example #model.fit([X1_train, X2_train], y_train) param_grid = [ { "LogisticRegression_0": [ LogisticRegression( random_state=0, solver="lbfgs", multi_class="multinomial" ) ], "LogisticRegression_0__C": [0.01, 0.1, 1], "PCA_0__n_components": [1, 2, 3, 4], }, { "RandomForestClassifier_0": [RandomForestClassifier(random_state=0)], "RandomForestClassifier_0__n_estimators": [10, 50, 100], }, ] cv = StratifiedKFold(n_splits=3)#, random_state=0) #shuffle is false so no random gscv_baikal = GridSearchCV( model, param_grid, cv=cv, scoring="accuracy", return_train_score=True, verbose=1, ) # THIS WILL FAIL, it does NOT like the inputs? # ValueError: Found input variables with inconsistent numbers of samples: [2, 426] gscv_baikal.fit([X1_train, X2_train], y_train) print("Best score:", gscv_baikal.best_score_) print("Best parameters", gscv_baikal.best_params_) model = gscv_baikal.best_estimator_.model ### # 4. Use the model y_test_pred = model.predict([X1_test, X2_test]) # This also works: # y_test_pred = model.predict({x1: X1_test, x2: X2_test}) # We can also query any intermediate outputs: outs = model.predict( [X1_test, X2_test], output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"] )