#Required for accessing openml datasets from Lale
!pip install 'liac-arff>=2.4.0'
Looking in indexes: https://kakate%40us.ibm.com:****@na.artifactory.swg-devops.com/artifactory/api/pypi/wcp-nlp-pypi-virtual/simple Requirement already satisfied: liac-arff>=2.4.0 in /Users/kakateus.ibm.com/venv/lale39/lib/python3.9/site-packages (2.5.0)
import lale.datasets.openml
import pandas as pd
(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
'breast-cancer', 'classification', preprocess=True)
import numpy as np
np.bincount(train_y)
array([140, 51])
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem as Nys
from lale.lib.lale import NoOp
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lale.lib.lale import Hyperopt
from lale.lib.imblearn import SMOTE, CondensedNearestNeighbour, SMOTEENN
from sklearn.metrics import accuracy_score
import lale
lale.wrap_imported_operators()
pipeline_without_correction = MinMaxScaler() >> PCA() >> RandomForestClassifier()
pipeline_without_correction.visualize()
optimizer = Hyperopt(estimator=pipeline_without_correction, max_evals = 10, scoring='roc_auc')
trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))
predictions = trained_optimizer.predict(test_X)
trained_optimizer.summary()
100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00, 1.41s/trial, best loss: -0.6583766233766233]
tid | loss | time | log_loss | status | |
---|---|---|---|---|---|
name | |||||
p0 | 0 | -0.623864 | 0.203896 | 0.567272 | ok |
p1 | 1 | -0.631656 | 0.237808 | 0.566987 | ok |
p2 | 2 | -0.658377 | 0.229631 | 0.558715 | ok |
p3 | 3 | -0.579383 | 0.212269 | 0.628262 | ok |
p4 | 4 | -0.616786 | 0.252034 | 0.567862 | ok |
p5 | 5 | -0.618864 | 0.281458 | 0.585125 | ok |
p6 | 6 | -0.650032 | 0.342633 | 0.556056 | ok |
p7 | 7 | -0.652825 | 0.270016 | 0.558804 | ok |
p8 | 8 | -0.633604 | 0.214408 | 0.578119 | ok |
p9 | 9 | -0.632500 | 0.278336 | 0.576655 | ok |
print(accuracy_score(test_y, predictions))
best_estimator = trained_optimizer.get_pipeline()
best_estimator.visualize()
0.6421052631578947
pipeline_with_correction = SMOTE(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())
optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')
trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))
predictions = trained_optimizer.predict(test_X)
trained_optimizer.summary()
100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00, 1.40s/trial, best loss: -0.6835011143534503]
tid | loss | time | log_loss | status | |
---|---|---|---|---|---|
name | |||||
p0 | 0 | -0.624954 | 0.190958 | 0.616522 | ok |
p1 | 1 | -0.650157 | 0.242815 | 0.655870 | ok |
p2 | 2 | -0.683501 | 0.211413 | 0.666853 | ok |
p3 | 3 | -0.604101 | 0.185956 | 0.599714 | ok |
p4 | 4 | -0.517529 | 0.265853 | 0.655569 | ok |
p5 | 5 | -0.556187 | 0.363249 | 0.635348 | ok |
p6 | 6 | -0.568477 | 0.298747 | 0.648213 | ok |
p7 | 7 | -0.579616 | 0.228733 | 0.658655 | ok |
p8 | 8 | -0.630238 | 0.222252 | 0.617986 | ok |
p9 | 9 | -0.590678 | 0.328617 | 0.641440 | ok |
print(accuracy_score(test_y, predictions))
best_estimator = trained_optimizer.get_pipeline()
best_estimator.visualize()
0.631578947368421
pipeline_with_correction = CondensedNearestNeighbour(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())
optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')
trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))
predictions = trained_optimizer.predict(test_X)
trained_optimizer.summary()
100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00, 2.26s/trial, best loss: -0.6813611664876958]
tid | loss | time | log_loss | status | |
---|---|---|---|---|---|
name | |||||
p0 | 0 | -0.642447 | 0.353958 | 0.604473 | ok |
p1 | 1 | -0.598246 | 0.424864 | 0.626900 | ok |
p2 | 2 | -0.679520 | 0.452510 | 0.607253 | ok |
p3 | 3 | -0.581329 | 0.354175 | 0.648877 | ok |
p4 | 4 | -0.626233 | 0.468622 | 0.619565 | ok |
p5 | 5 | -0.644204 | 0.416804 | 0.613639 | ok |
p6 | 6 | -0.649106 | 0.445763 | 0.618898 | ok |
p7 | 7 | -0.599145 | 0.505183 | 0.624448 | ok |
p8 | 8 | -0.584214 | 0.368897 | 0.622633 | ok |
p9 | 9 | -0.681361 | 0.444388 | 0.582320 | ok |
print(accuracy_score(test_y, predictions))
best_estimator = trained_optimizer.get_pipeline()
best_estimator.visualize()
0.6947368421052632
pipeline_with_correction = SMOTEENN(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())
optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')
trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))
predictions = trained_optimizer.predict(test_X)
trained_optimizer.summary()
100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00, 1.26s/trial, best loss: -0.664063068312234]
tid | loss | time | log_loss | status | |
---|---|---|---|---|---|
name | |||||
p0 | 0 | -0.574195 | 0.197406 | 0.874854 | ok |
p1 | 1 | -0.581562 | 0.211810 | 0.968561 | ok |
p2 | 2 | -0.664063 | 0.200366 | 1.025431 | ok |
p3 | 3 | -0.661621 | 0.183795 | 0.721990 | ok |
p4 | 4 | -0.554206 | 0.293048 | 0.857730 | ok |
p5 | 5 | -0.593452 | 0.240670 | 0.826638 | ok |
p6 | 6 | -0.636524 | 0.264110 | 0.809327 | ok |
p7 | 7 | -0.637288 | 0.219459 | 0.894342 | ok |
p8 | 8 | -0.657068 | 0.185566 | 1.327299 | ok |
p9 | 9 | -0.626770 | 0.241434 | 1.025155 | ok |
print(accuracy_score(test_y, predictions))
best_estimator = trained_optimizer.get_pipeline()
best_estimator.visualize()
0.35789473684210527