Guillaume Baudart, Martin Hirzel, Kiran Kate, Pari Ram, and Avi Shinnar
27 March 2020
Examples, documentation, code: https://github.com/ibm/lale
# !pip install --quiet lale
import lale.datasets
(train_X_all, train_y_all), (test_X, test_y) = lale.datasets.covtype_df(test_size=0.1)
print(f'shape train_X_all {train_X_all.shape}, test_X {test_X.shape}')
shape train_X_all (522910, 54), test_X (58102, 54)
import sklearn.model_selection
train_X, other_X, train_y, other_y = sklearn.model_selection.train_test_split(
train_X_all, train_y_all, test_size=0.9)
print(f'shape train_X {train_X.shape}, other_X {other_X.shape}')
shape train_X (52291, 54), other_X (470619, 54)
import pandas as pd
pd.set_option('display.max_columns', None)
pd.concat([pd.DataFrame({'y': train_y}, index=train_X.index),
train_X], axis=1).tail(10)
y | Elevation | Aspect | Slope | Horizontal_Distance_To_Hydrology | Vertical_Distance_To_Hydrology | Horizontal_Distance_To_Roadways | Hillshade_9am | Hillshade_Noon | Hillshade_3pm | Horizontal_Distance_To_Fire_Points | Wilderness_Area1 | Wilderness_Area2 | Wilderness_Area3 | Wilderness_Area4 | Soil_Type1 | Soil_Type2 | Soil_Type3 | Soil_Type4 | Soil_Type5 | Soil_Type6 | Soil_Type7 | Soil_Type8 | Soil_Type9 | Soil_Type10 | Soil_Type11 | Soil_Type12 | Soil_Type13 | Soil_Type14 | Soil_Type15 | Soil_Type16 | Soil_Type17 | Soil_Type18 | Soil_Type19 | Soil_Type20 | Soil_Type21 | Soil_Type22 | Soil_Type23 | Soil_Type24 | Soil_Type25 | Soil_Type26 | Soil_Type27 | Soil_Type28 | Soil_Type29 | Soil_Type30 | Soil_Type31 | Soil_Type32 | Soil_Type33 | Soil_Type34 | Soil_Type35 | Soil_Type36 | Soil_Type37 | Soil_Type38 | Soil_Type39 | Soil_Type40 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
274665 | 3 | 2354.0 | 130.0 | 23.0 | 285.0 | 80.0 | 277.0 | 250.0 | 220.0 | 86.0 | 874.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
120210 | 2 | 2985.0 | 91.0 | 18.0 | 886.0 | 187.0 | 3180.0 | 244.0 | 209.0 | 88.0 | 828.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
111775 | 2 | 3142.0 | 88.0 | 20.0 | 684.0 | -52.0 | 551.0 | 245.0 | 204.0 | 80.0 | 1082.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
400567 | 3 | 2493.0 | 108.0 | 14.0 | 182.0 | 34.0 | 666.0 | 243.0 | 223.0 | 107.0 | 1294.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
224682 | 2 | 2796.0 | 352.0 | 9.0 | 594.0 | 84.0 | 2955.0 | 205.0 | 225.0 | 158.0 | 1471.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
424723 | 1 | 3126.0 | 197.0 | 13.0 | 85.0 | 10.0 | 5344.0 | 216.0 | 251.0 | 166.0 | 1148.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
445777 | 1 | 2981.0 | 333.0 | 16.0 | 150.0 | 14.0 | 2704.0 | 182.0 | 218.0 | 175.0 | 655.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
388163 | 1 | 3380.0 | 219.0 | 6.0 | 395.0 | 88.0 | 2895.0 | 213.0 | 246.0 | 169.0 | 1224.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
522588 | 7 | 3397.0 | 113.0 | 15.0 | 706.0 | 240.0 | 1507.0 | 245.0 | 223.0 | 103.0 | 1040.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
128441 | 2 | 2831.0 | 155.0 | 21.0 | 85.0 | 27.0 | 4235.0 | 239.0 | 236.0 | 116.0 | 5071.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
from sklearn.decomposition import PCA
from xgboost import XGBClassifier as XGBoost
lale.wrap_imported_operators()
manual_trainable = PCA(n_components=6) >> XGBoost(n_estimators=3)
manual_trainable.visualize()
%%time
manual_trained = manual_trainable.fit(train_X, train_y)
CPU times: user 3.56 s, sys: 672 ms, total: 4.23 s Wall time: 3.88 s
import sklearn.metrics
manual_y = manual_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, manual_y):.1%}')
accuracy 75.5%
XGBoost.hyperparam_schema('n_estimators')
{'description': 'Number of trees to fit.', 'type': 'integer', 'default': 1000, 'minimumForOptimizer': 500, 'maximumForOptimizer': 1500}
print(PCA.documentation_url())
https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html
from lale.lib.lale import Hyperopt
import lale.schemas as schemas
CustomPCA = PCA.customize_schema(n_components=schemas.Int(min=2, max=54))
CustomXGBoost = XGBoost.customize_schema(n_estimators=schemas.Int(min=1, max=10))
hpo_planned = CustomPCA >> CustomXGBoost
hpo_trainable = Hyperopt(estimator=hpo_planned, max_evals=10, cv=3)
%%time
hpo_trained = hpo_trainable.fit(train_X, train_y)
100%|███████| 10/10 [04:22<00:00, 26.22s/trial, best loss: -0.8287659271127307] CPU times: user 4min 57s, sys: 20 s, total: 5min 17s Wall time: 4min 53s
hpo_y = hpo_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, hpo_y):.1%}')
accuracy 84.2%
hpo_trained.get_pipeline().visualize()
hpo_trained.get_pipeline().pretty_print(ipython_display=True)
from sklearn.decomposition import PCA as CustomPCA
from xgboost import XGBClassifier as CustomXGBoost
import lale
lale.wrap_imported_operators()
custom_pca = CustomPCA(n_components=43, svd_solver="full", whiten=True)
custom_xg_boost = CustomXGBoost(
gamma=0.42208258595069725,
learning_rate=0.6558019595096513,
max_depth=13,
min_child_weight=13,
n_estimators=9,
reg_alpha=0.3590229319214039,
reg_lambda=0.7978279409450941,
subsample=0.6209085649172931,
)
pipeline = custom_pca >> custom_xg_boost
hpo_trained.summary()
tid | loss | time | log_loss | status | |
---|---|---|---|---|---|
name | |||||
p0 | 0 | -0.754298 | 4.080399 | 1.039077 | ok |
p1 | 1 | -0.774493 | 7.493949 | 0.799467 | ok |
p2 | 2 | -0.725306 | 6.744288 | 0.948600 | ok |
p3 | 3 | -0.783175 | 4.715054 | 1.036146 | ok |
p4 | 4 | -0.759672 | 8.948971 | 0.576866 | ok |
p5 | 5 | -0.823029 | 11.589523 | 0.514666 | ok |
p6 | 6 | -0.783404 | 12.232503 | 0.765154 | ok |
p7 | 7 | -0.828766 | 20.878259 | 0.435281 | ok |
p8 | 8 | -0.724561 | 4.045507 | 0.669205 | ok |
p9 | 9 | -0.731828 | 4.792484 | 1.780335 | ok |
worst_name = hpo_trained.summary().loss.argmax()
if not isinstance(worst_name, str): #newer pandas argmax returns index
worst_name = hpo_trained.summary().index[worst_name]
print(worst_name)
p8
hpo_trained.get_pipeline(worst_name).visualize()
hpo_trained.get_pipeline(worst_name).pretty_print(ipython_display=True, show_imports=False)
custom_pca = CustomPCA(n_components=19, svd_solver="full")
custom_xg_boost = CustomXGBoost(
gamma=0.025801085053521078,
learning_rate=0.5793622466253201,
max_depth=3,
min_child_weight=8,
n_estimators=9,
reg_alpha=0.49646670359671663,
reg_lambda=0.9280083037935846,
subsample=0.5479690370134093,
)
pipeline = custom_pca >> custom_xg_boost
from sklearn.preprocessing import Normalizer as Norm
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as Tree
from sklearn.neighbors import KNeighborsClassifier as KNN
from lale.lib.lale import NoOp
lale.wrap_imported_operators()
KNN = KNN.customize_schema(n_neighbors=schemas.Int(min=1, max=10))
transp_planned = (Norm | NoOp) >> (Tree | LR(solver='liblinear') | KNN)
transp_planned.visualize()
%%time
transp_trained = transp_planned.auto_configure(
train_X, train_y, optimizer=Hyperopt, cv=3, max_evals=3)
100%|█████████| 3/3 [01:25<00:00, 28.55s/trial, best loss: -0.8412346112501562] CPU times: user 1min 27s, sys: 1.34 s, total: 1min 28s Wall time: 1min 27s
transp_trained.pretty_print(ipython_display=True, show_imports=False)
transp_trained.visualize()
knn = KNN(algorithm="ball_tree", metric="manhattan", n_neighbors=9)
pipeline = NoOp() >> knn
%%time
transp_y = transp_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, transp_y):.1%}')
accuracy 86.6% CPU times: user 52.4 s, sys: 78.1 ms, total: 52.5 s Wall time: 53 s
test_X.json_schema
{'description': 'Features of forest covertypes dataset (classification).', 'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#forest-covertypes', 'type': 'array', 'items': {'type': 'array', 'minItems': 54, 'maxItems': 54, 'items': [{'description': 'Elevation', 'type': 'integer'}, {'description': 'Aspect', 'type': 'integer'}, {'description': 'Slope', 'type': 'integer'}, {'description': 'Horizontal_Distance_To_Hydrology', 'type': 'integer'}, {'description': 'Vertical_Distance_To_Hydrology', 'type': 'integer'}, {'description': 'Horizontal_Distance_To_Roadways', 'type': 'integer'}, {'description': 'Hillshade_9am', 'type': 'integer'}, {'description': 'Hillshade_Noon', 'type': 'integer'}, {'description': 'Hillshade_3pm', 'type': 'integer'}, {'description': 'Horizontal_Distance_To_Fire_Points', 'type': 'integer'}, {'description': 'Wilderness_Area1', 'enum': [0, 1]}, {'description': 'Wilderness_Area2', 'enum': [0, 1]}, {'description': 'Wilderness_Area3', 'enum': [0, 1]}, {'description': 'Wilderness_Area4', 'enum': [0, 1]}, {'description': 'Soil_Type1', 'enum': [0, 1]}, {'description': 'Soil_Type2', 'enum': [0, 1]}, {'description': 'Soil_Type3', 'enum': [0, 1]}, {'description': 'Soil_Type4', 'enum': [0, 1]}, {'description': 'Soil_Type5', 'enum': [0, 1]}, {'description': 'Soil_Type6', 'enum': [0, 1]}, {'description': 'Soil_Type7', 'enum': [0, 1]}, {'description': 'Soil_Type8', 'enum': [0, 1]}, {'description': 'Soil_Type9', 'enum': [0, 1]}, {'description': 'Soil_Type10', 'enum': [0, 1]}, {'description': 'Soil_Type11', 'enum': [0, 1]}, {'description': 'Soil_Type12', 'enum': [0, 1]}, {'description': 'Soil_Type13', 'enum': [0, 1]}, {'description': 'Soil_Type14', 'enum': [0, 1]}, {'description': 'Soil_Type15', 'enum': [0, 1]}, {'description': 'Soil_Type16', 'enum': [0, 1]}, {'description': 'Soil_Type17', 'enum': [0, 1]}, {'description': 'Soil_Type18', 'enum': [0, 1]}, {'description': 'Soil_Type19', 'enum': [0, 1]}, {'description': 'Soil_Type20', 'enum': [0, 1]}, {'description': 'Soil_Type21', 'enum': [0, 1]}, {'description': 'Soil_Type22', 'enum': [0, 1]}, {'description': 'Soil_Type23', 'enum': [0, 1]}, {'description': 'Soil_Type24', 'enum': [0, 1]}, {'description': 'Soil_Type25', 'enum': [0, 1]}, {'description': 'Soil_Type26', 'enum': [0, 1]}, {'description': 'Soil_Type27', 'enum': [0, 1]}, {'description': 'Soil_Type28', 'enum': [0, 1]}, {'description': 'Soil_Type29', 'enum': [0, 1]}, {'description': 'Soil_Type30', 'enum': [0, 1]}, {'description': 'Soil_Type31', 'enum': [0, 1]}, {'description': 'Soil_Type32', 'enum': [0, 1]}, {'description': 'Soil_Type33', 'enum': [0, 1]}, {'description': 'Soil_Type34', 'enum': [0, 1]}, {'description': 'Soil_Type35', 'enum': [0, 1]}, {'description': 'Soil_Type36', 'enum': [0, 1]}, {'description': 'Soil_Type37', 'enum': [0, 1]}, {'description': 'Soil_Type38', 'enum': [0, 1]}, {'description': 'Soil_Type39', 'enum': [0, 1]}, {'description': 'Soil_Type40', 'enum': [0, 1]}]}, 'minItems': 58102, 'maxItems': 58102}
from lale.lib.lale import categorical
print(categorical(max_values=2)(test_X))
['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']
from lale.lib.lale import Project
from lale.lib.lale import ConcatFeatures as Concat
from sklearn.feature_selection import SelectKBest as FeatSel
lale.wrap_imported_operators()
binary_prep = Project(columns=categorical(max_values=2)) >> FeatSel
other_prep = Project(drop_columns=categorical(max_values=2)) >> (Norm | NoOp)
nonlin_planned = (binary_prep & other_prep) >> Concat >> KNN
nonlin_planned.visualize()
%%time
nonlin_trained = nonlin_planned.auto_configure(
train_X, train_y, optimizer=Hyperopt, cv=3, max_evals=3)
100%|█████████| 3/3 [02:30<00:00, 50.20s/trial, best loss: -0.8618882829755578] CPU times: user 2min 32s, sys: 344 ms, total: 2min 33s Wall time: 2min 35s
nonlin_trained.visualize()
nonlin_trained.pretty_print(ipython_display=True, show_imports=False, combinators=False)
project_0 = Project(columns=lale.lib.lale.categorical(max_values=2))
feat_sel = FeatSel(k=8)
pipeline_0 = make_pipeline(project_0, feat_sel)
project_1 = Project(drop_columns=lale.lib.lale.categorical(max_values=2))
pipeline_1 = make_pipeline(project_1, NoOp())
union = make_union(pipeline_0, pipeline_1)
knn = KNN(algorithm="kd_tree", n_neighbors=7, weights="distance")
pipeline = make_pipeline(union, knn)
%%time
nonlin_y = nonlin_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, nonlin_y):.1%}')
accuracy 88.6% CPU times: user 5.02 s, sys: 78.1 ms, total: 5.09 s Wall time: 5.13 s
binary_prep_trainable = Project(columns=categorical(max_values=2)) >> FeatSel(k=8)
binary_prep_trained = binary_prep_trainable.fit(train_X, train_y)
binary_prep_trained.transform(test_X.head(10))
Wilderness_Area1 | Wilderness_Area4 | Soil_Type2 | Soil_Type3 | Soil_Type4 | Soil_Type10 | Soil_Type38 | Soil_Type39 | |
---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
6 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
8 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
9 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |