!pip install 'liac-arff>=2.4.0'
Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)
import lale.datasets.openml
import pandas as pd
(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
'credit-g', 'classification', preprocess=False)
print(f'train_X.shape {train_X.shape}')
pd.concat([train_y.tail(), train_X.tail()], axis=1)
train_X.shape (670, 20)
class | checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | ... | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
835 | bad | <0 | 12.0 | no credits/all paid | new car | 1082.0 | <100 | 1<=X<4 | 4.0 | male single | ... | 4.0 | car | 48.0 | bank | own | 2.0 | skilled | 1.0 | none | yes |
192 | bad | 0<=X<200 | 27.0 | existing paid | business | 3915.0 | <100 | 1<=X<4 | 4.0 | male single | ... | 2.0 | car | 36.0 | none | own | 1.0 | skilled | 2.0 | yes | yes |
629 | good | no checking | 9.0 | existing paid | education | 3832.0 | no known savings | >=7 | 1.0 | male single | ... | 4.0 | real estate | 64.0 | none | own | 1.0 | unskilled resident | 1.0 | none | yes |
559 | bad | 0<=X<200 | 18.0 | critical/other existing credit | furniture/equipment | 1928.0 | <100 | <1 | 2.0 | male single | ... | 2.0 | real estate | 31.0 | none | own | 2.0 | unskilled resident | 1.0 | none | yes |
684 | good | 0<=X<200 | 36.0 | delayed previously | business | 9857.0 | 100<=X<500 | 4<=X<7 | 1.0 | male single | ... | 3.0 | life insurance | 31.0 | none | own | 2.0 | unskilled resident | 2.0 | yes | yes |
5 rows × 21 columns
from sklearn.preprocessing import Normalizer as Norm
from sklearn.preprocessing import OneHotEncoder as OneHot
from sklearn.linear_model import LogisticRegression as LR
from xgboost import XGBClassifier as XGBoost
from sklearn.svm import LinearSVC
from lale.operators import make_pipeline, make_union
from lale.lib.lale import Project, ConcatFeatures, NoOp
lale.wrap_imported_operators()
project_nums = Project(columns={'type': 'number'})
project_cats = Project(columns={'type': 'string'})
planned_pipeline = (
(project_nums >> (Norm | NoOp) & project_cats >> OneHot)
>> ConcatFeatures
>> (LR | LinearSVC(dual=False)| XGBoost))
planned_pipeline.visualize()
import sklearn.metrics
from lale.lib.lale import Hyperopt
auto_optimizer = Hyperopt(estimator=planned_pipeline, cv=3, max_evals=5)
auto_trained = auto_optimizer.fit(train_X, train_y)
auto_y = auto_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, auto_y):.1%}')
100%|█████████| 5/5 [01:08<00:00, 13.74s/trial, best loss: -0.7507273649370062] accuracy 72.1%
best_pipeline = auto_trained.get_pipeline()
best_pipeline.visualize()
from lale.pretty_print import ipython_display
ipython_display(best_pipeline, show_imports=False)
project_0 = Project(columns={"type": "number"})
norm = Norm(norm="l1")
project_1 = Project(columns={"type": "string"})
linear_svc = LinearSVC(
dual=False,
C=16757.615906506046,
fit_intercept=False,
tol=0.0006905134087360421,
)
pipeline = (
((project_0 >> norm) & (project_1 >> OneHot()))
>> ConcatFeatures()
>> linear_svc
)
ipython_display(XGBoost.hyperparam_schema('n_estimators'))
{
"description": "Number of trees to fit.",
"type": "integer",
"default": 1000,
"minimumForOptimizer": 500,
"maximumForOptimizer": 1500,
}
ipython_display(XGBoost.hyperparam_schema('booster'))
{
"description": "Specify which booster to use.",
"enum": ["gbtree", "gblinear", "dart"],
"default": "gbtree",
}
import jsonschema
import sys
try:
XGBoost(n_estimators=0.5, booster='gbtree')
except jsonschema.ValidationError as e:
print(e.message, file=sys.stderr)
Invalid configuration for XGBoost(n_estimators=0.5, booster='gbtree') due to invalid value n_estimators=0.5. Schema of argument n_estimators: { "description": "Number of trees to fit.", "type": "integer", "default": 1000, "minimumForOptimizer": 500, "maximumForOptimizer": 1500, } Value: 0.5
import lale.schemas as schemas
Grove = XGBoost.customize_schema(
n_estimators=schemas.Int(minimum=2, maximum=10),
booster=schemas.Enum(['gbtree'], default='gbtree'))
grove_planned = ( Project(columns={'type': 'number'}) >> Norm
& Project(columns={'type': 'string'}) >> OneHot
) >> ConcatFeatures >> Grove
grove_optimizer = Hyperopt(estimator=grove_planned, cv=3, max_evals=10)
grove_trained = grove_optimizer.fit(train_X, train_y)
grove_y = grove_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, grove_y):.1%}')
100%|███████| 10/10 [01:34<00:00, 9.48s/trial, best loss: -0.7433202541106129] accuracy 73.9%
grove_best = grove_trained.get_pipeline()
ipython_display(grove_best, show_imports=False)
project_0 = Project(columns={"type": "number"})
norm = Norm(norm="l1")
project_1 = Project(columns={"type": "string"})
xg_boost = XGBoost(
gamma=0.5347651601211355,
learning_rate=0.4698440238677895,
max_depth=9,
min_child_weight=16,
n_estimators=4,
reg_alpha=0.16360717863855023,
reg_lambda=0.9454327638424944,
subsample=0.7366877378057126,
)
pipeline = (
((project_0 >> norm) & (project_1 >> OneHot()))
>> ConcatFeatures()
>> xg_boost
)