!pip install 'liac-arff>=2.4.0'
Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)
import lale.datasets.openml
import pandas as pd
(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
'credit-g', 'classification', preprocess=False)
pd.concat([train_y.tail(), train_X.tail()], axis=1)
class | checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | ... | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
835 | bad | <0 | 12.0 | no credits/all paid | new car | 1082.0 | <100 | 1<=X<4 | 4.0 | male single | ... | 4.0 | car | 48.0 | bank | own | 2.0 | skilled | 1.0 | none | yes |
192 | bad | 0<=X<200 | 27.0 | existing paid | business | 3915.0 | <100 | 1<=X<4 | 4.0 | male single | ... | 2.0 | car | 36.0 | none | own | 1.0 | skilled | 2.0 | yes | yes |
629 | good | no checking | 9.0 | existing paid | education | 3832.0 | no known savings | >=7 | 1.0 | male single | ... | 4.0 | real estate | 64.0 | none | own | 1.0 | unskilled resident | 1.0 | none | yes |
559 | bad | 0<=X<200 | 18.0 | critical/other existing credit | furniture/equipment | 1928.0 | <100 | <1 | 2.0 | male single | ... | 2.0 | real estate | 31.0 | none | own | 2.0 | unskilled resident | 1.0 | none | yes |
684 | good | 0<=X<200 | 36.0 | delayed previously | business | 9857.0 | 100<=X<500 | 4<=X<7 | 1.0 | male single | ... | 3.0 | life insurance | 31.0 | none | own | 2.0 | unskilled resident | 2.0 | yes | yes |
5 rows × 21 columns
from sklearn.preprocessing import Normalizer as Norm
from lale.lib.lale import NoOp
from sklearn.preprocessing import OneHotEncoder as OneHot
from sklearn.linear_model import LogisticRegression as LR
from xgboost import XGBClassifier as XGBoost
from sklearn.svm import LinearSVC
from sklearn.compose import ColumnTransformer
from lale.operators import make_pipeline
lale.wrap_imported_operators()
import numpy as np
num_cols = [col for col in train_X.columns
if np.issubdtype(train_X.dtypes[col], np.number)]
cat_cols = [col for col in train_X.columns if col not in num_cols]
pipeline_trainable = ColumnTransformer(
transformers=[
('num_tfm', Norm(norm='l1'), num_cols),
('cat_tfm', OneHot(), cat_cols)]) >> LR()
pipeline_trainable.visualize()
%%time
pipeline_trained = pipeline_trainable.fit(train_X, train_y)
CPU times: user 1 s, sys: 188 ms, total: 1.19 s Wall time: 1.06 s
import sklearn.metrics
predictions = pipeline_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')
accuracy 71.2%
pipeline_planned = make_pipeline(
ColumnTransformer(transformers=[
('num_tfm', Norm | NoOp, num_cols),
('cat_tfm', OneHot, cat_cols)]),
LR | LinearSVC(dual=False)| XGBoost)
pipeline_planned.visualize()
%%time
from lale.lib.lale import Hyperopt
pipeline_trained = pipeline_planned.auto_configure(
train_X, train_y, Hyperopt, cv=3, max_evals=5)
100%|█████████| 5/5 [00:52<00:00, 10.58s/trial, best loss: -0.7507273649370062] CPU times: user 55.7 s, sys: 3.28 s, total: 59 s Wall time: 54.6 s
pipeline_trained.visualize()
pipeline_trained.pretty_print(ipython_display=True, show_imports=False)
norm = Norm(norm="l1")
column_transformer = ColumnTransformer(
transformers=[
(
"num_tfm",
norm,
[
"duration", "credit_amount", "installment_commitment",
"residence_since", "age", "existing_credits", "num_dependents",
],
),
(
"cat_tfm",
OneHot(),
[
"checking_status", "credit_history", "purpose",
"savings_status", "employment", "personal_status",
"other_parties", "property_magnitude", "other_payment_plans",
"housing", "job", "own_telephone", "foreign_worker",
],
),
]
)
linear_svc = LinearSVC(
dual=False,
C=16757.615906506046,
fit_intercept=False,
tol=0.0006905134087360421,
)
pipeline = column_transformer >> linear_svc
predictions = pipeline_trained.predict(test_X)
print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')
accuracy 72.1%