#!/usr/bin/env python
# coding: utf-8

# In[1]:


#Required for accessing openml datasets from Lale
get_ipython().system("pip install 'liac-arff>=2.4.0'")


# In[2]:


import lale.datasets.openml
import pandas as pd
(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(
    'credit-g', 'classification', preprocess=True)


# In[3]:


from sklearn.preprocessing import StandardScaler as Standard, MinMaxScaler as MinMax
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem as Nys
from lale.lib.lale import NoOp
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGBoost
from lale.lib.lale import TopKVotingClassifier
from sklearn.metrics import accuracy_score
from lale.lib.lale import Hyperopt
from sklearn.ensemble import VotingClassifier
lale.wrap_imported_operators()


# The optimizer supported as of now is Hyperopt and args_to_optimizer
# is a dictionary of values that Hyperopt would accept as per
# https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.hyperopt.html

# In[4]:


planned_pipeline = (NoOp | Standard | MinMax) >> (NoOp | PCA | Nys) >> (LR | RF | XGBoost)
ensemble = TopKVotingClassifier(
    estimator=planned_pipeline, k=3, optimizer=Hyperopt,
    args_to_optimizer={'max_evals':25, 'scoring':'accuracy'})
ensemble.visualize()


# In[5]:


trained_ensemble = ensemble.fit(train_X, train_y)


# In[6]:


#Note that you could also pass just the planned pipeline as below and Hyperopt with its default setting would be used.
ensemble = TopKVotingClassifier(estimator=planned_pipeline)


# In[7]:


predictions = trained_ensemble.predict(test_X)
print(accuracy_score(test_y, predictions))


# In[8]:


best_pipeline = trained_ensemble.get_pipeline()
best_pipeline.visualize()


# In[9]:


best_pipeline.to_json()