#!/usr/bin/env python
# coding: utf-8

# In[1]:


from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# ## Naive bayes

# In[39]:


from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=0)

gauss_clf = GaussianNB()
multi_clf = MultinomialNB()
bernl_clf = BernoulliNB()

gauss_clf.fit(X_train, y_train)
multi_clf.fit(X_train, y_train)
bernl_clf.fit(X_train, y_train)


# In[40]:


y_pred_gauss = gauss_clf.predict(X_test)
y_pred_multi = multi_clf.predict(X_test)
y_pred_bernl = bernl_clf.predict(X_test)


# In[41]:


print(classification_report(y_test, y_pred_gauss))
print(classification_report(y_test, y_pred_multi))
print(classification_report(y_test, y_pred_bernl))


# ## SVM

# In[42]:


from sklearn.datasets import load_svmlight_file
svc = SVC(kernel='rbf', random_state=101)
X_train, y_train = load_svmlight_file('data_set/ijcnn1.bz2')


# In[43]:


get_ipython().run_cell_magic('time', '', "scores = cross_val_score(svc,\n                         X_train,\n                         y_train,\n                         cv=5,\n                         scoring='accuracy',\n                         n_jobs=-1)\nprint(\n    'SVC with rbf kernel -> cross validation accuracy: mean = {:.4f}, std = {:.4f}'\n    .format(np.mean(scores), np.std(scores)))\n")


# In[44]:


get_ipython().run_cell_magic('time', '', "svc_new = SVC(kernel='rbf', random_state=101)\nsearch_dict = {\n    'C': [0.01, 0.1, 1, 10, 100],\n    'gamma': [0.1, 0.01, 0.001, 0.0001]\n}\nsearch_func = RandomizedSearchCV(estimator=svc_new,\n                                 param_distributions=search_dict,\n                                 n_iter=10,\n                                 scoring='accuracy',\n                                 n_jobs=-1,\n                                 iid=True,\n                                 refit=True,\n                                 cv=5,\n                                 random_state=101)\nsearch_func.fit(X_train, y_train)\nprint('Best parameters {}'.format(search_func.best_params_))\nprint('Cross validation accuracy: mean= {:.4f}'.format(search_func.best_score_))\n")


# In[45]:


get_ipython().run_cell_magic('time', '', "svc_best = SVC(C=100, gamma=0.1, kernel='rbf', random_state=101)\nsvc_best.fit(X_train, y_train)\nprint(\n    'SVC with rbf kernel -> cross validation accuracy: mean = {:.4f}, std = {:.4f}'\n    .format(np.mean(scores), np.std(scores)))\n")


# ## RandomForest ExtraTrees

# In[86]:


from sklearn.datasets import fetch_covtype
covertype = fetch_covtype()
covertype.data.shape


# In[87]:


covertype_x = covertype.data
covertype_y = covertype.target
covertype_x_train, covertype_x_test_val, covertype_y_train, covertype_y_test_val = train_test_split(
    covertype_x, covertype_y, test_size=0.4, random_state=42)
covertype_x_test, covertype_x_val, covertype_y_test, covertype_y_val = train_test_split(
    covertype_x_test_val, covertype_y_test_val, test_size=0.5, random_state=42)
covertypes = [
    'Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 'Cottonwood/Willow',
    'Aspen', 'Douglas-fir', 'Krummholz'
]


# In[88]:


print(covertype_x_train.shape)
print(covertype_x_val.shape)
print(covertype_x_test.shape)


# In[80]:


get_ipython().run_cell_magic('time', '', "rfc = RandomForestClassifier(n_estimators=100, random_state=101)\nscores = cross_val_score(rfc,\n                         covertype_x_train,\n                         covertype_y_train,\n                         cv=5,\n                         scoring='accuracy',\n                         n_jobs=-1)\nprint(\n    'RandomForestClassifier -> cross validation accurary: mean = {:.4f}, std = {:.4f}'\n    .format(np.mean(scores), np.std(scores)))\n")


# In[81]:


scores


# In[82]:


get_ipython().run_cell_magic('time', '', "etc = ExtraTreesClassifier(n_estimators=100, random_state=101)\nscores = cross_val_score(etc,\n                         covertype_x_train,\n                         covertype_y_train,\n                         cv=5,\n                         scoring='accuracy',\n                         n_jobs=-1)\nprint(\n    'ExtraTreesClassifier -> cross validation accurary: mean = {:.4f}, std = {:.4f}'\n    .format(np.mean(scores), np.std(scores)))\n")


# In[83]:


scores


# ## CalibrationClassifierCV

# In[13]:


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve


# In[21]:


rfc = RandomForestClassifier(n_estimators=100, random_state=101)
calibration = CalibratedClassifierCV(rfc, method='sigmoid', cv=5)


# In[84]:


rfc.fit(covertype_x_train, covertype_y_train)
calibration.fit(covertype_x_train, covertype_y_train)
prob_raw = rfc.predict_proba(covertype_x_test)
prob_cal = calibration.predict_proba(covertype_x_test)


# In[85]:


get_ipython().run_line_magic('matplotlib', 'inline')
tree_kind = covertypes.index('Ponderosa Pine')
probs = pd.DataFrame(list(zip(prob_raw[:, tree_kind], prob_cal[:, tree_kind])),
                     columns=['raw', 'calibrated'])
probs


# In[86]:


plot = probs.plot(kind='scatter', x=0, y=1, s=64, c='blue', edgecolors='white')


# ## AdaBoost

# In[87]:


get_ipython().run_cell_magic('time', '', "adbc = AdaBoostClassifier(n_estimators=300, random_state=101)\nscores = cross_val_score(adbc,\n                         covertype_x_train,\n                         covertype_y_train,\n                         cv=5,\n                         scoring='accuracy',\n                         n_jobs=-1)\nprint(\n    'Adaboost -> cross validation accurary: mean = {:.4f}, std = {:.4f}'.format(\n        np.mean(scores), np.std(scores)))\n")


# In[88]:


scores


# ## GradientBoost

# In[89]:


get_ipython().run_cell_magic('time', '', 'gbc = GradientBoostingClassifier(max_depth=5, n_estimators=50, random_state=101)\ngbc.fit(covertype_x_train, covertype_y_train)\n')


# In[90]:


accuracy_score(covertype_y_val, rfc.predict(covertype_x_val))


# In[91]:


accuracy_score(covertype_y_test, rfc.predict(covertype_x_test))


# ## XGBoost

# In[92]:


import xgboost as xgb
xgb_model = xgb.XGBClassifier(object='multi:softprob',
                              max_depth=24,
                              gamma=0.1,
                              subsample=0.9,
                              learning_rate=0.01,
                              n_estimators=500,
                              nthread=-1)


# In[93]:


get_ipython().run_cell_magic('time', '', "xgb_model.fit(covertype_x_train,\n              covertype_y_train,\n              eval_set=[(covertype_x_val, covertype_y_val)],\n              eval_metric='merror',\n              early_stopping_rounds=25,\n              verbose=True)\n")


# In[94]:


accuracy_score(covertype_y_test, xgb_model.predict(covertype_x_test))


# In[95]:


confusion_matrix(covertype_y_test, xgb_model.predict(covertype_x_test))


# In[96]:


print(
    classification_report(covertype_y_test,
                          xgb_model.predict(covertype_x_test)))


# ## XGBoost gpu vs cpu

# In[89]:


import xgboost as xgb

xgb_cpu = xgb.XGBClassifier(object='multi:softprob',
                            max_depth=8,
                            gamma=0.1,
                            subsample=0.9,
                            learning_rate=0.01,
                            n_estimators=200,
                            nthread=-1)


# In[90]:


get_ipython().run_cell_magic('time', '', "xgb_cpu.fit(covertype_x_train,\n            covertype_y_train,\n            eval_set=[(covertype_x_val, covertype_y_val)],\n            eval_metric='merror',\n            early_stopping_rounds=25,\n            verbose=True)\n")


# In[102]:


xgb_gpu = xgb.XGBClassifier(object='multi:softprob',
                            max_depth=8,
                            gamma=0.1,
                            subsample=0.9,
                            learning_rate=0.01,
                            n_estimators=200,
                            nthread=-1,
                            tree_method='gpu_hist',
                            gpu_id=0)


# In[103]:


get_ipython().run_cell_magic('time', '', "xgb_gpu.fit(covertype_x_train,\n            covertype_y_train,\n            eval_set=[(covertype_x_val, covertype_y_val)],\n            eval_metric='merror',\n            early_stopping_rounds=25,\n            verbose=True)\n")


# ## LightGBM

# In[7]:


covertype_y_train_1 = covertype_y_train - 1
covertype_y_val_1 = covertype_y_val - 1
covertype_y_test_1 = covertype_y_test - 1


# In[7]:


import lightgbm as lgb

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': len(np.unique(covertype_y)),
    'metric': 'multi_logloss',
    'learning_rate': 0.01,
    'max_depth': 128,
    'num_leaves': 256,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 10,
    'device': 'cpu'
}


# In[10]:


train_data = lgb.Dataset(data=covertype_x_train, label=covertype_y_train_1)
val_data = lgb.Dataset(data=covertype_x_val, label=covertype_y_val_1)


# In[10]:


bst = lgb.train(params,
                train_data,
                num_boost_round=2500,
                valid_sets=val_data,
                verbose_eval=500,
                early_stopping_rounds=25)


# In[15]:


lgb_cv = lgb.cv(params,
                train_data,
                num_boost_round=2500,
                nfold=3,
                shuffle=True,
                verbose_eval=250,
                early_stopping_rounds=25)


# In[17]:


nround = lgb_cv['multi_logloss-mean'].index(np.min(
    lgb_cv['multi_logloss-mean']))
nround


# In[13]:


y_probs = bst.predict(covertype_x_test, num_iteration=bst.best_iteration)
y_preds = np.argmax(y_probs, axis=1)
print(accuracy_score(covertype_y_test_1, y_preds))
print(confusion_matrix(covertype_y_test_1, y_preds))


# In[14]:


print(classification_report(covertype_y_test_1, y_preds))


# ## LightGBM GPU vs CPU

# In[17]:


import lightgbm as lgb

params_cpu = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': len(np.unique(covertype_y)),
    'metric': 'multi_logloss',
    'learning_rate': 0.01,
    'max_depth': 128,
    'num_leaves': 256,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 10,
    "n_jobs": 8,
    'device': 'cpu',
    'n_jobs': -1,
}


# In[18]:


get_ipython().run_cell_magic('time', '', 'bst_cpu = lgb.train(params_cpu,\n                    train_data,\n                    num_boost_round=500,\n                    valid_sets=val_data,\n                    verbose_eval=100,\n                    early_stopping_rounds=25)\n')


# In[19]:


params_gpu = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': len(np.unique(covertype_y)),
    'metric': 'multi_logloss',
    'learning_rate': 0.01,
    'max_depth': 128,
    'num_leaves': 256,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 10,
    "n_jobs": 8,
    'device': 'gpu',
    'gpu_platform_id': 2,
    'gpu_device_id': 1,
}


# In[20]:


get_ipython().run_cell_magic('time', '', 'bst_gpu = lgb.train(params_gpu,\n                    train_data,\n                    num_boost_round=500,\n                    valid_sets=val_data,\n                    verbose_eval=100,\n                    early_stopping_rounds=25)\n')


# ## CatBoost

# In[64]:


from catboost import CatBoostClassifier, Pool

covertype_dataset = fetch_covtype(random_state=101, shuffle=True)
label = covertype_dataset.target.astype(int) - 1
wilderness_area = np.argmax(covertype_dataset.data[:, 10:(10 + 4)], axis=1)
soil_type = np.argmax(covertype_dataset.data[:, (10 + 4):(10 + 4 + 40)], axis=1)
data = (covertype_dataset.data[:, :10], wilderness_area.reshape(-1,
                                                                1).astype(str),
        soil_type.reshape(-1, 1).astype(str))
data = np.hstack(data)


# In[66]:


covertype_train = Pool(data=data[:15000, :],
                       label=label[:15000],
                       cat_features=[10, 11])
covertype_val = Pool(data[15000:20000, :], label[15000:20000], [10, 11])
covertype_test_x = Pool(data[20000:25000, :], None, [10, 11])
covertype_test_y = label[20000:25000]


# In[75]:


cbc_cpu = CatBoostClassifier(iterations=2500,
                             learning_rate=0.05,
                             depth=8,
                             custom_loss='Accuracy',
                             eval_metric='Accuracy',
                             use_best_model=True,
                             loss_function='MultiClass',
                             task_type='CPU',
                             thread_count=-1)


# In[76]:


cbc_cpu.fit(covertype_train, eval_set=covertype_val, verbose=500, plot=False)


# In[78]:


preds_class_cpu = cbc_cpu.predict(covertype_test_x)
preds_proba_cpu = cbc_cpu.predict_proba(covertype_test_x)


# In[79]:


print(accuracy_score(covertype_test_y, preds_class_cpu))
print(confusion_matrix(covertype_test_y, preds_class_cpu))


# In[80]:


print(classification_report(covertype_test_y, preds_class_cpu))


# In[81]:


cbc_gpu = CatBoostClassifier(iterations=2500,
                             learning_rate=0.05,
                             depth=8,
                             custom_loss='Accuracy',
                             eval_metric='Accuracy',
                             use_best_model=True,
                             loss_function='MultiClass',
                             task_type='GPU',
                             thread_count=-1)


# In[82]:


cbc_gpu.fit(covertype_train, eval_set=covertype_val, verbose=500, plot=False)


# In[83]:


preds_class_gpu = cbc_gpu.predict(covertype_test_x)
preds_proba_gpu = cbc_gpu.predict_proba(covertype_test_x)


# In[84]:


print(accuracy_score(covertype_test_y, preds_class_gpu))
print(confusion_matrix(covertype_test_y, preds_class_gpu))


# In[85]:


print(classification_report(covertype_test_y, preds_class_gpu))


# ## ThunderGBM

# In[11]:


import thundersvm


# In[4]:


get_ipython().run_line_magic('pinfo', 'clf')


# In[7]:


get_ipython().system('pip list')


# In[ ]: