#!/usr/bin/env python # coding: utf-8 # In[1]: from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.metrics import classification_report, accuracy_score, confusion_matrix import numpy as np import warnings warnings.filterwarnings('ignore') # ## Naive bayes # In[39]: from sklearn.datasets import load_iris iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0) gauss_clf = GaussianNB() multi_clf = MultinomialNB() bernl_clf = BernoulliNB() gauss_clf.fit(X_train, y_train) multi_clf.fit(X_train, y_train) bernl_clf.fit(X_train, y_train) # In[40]: y_pred_gauss = gauss_clf.predict(X_test) y_pred_multi = multi_clf.predict(X_test) y_pred_bernl = bernl_clf.predict(X_test) # In[41]: print(classification_report(y_test, y_pred_gauss)) print(classification_report(y_test, y_pred_multi)) print(classification_report(y_test, y_pred_bernl)) # ## SVM # In[42]: from sklearn.datasets import load_svmlight_file svc = SVC(kernel='rbf', random_state=101) X_train, y_train = load_svmlight_file('data_set/ijcnn1.bz2') # In[43]: get_ipython().run_cell_magic('time', '', "scores = cross_val_score(svc,\n X_train,\n y_train,\n cv=5,\n scoring='accuracy',\n n_jobs=-1)\nprint(\n 'SVC with rbf kernel -> cross validation accuracy: mean = {:.4f}, std = {:.4f}'\n .format(np.mean(scores), np.std(scores)))\n") # In[44]: get_ipython().run_cell_magic('time', '', "svc_new = SVC(kernel='rbf', random_state=101)\nsearch_dict = {\n 'C': [0.01, 0.1, 1, 10, 100],\n 'gamma': [0.1, 0.01, 0.001, 0.0001]\n}\nsearch_func = RandomizedSearchCV(estimator=svc_new,\n param_distributions=search_dict,\n n_iter=10,\n scoring='accuracy',\n n_jobs=-1,\n iid=True,\n refit=True,\n cv=5,\n random_state=101)\nsearch_func.fit(X_train, y_train)\nprint('Best parameters {}'.format(search_func.best_params_))\nprint('Cross validation accuracy: mean= {:.4f}'.format(search_func.best_score_))\n") # In[45]: get_ipython().run_cell_magic('time', '', "svc_best = SVC(C=100, gamma=0.1, kernel='rbf', random_state=101)\nsvc_best.fit(X_train, y_train)\nprint(\n 'SVC with rbf kernel -> cross validation accuracy: mean = {:.4f}, std = {:.4f}'\n .format(np.mean(scores), np.std(scores)))\n") # ## RandomForest ExtraTrees # In[86]: from sklearn.datasets import fetch_covtype covertype = fetch_covtype() covertype.data.shape # In[87]: covertype_x = covertype.data covertype_y = covertype.target covertype_x_train, covertype_x_test_val, covertype_y_train, covertype_y_test_val = train_test_split( covertype_x, covertype_y, test_size=0.4, random_state=42) covertype_x_test, covertype_x_val, covertype_y_test, covertype_y_val = train_test_split( covertype_x_test_val, covertype_y_test_val, test_size=0.5, random_state=42) covertypes = [ 'Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 'Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz' ] # In[88]: print(covertype_x_train.shape) print(covertype_x_val.shape) print(covertype_x_test.shape) # In[80]: get_ipython().run_cell_magic('time', '', "rfc = RandomForestClassifier(n_estimators=100, random_state=101)\nscores = cross_val_score(rfc,\n covertype_x_train,\n covertype_y_train,\n cv=5,\n scoring='accuracy',\n n_jobs=-1)\nprint(\n 'RandomForestClassifier -> cross validation accurary: mean = {:.4f}, std = {:.4f}'\n .format(np.mean(scores), np.std(scores)))\n") # In[81]: scores # In[82]: get_ipython().run_cell_magic('time', '', "etc = ExtraTreesClassifier(n_estimators=100, random_state=101)\nscores = cross_val_score(etc,\n covertype_x_train,\n covertype_y_train,\n cv=5,\n scoring='accuracy',\n n_jobs=-1)\nprint(\n 'ExtraTreesClassifier -> cross validation accurary: mean = {:.4f}, std = {:.4f}'\n .format(np.mean(scores), np.std(scores)))\n") # In[83]: scores # ## CalibrationClassifierCV # In[13]: import pandas as pd import matplotlib.pyplot as plt from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import calibration_curve # In[21]: rfc = RandomForestClassifier(n_estimators=100, random_state=101) calibration = CalibratedClassifierCV(rfc, method='sigmoid', cv=5) # In[84]: rfc.fit(covertype_x_train, covertype_y_train) calibration.fit(covertype_x_train, covertype_y_train) prob_raw = rfc.predict_proba(covertype_x_test) prob_cal = calibration.predict_proba(covertype_x_test) # In[85]: get_ipython().run_line_magic('matplotlib', 'inline') tree_kind = covertypes.index('Ponderosa Pine') probs = pd.DataFrame(list(zip(prob_raw[:, tree_kind], prob_cal[:, tree_kind])), columns=['raw', 'calibrated']) probs # In[86]: plot = probs.plot(kind='scatter', x=0, y=1, s=64, c='blue', edgecolors='white') # ## AdaBoost # In[87]: get_ipython().run_cell_magic('time', '', "adbc = AdaBoostClassifier(n_estimators=300, random_state=101)\nscores = cross_val_score(adbc,\n covertype_x_train,\n covertype_y_train,\n cv=5,\n scoring='accuracy',\n n_jobs=-1)\nprint(\n 'Adaboost -> cross validation accurary: mean = {:.4f}, std = {:.4f}'.format(\n np.mean(scores), np.std(scores)))\n") # In[88]: scores # ## GradientBoost # In[89]: get_ipython().run_cell_magic('time', '', 'gbc = GradientBoostingClassifier(max_depth=5, n_estimators=50, random_state=101)\ngbc.fit(covertype_x_train, covertype_y_train)\n') # In[90]: accuracy_score(covertype_y_val, rfc.predict(covertype_x_val)) # In[91]: accuracy_score(covertype_y_test, rfc.predict(covertype_x_test)) # ## XGBoost # In[92]: import xgboost as xgb xgb_model = xgb.XGBClassifier(object='multi:softprob', max_depth=24, gamma=0.1, subsample=0.9, learning_rate=0.01, n_estimators=500, nthread=-1) # In[93]: get_ipython().run_cell_magic('time', '', "xgb_model.fit(covertype_x_train,\n covertype_y_train,\n eval_set=[(covertype_x_val, covertype_y_val)],\n eval_metric='merror',\n early_stopping_rounds=25,\n verbose=True)\n") # In[94]: accuracy_score(covertype_y_test, xgb_model.predict(covertype_x_test)) # In[95]: confusion_matrix(covertype_y_test, xgb_model.predict(covertype_x_test)) # In[96]: print( classification_report(covertype_y_test, xgb_model.predict(covertype_x_test))) # ## XGBoost gpu vs cpu # In[89]: import xgboost as xgb xgb_cpu = xgb.XGBClassifier(object='multi:softprob', max_depth=8, gamma=0.1, subsample=0.9, learning_rate=0.01, n_estimators=200, nthread=-1) # In[90]: get_ipython().run_cell_magic('time', '', "xgb_cpu.fit(covertype_x_train,\n covertype_y_train,\n eval_set=[(covertype_x_val, covertype_y_val)],\n eval_metric='merror',\n early_stopping_rounds=25,\n verbose=True)\n") # In[102]: xgb_gpu = xgb.XGBClassifier(object='multi:softprob', max_depth=8, gamma=0.1, subsample=0.9, learning_rate=0.01, n_estimators=200, nthread=-1, tree_method='gpu_hist', gpu_id=0) # In[103]: get_ipython().run_cell_magic('time', '', "xgb_gpu.fit(covertype_x_train,\n covertype_y_train,\n eval_set=[(covertype_x_val, covertype_y_val)],\n eval_metric='merror',\n early_stopping_rounds=25,\n verbose=True)\n") # ## LightGBM # In[7]: covertype_y_train_1 = covertype_y_train - 1 covertype_y_val_1 = covertype_y_val - 1 covertype_y_test_1 = covertype_y_test - 1 # In[7]: import lightgbm as lgb params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': len(np.unique(covertype_y)), 'metric': 'multi_logloss', 'learning_rate': 0.01, 'max_depth': 128, 'num_leaves': 256, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 10, 'device': 'cpu' } # In[10]: train_data = lgb.Dataset(data=covertype_x_train, label=covertype_y_train_1) val_data = lgb.Dataset(data=covertype_x_val, label=covertype_y_val_1) # In[10]: bst = lgb.train(params, train_data, num_boost_round=2500, valid_sets=val_data, verbose_eval=500, early_stopping_rounds=25) # In[15]: lgb_cv = lgb.cv(params, train_data, num_boost_round=2500, nfold=3, shuffle=True, verbose_eval=250, early_stopping_rounds=25) # In[17]: nround = lgb_cv['multi_logloss-mean'].index(np.min( lgb_cv['multi_logloss-mean'])) nround # In[13]: y_probs = bst.predict(covertype_x_test, num_iteration=bst.best_iteration) y_preds = np.argmax(y_probs, axis=1) print(accuracy_score(covertype_y_test_1, y_preds)) print(confusion_matrix(covertype_y_test_1, y_preds)) # In[14]: print(classification_report(covertype_y_test_1, y_preds)) # ## LightGBM GPU vs CPU # In[17]: import lightgbm as lgb params_cpu = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': len(np.unique(covertype_y)), 'metric': 'multi_logloss', 'learning_rate': 0.01, 'max_depth': 128, 'num_leaves': 256, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 10, "n_jobs": 8, 'device': 'cpu', 'n_jobs': -1, } # In[18]: get_ipython().run_cell_magic('time', '', 'bst_cpu = lgb.train(params_cpu,\n train_data,\n num_boost_round=500,\n valid_sets=val_data,\n verbose_eval=100,\n early_stopping_rounds=25)\n') # In[19]: params_gpu = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': len(np.unique(covertype_y)), 'metric': 'multi_logloss', 'learning_rate': 0.01, 'max_depth': 128, 'num_leaves': 256, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 10, "n_jobs": 8, 'device': 'gpu', 'gpu_platform_id': 2, 'gpu_device_id': 1, } # In[20]: get_ipython().run_cell_magic('time', '', 'bst_gpu = lgb.train(params_gpu,\n train_data,\n num_boost_round=500,\n valid_sets=val_data,\n verbose_eval=100,\n early_stopping_rounds=25)\n') # ## CatBoost # In[64]: from catboost import CatBoostClassifier, Pool covertype_dataset = fetch_covtype(random_state=101, shuffle=True) label = covertype_dataset.target.astype(int) - 1 wilderness_area = np.argmax(covertype_dataset.data[:, 10:(10 + 4)], axis=1) soil_type = np.argmax(covertype_dataset.data[:, (10 + 4):(10 + 4 + 40)], axis=1) data = (covertype_dataset.data[:, :10], wilderness_area.reshape(-1, 1).astype(str), soil_type.reshape(-1, 1).astype(str)) data = np.hstack(data) # In[66]: covertype_train = Pool(data=data[:15000, :], label=label[:15000], cat_features=[10, 11]) covertype_val = Pool(data[15000:20000, :], label[15000:20000], [10, 11]) covertype_test_x = Pool(data[20000:25000, :], None, [10, 11]) covertype_test_y = label[20000:25000] # In[75]: cbc_cpu = CatBoostClassifier(iterations=2500, learning_rate=0.05, depth=8, custom_loss='Accuracy', eval_metric='Accuracy', use_best_model=True, loss_function='MultiClass', task_type='CPU', thread_count=-1) # In[76]: cbc_cpu.fit(covertype_train, eval_set=covertype_val, verbose=500, plot=False) # In[78]: preds_class_cpu = cbc_cpu.predict(covertype_test_x) preds_proba_cpu = cbc_cpu.predict_proba(covertype_test_x) # In[79]: print(accuracy_score(covertype_test_y, preds_class_cpu)) print(confusion_matrix(covertype_test_y, preds_class_cpu)) # In[80]: print(classification_report(covertype_test_y, preds_class_cpu)) # In[81]: cbc_gpu = CatBoostClassifier(iterations=2500, learning_rate=0.05, depth=8, custom_loss='Accuracy', eval_metric='Accuracy', use_best_model=True, loss_function='MultiClass', task_type='GPU', thread_count=-1) # In[82]: cbc_gpu.fit(covertype_train, eval_set=covertype_val, verbose=500, plot=False) # In[83]: preds_class_gpu = cbc_gpu.predict(covertype_test_x) preds_proba_gpu = cbc_gpu.predict_proba(covertype_test_x) # In[84]: print(accuracy_score(covertype_test_y, preds_class_gpu)) print(confusion_matrix(covertype_test_y, preds_class_gpu)) # In[85]: print(classification_report(covertype_test_y, preds_class_gpu)) # ## ThunderGBM # In[11]: import thundersvm # In[4]: get_ipython().run_line_magic('pinfo', 'clf') # In[7]: get_ipython().system('pip list') # In[ ]: