from google.colab import drive
import os
drive.mount('/content/drive')
# Establecer ruta de acceso en drive
import os
print(os.getcwd())
os.chdir("/content/drive/My Drive")
print(os.getcwd())
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True). /content/drive/My Drive /content/drive/My Drive
import os
import numpy as np # linear algebra
import pandas as pd #
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
%matplotlib inline
np.random.seed(7)
train = pd.read_csv('train_x.csv', index_col=0)
y = train.Survived #.reset_index(drop=True)
features = train.drop(['Survived'], axis=1)
features.head()
Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|
PassengerId | ||||||||||
1 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
4 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
5 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
y
PassengerId 1 0 2 1 3 1 4 1 5 0 .. 887 0 888 1 889 0 890 1 891 0 Name: Survived, Length: 891, dtype: int64
features = features.drop(['Cabin'], axis=1) # Problema con nulos
features = features.drop(['Name'], axis=1) # Problea con nulos y texto
objects = [col for col in features.columns if features[col].dtype == "object"] # Verificando columnas tipo object
objects
['Sex', 'Ticket', 'Embarked']
features.update(features[objects].fillna('None')) # Llenar los nulos de columnas tipo object con None
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # Todos los tipos de datos posibles numericos
numerics = [col for col in features.columns if features[col].dtype in numeric_dtypes] #Chequar las columnas de tipo numerico
numerics
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
features.update(features[numerics].fillna(0)) # Llenar con 0 los datos nulos
features.info()# Descripcion del dataset
<class 'pandas.core.frame.DataFrame'> Int64Index: 891 entries, 1 to 891 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pclass 891 non-null int64 1 Sex 891 non-null object 2 Age 891 non-null float64 3 SibSp 891 non-null int64 4 Parch 891 non-null int64 5 Ticket 891 non-null object 6 Fare 891 non-null float64 7 Embarked 891 non-null object dtypes: float64(2), int64(3), object(3) memory usage: 62.6+ KB
X = pd.get_dummies(features) # Convertir a dummies
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.70,test_size=0.30,random_state=0)
X
Pclass | Age | SibSp | Parch | Fare | Sex_female | Sex_male | Ticket_110152 | Ticket_110413 | Ticket_110465 | ... | Ticket_W./C. 6607 | Ticket_W./C. 6608 | Ticket_W./C. 6609 | Ticket_W.E.P. 5734 | Ticket_W/C 14208 | Ticket_WE/P 5735 | Embarked_C | Embarked_None | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | |||||||||||||||||||||
1 | 3 | 22.0 | 1 | 0 | 7.2500 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 1 | 38.0 | 1 | 0 | 71.2833 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 3 | 26.0 | 0 | 0 | 7.9250 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 | 3 | 35.0 | 0 | 0 | 8.0500 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
887 | 2 | 27.0 | 0 | 0 | 13.0000 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
888 | 1 | 19.0 | 0 | 0 | 30.0000 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
889 | 3 | 0.0 | 1 | 2 | 23.4500 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
890 | 1 | 26.0 | 0 | 0 | 30.0000 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
891 | 3 | 32.0 | 0 | 0 | 7.7500 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
891 rows × 692 columns
#Dataframe de resultados
cols = ['Case','SGD','Ridge','KNN','SVM','Bagging','RndForest','LogReg','LGB']
resul = pd.DataFrame(columns=cols)
resul.set_index("Case",inplace=True)
resul.loc['Standard'] = [0,0,0,0,0,0,0,0]
resul.loc['GridSearch'] = [0,0,0,0,0,0,0,0]
resul.loc['RandomSearch'] = [0,0,0,0,0,0,0,0]
resul.loc['Hyperopt'] = [0,0,0,0,0,0,0,0]
resul.head()
SGD | Ridge | KNN | SVM | Bagging | RndForest | LogReg | LGB | |
---|---|---|---|---|---|---|---|---|
Case | ||||||||
Standard | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
GridSearch | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
RandomSearch | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Hyperopt | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Lo primero que vamos a hacer es definir nuestra función objetivo que debe devolver un diccionario al menos con las etiquetas 'loss' y 'status'.
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer
MAX_EVALS = 500
N_FOLDS = 10
def objective(params, n_folds = N_FOLDS):
"""Función objetivo para la Optimización de hiperparametros del Gradient Boosting Machine"""
# Llevar el conteo de iteraciones
global ITERATION
ITERATION += 1
# Recupera el subsample si se encuentra, en caso contrario se asigna 1.0
subsample = params['boosting_type'].get('subsample', 1.0)
# Extrae el boosting type
params['boosting_type'] = params['boosting_type']['boosting_type']
params['subsample'] = subsample
# Se asegura que los parametros que tienen que ser enteros sean enteros
for parameter_name in ['num_leaves', 'subsample_for_bin',
'min_child_samples']:
params[parameter_name] = int(params[parameter_name])
start = timer()
# realiza n_folds de cross validation
cv_results = lgb.cv(params, train_set, num_boost_round = 10000,
nfold = n_folds, early_stopping_rounds = 100,
metrics = 'auc', seed = 50)
run_time = timer() - start
# Extrae el mejor score
best_score = np.max(cv_results['auc-mean'])
# El loss se debe minimizar
loss = 1 - best_score
# Impulsando las iteraciones que arrojaron el mayor score en CV
n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
# Escribe sobre el archivo CSV ('a' significa append)
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, params, ITERATION, n_estimators,
run_time])
# Dictionary con informacion para la evaluación
return {'loss': loss, 'params': params, 'iteration': ITERATION,
'estimators': n_estimators, 'train_time': run_time,
'status': STATUS_OK}
Espacio del Dominio: El Dominio representa el rango de valores que queremos evaluar para cada hiperparámetro. En cada iteración de la búsqueda, el algoritmo de optimización bayesiano elegirá un valor para cada hiperparámetro desde el espacio del domino. Cuando hacemos un Random Search o un Grid Search, el espacio del dominio es una cuadrícula (una tabla de valores establecidos). En la optimización bayesiana, la idea es la misma, excepto que este espacio tiene distribuciones de probabilidad para cada hiperparámetro en lugar de valores discretos.
from hyperopt import hp
space = {
'class_weight': hp.choice('class_weight', [None, 'balanced']),
'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)},
{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
{'boosting_type': 'goss', 'subsample': 1.0}]),
'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
'learning_rate': hp.loguniform('learning_rate', np.log(0.01),np.log(0.2)),
'subsample_for_bin': hp.quniform('subsample_for_bin', 20000,300000,1000),
'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}
Aquí se pueden usar diferentes tipos de distribución de dominio (se puede conseguir la lista completa de distribuciones en la documentación de hyperopt):
choice : variables categóricas
quniform : discretas uniformes (números enteros espaciados uniformemente)
uniform continuad uniformes (floats espaciados uniformemente)
loguniform: logarítmicas continuas uniformes (floats espaciados uniformemente en una escala logaritmica)
from hyperopt import tpe
from hyperopt import Trials
# Algoritmo de optimización
tpe_algorithm = tpe.suggest
# Lleva el registro de los resultados
bayes_trials = Trials()
from hyperopt import fmin
# Variable Global
global ITERATION
ITERATION = 0
MAX_EVALS = 100
# Crea un dataset lgb
train_set = lgb.Dataset(X_train, label = y_train)
# archivo para guardar los primeros resultados
out_file = './gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
# escribe la cabecera de los archivos
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()
# Se demora bastante
best = fmin(fn = objective, space = space, algo = tpe.suggest,
max_evals = MAX_EVALS, trials = bayes_trials,
rstate =np.random.RandomState(50))
5%|▌ | 5/100 [00:04<01:11, 1.34it/s, best loss: 0.13747126630679263]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
6%|▌ | 6/100 [01:44<54:17, 34.65s/it, best loss: 0.13747126630679263]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
8%|▊ | 8/100 [03:39<1:03:27, 41.38s/it, best loss: 0.13747126630679263]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
11%|█ | 11/100 [04:00<24:58, 16.84s/it, best loss: 0.13747126630679263]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
12%|█▏ | 12/100 [04:19<25:53, 17.66s/it, best loss: 0.13747126630679263]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
13%|█▎ | 13/100 [07:12<1:33:45, 64.66s/it, best loss: 0.13023161268556005]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
15%|█▌ | 15/100 [08:55<1:15:35, 53.36s/it, best loss: 0.13023161268556005]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
18%|█▊ | 18/100 [09:19<30:25, 22.26s/it, best loss: 0.13023161268556005]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
23%|██▎ | 23/100 [09:45<08:18, 6.48s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
45%|████▌ | 45/100 [10:31<00:52, 1.05it/s, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
48%|████▊ | 48/100 [12:16<13:49, 15.94s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
51%|█████ | 51/100 [14:29<20:57, 25.67s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
55%|█████▌ | 55/100 [15:54<11:45, 15.67s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
59%|█████▉ | 59/100 [17:28<09:15, 13.55s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
63%|██████▎ | 63/100 [17:51<03:49, 6.19s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
65%|██████▌ | 65/100 [20:32<21:34, 36.98s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
66%|██████▌ | 66/100 [23:34<45:33, 80.41s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
67%|██████▋ | 67/100 [27:12<1:06:52, 121.58s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
68%|██████▊ | 68/100 [29:39<1:08:58, 129.33s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
69%|██████▉ | 69/100 [32:48<1:16:04, 147.25s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
70%|███████ | 70/100 [35:27<1:15:26, 150.90s/it, best loss: 0.1295262033288349]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
71%|███████ | 71/100 [38:04<1:13:47, 152.66s/it, best loss: 0.1292669815564551]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
72%|███████▏ | 72/100 [40:48<1:12:46, 155.94s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
73%|███████▎ | 73/100 [43:00<1:06:58, 148.83s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
74%|███████▍ | 74/100 [44:58<1:00:27, 139.53s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
75%|███████▌ | 75/100 [47:15<57:50, 138.83s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
76%|███████▌ | 76/100 [48:58<51:13, 128.07s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
77%|███████▋ | 77/100 [50:55<47:47, 124.67s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
78%|███████▊ | 78/100 [53:17<47:37, 129.89s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
79%|███████▉ | 79/100 [56:04<49:24, 141.15s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
80%|████████ | 80/100 [57:36<42:09, 126.46s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
81%|████████ | 81/100 [1:01:03<47:38, 150.44s/it, best loss: 0.12762910481331535]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
82%|████████▏ | 82/100 [1:04:35<50:41, 168.98s/it, best loss: 0.12759834682860993]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
83%|████████▎ | 83/100 [1:08:12<51:56, 183.34s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
84%|████████▍ | 84/100 [1:08:32<35:48, 134.30s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
85%|████████▌ | 85/100 [1:11:26<36:33, 146.24s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
86%|████████▌ | 86/100 [1:12:19<27:36, 118.31s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
87%|████████▋ | 87/100 [1:12:39<19:15, 88.88s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
88%|████████▊ | 88/100 [1:14:59<20:50, 104.22s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
90%|█████████ | 90/100 [1:17:07<13:00, 78.04s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
91%|█████████ | 91/100 [1:19:07<13:35, 90.59s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
93%|█████████▎| 93/100 [1:20:48<07:39, 65.63s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
94%|█████████▍| 94/100 [1:22:43<08:02, 80.47s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
95%|█████████▌| 95/100 [1:25:05<08:13, 98.76s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
96%|█████████▌| 96/100 [1:26:46<06:37, 99.42s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
98%|█████████▊| 98/100 [1:31:03<03:25, 102.73s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
99%|█████████▉| 99/100 [1:32:49<01:43, 103.87s/it, best loss: 0.12692223346828602]
/usr/local/lib/python3.7/dist-packages/lightgbm/callback.py:189: UserWarning: Early stopping is not available in dart mode warnings.warn('Early stopping is not available in dart mode')
100%|██████████| 100/100 [1:33:07<00:00, 55.88s/it, best loss: 0.12692223346828602]
Esta función activa el proceso de búsqueda de la mejor combinación. Una vez finalizado el proceso, podemos tomar el objeto Trials (bayes_trials en nuestro caso) y analizar sus resultados:
# Ordena las pruebas segun el menor loss (mayor AUC) primero
bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])
bayes_trials_results[:2]
[{'estimators': 1390, 'iteration': 83, 'loss': 0.12692223346828602, 'params': {'boosting_type': 'dart', 'class_weight': 'balanced', 'colsample_bytree': 0.8617558102005193, 'learning_rate': 0.045091115529774406, 'min_child_samples': 40, 'num_leaves': 145, 'reg_alpha': 0.03906368016088817, 'reg_lambda': 0.8457944649575712, 'subsample': 0.5562695107489157, 'subsample_for_bin': 201000}, 'status': 'ok', 'train_time': 216.78248231699945}, {'estimators': 377, 'iteration': 82, 'loss': 0.12759834682860993, 'params': {'boosting_type': 'dart', 'class_weight': 'balanced', 'colsample_bytree': 0.941484025255672, 'learning_rate': 0.08594930906358782, 'min_child_samples': 40, 'num_leaves': 143, 'reg_alpha': 0.14142198699291497, 'reg_lambda': 0.5752120688348917, 'subsample': 0.5992524654733394, 'subsample_for_bin': 247000}, 'status': 'ok', 'train_time': 212.17623683700003}]
results = pd.read_csv('./gbm_trials.csv')
# Ordena con el mejor score de primero y resetea el indice para las divisiones
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()
loss | params | iteration | estimators | train_time | |
---|---|---|---|---|---|
0 | 0.126922 | {'boosting_type': 'dart', 'class_weight': 'bal... | 83 | 1390 | 216.782482 |
1 | 0.127598 | {'boosting_type': 'dart', 'class_weight': 'bal... | 82 | 377 | 212.176237 |
2 | 0.127629 | {'boosting_type': 'dart', 'class_weight': None... | 72 | 4473 | 163.547625 |
3 | 0.128016 | {'boosting_type': 'dart', 'class_weight': 'bal... | 81 | 484 | 206.337745 |
4 | 0.128408 | {'boosting_type': 'dart', 'class_weight': 'bal... | 97 | 299 | 256.564190 |
import ast
# Convierte de string a un dictionary
ast.literal_eval(results.loc[0, 'params'])
{'boosting_type': 'dart', 'class_weight': 'balanced', 'colsample_bytree': 0.8617558102005193, 'learning_rate': 0.045091115529774406, 'min_child_samples': 40, 'num_leaves': 145, 'reg_alpha': 0.03906368016088817, 'reg_lambda': 0.8457944649575712, 'subsample': 0.5562695107489157, 'subsample_for_bin': 201000}