from google.colab import drive import os drive.mount('/content/gdrive') # Establecer ruta de acceso en dr import os print(os.getcwd()) os.chdir("/content/gdrive/My Drive") import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import xgboost as xgb from sklearn.metrics import mean_squared_error from sklearn.metrics import accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.experimental import enable_halving_search_cv from sklearn.model_selection import HalvingGridSearchCV from sklearn.model_selection import HalvingRandomSearchCV # Lectura de DF df=pd.read_csv('creditcard.csv') # Vamos a eliminar la columna tiempo df= df.drop(columns='Time') # Estandarizamos la columna Amount df['Amount']=(df['Amount']- np.mean(df['Amount']))/np.std(df.Amount) df.head() # El problema es muy desbalanceado tranajaremos con una muestra para resolver el problema df_ones=df[df['Class']==1] # Filtro de caracteristica print(df_ones.shape) df_zeros=df[df['Class']==0] # Filtro de NO caracteristica df_zeros= df_zeros.sample(3*df_ones.shape[0]) # TamaƱo de muestra 3 veces el de la caracteristica print(df_zeros.shape) # Concatenar df_final=pd.DataFrame(np.concatenate([df_ones, df_zeros],axis=0), columns=df.columns) print(df_final.shape) df_final.head() df_final.shape df_final.isnull().sum() # Separar en X y y y= df_final.Class X= df_final.drop(columns='Class', axis=1) print(X.shape, y.shape) # Separar en train y test X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) print(X_train.shape, X_test.shape) model= xgb.XGBClassifier(learning_rate=0.001) # Lista de hiperparametros params_1 = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 5} params_2 = {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 7} params_3 = {'criterion': 'gini', 'splitter': 'random', 'max_depth': 10} # Modelo 1 model.set_params(**params_1).fit(X_train, y_train) print(f'Accuracy para Modelo 1 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}') # Modelo 2 model.set_params(**params_2).fit(X_train, y_train) print(f'Accuracy para Modelo 2 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}') # Modelo 3 model.set_params(**params_3).fit(X_train, y_train) print(f'Accuracy para Modelo 3 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}') params_grid = { 'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [5,6,7], 'criterion':['entropy','gini'] } # tiempo de ejecucuon 424 seg grid_cv = GridSearchCV(model, params_grid, scoring="accuracy", n_jobs=-1, cv=3) grid_cv.fit(X_train, y_train) print("Mejores Parametros", grid_cv.best_params_) print("Mejor CV score", grid_cv.best_score_) print(f'Accuracy del modelo = {round(accuracy_score(y_test, grid_cv.predict(X_test)), 5)}') # tiempo de ejecucuon 7 seg grid_cv = RandomizedSearchCV(model, params_grid, scoring="accuracy", n_jobs=-1, cv=3) grid_cv.fit(X_train, y_train) print("Mejores parametros", grid_cv.best_params_) print("Mejor score de CV", grid_cv.best_score_) print(f'Accuracy del modelo = {round(accuracy_score(y_test, grid_cv.predict(X_test)), 5)}') # tiempo de ejecucuon 125 seg halving_cv = HalvingGridSearchCV(model, params_grid, scoring="accuracy", factor=3) halving_cv.fit(X_train, y_train) print("Mejores parametros", halving_cv.best_params_) print("Mejor Score CV", halving_cv.best_score_) print(f'Accuracy del modelo = {round(accuracy_score(y_test, halving_cv.predict(X_test)), 5)}') # tiempo de ejecucuon 13 seg halving_cv = HalvingRandomSearchCV(model, params_grid, scoring="accuracy", factor=3) halving_cv.fit(X_train, y_train) print("Mejores parametros", halving_cv.best_params_) print("Mejor CV score", halving_cv.best_score_) print(f'Accuracy del modelo = {round(accuracy_score(y_test, halving_cv.predict(X_test)), 5)}')