Exploring the use of XGBoost and its integration with Scikit-Learn.
Some useful links:
Objective is to demonstrate:
This notebook is adapted from https://www.kaggle.com/code/stuarthallows/using-xgboost-with-scikit-learn/notebook
numpy
scikit-learn
scipy
xgboost
Run the following cell to install the packages.
#
# Required Packages
# Run this cell to install required packages.
#
%pip install "numpy>=1.19" "pandas>=1.1" "scikit-learn>=0.22.2" "scipy>=1.7" "xgboost>=1.6"
import numpy as np
import xgboost as xgb
from scipy.stats import randint, uniform
from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import accuracy_score, auc, confusion_matrix, mean_squared_error
from sklearn.model_selection import (
GridSearchCV,
KFold,
RandomizedSearchCV,
cross_val_score,
train_test_split,
)
def display_scores(scores):
print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))
def report_best_scores(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results["rank_test_score"] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print(
"Mean validation score: {0:.3f} (std: {1:.3f})".format(
results["mean_test_score"][candidate], results["std_test_score"][candidate]
)
)
print("Parameters: {0}".format(results["params"][candidate]))
print("")
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X)
mse = mean_squared_error(y, y_pred)
print(np.sqrt(mse))
print(xgb_model)
[23:05:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. 0.2401475171547707 XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='reg:linear', predictor='auto', random_state=42, reg_alpha=0, ...)
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X)
print(confusion_matrix(y, y_pred))
[[212 0] [ 0 357]]
wine = load_wine()
X = wine.data
y = wine.target
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X)
print(confusion_matrix(y, y_pred))
[[59 0 0] [ 0 71 0] [ 0 0 48]]
Cross-validation using KFold
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for train_index, test_index in kfold.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
xgb_model = xgb.XGBRegressor(objective="reg:linear")
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
scores.append(mean_squared_error(y_test, y_pred))
display_scores(np.sqrt(scores))
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)
scores = cross_val_score(xgb_model, X, y, scoring="neg_mean_squared_error", cv=5)
display_scores(np.sqrt(-scores))
[23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. Scores: [63.94113133 61.42459265 67.48347385 69.49735119 59.90352074] Mean: 64.450 Std: 3.599 [23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. [23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. Scores: [62.80101886 65.82933114 62.19849188 66.40701402 67.29879575] Mean: 64.907 Std: 2.029
Cross-validation using cross_val_score
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
xgb_model = xgb.XGBRegressor()
params = {
"colsample_bytree": uniform(0.7, 0.3),
"gamma": uniform(0, 0.5),
"learning_rate": uniform(0.03, 0.3), # default 0.1
"max_depth": randint(2, 6), # default 3
"n_estimators": randint(100, 150), # default 100
"subsample": uniform(0.6, 0.4),
}
search = RandomizedSearchCV(
xgb_model,
param_distributions=params,
random_state=42,
n_iter=200,
cv=3,
verbose=1,
n_jobs=1,
return_train_score=True,
)
search.fit(X, y)
report_best_scores(search.cv_results_, 1)
Fitting 3 folds for each of 200 candidates, totalling 600 fits Model with rank: 1 Mean validation score: 0.464 (std: 0.009) Parameters: {'colsample_bytree': 0.7516959613604889, 'gamma': 0.09614450940433539, 'learning_rate': 0.042260584879943656, 'max_depth': 2, 'n_estimators': 117, 'subsample': 0.7114361356127834}