The data is gotten from the UCI Machine Learning repository submitted by Jock A. Blackard. The dataset contains 581,012 records with 54 attributes. The dataset characteristics can be viewed from the UCI Forest CoverType description page. But for convenience sake, the summary statistics is replicated below.
Summary Statistics
# import classes and functions
import numpy
from pandas import read_csv
from xgboost import XGBClassifier # xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
Using TensorFlow backend.
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = read_csv("covtype.data", header=None)
dataset = dataframe.values
# reshuffle dataset
dataset = numpy.random.permutation(dataset)
# use reduced dataset
dataset = dataset[0:50000,:]
# split into input (X) and output (Y) variables
X = dataset[:,0:54].astype(float)
Y = dataset[:,54]
# encode class values as integers
encoder = LabelEncoder()
encoder = encoder.fit(Y)
encoded_Y = encoder.transform(Y)
The following hyper-parameters are tuned to improve our model.
# XGBOOST
# grid search
model = XGBClassifier(nthread=-1)
n_estimators = range(50, 400, 50) # tune number of trees
max_depth = range(1, 11, 2) # tune size of decision trees
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3] # tune the learning rate
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
results = grid_search.fit(X, encoded_Y)
print("Best: %f%% using %s" % (results.best_score_ * 100, results.best_params_))
Best: 90.462000% using {'n_estimators': 350, 'learning_rate': 0.3, 'max_depth': 9}