%load_ext watermark
%watermark -p scikit-learn,mlxtend,xgboost
scikit-learn: 1.0 mlxtend : 0.19.0 xgboost : 1.5.0
import pandas as pd
X_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_train.csv', header=None).values
y_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_train.csv', header=None).values.ravel().astype(int)
X_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_test.csv', header=None).values
y_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_test.csv', header=None).values.ravel().astype(int)
print('X_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)
X_train.shape: (9119, 16) y_train.shape: (9119,) X_test.shape: (4492, 16) y_test.shape: (4492,)
from sklearn.model_selection import train_test_split
X_train_sub, X_valid, y_train_sub, y_valid = \
train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)
print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])
Train/Valid/Test sizes: 9119 1824 4492
Compare hyperparameter settings on validation set:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")
Train Accuracy: 79.657% Valid Accuracy: 71.162%
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")
Train Accuracy: 84.003% Valid Accuracy: 71.930%
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")
Train Accuracy: 77.478% Valid Accuracy: 69.518%
Choose best model and train on whole training set:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
print(f"Train Accuracy: {model.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%")
Train Accuracy: 84.965% Test Accuracy: 71.305%