#!/usr/bin/env python # coding: utf-8 # # Performance Baselines # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-p scikit-learn,mlxtend,xgboost') # ## Dataset # Source: https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset # In[2]: import pandas as pd X_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_train.csv', header=None).values y_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_train.csv', header=None).values.ravel().astype(int) X_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_test.csv', header=None).values y_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_test.csv', header=None).values.ravel().astype(int) print('X_train.shape:', X_train.shape) print('y_train.shape:', y_train.shape) print('X_test.shape:', X_test.shape) print('y_test.shape:', y_test.shape) # In[3]: from sklearn.model_selection import train_test_split X_train_sub, X_valid, y_train_sub, y_valid = \ train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train) print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0]) # ## Baselines # Compare hyperparameter settings on validation set: # In[4]: from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train_sub, y_train_sub) print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%") print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%") # In[5]: knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train_sub, y_train_sub) print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%") print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%") # In[6]: knn = KNeighborsClassifier(n_neighbors=7) knn.fit(X_train_sub, y_train_sub) print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%") print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%") # Choose best model and train on whole training set: # In[7]: model = KNeighborsClassifier(n_neighbors=3) model.fit(X_train, y_train) print(f"Train Accuracy: {model.score(X_train, y_train)*100:0.3f}%") print(f"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%") # In[ ]: # In[ ]: