#!/usr/bin/env python # coding: utf-8 # # **K-Nearest-Neighbor Classification** # ## Reading the data # Let's look at the Iris dataset. We have already seen this dataset in the **"Clustering"** case study before. # This dataset measures a bunch of flower measurements (Petal Length, Petal Width, Sepal Length, Sepal Width) for different types of flowers. # #
# # In the Clustering case study, we used K-Means to cluster the flowers into different groups without making use of the labels (unsupervised learning). # # In this case study, we will use K-Nearest-Neighbors (KNN) to train a classifier using the labeled flowers. We will then use this information to predict labels for test flowers. # In[1]: import pandas as pd df = pd.read_csv("iris.csv") df.sample(10) # In[2]: # a useful method for categorical data df.species.unique() df.species.value_counts() # Let's see what different classes or species of flowers are present in the dataset # In[3]: # the describe method does something different when data is categorical! df.species.describe() # So there are 50 datapoints each belonging to one of three flower-species or classes.
# ## Visualizing the data # Since it's difficult to visualize 4 dimensions, let's plot all combinations of these features pair-wise and see if these data points are separable. # In[4]: df.columns # In[5]: features = ['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth'] # In[6]: ax = df[df.species=='setosa'].plot.scatter(x='petalLength', y='petalWidth', c='blue') df[df.species=='virginica'].plot.scatter(x='petalLength', y='petalWidth', c='red', ax=ax) df[df.species=='versicolor'].plot.scatter(x='petalLength', y='petalWidth', c='purple', ax=ax) ax.legend(['setosa','virginica','versicolor']); # ## K-Nearest Neighbour Algorithm # First, let's train and test on the whole dataset of 150 points. # In[7]: # We select the training features and labels Xtrain = df[features] Ytrain = df.species # ### Training with K = 5 # In[8]: from sklearn.neighbors import KNeighborsClassifier # Instantiate learning model (k = 5) knn = KNeighborsClassifier(n_neighbors=5) # Fitting the model knn.fit(Xtrain,Ytrain) # Predicting the Test set results Ypred = knn.predict(Xtrain) # compute (number of correct predictions) / (total number of predictions) accuracy = sum(Ytrain == Ypred) / len(Ypred) print("The model accuracy is: ", accuracy) # In[9]: # A built-in way of measuring accuracy: from sklearn.metrics import accuracy_score accuracy_score(Ytrain, Ypred) # That means that with K=5, we have achieved 96.67% accuracy on testing with the same dataset we trained with.
# Let's see what happens with K = 1, i.e. with just 1 Nearest Neighbour. # ### Training with K = 1 # In[10]: knn1 = KNeighborsClassifier(n_neighbors=1) knn1.fit(Xtrain, Ytrain) Ypred1 = knn1.predict(Xtrain) print("Model accuracy:", accuracy_score(Ytrain, Ypred1)) # We see 100% accuracy! WHY??? # # # --- # # **Quiz problems** # ## Question 1 # # (see the Canvas quiz!) # ## Question 2 # # This time, we will only use two of the features. What accuracy do we obtain if we only use the `sepalWidth` and `sepalLength` features for classification and KNN with K=5? Here is some starter code: # In[ ]: # YOUR CODE HERE: (define Xtrain and Ytrain here) Xtrain = Ytrain = # In[ ]: # the following code generates the KNN classifier and computes the prediction error knn = KNeighborsClassifier(n_neighbors = 5) knn.fit(Xtrain,Ytrain) Ypred = knn.predict(Xtrain) print( "The accuracy is: ", accuracy_score(Ytrain,Ypred) ) # ## Question 3 # # Rather than using the same training and testing set, we will do the following: # - Train using the even-numbered rows (index 0,2,4,...) # - Test using the odd-numbered rows (index 1,3,5,...) # # When we do this with KNN (K = 5), what accuracy do we obtain? Here is some code to get you started: # In[ ]: # all the indices (check to see what "everything" is!) everything = list(df.index) # train on the even indices, test on the odd indices training = everything[0::2] testing = everything[1::2] # In[ ]: # YOUR CODE HERE: (define Xtrain, Ytrain, Xtest, and Ytest) Xtrain = Ytrain = Xtest = Ytest = # In[ ]: # generate knn classifier with K=5 and form the prediction error knn = KNeighborsClassifier(n_neighbors = 5) knn.fit(Xtrain,Ytrain) Ypred = knn.predict(Xtest) print( "The accuracy is: ", accuracy_score(Ytest,Ypred) )