In [ ]:

from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Classification¶

In [ ]:

# Can you tell if a bank note is counterfeit or legitimate?
# Variables based on photgraphs of many banknotes (a few numbers for each image calculated)
banknotes = Table.read_table('banknote.csv')
banknotes

In [ ]:

# Visualize 'WaveletVar' and 'WaveletCurt'
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')

In [ ]:

# Visualize 'WaveletSkew', 'Entropy'
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')

In [ ]:

# Two attributes have some overlap of classes...what happens with three attributes?
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
          s=50);

Breast cancer classifier¶

In [ ]:

#Class 1 = malignant (cancer)
#Class 0 = benign (not cancer)
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [ ]:

#A number of points are layered
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')

In [ ]:

#Function to "jitter" the points (for visualization purposes)
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

In [ ]:

# Plot the data with the points jittered
jittered.scatter(0, 1, colors='Class')

Distance¶

In [ ]:

# Get only features of the data (i.e. the attributes) without the class labels
features = patients.drop('Class')
features.show(3)

In [ ]:

# create a function to compute the distance between two arrays
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

In [ ]:

# We can use np.array(tuple(row)) convert a row to an numpy array
row_one_array = np.array(tuple(features.row(1)))
row_one_array

In [ ]:

# create a function to compute the distance between two rows in a Table
def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(tuple(row1)), np.array(tuple(row2)))

In [ ]:

# distance between the first and second row
row_distance(features.row(0), features.row(1))

In [ ]:

# sanity check: distance between first row and itself
row_distance(features.row(0), features.row(0))

Classification Procedure¶

In [ ]:

# a function to compute the distance between a whole training set and a given example
# returns the training set with an additional column that has the distance to the example for each row
def distances(training, example):
    """Compute distance between example and every row in training.
    Return training augmented with Distance column"""
    distances = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        distances = np.append(distances, row_distance(row, example))
    return training.with_column('Distance', distances)

In [ ]:

# Let's look at patient 15
patients.take(15)

In [ ]:

# Let's look at the features for row 15
example = features.row(15)
example

In [ ]:

# Let's look at the distance between patient 15 and all other patients
distances(patients.exclude(15), example).sort('Distance')

In [ ]:

# A function that will return a table with the k closest distances to an example
def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    return distances(training, example).sort('Distance').take(np.arange(k))

In [ ]:

# Applying the closest function to example patient 15
closest_table = closest(patients.exclude(15), example, 5)
closest_table 

In [ ]:

# A function that returns the class label for the class that has the most nearest neighbors
def majority_class(topk):
    """Return the class with the highest count"""
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

majority_class(closest_table) 

In [ ]:

# The full k nearest neighbor classification function
def classify(training, example, k):
    "Return the majority class among the k nearest neighbors of example"
    return majority_class(closest(training, example, k))

In [ ]:

# applying the kNN function to patient 15
classify(patients.exclude(15), example, 5)

In [ ]:

# Let's look at patient 15 - did we make the correct prediction? 
patients.take(15)

In [ ]:

# Let's try it for patient 10
new_example = features.row(10)
classify(patients.exclude(10), new_example, 5)

In [ ]:

# Did we get it correct?
patients.take(10)

In [ ]:

# evaluate any patient number...
patient_to_use = 6
new_example = features.row(patient_to_use)
classify(patients.exclude(patient_to_use), new_example, 5), patients.take(patient_to_use).column("Class").item(0)

Evaluation¶

In [ ]:

# Show the total number of rows in the data set
patients.num_rows

In [ ]:

# Create a training and test set
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set  = shuffled.take(np.arange(342, 683))

In [ ]:

# print the number of points in the training and test set
print(training_set.num_rows)
print(test_set.num_rows)

In [ ]:

# create a function that returns the proportion of points correctly classified in the test set
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

In [ ]:

# evaluate the classifier using k = 5
evaluate_accuracy(training_set, test_set, 5)

In [ ]:

# evaluate the classifier using k = 3
evaluate_accuracy(training_set, test_set, 3)

In [ ]:

# evaluate the classifier using k = 11
evaluate_accuracy(training_set, test_set, 11)

In [ ]:

# evaluate the classifier using k = 1 both training and testing using only the training set
evaluate_accuracy(training_set, training_set, 1)

In [ ]:

# evaluate the classifier using k = 1
evaluate_accuracy(training_set, test_set, 1)