Support Vector Machine

using uci breast cancer dataset (again)

way better than k nearest neigbors but algo is much more complex

@TODO: learn linear algebra lol

ISO

In [1]:

import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

%load breast-cancer-wisconsin.data.txt

In [2]:

df = pd.read_csv('breast-cancer-wisconsin.data.txt')
# replace unkown data with outliers
df.replace('?',-99999, inplace=True)
# irrelevent feature
df.drop(['id'], 1, inplace=True)

In [3]:

# feature data
X = np.array(df.drop(['class'], 1))
# class / label data
y = np.array(df['class'])

In [4]:

# separate training and testing chunks
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:

# define classifier
clf = svm.SVC()

# train classifier
clf.fit(X_train, y_train)

Out[10]:

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:

# test
accuracy = clf.score(X_test, y_test)
print('accuracy:', accuracy)
# about 96% accuracy without any tweaks
# If you want to save the classier you'd pickle it

accuracy: 0.9428571428571428

In [12]:

example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,1,1,2,3,2,1], [1,0,6,1,5,1,2,4,2]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = clf.predict(example_measures)
print('example class outputs: ', prediction)

example class outputs:  [2 2 4]