# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# scikit-learn
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc
# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.style.use('ggplot')
SVC plot function.*
def plot_svc(svc, X, y, h=0.02, pad=0.25):
x_min, x_max = X[:, 0].min()-pad, X[:, 0].max()+pad
y_min, y_max = X[:, 1].min()-pad, X[:, 1].max()+pad
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.2)
plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)
# Support vectors indicated in plot by vertical lines
sv = svc.support_vectors_
plt.scatter(sv[:,0], sv[:,1], c='k', marker='x', s=100, linewidths='1')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
print('Number of support vectors: ', svc.support_.size)
# Generating random data: 20 observations of 2 features and divide into two classes.
np.random.seed(5)
X = np.random.randn(20,2)
y = np.repeat([1,-1], 10)
X[y == -1] = X[y == -1] +1
plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');
svc = SVC(C=1, kernel='linear')
svc.fit(X, y)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
plot_svc(svc, X, y)
Number of support vectors: 13
# When using a smaller cost parameter (C=0.1) the margin is wider, resulting in more support vectors.
svc2 = SVC(C=0.1, kernel='linear')
svc2.fit(X, y)
plot_svc(svc2, X, y)
Number of support vectors: 16
# Select the optimal C parameter by cross-validation
tuning_parms = [{'C': [0.001, 0.01, 0.1, 1, 5, 10, 100]}]
clf = GridSearchCV(SVC(kernel='linear'), tuning_parms, cv=10, scoring='accuracy')
clf.fit(X, y)
pd.DataFrame(clf.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
mean_test_score | std_test_score | params | |
---|---|---|---|
0 | 0.80 | 0.244949 | {'C': 0.001} |
1 | 0.80 | 0.244949 | {'C': 0.01} |
2 | 0.80 | 0.244949 | {'C': 0.1} |
3 | 0.75 | 0.335410 | {'C': 1} |
4 | 0.75 | 0.335410 | {'C': 5} |
5 | 0.75 | 0.335410 | {'C': 10} |
6 | 0.75 | 0.335410 | {'C': 100} |
# Best parameter according to GridSearchCV.
clf.best_params_
{'C': 0.001}
# Generating test data
np.random.seed(1)
X_test = np.random.randn(20,2)
y_test = np.random.choice([-1,1], 20)
X_test[y_test == 1] = X_test[y_test == 1] -1
plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');
# Prediction with C = 0.1
y_pred = svc2.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_pred),index=svc.classes_, columns=svc.classes_)
-1 | 1 | |
---|---|---|
-1 | 2 | 6 |
1 | 0 | 12 |
# SVC with C = 0.001
svc3 = SVC(C=0.001, kernel='linear')
svc3.fit(X, y)
y_pred = svc3.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_pred), index=svc3.classes_, columns=svc3.classes_)
-1 | 1 | |
---|---|---|
-1 | 2 | 6 |
1 | 0 | 12 |
# Changing the test data so that the classes are really seperable with a hyperplane.
X_test[y_test == 1] = X_test[y_test == 1] -1
plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');
svc4 = SVC(C=10.0, kernel='linear')
svc4.fit(X_test, y_test)
plot_svc(svc4, X_test, y_test)
Number of support vectors: 4
# Increase the margin: 1 mis-classfication. Increased bias, lower variance.
svc5 = SVC(C=1, kernel='linear')
svc5.fit(X_test, y_test)
plot_svc(svc5, X_test, y_test)
Number of support vectors: 5
# Generate test data
np.random.seed(8)
X = np.random.randn(200,2)
X[:100] = X[:100] +2
X[101:150] = X[101:150] -2
y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)])
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2)
plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');
svm = SVC(C=1.0, kernel='rbf', gamma=1)
svm.fit(X_train, y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
plot_svc(svm, X_train, y_train)
Number of support vectors: 51
# Increasing C parameter, allowing more flexibility
svm2 = SVC(C=100, kernel='rbf', gamma=1.0)
svm2.fit(X_train, y_train)
plot_svc(svm2, X_train, y_train)
Number of support vectors: 36
# Set the parameters by cross-validation
tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100],
'gamma': [0.5, 1,2,3,4]}]
clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy')
clf.fit(X_train, y_train)
pd.DataFrame(clf.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
mean_test_score | std_test_score | params | |
---|---|---|---|
0 | 0.77 | 0.031286 | {'C': 0.01, 'gamma': 0.5} |
1 | 0.77 | 0.031286 | {'C': 0.01, 'gamma': 1} |
2 | 0.77 | 0.031286 | {'C': 0.01, 'gamma': 2} |
3 | 0.77 | 0.031286 | {'C': 0.01, 'gamma': 3} |
4 | 0.77 | 0.031286 | {'C': 0.01, 'gamma': 4} |
5 | 0.77 | 0.031286 | {'C': 0.1, 'gamma': 0.5} |
6 | 0.77 | 0.031286 | {'C': 0.1, 'gamma': 1} |
7 | 0.77 | 0.031286 | {'C': 0.1, 'gamma': 2} |
8 | 0.77 | 0.031286 | {'C': 0.1, 'gamma': 3} |
9 | 0.77 | 0.031286 | {'C': 0.1, 'gamma': 4} |
10 | 0.92 | 0.109332 | {'C': 1, 'gamma': 0.5} |
11 | 0.92 | 0.088671 | {'C': 1, 'gamma': 1} |
12 | 0.90 | 0.090006 | {'C': 1, 'gamma': 2} |
13 | 0.88 | 0.075639 | {'C': 1, 'gamma': 3} |
14 | 0.85 | 0.063842 | {'C': 1, 'gamma': 4} |
15 | 0.92 | 0.076568 | {'C': 10, 'gamma': 0.5} |
16 | 0.89 | 0.101409 | {'C': 10, 'gamma': 1} |
17 | 0.86 | 0.118986 | {'C': 10, 'gamma': 2} |
18 | 0.86 | 0.108321 | {'C': 10, 'gamma': 3} |
19 | 0.87 | 0.097125 | {'C': 10, 'gamma': 4} |
20 | 0.84 | 0.091320 | {'C': 100, 'gamma': 0.5} |
21 | 0.83 | 0.119886 | {'C': 100, 'gamma': 1} |
22 | 0.86 | 0.118986 | {'C': 100, 'gamma': 2} |
23 | 0.87 | 0.116294 | {'C': 100, 'gamma': 3} |
24 | 0.87 | 0.116294 | {'C': 100, 'gamma': 4} |
clf.best_params_
{'C': 1, 'gamma': 0.5}
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))
array([[67, 6], [ 9, 18]])
# 15% of test observations misclassified
clf.best_estimator_.score(X_test, y_test)
0.84999999999999998
svm3 = SVC(C=1, kernel='rbf', gamma=2)
svm3.fit(X_train, y_train)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=2, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# More flexible model
svm4 = SVC(C=1, kernel='rbf', gamma=50)
svm4.fit(X_train, y_train)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=50, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
y_train_score3 = svm3.decision_function(X_train)
y_train_score4 = svm4.decision_function(X_train)
false_pos_rate3, true_pos_rate3, _ = roc_curve(y_train, y_train_score3)
roc_auc3 = auc(false_pos_rate3, true_pos_rate3)
false_pos_rate4, true_pos_rate4, _ = roc_curve(y_train, y_train_score4)
roc_auc4 = auc(false_pos_rate4, true_pos_rate4)
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(14,6))
ax1.plot(false_pos_rate3, true_pos_rate3, label='SVM $\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b')
ax1.plot(false_pos_rate4, true_pos_rate4, label='SVM $\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r')
ax1.set_title('Training Data')
y_test_score3 = svm3.decision_function(X_test)
y_test_score4 = svm4.decision_function(X_test)
false_pos_rate3, true_pos_rate3, _ = roc_curve(y_test, y_test_score3)
roc_auc3 = auc(false_pos_rate3, true_pos_rate3)
false_pos_rate4, true_pos_rate4, _ = roc_curve(y_test, y_test_score4)
roc_auc4 = auc(false_pos_rate4, true_pos_rate4)
ax2.plot(false_pos_rate3, true_pos_rate3, label='SVM $\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b')
ax2.plot(false_pos_rate4, true_pos_rate4, label='SVM $\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r')
ax2.set_title('Test Data')
for ax in fig.axes:
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([-0.05, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend(loc="lower right")
# Adding a third class of observations
np.random.seed(8)
XX = np.vstack([X, np.random.randn(50,2)])
yy = np.hstack([y, np.repeat(0,50)])
XX[yy ==0] = XX[yy == 0] +4
plt.scatter(XX[:,0], XX[:,1], s=70, c=yy, cmap=plt.cm.prism)
plt.xlabel('XX1')
plt.ylabel('XX2');
svm5 = SVC(C=1, kernel='rbf')
svm5.fit(XX, yy)
plot_svc(svm5, XX, yy)
Number of support vectors: 133
# Load Khan data set from R ISLR package
islr = importr('ISLR')
khan_rdf = rdata(islr).fetch('Khan')['Khan']
list(khan_rdf.names)
['xtrain', 'xtest', 'ytrain', 'ytest']
# Assign X, y train and test set
X_train = pandas2ri.ri2py(khan_rdf[0])
X_test = pandas2ri.ri2py(khan_rdf[1])
y_train = pandas2ri.ri2py(khan_rdf[2]).astype(int)
y_test = pandas2ri.ri2py(khan_rdf[3]).astype(int)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(63, 2308) (63,) (20, 2308) (20,)
# y_train counts
pd.Series(y_train).value_counts()
2 23 4 20 3 12 1 8 dtype: int64
# y_test counts
pd.Series(y_test).value_counts()
3 6 2 6 4 5 1 3 dtype: int64
# This model gives identical results to the svm() of the R package e1071, also based on libsvm library.
svc = SVC(kernel='linear')
# This model is based on liblinear library and gives 100 score on the test data.
#svc = LinearSVC()
svc.fit(X_train, y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# Traing CM
cm = confusion_matrix(y_train, svc.predict(X_train))
cm_df = pd.DataFrame(cm.T, index=svc.classes_, columns=svc.classes_)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)
True 1 2 3 4 Predicted 1 8 0 0 0 2 0 23 0 0 3 0 0 12 0 4 0 0 0 20
# Prediction and test CM
cm = confusion_matrix(y_test, svc.predict(X_test))
cm_df = pd.DataFrame(cm.T, index=svc.classes_, columns=svc.classes_)
cm_df.index.name = 'Predicted'
cm_df.columns.name = 'True'
print(cm_df)
True 1 2 3 4 Predicted 1 3 0 0 0 2 0 6 2 0 3 0 0 4 0 4 0 0 0 5