#!/usr/bin/env python # coding: utf-8 # # Chapter 9 Support Vector Machines # - [Lab 9.6.1 Support Vector Classifier](#lab-9.6.1) # - [Lab 9.6.2 Support Vector Machine](#lab-9.6.2) # - [Lab 9.6.3 ROC Curves](#lab-9.6.3) # - [Lab 9.6.4 SVM with Multiple Classes](#lab-9.6.4) # - [Lab 9.6.5 Application to Gene Expression Data](#lab-9.6.5) # ### Imports and Configurations # In[1]: # Use rpy2 for loading R datasets from rpy2.robjects.packages import importr from rpy2.robjects.packages import data as rdata from rpy2.robjects import pandas2ri # Math and data processing import numpy as np import scipy as sp import pandas as pd # scikit-learn from sklearn.svm import SVC, LinearSVC from sklearn.model_selection import train_test_split, cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import scale from sklearn.metrics import mean_squared_error from sklearn.metrics import confusion_matrix, classification_report from sklearn.metrics import roc_curve, auc # Visulization from IPython.display import display import matplotlib as mpl import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') mpl.style.use('ggplot') # *SVC plot function.** # In[2]: def plot_svc(svc, X, y, h=0.02, pad=0.25): x_min, x_max = X[:, 0].min()-pad, X[:, 0].max()+pad y_min, y_max = X[:, 1].min()-pad, X[:, 1].max()+pad xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = svc.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.2) plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired) # Support vectors indicated in plot by vertical lines sv = svc.support_vectors_ plt.scatter(sv[:,0], sv[:,1], c='k', marker='x', s=100, linewidths='1') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xlabel('X1') plt.ylabel('X2') plt.show() print('Number of support vectors: ', svc.support_.size) # # ### Lab 9.6.1 Support Vector Classifier # In[3]: # Generating random data: 20 observations of 2 features and divide into two classes. np.random.seed(5) X = np.random.randn(20,2) y = np.repeat([1,-1], 10) X[y == -1] = X[y == -1] +1 plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired) plt.xlabel('X1') plt.ylabel('X2'); # In[4]: svc = SVC(C=1, kernel='linear') svc.fit(X, y) # In[5]: plot_svc(svc, X, y) # In[6]: # When using a smaller cost parameter (C=0.1) the margin is wider, resulting in more support vectors. svc2 = SVC(C=0.1, kernel='linear') svc2.fit(X, y) plot_svc(svc2, X, y) # In[7]: # Select the optimal C parameter by cross-validation tuning_parms = [{'C': [0.001, 0.01, 0.1, 1, 5, 10, 100]}] clf = GridSearchCV(SVC(kernel='linear'), tuning_parms, cv=10, scoring='accuracy') clf.fit(X, y) pd.DataFrame(clf.cv_results_)[['mean_test_score', 'std_test_score', 'params']] # In[8]: # Best parameter according to GridSearchCV. clf.best_params_ # In[9]: # Generating test data np.random.seed(1) X_test = np.random.randn(20,2) y_test = np.random.choice([-1,1], 20) X_test[y_test == 1] = X_test[y_test == 1] -1 plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired) plt.xlabel('X1') plt.ylabel('X2'); # In[10]: # Prediction with C = 0.1 y_pred = svc2.predict(X_test) pd.DataFrame(confusion_matrix(y_test, y_pred),index=svc.classes_, columns=svc.classes_) # In[11]: # SVC with C = 0.001 svc3 = SVC(C=0.001, kernel='linear') svc3.fit(X, y) y_pred = svc3.predict(X_test) pd.DataFrame(confusion_matrix(y_test, y_pred), index=svc3.classes_, columns=svc3.classes_) # In[12]: # Changing the test data so that the classes are really seperable with a hyperplane. X_test[y_test == 1] = X_test[y_test == 1] -1 plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired) plt.xlabel('X1') plt.ylabel('X2'); # In[13]: svc4 = SVC(C=10.0, kernel='linear') svc4.fit(X_test, y_test) plot_svc(svc4, X_test, y_test) # In[14]: # Increase the margin: 1 mis-classfication. Increased bias, lower variance. svc5 = SVC(C=1, kernel='linear') svc5.fit(X_test, y_test) plot_svc(svc5, X_test, y_test) # # ### Lab 9.6.2 Support Vector Machine # In[15]: # Generate test data np.random.seed(8) X = np.random.randn(200,2) X[:100] = X[:100] +2 X[101:150] = X[101:150] -2 y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)]) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2) plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired) plt.xlabel('X1') plt.ylabel('X2'); # In[16]: svm = SVC(C=1.0, kernel='rbf', gamma=1) svm.fit(X_train, y_train) # In[17]: plot_svc(svm, X_train, y_train) # In[18]: # Increasing C parameter, allowing more flexibility svm2 = SVC(C=100, kernel='rbf', gamma=1.0) svm2.fit(X_train, y_train) plot_svc(svm2, X_train, y_train) # In[19]: # Set the parameters by cross-validation tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100], 'gamma': [0.5, 1,2,3,4]}] clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy') clf.fit(X_train, y_train) pd.DataFrame(clf.cv_results_)[['mean_test_score', 'std_test_score', 'params']] # In[20]: clf.best_params_ # In[21]: confusion_matrix(y_test, clf.best_estimator_.predict(X_test)) # In[22]: # 15% of test observations misclassified clf.best_estimator_.score(X_test, y_test) # # ### Lab 9.6.3 ROC Curves # In[23]: svm3 = SVC(C=1, kernel='rbf', gamma=2) svm3.fit(X_train, y_train) # In[24]: # More flexible model svm4 = SVC(C=1, kernel='rbf', gamma=50) svm4.fit(X_train, y_train) # In[25]: y_train_score3 = svm3.decision_function(X_train) y_train_score4 = svm4.decision_function(X_train) false_pos_rate3, true_pos_rate3, _ = roc_curve(y_train, y_train_score3) roc_auc3 = auc(false_pos_rate3, true_pos_rate3) false_pos_rate4, true_pos_rate4, _ = roc_curve(y_train, y_train_score4) roc_auc4 = auc(false_pos_rate4, true_pos_rate4) fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(14,6)) ax1.plot(false_pos_rate3, true_pos_rate3, label='SVM $\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b') ax1.plot(false_pos_rate4, true_pos_rate4, label='SVM $\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r') ax1.set_title('Training Data') y_test_score3 = svm3.decision_function(X_test) y_test_score4 = svm4.decision_function(X_test) false_pos_rate3, true_pos_rate3, _ = roc_curve(y_test, y_test_score3) roc_auc3 = auc(false_pos_rate3, true_pos_rate3) false_pos_rate4, true_pos_rate4, _ = roc_curve(y_test, y_test_score4) roc_auc4 = auc(false_pos_rate4, true_pos_rate4) ax2.plot(false_pos_rate3, true_pos_rate3, label='SVM $\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b') ax2.plot(false_pos_rate4, true_pos_rate4, label='SVM $\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r') ax2.set_title('Test Data') for ax in fig.axes: ax.plot([0, 1], [0, 1], 'k--') ax.set_xlim([-0.05, 1.0]) ax.set_ylim([0.0, 1.05]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.legend(loc="lower right") # # ### Lab 9.6.4 SVM with Multiple Classes # In[26]: # Adding a third class of observations np.random.seed(8) XX = np.vstack([X, np.random.randn(50,2)]) yy = np.hstack([y, np.repeat(0,50)]) XX[yy ==0] = XX[yy == 0] +4 plt.scatter(XX[:,0], XX[:,1], s=70, c=yy, cmap=plt.cm.prism) plt.xlabel('XX1') plt.ylabel('XX2'); # In[27]: svm5 = SVC(C=1, kernel='rbf') svm5.fit(XX, yy) plot_svc(svm5, XX, yy) # # ### Lab 9.6.5 Application to Gene Expression Data # In[28]: # Load Khan data set from R ISLR package islr = importr('ISLR') khan_rdf = rdata(islr).fetch('Khan')['Khan'] list(khan_rdf.names) # In[29]: # Assign X, y train and test set X_train = pandas2ri.ri2py(khan_rdf[0]) X_test = pandas2ri.ri2py(khan_rdf[1]) y_train = pandas2ri.ri2py(khan_rdf[2]).astype(int) y_test = pandas2ri.ri2py(khan_rdf[3]).astype(int) print(X_train.shape) print(y_train.shape) print(X_test.shape) print(y_test.shape) # In[30]: # y_train counts pd.Series(y_train).value_counts() # In[31]: # y_test counts pd.Series(y_test).value_counts() # In[32]: # This model gives identical results to the svm() of the R package e1071, also based on libsvm library. svc = SVC(kernel='linear') # This model is based on liblinear library and gives 100 score on the test data. #svc = LinearSVC() svc.fit(X_train, y_train) # In[33]: # Traing CM cm = confusion_matrix(y_train, svc.predict(X_train)) cm_df = pd.DataFrame(cm.T, index=svc.classes_, columns=svc.classes_) cm_df.index.name = 'Predicted' cm_df.columns.name = 'True' print(cm_df) # In[34]: # Prediction and test CM cm = confusion_matrix(y_test, svc.predict(X_test)) cm_df = pd.DataFrame(cm.T, index=svc.classes_, columns=svc.classes_) cm_df.index.name = 'Predicted' cm_df.columns.name = 'True' print(cm_df)