#!/usr/bin/env python # coding: utf-8 # # Maximum Margin Classifiers - The Support Vector Machine #
Given a labeled scatter plot, such as the one below, the best way to classify points of different classes is to find a hyper-plane (straight line in the 2-d case) that separates the two classes.
# In[1]: import os import course_utils as bd import pandas as pd import numpy as np import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') import imp imp.reload(bd) get_ipython().run_line_magic('matplotlib', 'inline') #For mac users cwd = os.getcwd() datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/' #For window's users, hardcode the dir: #datadir = # In[3]: import os import course_utils as bd import pandas as pd import numpy as np import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') import imp imp.reload(bd) get_ipython().run_line_magic('matplotlib', 'inline') #For mac users cwd = os.getcwd() datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/' #For window's users, hardcode the dir: #datadir = #generate data m = [[0.25,0.25], [0.75,0.75]]; s = 0.025; n=20 X1 = pd.DataFrame(np.random.multivariate_normal([m[0][0], m[0][1]], [[s,0],[0,s]], n), columns=['x1','x2']) X2 = pd.DataFrame(np.random.multivariate_normal([m[1][0], m[1][1]], [[s,0],[0,s]], n), columns=['x1','x2']) plt.figure(figsize = (8, 8)) #Plot data and any hyper-plane that separates the data plt.plot(X1['x1'], X1['x2'],'r.') plt.plot(X2['x1'], X2['x2'],'b+'); #The issue with looking for just any separating line is that an infinite number of lines might qualify, even if the space between classes is quite small. The following shows the same data above along with many qualifying lines.
# In[4]: fig = plt.figure(figsize = (8, 8)) ax = fig.add_subplot(111) #Plot data and any hyper-plane that separates the data plt.plot(X1['x1'], X1['x2'],'r.') plt.plot(X2['x1'], X2['x2'],'b+') #We'll use a brute force search for this, but starting from an intelligent starting point #For this part of the code, we use the hyperplane with equation X1 * ws = t. This differs #from the implementation of bd.plot_dec_line by the sign of t. success = 0; ws = np.array([1, 1]); t = 1 while (success < 10): #Randomly perturbate the starting point, check to see if it seprates the points w_i = ws + np.random.normal(0, 1, 2); t_i = t + np.random.normal(0, 1, 1) if not ((sum((X1.dot(w_i) - t_i)>0)) or (sum((X2.dot(w_i) - t_i)<0))): bd.plot_dec_line(X1['x1'].min(), X2['x1'].max(), w_i[0], w_i[1],-1*t_i, 'k-', 't') success += 1 ax.set_xlim(0, 1) ax.set_ylim(0, 1) # ## Getting the Maximum Margin #With so many hyperplane options, we need a principled way to choose an optimal separating line. Intuitively, we want a line that is simultaneously as far away from the positive instances as it is from the negative instances. In a sense, we want it to be in the middle of the empty region separating the classes. We can formalize this by searching for a hyperplane that maximizes the margin between the positive and negative classes.
#
# Some Definitions
#
#
# With perfectly separable data, we then define the following objective function.
#
#
In the above we plot the optimal linear hyper-plane $W$ and also the hyper-planes that define the margin. Circled are the support vectors.
#
# Once we have found $W$ and $t$, classification of a point $X$ is defined as:
#
Let $\xi_i = [\xi_i]_+ = [1- y_i(W\cdot X_i+t)]_+$ be the distance between a wrongly classified point $X$ and the optimal margin plane $y_i(W\cdot X_i+t)=1$, where $[1-z]_+ = max(0,1-z)$. This last function is also known as the hinge function. We can then redefine our optimization problem to allow for some misclassification of our training points. Specifically,:
#
#
When we presented the soft margin version of the SVM, we also introduced a new term $C$ which controls the amount of error we can tolerate (or conversely, the size of the margin). $C$ is thus a free parameter that we must choose to get the right fit. One way to think about it is as $C$ increases we increase the size of the 2nd term in our optimization function. This means we become less tolerant of training errors (and we thus decrease the size of the margin). # # #
# In[37]: from sklearn import svm import course_utils as bd import pandas as pd imp.reload(bd) #Generate noisier data than the last example m = [[0.25,0.25],[1.25,1.25]]; s = 0.35; n=20 X1=pd.DataFrame(np.random.multivariate_normal([m[0][0], m[0][1]], [[s,0], [0,s]], n), columns=['x1','x2']) X2=pd.DataFrame(np.random.multivariate_normal([m[1][0] ,m[1][1]], [[s,0], [0,s]], n), columns=['x1','x2']) X1['Y'] = -1 * np.ones(X1.shape[0]); X2['Y'] = 1 * np.ones(X2.shape[0]) dat = X1.append(X2, ignore_index=True) #Now plot for different levels of C fig = plt.figure(figsize = (12, 12)) c_s = [ 1000, 1, 10e-2, 10e-4 ] for i,c in enumerate(c_s): my_svm = svm.SVC(kernel='linear', C=c) my_svm.fit(dat[['x1','x2']], dat['Y']) w = my_svm.coef_[0] t = my_svm.intercept_ if t<0: sign = '-' else: sign = '+' ax = fig.add_subplot(2, 2, i+1) plt.title('C={} : f=<{},{}> X {} {}'.format(c, np.round(w[0],1), np.round(w[1],1), sign,np.abs(np.round(t,1)))) bd.plotSVM(dat[['x1','x2']], dat['Y'], my_svm) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) #As we increase $C$, we can see that the norm of $W$ increases and the number of support vectors identified shrinks. #
# # SVM vs. Logistic Regression # #Both SVM and Logistic Regression find linear separating hyper-planes of the form $class(X)=f(W\cdot X+t)$. # #
# In[10]: from sklearn import svm, linear_model import course_utils as bd from sklearn.metrics import confusion_matrix import pandas as pd imp.reload(bd) #Generate training and testing data from the same distribution m = [[0.25,0.25],[1.25,1.25]]; s = 2; n=20 def makeData(m, s, n): X1=pd.DataFrame(np.random.multivariate_normal([m[0][0],m[0][1]],[[s,0],[0,s]],n), columns=['x1','x2']) X2=pd.DataFrame(np.random.multivariate_normal([m[1][0],m[1][1]],[[s,0],[0,s]],n), columns=['x1','x2']) X1['Y'] = -1 * np.ones(X1.shape[0]) X2['Y'] = 1 * np.ones(X2.shape[0]) return X1.append(X2, ignore_index=True) train = makeData(m, s, n) test = makeData(m, s, n) #Fit both types of models logreg = linear_model.LogisticRegression(C=1e30) logreg.fit(train[['x1','x2']], (train['Y']+1)/2) my_svm = svm.SVC(kernel='linear') my_svm.fit(train[['x1','x2']], train['Y']) #Get accuracy def getACC(X_t, Y_t, mod): cm=confusion_matrix(mod.predict(X_t),Y_t) return (cm[0][0]+cm[1][1])/float(np.sum(cm)) acc_svm = getACC(test[['x1','x2']], test['Y'], my_svm) acc_lr = getACC(test[['x1','x2']], (test['Y']+1)/2, logreg) #Plot them w_svm = my_svm.coef_[0]; a_svm = -w_svm[0] / w_svm[1]; t_svm = my_svm.intercept_[0]; w_lr = logreg.coef_[0]; a_lr = -w_lr[0] / w_lr[1]; t_lr = logreg.intercept_[0]; xx = np.linspace(test[['x1','x2']].iloc[:,0].min(), test[['x1','x2']].iloc[:,0].max()) yy_svm = a_svm * xx - (t_svm) / w_svm[1] yy_lr = a_lr * xx - (t_lr) / w_lr[1] fig = plt.figure(figsize = (8, 8)) plt.plot(test[(test['Y']==-1)].iloc[:,0], test[(test['Y']==-1)].iloc[:,1],'r.') plt.plot(test[(test['Y']==1)].iloc[:,0], test[(test['Y']==1)].iloc[:,1],'b+') plt.plot(xx, yy_svm, label='SVM({})'.format(acc_svm)) plt.plot(xx, yy_lr, label='LR({})'.format(acc_svm)) plt.legend() #We can see that the two classifiers behave very similarly when just looking for a linear decision boundary. The choice of which classifier to use is usually determined by comfort and familiarity as well as the need to exploit the specific properties of each.
#
#