Given a labeled scatter plot, such as the one below, the best way to classify points of different classes is to find a hyper-plane (straight line in the 2-d case) that separates the two classes.
import os
import course_utils as bd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import imp
imp.reload(bd)
%matplotlib inline
#For mac users
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'
#For window's users, hardcode the dir:
#datadir =
import os
import course_utils as bd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import imp
imp.reload(bd)
%matplotlib inline
#For mac users
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'
#For window's users, hardcode the dir:
#datadir =
#generate data
m = [[0.25,0.25], [0.75,0.75]]; s = 0.025; n=20
X1 = pd.DataFrame(np.random.multivariate_normal([m[0][0], m[0][1]], [[s,0],[0,s]], n), columns=['x1','x2'])
X2 = pd.DataFrame(np.random.multivariate_normal([m[1][0], m[1][1]], [[s,0],[0,s]], n), columns=['x1','x2'])
plt.figure(figsize = (8, 8))
#Plot data and any hyper-plane that separates the data
plt.plot(X1['x1'], X1['x2'],'r.')
plt.plot(X2['x1'], X2['x2'],'b+');
The issue with looking for just any separating line is that an infinite number of lines might qualify, even if the space between classes is quite small. The following shows the same data above along with many qualifying lines.
fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(111)
#Plot data and any hyper-plane that separates the data
plt.plot(X1['x1'], X1['x2'],'r.')
plt.plot(X2['x1'], X2['x2'],'b+')
#We'll use a brute force search for this, but starting from an intelligent starting point
#For this part of the code, we use the hyperplane with equation X1 * ws = t. This differs
#from the implementation of bd.plot_dec_line by the sign of t.
success = 0; ws = np.array([1, 1]); t = 1
while (success < 10):
#Randomly perturbate the starting point, check to see if it seprates the points
w_i = ws + np.random.normal(0, 1, 2); t_i = t + np.random.normal(0, 1, 1)
if not ((sum((X1.dot(w_i) - t_i)>0)) or (sum((X2.dot(w_i) - t_i)<0))):
bd.plot_dec_line(X1['x1'].min(), X2['x1'].max(), w_i[0], w_i[1],-1*t_i, 'k-', 't')
success += 1
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
(0, 1)
With so many hyperplane options, we need a principled way to choose an optimal separating line. Intuitively, we want a line that is simultaneously as far away from the positive instances as it is from the negative instances. In a sense, we want it to be in the middle of the empty region separating the classes. We can formalize this by searching for a hyperplane that maximizes the margin between the positive and negative classes.
Some Definitions
The last bit comes from the fact that $W \cdot Q = - t$ since $Q$ is on the line.
Constraining the Problem
By adding contraints to the problem we make it easier to solve. Let $M^+$ and $M^-$ be the distances from the closest positive and negative point to the line. We are free to rescale $t$, $\|W\|_2$, $M^+$ and $M^-$ , so we do the following:
With perfectly separable data, we then define the following objective function.
Now let's stop beating around the bush and actually build one! (note one important tip, with SVM's we generally encode a binary outcome as $[-1, 1]$ and not $[0, 1]$).
from sklearn import svm
import course_utils as bd
import pandas as pd
imp.reload(bd)
m = [[0.25,0.25],[1.75,1.75]]; s = 0.05; n=20
X1=pd.DataFrame(np.random.multivariate_normal([m[0][0],m[0][1]],[[s,0],[0,s]],n), columns=['x1','x2'])
X2=pd.DataFrame(np.random.multivariate_normal([m[1][0],m[1][1]],[[s,0],[0,s]],n), columns=['x1','x2'])
X1['Y'] = -1 * np.ones(X1.shape[0]); X2['Y'] = 1 * np.ones(X2.shape[0])
dat = X1.append(X2, ignore_index=True)
my_svm = svm.SVC(kernel='linear', C = 100)
my_svm.fit(dat[['x1','x2']], dat['Y'])
bd.plotSVM(dat[['x1','x2']], dat['Y'], my_svm)
In the above we plot the optimal linear hyper-plane $W$ and also the hyper-planes that define the margin. Circled are the support vectors.
Once we have found $W$ and $t$, classification of a point $X$ is defined as:
In the above scenarios we identified an optimal separating hyper-plane for data that is cleanly separated by a decision boundary. This is seldom the case in real data, so we need to adapt our formulation to allow for such cases.
Let $\xi_i = [\xi_i]_+ = [1- y_i(W\cdot X_i+t)]_+$ be the distance between a wrongly classified point $X$ and the optimal margin plane $y_i(W\cdot X_i+t)=1$, where $[1-z]_+ = max(0,1-z)$. This last function is also known as the hinge function. We can then redefine our optimization problem to allow for some misclassification of our training points. Specifically,:
It is often useful to write this as:
When we presented the soft margin version of the SVM, we also introduced a new term $C$ which controls the amount of error we can tolerate (or conversely, the size of the margin). $C$ is thus a free parameter that we must choose to get the right fit. One way to think about it is as $C$ increases we increase the size of the 2nd term in our optimization function. This means we become less tolerant of training errors (and we thus decrease the size of the margin).
from sklearn import svm
import course_utils as bd
import pandas as pd
imp.reload(bd)
#Generate noisier data than the last example
m = [[0.25,0.25],[1.25,1.25]]; s = 0.35; n=20
X1=pd.DataFrame(np.random.multivariate_normal([m[0][0], m[0][1]], [[s,0], [0,s]], n), columns=['x1','x2'])
X2=pd.DataFrame(np.random.multivariate_normal([m[1][0] ,m[1][1]], [[s,0], [0,s]], n), columns=['x1','x2'])
X1['Y'] = -1 * np.ones(X1.shape[0]); X2['Y'] = 1 * np.ones(X2.shape[0])
dat = X1.append(X2, ignore_index=True)
#Now plot for different levels of C
fig = plt.figure(figsize = (12, 12))
c_s = [ 1000, 1, 10e-2, 10e-4 ]
for i,c in enumerate(c_s):
my_svm = svm.SVC(kernel='linear', C=c)
my_svm.fit(dat[['x1','x2']], dat['Y'])
w = my_svm.coef_[0]
t = my_svm.intercept_
if t<0:
sign = '-'
else:
sign = '+'
ax = fig.add_subplot(2, 2, i+1)
plt.title('C={} : f=<{},{}> X {} {}'.format(c, np.round(w[0],1), np.round(w[1],1), sign,np.abs(np.round(t,1))))
bd.plotSVM(dat[['x1','x2']], dat['Y'], my_svm)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
As we increase $C$, we can see that the norm of $W$ increases and the number of support vectors identified shrinks.
Both SVM and Logistic Regression find linear separating hyper-planes of the form $class(X)=f(W\cdot X+t)$.
from sklearn import svm, linear_model
import course_utils as bd
from sklearn.metrics import confusion_matrix
import pandas as pd
imp.reload(bd)
#Generate training and testing data from the same distribution
m = [[0.25,0.25],[1.25,1.25]]; s = 2; n=20
def makeData(m, s, n):
X1=pd.DataFrame(np.random.multivariate_normal([m[0][0],m[0][1]],[[s,0],[0,s]],n), columns=['x1','x2'])
X2=pd.DataFrame(np.random.multivariate_normal([m[1][0],m[1][1]],[[s,0],[0,s]],n), columns=['x1','x2'])
X1['Y'] = -1 * np.ones(X1.shape[0])
X2['Y'] = 1 * np.ones(X2.shape[0])
return X1.append(X2, ignore_index=True)
train = makeData(m, s, n)
test = makeData(m, s, n)
#Fit both types of models
logreg = linear_model.LogisticRegression(C=1e30)
logreg.fit(train[['x1','x2']], (train['Y']+1)/2)
my_svm = svm.SVC(kernel='linear')
my_svm.fit(train[['x1','x2']], train['Y'])
#Get accuracy
def getACC(X_t, Y_t, mod):
cm=confusion_matrix(mod.predict(X_t),Y_t)
return (cm[0][0]+cm[1][1])/float(np.sum(cm))
acc_svm = getACC(test[['x1','x2']], test['Y'], my_svm)
acc_lr = getACC(test[['x1','x2']], (test['Y']+1)/2, logreg)
#Plot them
w_svm = my_svm.coef_[0]; a_svm = -w_svm[0] / w_svm[1]; t_svm = my_svm.intercept_[0];
w_lr = logreg.coef_[0]; a_lr = -w_lr[0] / w_lr[1]; t_lr = logreg.intercept_[0];
xx = np.linspace(test[['x1','x2']].iloc[:,0].min(), test[['x1','x2']].iloc[:,0].max())
yy_svm = a_svm * xx - (t_svm) / w_svm[1]
yy_lr = a_lr * xx - (t_lr) / w_lr[1]
fig = plt.figure(figsize = (8, 8))
plt.plot(test[(test['Y']==-1)].iloc[:,0], test[(test['Y']==-1)].iloc[:,1],'r.')
plt.plot(test[(test['Y']==1)].iloc[:,0], test[(test['Y']==1)].iloc[:,1],'b+')
plt.plot(xx, yy_svm, label='SVM({})'.format(acc_svm))
plt.plot(xx, yy_lr, label='LR({})'.format(acc_svm))
plt.legend()
<matplotlib.legend.Legend at 0x222b1c615c0>
We can see that the two classifiers behave very similarly when just looking for a linear decision boundary. The choice of which classifier to use is usually determined by comfort and familiarity as well as the need to exploit the specific properties of each.