import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn.datasets import make_blobs, make_classification, make_moons, make_circles
from sklearn.metrics import accuracy_score
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
sns.set_style('whitegrid')
sns.set_context('paper',font_scale=2)
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}
Consider a situation where you are trying to learn a system that can label if an individual will commit a crime (label = yes
or +1) or not commit a crime (label=no
or -1), if granted bail. For simplicity, let us assume that we are only using two features as inputs, num_past_offenses
(denoted as $x_1$) and age
(denoted as $x_2$).
Let us first look at all of the data. Of course, this is all made up.
X = np.vstack([np.random.uniform(-1,1,500),np.random.uniform(-1,1,500)]).T
# let us assume this is all the data
plt.scatter(X[:, 0], X[:, 1])
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.tight_layout()
Now we add the label (+1 or -1). Let us assume that the true hypothesis (or the decision boundary) is a line with the following equation:
$$ w_0 + w_1x_1 + w_2x_2 = 0 $$where $w_0 = 0, w_1 = 1, w_2 = 1$. Thus, the decision boundary becomes:
$$ x_1 + x_2 = 0 $$Now of course, to draw the line, I will have to express it in the $y = mx + c$ format:
$$ x_2 = -x_1 $$# get the labels
X_i = np.hstack([np.ones((X.shape[0],1)),X])
w = np.array([[0],[1],[1]])
y = np.array((np.dot(X_i,w) > 0),dtype='int')
y[y == 0] = -1
# let us assume this is all the data
plt.scatter(X[:, 0], X[:, 1],c=y.flatten(), s=24, cmap='jet')
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.plot(np.linspace(-1,1,10), -np.linspace(-1,1,10), '-k')
plt.tight_layout()
During training, we are given a sample of the data
traininds = np.random.permutation(X.shape[0])[0:100]
Xtrain = X[traininds,:]
ytrain = y[traininds,:]
plt.scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
<matplotlib.collections.PathCollection at 0x1a1f76e438>
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
ax[0].set_xticks([])
ax[0].set_yticks([])
ax[0].text(0,-1.2,'(a)')
ax[0].plot(np.linspace(-1,1,10), -1.3*np.linspace(-1,1,10)+0.1, '-k')
ax[0].set_ylim([-1,1])
ax[1].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
ax[1].set_xticks([])
ax[1].set_yticks([])
ax[1].text(0,-1.2,'(b)')
ax[1].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)+0.5, '-k')
ax[1].set_ylim([-1,1])
ax[2].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
ax[2].set_xticks([])
ax[2].set_yticks([])
ax[2].text(0,-1.2,'(c)')
ax[2].plot(np.linspace(-1,1,10), -0.7*np.linspace(-1,1,10)-0.3, '-k')
ax[2].set_ylim([-1,1])
plt.tight_layout()
Let us assume that the training data is not uniformly sampled
# get the labels
X_i = np.hstack([np.ones((X.shape[0],1)),X])
z_hat = np.abs(np.dot(X_i,w))
sel_inds = np.where(z_hat > 0.3)[0]
X1 = X[sel_inds,:]
y1 = y[sel_inds]
traininds = np.random.permutation(X1.shape[0])[0:100]
Xtrain1 = X1[traininds,:]
ytrain = y1[traininds,:]
alphaval=0.0
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet')
ax[0].scatter(X[:,0],X[:,1],alpha=alphaval,c='k')
ax[0].set_xticks([])
ax[0].set_yticks([])
ax[0].text(0,-1.2,'(a)')
ax[0].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10), '-k')
ax[0].set_ylim([-1,1])
ax[1].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet')
ax[1].scatter(X[:,0],X[:,1],alpha=alphaval,c='k')
ax[1].set_xticks([])
ax[1].set_yticks([])
ax[1].text(0,-1.2,'(b)')
ax[1].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)-0.28, '-k')
ax[1].set_ylim([-1,1])
ax[2].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet')
ax[2].scatter(X[:,0],X[:,1],alpha=alphaval,c='k')
ax[2].set_xticks([])
ax[2].set_yticks([])
ax[2].text(0,-1.2,'(c)')
ax[2].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)+0.28, '-k')
ax[2].set_ylim([-1,1])
plt.tight_layout()
#checking margins
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
y[y==0] = -1
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet');
# fit the model
clf = svm.SVC(kernel='linear', C=10)
clf.fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap='jet')
# plot the decision function
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
# plot decision boundary and margins
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
# plot support vectors
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
linewidth=1, facecolors='none', edgecolors='k')
plt.show()
clf.dual_coef_
array([[-0.2672062, -0.7327938, 1. ]])
clf.support_vectors_
array([[0.44359863, 3.11530945], [2.33812285, 3.43116792], [2.06156753, 1.96918596]])
clf.dual_coef_
array([[-0.2672062, -0.7327938, 1. ]])
The sklearn
implementation attaches the sign to the $\alpha$s
# calculating w
w = np.sum(clf.dual_coef_.T*clf.support_vectors_,axis=0)
print(w)
[ 0.22967329 -1.37758262]
clf.coef_
array([[ 0.22967329, -1.37758262]])
The key takeaway here is that while the dual optimization formulation of SVMs involves $N$ $\alpha$s, only a few are non-zero. These correspond to the training examples that lie on the hyper-plane, and are know as the support vectors.
X, y = make_blobs(n_samples=100, centers=2,
random_state=0, cluster_std=1.2)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet');
X, y = make_blobs(n_samples=100, centers=2,
random_state=0, cluster_std=0.8)
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, C in zip(ax, [10.0, 0.01]):
clf = svm.SVC(kernel='linear', C=C)
clf.fit(X, y)
axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet')
xlim = axi.get_xlim()
ylim = axi.get_ylim()
# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
# plot decision boundary and margins
axi.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
# plot support vectors
axi.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
linewidth=1, facecolors='none', edgecolors='k')
axi.set_title('C = {0:.1f}'.format(C), size=14)
Consider the following experiment
#X, y = make_blobs(n_samples=1000, centers=2,
# random_state=0, cluster_std=1.8)
X, y = make_moons(n_samples=1000,noise=0.5, random_state=0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet');
We sample points from the above data set, and every time train a linear SVM classifier on the sample. Due to randomness, the trained model will be different. However, we want to study, if the model is more stable when C is high or low?
C = 10
fig, ax = plt.subplots(4, 2, figsize=(10, 10))
expind = 0
for i in range(4):
for j in range(2):
axi = ax[i,j]
sampleinds = np.random.permutation(X.shape[0])[0:40]
Xtrain,ytrain = X[sampleinds,:],y[sampleinds]
clf = svm.SVC(kernel='linear', C=C)
clf.fit(Xtrain, ytrain)
axi.scatter(Xtrain[:, 0], Xtrain[:, 1], c=ytrain, s=50, cmap='jet')
xlim = axi.get_xlim()
ylim = axi.get_ylim()
# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
# plot decision boundary and margins
axi.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
# plot support vectors
axi.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
linewidth=1, facecolors='none', edgecolors='k')
axi.set_title('C = {0:.1f}'.format(C), size=14)
var = []
accs = []
for C in np.linspace(0.1,1,20):
coefs = []
tr_ac = []
for i in range(100):
sampleinds = np.random.permutation(X.shape[0])[0:40]
Xtrain,ytrain = X[sampleinds,:],y[sampleinds]
clf = svm.SVC(kernel='linear', C=C)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtrain)
tr_ac.append(accuracy_score(ytrain,ypred))
coefs.append(np.array([clf.coef_[0,0],clf.coef_[0,1]]))
var.append(np.sum(np.std(np.array(coefs),axis=0)))
accs.append(np.mean(tr_ac))
plt.plot(np.linspace(0.1,1,20),var)
#plt.text(0,0.3,'mule')
#plt.text(10,1.2,'sheep')
plt.ylabel('Variance')
plt.xlabel('C')
plt.tight_layout()
# first generate all data
Xfull = np.vstack([np.random.uniform(-1,1,1000),np.random.uniform(-1,1,1000)]).T
X1 = np.hstack([np.ones((Xfull.shape[0],1)),Xfull,Xfull[:,0:1]**2+Xfull[:,1:2]**2,Xfull[:,0:1]**2])
w = np.array([[1],[1],[1],[3],[2]])
d = np.dot(X1,w)
yfull = np.ones((Xfull.shape[0],1))
yfull[d < 2.5] = -1
plt.scatter(Xfull[:,0],Xfull[:,1],c=yfull.flatten(),cmap='jet')
yfull=yfull.flatten()
# set aside some of it for future testing
sampleinds = np.random.permutation(Xfull.shape[0])
Xtest,ytest = Xfull[sampleinds[0:100],:], yfull[sampleinds[0:100]]
X,y = Xfull[sampleinds[100:],:], yfull[sampleinds[100:]]
trainingaccuracy_m = []
testingaccuracy_m = []
trainingaccuracy_s = []
testingaccuracy_s = []
Cvals = np.linspace(0.1,5,20)
for C in Cvals:
tr_ac = []
te_ac = []
for i in range(100):
sampleinds = np.random.permutation(X.shape[0])[0:40]
Xtrain,ytrain = X[sampleinds,:],y[sampleinds]
clf = svm.SVC(kernel='linear', C=C)
clf.fit(Xtrain, ytrain)
# calculate accuracy on training set
ypred = clf.predict(Xtrain)
tr_ac.append(accuracy_score(ytrain,ypred))
# calculate accuracy on testing set
ypred = clf.predict(Xtest)
te_ac.append(accuracy_score(ytest,ypred))
trainingaccuracy_m.append(np.mean(tr_ac))
trainingaccuracy_s.append(np.std(tr_ac))
testingaccuracy_m.append(np.mean(te_ac))
testingaccuracy_s.append(np.std(te_ac))
plt.plot(Cvals,trainingaccuracy_m)
plt.plot(Cvals,testingaccuracy_m,'r')
plt.ylabel('Accuracy')
plt.xlabel('C')
plt.legend(['Accuracy on Training Data','Accuracy on Testing Data'])
plt.tight_layout()
np.random.seed(0)
X = np.random.randn(300, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
edgecolors='k')
<matplotlib.collections.PathCollection at 0x1a1fda3c18>
# fit the model
clf = svm.SVC(kernel='rbf',gamma='auto',C=1)
clf.fit(X, Y)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# sample points on a grid
xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
np.linspace(-3, 3, 500))
# plot the decision function for each datapoint on the grid
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
linestyles='dashed')
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
edgecolors='k')
plt.xticks(())
plt.yticks(())
plt.axis([-3, 3, -3, 3])
plt.show()
Study the impact of $\gamma$ and $C$
x1 = 0
x2 = np.linspace(-10,10,100)
k = np.zeros((x2.shape[0],1))
for gamma in [0.1,0.5,0.9,2]:
for i in range(k.shape[0]):
k[i] = np.exp(-0.5*gamma**2*((x1 - x2[i])**2));
plt.plot(x2,k,label='$\gamma$='+str(gamma))
plt.legend()
<matplotlib.legend.Legend at 0x1a1f14bba8>
X,Y = make_circles(n_samples=500, noise=0.3, factor=0.5, random_state=1)
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
edgecolors='k')
<matplotlib.collections.PathCollection at 0x1a1ff09c88>
# fit the model
clf = svm.SVC(kernel='rbf',degree=9,gamma=400)
clf.fit(X, Y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=9, gamma=400, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# sample points on a grid
xx, yy = np.meshgrid(np.linspace(-2, 2, 500),
np.linspace(-2, 2, 500))
# plot the decision function for each datapoint on the grid
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
linestyles='dashed')
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
edgecolors='k')
plt.xticks(())
plt.yticks(())
plt.axis([-2, 2, -2, 2])
plt.show()