#!/usr/bin/env python # coding: utf-8 # In[201]: import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn import svm from sklearn.datasets import make_blobs, make_classification, make_moons, make_circles from sklearn.metrics import accuracy_score get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: import seaborn as sns sns.set(color_codes=True) sns.set_style('whitegrid') sns.set_context('paper',font_scale=2) get_ipython().run_line_magic('config', "InlineBackend.print_figure_kwargs = {'bbox_inches':None}") # # Some simple thought experiments # # Consider a situation where you are trying to learn a system that can label if an individual will commit a crime (label = `yes` or +1) or not commit a crime (label=`no` or -1), if granted bail. For simplicity, let us assume that we are only using two features as inputs, `num_past_offenses` (denoted as $x_1$) and `age` (denoted as $x_2$). # # Let us first look at **all** of the data. Of course, this is all made up. # In[14]: X = np.vstack([np.random.uniform(-1,1,500),np.random.uniform(-1,1,500)]).T # In[111]: # let us assume this is all the data plt.scatter(X[:, 0], X[:, 1]) plt.xlabel('$x_1$') plt.ylabel('$x_2$') plt.tight_layout() # Now we add the label (+1 or -1). Let us assume that the true hypothesis (or the decision boundary) is a line with the following equation: # # $$ # w_0 + w_1x_1 + w_2x_2 = 0 # $$ # # where $w_0 = 0, w_1 = 1, w_2 = 1$. Thus, the decision boundary becomes: # # $$ # x_1 + x_2 = 0 # $$ # # Now of course, to draw the line, I will have to express it in the $y = mx + c$ format: # # $$ # x_2 = -x_1 # $$ # # In[112]: # get the labels X_i = np.hstack([np.ones((X.shape[0],1)),X]) w = np.array([[0],[1],[1]]) y = np.array((np.dot(X_i,w) > 0),dtype='int') y[y == 0] = -1 # In[113]: # let us assume this is all the data plt.scatter(X[:, 0], X[:, 1],c=y.flatten(), s=24, cmap='jet') plt.xlabel('$x_1$') plt.ylabel('$x_2$') plt.plot(np.linspace(-1,1,10), -np.linspace(-1,1,10), '-k') plt.tight_layout() # ## Training phase # During training, we are given a sample of the data # In[116]: traininds = np.random.permutation(X.shape[0])[0:100] Xtrain = X[traininds,:] ytrain = y[traininds,:] # In[117]: plt.scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet') # ### which line to choose? # In[126]: fig, ax = plt.subplots(1, 3, figsize=(12, 4)) ax[0].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet') ax[0].set_xticks([]) ax[0].set_yticks([]) ax[0].text(0,-1.2,'(a)') ax[0].plot(np.linspace(-1,1,10), -1.3*np.linspace(-1,1,10)+0.1, '-k') ax[0].set_ylim([-1,1]) ax[1].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet') ax[1].set_xticks([]) ax[1].set_yticks([]) ax[1].text(0,-1.2,'(b)') ax[1].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)+0.5, '-k') ax[1].set_ylim([-1,1]) ax[2].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet') ax[2].set_xticks([]) ax[2].set_yticks([]) ax[2].text(0,-1.2,'(c)') ax[2].plot(np.linspace(-1,1,10), -0.7*np.linspace(-1,1,10)-0.3, '-k') ax[2].set_ylim([-1,1]) plt.tight_layout() # ### How about this? # # Let us assume that the training data is not *uniformly sampled* # In[154]: # get the labels X_i = np.hstack([np.ones((X.shape[0],1)),X]) z_hat = np.abs(np.dot(X_i,w)) sel_inds = np.where(z_hat > 0.3)[0] X1 = X[sel_inds,:] y1 = y[sel_inds] traininds = np.random.permutation(X1.shape[0])[0:100] Xtrain1 = X1[traininds,:] ytrain = y1[traininds,:] # In[156]: alphaval=0.0 fig, ax = plt.subplots(1, 3, figsize=(12, 4)) ax[0].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet') ax[0].scatter(X[:,0],X[:,1],alpha=alphaval,c='k') ax[0].set_xticks([]) ax[0].set_yticks([]) ax[0].text(0,-1.2,'(a)') ax[0].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10), '-k') ax[0].set_ylim([-1,1]) ax[1].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet') ax[1].scatter(X[:,0],X[:,1],alpha=alphaval,c='k') ax[1].set_xticks([]) ax[1].set_yticks([]) ax[1].text(0,-1.2,'(b)') ax[1].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)-0.28, '-k') ax[1].set_ylim([-1,1]) ax[2].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet') ax[2].scatter(X[:,0],X[:,1],alpha=alphaval,c='k') ax[2].set_xticks([]) ax[2].set_yticks([]) ax[2].text(0,-1.2,'(c)') ax[2].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)+0.28, '-k') ax[2].set_ylim([-1,1]) plt.tight_layout() # # Support Vector Machines # ### Separable case # In[340]: #checking margins from sklearn.datasets.samples_generator import make_blobs X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) y[y==0] = -1 plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet'); # In[349]: # fit the model clf = svm.SVC(kernel='linear', C=10) clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap='jet') # plot the decision function ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() # create grid to evaluate model xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) # plot support vectors ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k') plt.show() # In[343]: clf.dual_coef_ # In[344]: clf.support_vectors_ # In[206]: clf.dual_coef_ # The `sklearn` implementation attaches the sign to the $\alpha$s # In[221]: # calculating w w = np.sum(clf.dual_coef_.T*clf.support_vectors_,axis=0) print(w) # In[222]: clf.coef_ # The key takeaway here is that while the dual optimization formulation of SVMs involves $N$ $\alpha$s, only a few are non-zero. These correspond to the training examples that lie on the hyper-plane, and are know as the **support vectors**. # ### Non-separable Case # In[346]: X, y = make_blobs(n_samples=100, centers=2, random_state=0, cluster_std=1.2) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet'); # In[358]: X, y = make_blobs(n_samples=100, centers=2, random_state=0, cluster_std=0.8) fig, ax = plt.subplots(1, 2, figsize=(16, 6)) fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1) for axi, C in zip(ax, [10.0, 0.01]): clf = svm.SVC(kernel='linear', C=C) clf.fit(X, y) axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet') xlim = axi.get_xlim() ylim = axi.get_ylim() # create grid to evaluate model xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins axi.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) # plot support vectors axi.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k') axi.set_title('C = {0:.1f}'.format(C), size=14) # # Bias vs. Variance # Consider the following experiment # In[350]: #X, y = make_blobs(n_samples=1000, centers=2, # random_state=0, cluster_std=1.8) X, y = make_moons(n_samples=1000,noise=0.5, random_state=0) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet'); # We sample points from the above data set, and every time train a linear SVM classifier on the sample. Due to randomness, the trained model will be different. However, we want to study, if the model is more stable when C is high or low? # In[355]: C = 10 fig, ax = plt.subplots(4, 2, figsize=(10, 10)) expind = 0 for i in range(4): for j in range(2): axi = ax[i,j] sampleinds = np.random.permutation(X.shape[0])[0:40] Xtrain,ytrain = X[sampleinds,:],y[sampleinds] clf = svm.SVC(kernel='linear', C=C) clf.fit(Xtrain, ytrain) axi.scatter(Xtrain[:, 0], Xtrain[:, 1], c=ytrain, s=50, cmap='jet') xlim = axi.get_xlim() ylim = axi.get_ylim() # create grid to evaluate model xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins axi.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) # plot support vectors axi.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k') axi.set_title('C = {0:.1f}'.format(C), size=14) # ## Testin stability of models under different training sets for different values of $C$ # In[152]: var = [] accs = [] for C in np.linspace(0.1,1,20): coefs = [] tr_ac = [] for i in range(100): sampleinds = np.random.permutation(X.shape[0])[0:40] Xtrain,ytrain = X[sampleinds,:],y[sampleinds] clf = svm.SVC(kernel='linear', C=C) clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtrain) tr_ac.append(accuracy_score(ytrain,ypred)) coefs.append(np.array([clf.coef_[0,0],clf.coef_[0,1]])) var.append(np.sum(np.std(np.array(coefs),axis=0))) accs.append(np.mean(tr_ac)) # In[155]: plt.plot(np.linspace(0.1,1,20),var) #plt.text(0,0.3,'mule') #plt.text(10,1.2,'sheep') plt.ylabel('Variance') plt.xlabel('C') plt.tight_layout() # # Studying impact of $C$ on performance # In[119]: # first generate all data Xfull = np.vstack([np.random.uniform(-1,1,1000),np.random.uniform(-1,1,1000)]).T X1 = np.hstack([np.ones((Xfull.shape[0],1)),Xfull,Xfull[:,0:1]**2+Xfull[:,1:2]**2,Xfull[:,0:1]**2]) w = np.array([[1],[1],[1],[3],[2]]) d = np.dot(X1,w) yfull = np.ones((Xfull.shape[0],1)) yfull[d < 2.5] = -1 plt.scatter(Xfull[:,0],Xfull[:,1],c=yfull.flatten(),cmap='jet') yfull=yfull.flatten() # In[136]: # set aside some of it for future testing sampleinds = np.random.permutation(Xfull.shape[0]) Xtest,ytest = Xfull[sampleinds[0:100],:], yfull[sampleinds[0:100]] X,y = Xfull[sampleinds[100:],:], yfull[sampleinds[100:]] # In[147]: trainingaccuracy_m = [] testingaccuracy_m = [] trainingaccuracy_s = [] testingaccuracy_s = [] Cvals = np.linspace(0.1,5,20) for C in Cvals: tr_ac = [] te_ac = [] for i in range(100): sampleinds = np.random.permutation(X.shape[0])[0:40] Xtrain,ytrain = X[sampleinds,:],y[sampleinds] clf = svm.SVC(kernel='linear', C=C) clf.fit(Xtrain, ytrain) # calculate accuracy on training set ypred = clf.predict(Xtrain) tr_ac.append(accuracy_score(ytrain,ypred)) # calculate accuracy on testing set ypred = clf.predict(Xtest) te_ac.append(accuracy_score(ytest,ypred)) trainingaccuracy_m.append(np.mean(tr_ac)) trainingaccuracy_s.append(np.std(tr_ac)) testingaccuracy_m.append(np.mean(te_ac)) testingaccuracy_s.append(np.std(te_ac)) # In[148]: plt.plot(Cvals,trainingaccuracy_m) plt.plot(Cvals,testingaccuracy_m,'r') plt.ylabel('Accuracy') plt.xlabel('C') plt.legend(['Accuracy on Training Data','Accuracy on Testing Data']) plt.tight_layout() # # Kernel SVM # In[164]: np.random.seed(0) X = np.random.randn(300, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors='k') # In[198]: # fit the model clf = svm.SVC(kernel='rbf',gamma='auto',C=1) clf.fit(X, Y) # In[199]: # sample points on a grid xx, yy = np.meshgrid(np.linspace(-3, 3, 500), np.linspace(-3, 3, 500)) # plot the decision function for each datapoint on the grid Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linestyles='dashed') plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors='k') plt.xticks(()) plt.yticks(()) plt.axis([-3, 3, -3, 3]) plt.show() # Study the impact of $\gamma$ and $C$ # ## The RBF Kernel # In[239]: x1 = 0 x2 = np.linspace(-10,10,100) k = np.zeros((x2.shape[0],1)) for gamma in [0.1,0.5,0.9,2]: for i in range(k.shape[0]): k[i] = np.exp(-0.5*gamma**2*((x1 - x2[i])**2)); plt.plot(x2,k,label='$\gamma$='+str(gamma)) plt.legend() # In[211]: X,Y = make_circles(n_samples=500, noise=0.3, factor=0.5, random_state=1) plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors='k') # In[237]: # fit the model clf = svm.SVC(kernel='rbf',degree=9,gamma=400) clf.fit(X, Y) # In[238]: # sample points on a grid xx, yy = np.meshgrid(np.linspace(-2, 2, 500), np.linspace(-2, 2, 500)) # plot the decision function for each datapoint on the grid Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linestyles='dashed') plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors='k') plt.xticks(()) plt.yticks(()) plt.axis([-2, 2, -2, 2]) plt.show()