#!/usr/bin/env python
# coding: utf-8

# In[201]:


import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn.datasets import make_blobs, make_classification, make_moons, make_circles
from sklearn.metrics import accuracy_score
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


import seaborn as sns
sns.set(color_codes=True)
sns.set_style('whitegrid')
sns.set_context('paper',font_scale=2)
get_ipython().run_line_magic('config', "InlineBackend.print_figure_kwargs = {'bbox_inches':None}")


# # Some simple thought experiments
# 
# Consider a situation where you are trying to learn a system that can label if an individual will commit a crime (label = `yes` or +1) or not commit a crime (label=`no` or -1), if granted bail. For simplicity, let us assume that we are only using two features as inputs, `num_past_offenses` (denoted as $x_1$) and `age` (denoted as $x_2$).
# 
# Let us first look at **all** of the data. Of course, this is all made up.

# In[14]:


X = np.vstack([np.random.uniform(-1,1,500),np.random.uniform(-1,1,500)]).T


# In[111]:


# let us assume this is all the data
plt.scatter(X[:, 0], X[:, 1])
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.tight_layout()


# Now we add the label (+1 or -1). Let us assume that the true hypothesis (or the decision boundary) is a line with the following equation:
# 
# $$
# w_0 + w_1x_1 + w_2x_2 = 0
# $$
# 
# where $w_0 = 0, w_1 = 1, w_2 = 1$. Thus, the decision boundary becomes:
# 
# $$
# x_1 + x_2 = 0
# $$
# 
# Now of course, to draw the line, I will have to express it in the $y = mx + c$ format:
# 
# $$
# x_2 = -x_1
# $$
# 

# In[112]:


# get the labels
X_i = np.hstack([np.ones((X.shape[0],1)),X])
w = np.array([[0],[1],[1]])
y = np.array((np.dot(X_i,w) > 0),dtype='int')
y[y == 0] = -1


# In[113]:


# let us assume this is all the data
plt.scatter(X[:, 0], X[:, 1],c=y.flatten(), s=24, cmap='jet')
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.plot(np.linspace(-1,1,10), -np.linspace(-1,1,10), '-k')
plt.tight_layout()


# ## Training phase
# During training, we are given a sample of the data

# In[116]:


traininds = np.random.permutation(X.shape[0])[0:100]
Xtrain = X[traininds,:]
ytrain = y[traininds,:]


# In[117]:


plt.scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')


# ### which line to choose?

# In[126]:


fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
ax[0].set_xticks([])
ax[0].set_yticks([])
ax[0].text(0,-1.2,'(a)')
ax[0].plot(np.linspace(-1,1,10), -1.3*np.linspace(-1,1,10)+0.1, '-k')
ax[0].set_ylim([-1,1])

ax[1].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
ax[1].set_xticks([])
ax[1].set_yticks([])
ax[1].text(0,-1.2,'(b)')
ax[1].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)+0.5, '-k')
ax[1].set_ylim([-1,1])


ax[2].scatter(Xtrain[:, 0], Xtrain[:, 1],c=ytrain.flatten(), cmap='jet')
ax[2].set_xticks([])
ax[2].set_yticks([])
ax[2].text(0,-1.2,'(c)')
ax[2].plot(np.linspace(-1,1,10), -0.7*np.linspace(-1,1,10)-0.3, '-k')
ax[2].set_ylim([-1,1])


plt.tight_layout()


# ### How about this?
# 
# Let us assume that the training data is not *uniformly sampled*

# In[154]:


# get the labels
X_i = np.hstack([np.ones((X.shape[0],1)),X])
z_hat = np.abs(np.dot(X_i,w))
sel_inds = np.where(z_hat > 0.3)[0]
X1 = X[sel_inds,:]
y1 = y[sel_inds]

traininds = np.random.permutation(X1.shape[0])[0:100]
Xtrain1 = X1[traininds,:]
ytrain = y1[traininds,:]


# In[156]:


alphaval=0.0
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet')
ax[0].scatter(X[:,0],X[:,1],alpha=alphaval,c='k')
ax[0].set_xticks([])
ax[0].set_yticks([])
ax[0].text(0,-1.2,'(a)')
ax[0].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10), '-k')
ax[0].set_ylim([-1,1])

ax[1].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet')
ax[1].scatter(X[:,0],X[:,1],alpha=alphaval,c='k')
ax[1].set_xticks([])
ax[1].set_yticks([])
ax[1].text(0,-1.2,'(b)')
ax[1].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)-0.28, '-k')
ax[1].set_ylim([-1,1])


ax[2].scatter(Xtrain1[:, 0], Xtrain1[:, 1],c=ytrain1.flatten(), cmap='jet')
ax[2].scatter(X[:,0],X[:,1],alpha=alphaval,c='k')
ax[2].set_xticks([])
ax[2].set_yticks([])
ax[2].text(0,-1.2,'(c)')
ax[2].plot(np.linspace(-1,1,10), -np.linspace(-1,1,10)+0.28, '-k')
ax[2].set_ylim([-1,1])


plt.tight_layout()


# # Support Vector Machines

# ### Separable case

# In[340]:


#checking margins
from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=50, centers=2,
                  random_state=0, cluster_std=0.60)
y[y==0] = -1
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet');


# In[349]:


# fit the model
clf = svm.SVC(kernel='linear', C=10)
clf.fit(X, y)

plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap='jet')

# plot the decision function
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])
# plot support vectors
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.show()


# In[343]:


clf.dual_coef_


# In[344]:


clf.support_vectors_


# In[206]:


clf.dual_coef_


# The `sklearn` implementation attaches the sign to the $\alpha$s

# In[221]:


# calculating w
w = np.sum(clf.dual_coef_.T*clf.support_vectors_,axis=0)
print(w)


# In[222]:


clf.coef_


# The key takeaway here is that while the dual optimization formulation of SVMs involves $N$ $\alpha$s, only a few are non-zero. These correspond to the training examples that lie on the hyper-plane, and are know as the **support vectors**.

# ### Non-separable Case

# In[346]:


X, y = make_blobs(n_samples=100, centers=2,
                  random_state=0, cluster_std=1.2)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet');


# In[358]:


X, y = make_blobs(n_samples=100, centers=2,
                  random_state=0, cluster_std=0.8)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

for axi, C in zip(ax, [10.0, 0.01]):
    clf = svm.SVC(kernel='linear', C=C)
    clf.fit(X, y)
    axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet')
    xlim = axi.get_xlim()
    ylim = axi.get_ylim()

    # create grid to evaluate model
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    Z = clf.decision_function(xy).reshape(XX.shape)

    # plot decision boundary and margins
    axi.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])
    # plot support vectors
    axi.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
               linewidth=1, facecolors='none', edgecolors='k')
    axi.set_title('C = {0:.1f}'.format(C), size=14)


# # Bias vs. Variance
# Consider the following experiment

# In[350]:


#X, y = make_blobs(n_samples=1000, centers=2,
#                  random_state=0, cluster_std=1.8)
X, y = make_moons(n_samples=1000,noise=0.5, random_state=0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet');


# We sample points from the above data set, and every time train a linear SVM classifier on the sample. Due to randomness, the trained model will be different. However, we want to study, if the model is more stable when C is high or low?

# In[355]:


C = 10
fig, ax = plt.subplots(4, 2, figsize=(10, 10))
expind = 0
for i in range(4):
    for j in range(2):
        axi = ax[i,j]
        sampleinds = np.random.permutation(X.shape[0])[0:40]
        Xtrain,ytrain = X[sampleinds,:],y[sampleinds]

        clf = svm.SVC(kernel='linear', C=C)
        clf.fit(Xtrain, ytrain)
        axi.scatter(Xtrain[:, 0], Xtrain[:, 1], c=ytrain, s=50, cmap='jet')
        xlim = axi.get_xlim()
        ylim = axi.get_ylim()

        # create grid to evaluate model
        xx = np.linspace(xlim[0], xlim[1], 30)
        yy = np.linspace(ylim[0], ylim[1], 30)
        YY, XX = np.meshgrid(yy, xx)
        xy = np.vstack([XX.ravel(), YY.ravel()]).T
        Z = clf.decision_function(xy).reshape(XX.shape)

        # plot decision boundary and margins
        axi.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
                   linestyles=['--', '-', '--'])
        # plot support vectors
        axi.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
                   linewidth=1, facecolors='none', edgecolors='k')
        axi.set_title('C = {0:.1f}'.format(C), size=14)


# ## Testin stability of models under different training sets for different values of $C$

# In[152]:


var = []
accs = []
for C in np.linspace(0.1,1,20):
    coefs = []
    tr_ac = []
    for i in range(100):
        sampleinds = np.random.permutation(X.shape[0])[0:40]
        Xtrain,ytrain = X[sampleinds,:],y[sampleinds]

        clf = svm.SVC(kernel='linear', C=C)
        clf.fit(Xtrain, ytrain)
        ypred = clf.predict(Xtrain)
        tr_ac.append(accuracy_score(ytrain,ypred))
        coefs.append(np.array([clf.coef_[0,0],clf.coef_[0,1]]))
    var.append(np.sum(np.std(np.array(coefs),axis=0)))
    accs.append(np.mean(tr_ac))


# In[155]:


plt.plot(np.linspace(0.1,1,20),var)
#plt.text(0,0.3,'mule')
#plt.text(10,1.2,'sheep')
plt.ylabel('Variance')
plt.xlabel('C')
plt.tight_layout()


# # Studying impact of $C$ on performance

# In[119]:


# first generate all data
Xfull = np.vstack([np.random.uniform(-1,1,1000),np.random.uniform(-1,1,1000)]).T
X1 = np.hstack([np.ones((Xfull.shape[0],1)),Xfull,Xfull[:,0:1]**2+Xfull[:,1:2]**2,Xfull[:,0:1]**2])
w = np.array([[1],[1],[1],[3],[2]])
d = np.dot(X1,w)
yfull = np.ones((Xfull.shape[0],1))
yfull[d < 2.5] = -1
plt.scatter(Xfull[:,0],Xfull[:,1],c=yfull.flatten(),cmap='jet')
yfull=yfull.flatten()


# In[136]:


# set aside some of it for future testing
sampleinds = np.random.permutation(Xfull.shape[0])
Xtest,ytest = Xfull[sampleinds[0:100],:], yfull[sampleinds[0:100]]
X,y = Xfull[sampleinds[100:],:], yfull[sampleinds[100:]]


# In[147]:


trainingaccuracy_m = []
testingaccuracy_m = []
trainingaccuracy_s = []
testingaccuracy_s = []
Cvals = np.linspace(0.1,5,20)
for C in Cvals:
    tr_ac = []
    te_ac = []
    for i in range(100):
        sampleinds = np.random.permutation(X.shape[0])[0:40]
        Xtrain,ytrain = X[sampleinds,:],y[sampleinds]

        clf = svm.SVC(kernel='linear', C=C)
        clf.fit(Xtrain, ytrain)
        
        # calculate accuracy on training set
        ypred = clf.predict(Xtrain)
        tr_ac.append(accuracy_score(ytrain,ypred))
        # calculate accuracy on testing set
        ypred = clf.predict(Xtest)
        te_ac.append(accuracy_score(ytest,ypred))
    trainingaccuracy_m.append(np.mean(tr_ac))
    trainingaccuracy_s.append(np.std(tr_ac))

    testingaccuracy_m.append(np.mean(te_ac))
    testingaccuracy_s.append(np.std(te_ac))


# In[148]:


plt.plot(Cvals,trainingaccuracy_m)
plt.plot(Cvals,testingaccuracy_m,'r')
plt.ylabel('Accuracy')
plt.xlabel('C')
plt.legend(['Accuracy on Training Data','Accuracy on Testing Data'])
plt.tight_layout()


# # Kernel SVM

# In[164]:


np.random.seed(0)
X = np.random.randn(300, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
            edgecolors='k')


# In[198]:


# fit the model
clf = svm.SVC(kernel='rbf',gamma='auto',C=1)
clf.fit(X, Y)


# In[199]:


# sample points on a grid
xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
                     np.linspace(-3, 3, 500))
# plot the decision function for each datapoint on the grid
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
           origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                       linestyles='dashed')
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
            edgecolors='k')
plt.xticks(())
plt.yticks(())
plt.axis([-3, 3, -3, 3])
plt.show()


# Study the impact of $\gamma$ and $C$

# ## The RBF Kernel

# In[239]:


x1 = 0
x2 = np.linspace(-10,10,100)
k = np.zeros((x2.shape[0],1))
for gamma in [0.1,0.5,0.9,2]:
    for i in range(k.shape[0]):
        k[i] = np.exp(-0.5*gamma**2*((x1 - x2[i])**2));
    plt.plot(x2,k,label='$\gamma$='+str(gamma))
plt.legend()


# In[211]:


X,Y = make_circles(n_samples=500, noise=0.3, factor=0.5, random_state=1)
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
            edgecolors='k')


# In[237]:


# fit the model
clf = svm.SVC(kernel='rbf',degree=9,gamma=400)
clf.fit(X, Y)


# In[238]:


# sample points on a grid
xx, yy = np.meshgrid(np.linspace(-2, 2, 500),
                     np.linspace(-2, 2, 500))
# plot the decision function for each datapoint on the grid
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
           origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                       linestyles='dashed')
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
            edgecolors='k')
plt.xticks(())
plt.yticks(())
plt.axis([-2, 2, -2, 2])
plt.show()