#!/usr/bin/env python
# coding: utf-8
# ## Simulate expected misclassification rate
# In[2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import multivariate_normal
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_formats = {'retina',}")
plt.style.use('seaborn-white')
# #### Model to be used for generating data with two classes:
#
# $y_i = 0,\quad x_i \sim N_{10}(0, I_{10})$
#
# $y_i = 1,\quad x_i \sim N_{10}(\mu, I_{10})\,$ with $\mu = (1,1,1,1,1,0,0,0,0,0)$
#
# $x_i \in \mathbb{R^{10}}$ normally distributed
#
# $y_i$ equally divided between the two classes (balanced dataset)
#
# In[3]:
def simulate_clf_error(clf, train_sample_n=100, test_sample_n=2000):
# _1_ Generate training sample and train classifier
y_0 = multivariate_normal(mean=np.repeat(0, 10), cov=np.identity(10)).rvs(train_sample_n//2)
y_1 = multivariate_normal(mean=np.repeat([0,1],5), cov=np.identity(10)).rvs(train_sample_n//2)
X_train = np.r_[y_0, y_1]
y_train = np.repeat([0,1],50)
clf.fit(X_train, y_train)
# _2_ Generate large set of test data and return error rate of classifier
y_00 = multivariate_normal(mean=np.repeat(0, 10), cov=np.identity(10)).rvs(test_sample_n//2)
y_11 = multivariate_normal(mean=np.repeat([0,1],5), cov=np.identity(10)).rvs(test_sample_n//2)
X_test = np.r_[y_00, y_11]
y_test = np.repeat([0,1], test_sample_n//2)
return(1 - clf.score(X_test, y_test))
# #### Run simulations
# In[7]:
repeats = 1000
svm_radial = [simulate_clf_error(SVC(kernel='rbf')) for i in np.arange(repeats)]
svm_linear = [simulate_clf_error(SVC(kernel='linear')) for i in np.arange(repeats)]
log_regr = [simulate_clf_error(LogisticRegression(C=100)) for i in np.arange(repeats)]
# #### Average error rate
# In[8]:
print('SVM - radial kernel: {}'.format(np.mean(svm_radial)))
print('SVM - linear kernel: {}'.format(np.mean(svm_linear)))
print('Logistic regression: {}'.format(np.mean(log_regr)))
# In[9]:
plt.plot(svm_radial, 'g', alpha=0.4, label='SVM Radial')
plt.plot(svm_linear, 'r', alpha=0.4, label='SVM Linear')
plt.plot(log_regr, 'b', alpha=0.4, label='Logistic Regression')
plt.hlines(np.mean(np.c_[svm_radial, svm_linear, log_regr], axis=0), 0, repeats, colors=['g', 'r', 'b'])
plt.xlabel('Simulation')
plt.ylabel('Error rate')
plt.title('Simulation: expected misclassification rates')
plt.legend();
# In[10]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3, sharey=True, figsize=(18,5))
ax1.plot(svm_radial, 'g', alpha=0.4, label='SVM Radial')
ax1.hlines(np.mean(svm_radial), 0, repeats, colors='g')
ax1.set_ylabel('Error rate')
ax2.plot(svm_linear, 'r', alpha=0.4, label='SVM Linear')
ax2.hlines(np.mean(svm_linear), 0, repeats, colors='r')
ax3.plot(log_regr, 'b', alpha=0.4, label='Logistic Regression')
ax3.hlines(np.mean(log_regr), 0, repeats, colors='b');
fig.suptitle('Simulation: expected misclassification rates', fontsize=16)
fig.subplots_adjust(wspace=0.02)
for ax in fig.axes:
ax.set_xlabel('{} simulations'.format(repeats))
ax.tick_params(labelbottom='off')
ax.legend()