!date
Wed Jan 15 22:45:24 EST 2014
from pykalman.classifier import GenerativeBayes
import pandas as pd
import numpy as np
rnorm = np.random.normal
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.qda import QDA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GMM
from sklearn.cross_validation import train_test_split
%matplotlib inline
import matplotlib.pylab as plt
After seeing this post I wanted to perform the same analsysis using SciKit-Learn. The classifiers used are given below with their paramters. I do not have a Neural Network like that shown in the post, but added a few extras. The GenerativeBayes is based on commits for this issue. It trains a Gaussian Mixture Model per class. One thing this notebook shows that I did not see in the post, I provide an out-of-sample score for each classifier. The score is provided on each plot.
classifiers = [
DecisionTreeClassifier(),
KNeighborsClassifier(3),
LogisticRegression(),
SVC(kernel="rbf"),
AdaBoostClassifier(),
GenerativeBayes(GMM(n_components=1,covariance_type='full', init_params='wc', n_iter=20)),
GenerativeBayes(GMM(n_components=2,covariance_type='full', init_params='wc', n_iter=20)),
QDA(),
RandomForestClassifier()]
clf_names = [
'Decision Tree',
'K Neighbors',
'Logistic Regression',
'SVC (Gaussian)',
'Ada Boost',
'GMM (1 component)',
'GMM (2 component)',
'QDA',
'Random Forest']
def plot_results(classifiers, df):
plt.figure(figsize=(14,14))
X = df[['x','y']]
Y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.4)
n_classes = len(Y.unique())
plot_colors = "brym"
plot_step = 0.02
x_min, x_max = X.ix[:, 0].min() - 1, X.ix[:, 0].max() + 1
y_min, y_max = X.ix[:, 1].min() - 1, X.ix[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
for j, clf in enumerate(classifiers):
clf.fit(X_train,y_train)
score = clf.score(X_test, y_test)
ax = plt.subplot(4,3,j+1)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
plt.scatter(X[Y==i].x, X[Y==i].y, c=color, label=i, cmap=plt.cm.Paired)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
plt.title(clf_names[j])
# Simple Linearly Seperated
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=2.0, scale=0.5, size=(N,1)),
rnorm(loc=2.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)),
rnorm(loc=1.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p2['label'] = 1
df = pd.concat([p1, p2])
plot_results(classifiers, df)
# 3 Class Linearly Seperated
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=3.0, scale=0.5, size=(N,1)),
rnorm(loc=3.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=2.0, scale=0.5, size=(N,1)),
rnorm(loc=2.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p2['label'] = 1
p3 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)),
rnorm(loc=1.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p3['label'] = 2
df = pd.concat([p1, p2,p3])
plot_results(classifiers, df)
# XOR pattern (simple)
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)),
rnorm(loc=1.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=0.5, size=(N,1)),
rnorm(loc=1.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p2['label'] = 1
p3 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=0.5, size=(N,1)),
rnorm(loc=-1.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p3['label'] = 0
p4 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)),
rnorm(loc=-1.0, scale=0.5, size=(N,1)))),
columns=['x','y'])
p4['label'] = 1
df = pd.concat([p1,p2,p3,p4])
plot_results(classifiers, df)