In [1]:

import numpy as np
import pylab as pl
from scipy import stats
from sklearn.datasets import load_digits
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib

In [2]:

digits = load_digits()
X = digits.data
y = digits.target

X, y = shuffle(X, y, random_state = 0)
print X.shape, y.shape
y30 = y.copy()
y30[30:] = -1
norm_X = normalize(X)
pca_X = PCA(n_components=150, whiten=False).fit_transform(X)
print pca_X.shape

(1797, 64) (1797,)
(1797, 64)

In [3]:

## Label Spreading
train_X = pca_X
gammas = [0.0005, 0.001, 0.05, 0.1, 0.5, ]
#gammas = [1, 5, 10, 15, 20, 25, 30, 35, 40]
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
param_grid = ParameterGrid(dict(gamma=gammas, alpha=alphas))
ls = LabelSpreading()
scores = {}
for param in param_grid:
    alpha, gamma = param['alpha'], param['gamma']
    ls.set_params(alpha = alpha, gamma = gamma)
    ls.fit(train_X, y30)
    scores[(alpha, gamma)] = ls.score(train_X, y)
    print (alpha, gamma), scores[(alpha, gamma)]

(0.1, 0.0005) 0.195882025598
(0.1, 0.001) 0.200890372844
(0.1, 0.05) 0.831942125765
(0.1, 0.1) 0.844741235392
(0.1, 0.5) 0.841958820256
(0.2, 0.0005) 0.195882025598
(0.2, 0.001) 0.198664440735
(0.2, 0.05) 0.838619922092
(0.2, 0.1) 0.844741235392
(0.2, 0.5) 0.843071786311
(0.3, 0.0005) 0.195882025598
(0.3, 0.001) 0.196438508625
(0.3, 0.05) 0.843071786311
(0.3, 0.1) 0.842515303283
(0.3, 0.5) 0.843628269338
(0.4, 0.0005) 0.194769059544
(0.4, 0.001) 0.196438508625
(0.4, 0.05) 0.844184752365
(0.4, 0.1) 0.842515303283
(0.4, 0.5) 0.843628269338
(0.5, 0.0005) 0.194212576516
(0.5, 0.001) 0.196438508625
(0.5, 0.05) 0.844184752365
(0.5, 0.1) 0.841958820256
(0.5, 0.5) 0.844184752365
(0.6, 0.0005) 0.192543127435
(0.6, 0.001) 0.196438508625
(0.6, 0.05) 0.840845854201
(0.6, 0.1) 0.843071786311
(0.6, 0.5) 0.844184752365
(0.7, 0.0005) 0.183639398998
(0.7, 0.001) 0.196438508625
(0.7, 0.05) 0.842515303283
(0.7, 0.1) 0.842515303283
(0.7, 0.5) 0.844184752365
(0.8, 0.0005) 0.157484696717
(0.8, 0.001) 0.196438508625
(0.8, 0.05) 0.846410684474
(0.8, 0.1) 0.844741235392
(0.8, 0.5) 0.84529771842
(0.9, 0.0005) 0.0968280467446
(0.9, 0.001) 0.195882025598
(0.9, 0.05) 0.846967167501
(0.9, 0.1) 0.847523650529
(0.9, 0.5) 0.84529771842
(1.0, 0.0005) 0.0968280467446
(1.0, 0.001) 0.0968280467446
(1.0, 0.05) 0.834168057874
(1.0, 0.1) 0.840845854201
(1.0, 0.5) 0.841402337229

In [4]:

score_mat = np.asarray([[scores[(alpha, gamma)] for gamma in gammas] 
                        for alpha in alphas])
pl.matshow(score_mat)
pl.xticks(np.arange(len(gammas)), gammas, rotation = 45)
pl.yticks(np.arange(len(alphas)), alphas)
pl.xlabel('gamma')
pl.ylabel('alpha')
pl.colorbar()

Out[4]:

<matplotlib.colorbar.Colorbar instance at 0x4619488>

Conclusions¶

gamma is more important than alpha for LabelSpreading
gamma is more important than C for SVM
gamma valid ranges might be very different for preprocessed data, such as PCA

In [6]:

## find the most uncertain target by entropy
ls.set_params(alpha = 0.8, gamma = 0.5)
ls.fit(train_X, y30)
pred = ls.predict_proba(train_X)
ent = stats.entropy(pred.T, )
print ent.shape

(1797,)

/usr/local/lib/python2.7/dist-packages/scipy/stats/distributions.py:5427: RuntimeWarning: divide by zero encountered in log
  vec = where(pk == 0, 0.0, pk*log(pk))
/usr/local/lib/python2.7/dist-packages/scipy/stats/distributions.py:5427: RuntimeWarning: invalid value encountered in multiply
  vec = where(pk == 0, 0.0, pk*log(pk))

In [13]:

uncertain_index = ent.argsort()[:-20:-1]

In [16]:

print 'prediction on uncertain cases', np.mean(ls.predict(train_X[uncertain_index]) 
              == y[uncertain_index])
print 'prediction overally', np.mean(ls.predict(train_X) 
              == y)

prediction on uncertain cases 0.105263157895
prediction overally 0.84529771842

In [19]:

yhat = ls.predict(train_X)
for i in uncertain_index:
    pl.figure(figsize = (4, 4))
    pl.imshow(X[i].reshape((8, 8)), cmap = pl.cm.gray)
    pl.title("predict=%i, true=%i" % (yhat[i], y[i]))

In [ ]: