import numpy as np
import pylab as pl
from scipy import stats
from sklearn.datasets import load_digits
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
digits = load_digits()
X = digits.data
y = digits.target
X, y = shuffle(X, y, random_state = 0)
print X.shape, y.shape
y30 = y.copy()
y30[30:] = -1
norm_X = normalize(X)
pca_X = PCA(n_components=150, whiten=False).fit_transform(X)
print pca_X.shape
(1797, 64) (1797,) (1797, 64)
## Label Spreading
train_X = pca_X
gammas = [0.0005, 0.001, 0.05, 0.1, 0.5, ]
#gammas = [1, 5, 10, 15, 20, 25, 30, 35, 40]
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
param_grid = ParameterGrid(dict(gamma=gammas, alpha=alphas))
ls = LabelSpreading()
scores = {}
for param in param_grid:
alpha, gamma = param['alpha'], param['gamma']
ls.set_params(alpha = alpha, gamma = gamma)
ls.fit(train_X, y30)
scores[(alpha, gamma)] = ls.score(train_X, y)
print (alpha, gamma), scores[(alpha, gamma)]
(0.1, 0.0005) 0.195882025598 (0.1, 0.001) 0.200890372844 (0.1, 0.05) 0.831942125765 (0.1, 0.1) 0.844741235392 (0.1, 0.5) 0.841958820256 (0.2, 0.0005) 0.195882025598 (0.2, 0.001) 0.198664440735 (0.2, 0.05) 0.838619922092 (0.2, 0.1) 0.844741235392 (0.2, 0.5) 0.843071786311 (0.3, 0.0005) 0.195882025598 (0.3, 0.001) 0.196438508625 (0.3, 0.05) 0.843071786311 (0.3, 0.1) 0.842515303283 (0.3, 0.5) 0.843628269338 (0.4, 0.0005) 0.194769059544 (0.4, 0.001) 0.196438508625 (0.4, 0.05) 0.844184752365 (0.4, 0.1) 0.842515303283 (0.4, 0.5) 0.843628269338 (0.5, 0.0005) 0.194212576516 (0.5, 0.001) 0.196438508625 (0.5, 0.05) 0.844184752365 (0.5, 0.1) 0.841958820256 (0.5, 0.5) 0.844184752365 (0.6, 0.0005) 0.192543127435 (0.6, 0.001) 0.196438508625 (0.6, 0.05) 0.840845854201 (0.6, 0.1) 0.843071786311 (0.6, 0.5) 0.844184752365 (0.7, 0.0005) 0.183639398998 (0.7, 0.001) 0.196438508625 (0.7, 0.05) 0.842515303283 (0.7, 0.1) 0.842515303283 (0.7, 0.5) 0.844184752365 (0.8, 0.0005) 0.157484696717 (0.8, 0.001) 0.196438508625 (0.8, 0.05) 0.846410684474 (0.8, 0.1) 0.844741235392 (0.8, 0.5) 0.84529771842 (0.9, 0.0005) 0.0968280467446 (0.9, 0.001) 0.195882025598 (0.9, 0.05) 0.846967167501 (0.9, 0.1) 0.847523650529 (0.9, 0.5) 0.84529771842 (1.0, 0.0005) 0.0968280467446 (1.0, 0.001) 0.0968280467446 (1.0, 0.05) 0.834168057874 (1.0, 0.1) 0.840845854201 (1.0, 0.5) 0.841402337229
score_mat = np.asarray([[scores[(alpha, gamma)] for gamma in gammas]
for alpha in alphas])
pl.matshow(score_mat)
pl.xticks(np.arange(len(gammas)), gammas, rotation = 45)
pl.yticks(np.arange(len(alphas)), alphas)
pl.xlabel('gamma')
pl.ylabel('alpha')
pl.colorbar()
<matplotlib.colorbar.Colorbar instance at 0x4619488>
## find the most uncertain target by entropy
ls.set_params(alpha = 0.8, gamma = 0.5)
ls.fit(train_X, y30)
pred = ls.predict_proba(train_X)
ent = stats.entropy(pred.T, )
print ent.shape
(1797,)
/usr/local/lib/python2.7/dist-packages/scipy/stats/distributions.py:5427: RuntimeWarning: divide by zero encountered in log vec = where(pk == 0, 0.0, pk*log(pk)) /usr/local/lib/python2.7/dist-packages/scipy/stats/distributions.py:5427: RuntimeWarning: invalid value encountered in multiply vec = where(pk == 0, 0.0, pk*log(pk))
uncertain_index = ent.argsort()[:-20:-1]
print 'prediction on uncertain cases', np.mean(ls.predict(train_X[uncertain_index])
== y[uncertain_index])
print 'prediction overally', np.mean(ls.predict(train_X)
== y)
prediction on uncertain cases 0.105263157895 prediction overally 0.84529771842
yhat = ls.predict(train_X)
for i in uncertain_index:
pl.figure(figsize = (4, 4))
pl.imshow(X[i].reshape((8, 8)), cmap = pl.cm.gray)
pl.title("predict=%i, true=%i" % (yhat[i], y[i]))