# Chapter 4 - Evaluation and Optimization¶

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


We generate two inputs:

• features – a matrix of input features
• target – an array of target variables corresponding to those features
In [2]:
features = rand(100,5)
target = rand(100) > 0.5


### The holdout method¶

We divide into a randomized training and test set:

In [3]:
N = features.shape[0]
N_train = floor(0.7 * N)

# Randomize index
# Note: sometimes you want to retain the order in the dataset and skip this step
# E.g. in the case of time-based datasets where you want to test on 'later' instances
idx = random.permutation(N)

# Split index
idx_train = idx[:N_train]
idx_test = idx[N_train:]

# Break your data into training and testing subsets
features_train = features[idx_train,:]
target_train = target[idx_train]
features_test = features[idx_test,:]
target_test = target[idx_test]

# Build, predict, evaluate (to be filled out)
# model = train(features_train, target_train)
# preds_test = predict(model, features_test)
# accuracy = evaluate_acc(preds_test, target_test)

/Users/henrik/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/Users/henrik/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [4]:
print features_train.shape
print features_test.shape
print target_train.shape
print target_test.shape

(70, 5)
(30, 5)
(70,)
(30,)


### K-fold cross-validation¶

In [5]:
N = features.shape[0]
K = 10 # number of folds

preds_kfold = np.empty(N)
folds = np.random.randint(0, K, size=N)

for idx in np.arange(K):

# For each fold, break your data into training and testing subsets
features_train = features[folds != idx,:]
target_train = target[folds != idx]
features_test = features[folds == idx,:]

# Print the indices in each fold, for inspection
print nonzero(folds == idx)[0]

# Build and predict for CV fold (to be filled out)
# model = train(features_train, target_train)
# preds_kfold[folds == idx] = predict(model, features_test)

# accuracy = evaluate_acc(preds_kfold, target)

[16 23 33 34 35 39 44 48 99]
[ 3 10 29 37 45 52 54 62 68 83 98]
[ 4 12 20 22 28 50 51 53 55 56 59 60 64 77 84 86 92 97]
[ 6  7  9 11 18 27 41 49 57 61 70 73 80]
[42 67 96]
[30 63 66 78 90 91]
[ 2  5 26 40 46 58 72 74 87 95]
[ 0  8 14 24 32 36 38 43 85 88 89]
[ 1 13 17 19 21 47 71 79 81 82 93 94]
[15 25 31 65 69 75 76]


### The ROC curve¶

In [6]:
def roc_curve(true_labels, predicted_probs, n_points=100, pos_class=1):
thr = linspace(0,1,n_points)
tpr = zeros(n_points)
fpr = zeros(n_points)

pos = true_labels == pos_class
neg = logical_not(pos)
n_pos = count_nonzero(pos)
n_neg = count_nonzero(neg)

for i,t in enumerate(thr):
tpr[i] = count_nonzero(logical_and(predicted_probs >= t, pos)) / n_pos
fpr[i] = count_nonzero(logical_and(predicted_probs >= t, neg)) / n_neg

return fpr, tpr, thr

In [7]:
# Randomly generated predictions should give us a diagonal ROC curve
preds = rand(len(target))
fpr, tpr, thr = roc_curve(target, preds, pos_class=True)
plot(fpr, tpr)

Out[7]:
[<matplotlib.lines.Line2D at 0x10595f150>]

### The area under the ROC curve¶

In [8]:
def auc(true_labels, predicted_labels, pos_class=1):
fpr, tpr, thr = roc_curve(true_labels, predicted_labels,
pos_class=pos_class)
area = -trapz(tpr, x=fpr)
return area

In [9]:
auc(target, preds, pos_class=True)

Out[9]:
0.5

### Multi-class classification¶

In [28]:
d = pandas.read_csv("data/mnist_small.csv")
d_train = d[:int(0.8*len(d))]
d_test = d[int(0.8*len(d)):]

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(d_train.drop('label', axis=1), d_train['label'])

Out[29]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
In [30]:
from sklearn.metrics import confusion_matrix
preds = rf.predict(d_test.drop('label', axis=1))
cm = confusion_matrix(d_test['label'], preds)

In [39]:
matshow(cm, cmap='Greys')
colorbar()
savefig("figures/figure-4.19.eps", format='eps')


### The root-mean-square error¶

In [10]:
def rmse(true_values, predicted_values):
n = len(true_values)
residuals = 0
for i in range(n):
residuals += (true_values[i] - predicted_values[i])**2.
return np.sqrt(residuals/n)

In [11]:
rmse(rand(10), rand(10))

Out[11]:
0.26547176572681108

### The R-squared error¶

In [12]:
def r2(true_values, predicted_values):
n = len(true_values)
mean = np.mean(true_values)
residuals = 0
total = 0
for i in range(n):
residuals += (true_values[i] - predicted_values[i])**2.
total += (true_values[i] - mean)**2.
return 1.0 - residuals/total

In [13]:
r2(arange(10)+rand(), arange(10)+rand(10))

Out[13]:
0.98002618717502987

### Grid search with kernel-SVM model¶

Importing modules:

In [14]:
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC


In [15]:
d = pandas.read_csv("data/titanic.csv")

# Target
y = d["Survived"]

# Features
X = d.drop(["Survived", "PassengerId", "Cabin","Ticket","Name", "Fare"], axis=1)
X['Sex'] = map(lambda x: 1 if x=="male" else 0, X['Sex'])
X['Embarked-Q'] = map(lambda x: 1 if x=="Q" else 0, X['Embarked'])
X['Embarked-C'] = map(lambda x: 1 if x=="C" else 0, X['Embarked'])
X['Embarked-S'] = map(lambda x: 1 if x=="S" else 0, X['Embarked'])
X = X.drop(["Embarked", "Sex"], axis=1)
X = X.fillna(-1)


Performing grid-search to find the optimal hyper-parameters:

In [16]:
# grid of (gamma, C) values to try
gam_vec, cost_vec = np.meshgrid(np.linspace(0.01, 10, 11),
np.linspace(0.01, 10, 11))

AUC_all = [] # initialize empty array to store AUC results

# set up cross-validation folds
N = len(y)
K = 10 # number of cross-validation folds
folds = np.random.randint(0, K, size=N)

# search over every value of the grid
for param_ind in np.arange(len(gam_vec.ravel())):

# initialize cross-validation predictions
y_cv_pred = np.empty(N)

# loop through the cross-validation folds
for ii in np.arange(K):
# break your data into training and testing subsets
X_train = X.ix[folds != ii,:]
y_train = y.ix[folds != ii]
X_test = X.ix[folds == ii,:]

# build a model on the training set
model = SVC(gamma=gam_vec.ravel()[param_ind], C=cost_vec.ravel()[param_ind])
model.fit(X_train, y_train)

# generate and store model predictions on the testing set
y_cv_pred[folds == ii] = model.predict(X_test)

# evaluate the AUC of the predictions
AUC_all.append(roc_auc_score(y, y_cv_pred))

indmax = np.argmax(AUC_all)
print "Maximum = %.3f" % (np.max(AUC_all))
print "Tuning Parameters: (gamma = %.2f, C = %.2f)" % (gam_vec.ravel()[indmax], cost_vec.ravel()[indmax])

Maximum = 0.674
Tuning Parameters: (gamma = 0.01, C = 4.01)


Plotting the contours of the parameter performance:

In [24]:
AUC_grid = np.array(AUC_all).reshape(gam_vec.shape)

contourf(gam_vec, cost_vec, AUC_grid, 20, cmap='Greys')
xlabel("kernel coefficient, gamma")
ylabel("penalty parameter, C")
colorbar()
savefig("figures/figure-4.25.eps", format='eps')

In [ ]: