import numpy as np
import os
import sys
import cntk
from cntk.layers import Embedding, LSTM, GRU, Dense, Recurrence
from cntk import sequence
from common.params_lstm import *
from common.utils import *
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("CNTK: ", cntk.__version__)
print("GPU: ", get_gpu_name())
OS: linux Python: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)] Numpy: 1.13.3 CNTK: 2.2 GPU: ['Tesla K80']
def create_symbol(CUDNN=True):
# Weight initialiser from uniform distribution
# Activation (unless states) is None
with cntk.layers.default_options(init = cntk.glorot_uniform()):
x = Embedding(EMBEDSIZE)(features) # output: list of len=BATCHSIZE of arrays with shape=(MAXLEN, EMBEDSIZE)
# Since we have a vanilla RNN, instead of using the more flexible Recurrence(GRU) unit, which allows for
# example LayerNormalisation to be added to the network, we can use optimized_rnnstack which quickly
# goes down to the CuDNN level. This is another reason not to read much into the speed comparison because
# it becomes a measure of which framework has the fastest way to go down to CuDNN.
if not CUDNN:
x = Recurrence(GRU(NUMHIDDEN))(x) # output: list of len=BATCHSIZE of arrays with shape=(MAXLEN, NUMHIDDEN)
else:
W = cntk.parameter((cntk.InferredDimension, 4))
x = cntk.ops.optimized_rnnstack(x, W, NUMHIDDEN, num_layers=1, bidirectional=False, recurrent_op='gru')
x = sequence.last(x) #o utput: array with shape=(BATCHSIZE, NUMHIDDEN)
x = Dense(2)(x) # output: array with shape=(BATCHSIZE, 2)
return x
def init_model(m):
# Loss (dense labels); check if support for sparse labels
loss = cntk.cross_entropy_with_softmax(m, labels)
# ADAM, set unit_gain to False to match others
learner = cntk.adam(m.parameters,
lr=cntk.learning_rate_schedule(LR, cntk.UnitType.minibatch) ,
momentum=cntk.momentum_schedule(BETA_1),
variance_momentum=cntk.momentum_schedule(BETA_2),
epsilon=EPS,
unit_gain=False)
trainer = cntk.Trainer(m, (loss, cntk.classification_error(m, labels)), [learner])
return trainer
%%time
# Data into format for library
x_train, x_test, y_train, y_test = imdb_for_library(seq_len=MAXLEN, max_features=MAXFEATURES, one_hot=True)# CNTK format
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(x_train.dtype, x_test.dtype, y_train.dtype, y_test.dtype)
Preparing train set... Preparing test set... Trimming to 30000 max-features Padding to length 150 (25000, 150) (25000, 150) (25000, 2) (25000, 2) int32 int32 float32 float32 CPU times: user 5.58 s, sys: 283 ms, total: 5.86 s Wall time: 5.87 s
%%time
# Placeholders
features = sequence.input_variable(shape=MAXFEATURES, is_sparse=True)
labels = cntk.input_variable(2)
# Load symbol
sym = create_symbol()
CPU times: user 9.42 ms, sys: 28 ms, total: 37.4 ms Wall time: 66.6 ms
%%time
trainer = init_model(sym)
CPU times: user 88.8 ms, sys: 203 ms, total: 291 ms Wall time: 299 ms
%%time
# 32s
# Train model
for j in range(EPOCHS):
for data, label in yield_mb(x_train, y_train, BATCHSIZE, shuffle=True):
data_1hot = cntk.Value.one_hot(data, MAXFEATURES) #TODO: do this externally and generate batches of 1hot
trainer.train_minibatch({features: data_1hot, labels: label})
# Log (this is just last batch in epoch, not average of batches)
eval_error = trainer.previous_minibatch_evaluation_average
print("Epoch %d | Accuracy: %.6f" % (j+1, (1-eval_error)))
Epoch 1 | Accuracy: 0.765625 Epoch 2 | Accuracy: 0.937500 Epoch 3 | Accuracy: 0.937500 CPU times: user 28 s, sys: 4.66 s, total: 32.7 s Wall time: 32.3 s
%%time
# Predict and then score accuracy
# Apply softmax since that is only applied at training
# with cross-entropy loss
z = cntk.softmax(sym)
n_samples = (y_test.shape[0]//BATCHSIZE)*BATCHSIZE
y_guess = np.zeros(n_samples, dtype=np.int)
y_truth = np.argmax(y_test[:n_samples], axis=-1)
c = 0
for data, label in yield_mb(x_test, y_test, BATCHSIZE):
data = cntk.Value.one_hot(data, MAXFEATURES)
predicted_label_probs = z.eval({features : data})
y_guess[c*BATCHSIZE:(c+1)*BATCHSIZE] = np.argmax(predicted_label_probs, axis=-1)
c += 1
CPU times: user 3.58 s, sys: 396 ms, total: 3.98 s Wall time: 3.98 s
print("Accuracy: ", sum(y_guess == y_truth)/len(y_guess))
Accuracy: 0.853405448718