!wget -q https://raw.githubusercontent.com/Mmiglio/SpeechRecognition/Attention-Mechanism-and-Memory-Networks/src/getData.py
!wget -q https://raw.githubusercontent.com/Mmiglio/SpeechRecognition/Attention-Mechanism-and-Memory-Networks/src/constants.py
!wget -q https://raw.githubusercontent.com/Mmiglio/SpeechRecognition/Attention-Mechanism-and-Memory-Networks/src/trainingUtils.py
!wget -q https://raw.githubusercontent.com/Mmiglio/SpeechRecognition/Attention-Mechanism-and-Memory-Networks/src/models.py
!wget -q https://raw.githubusercontent.com/Mmiglio/SpeechRecognition/Attention-Mechanism-and-Memory-Networks/src/analysisFunctions.py
from getData import downloadSpeechData, getDataDict
# Download data
downloadSpeechData(data_path='speechData/')
# Get dict with files and labels
dataDict = getDataDict(data_path='speechData/')
Downloading http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz into /content/speechData/train.tar.gz Extracting /content/speechData/train.tar.gz into /content/speechData/train Downloading http://download.tensorflow.org/data/speech_commands_test_set_v0.01.tar.gz into /content/speechData/test.tar.gz Extracting /content/speechData/test.tar.gz into /content/speechData/test
from getData import getDataframe
trainDF = getDataframe(dataDict['train'])
valDF = getDataframe(dataDict['val'])
testDF = getDataframe(dataDict['test'])
print("Train files: {}".format(trainDF.shape[0]))
print("Validation files: {}".format(valDF.shape[0]))
print("Test files: {}".format(testDF.shape[0]))
Train files: 51088 Validation files: 6798 Test files: 6835
trainDF.head()
files | labels | category | |
---|---|---|---|
0 | speechData/train/dog/33f60c62_nohash_1.wav | 8 | dog |
1 | speechData/train/bird/30065f33_nohash_0.wav | 7 | bird |
2 | speechData/train/down/f5c3de1b_nohash_0.wav | 21 | down |
3 | speechData/train/go/2bd2cad5_nohash_0.wav | 27 | go |
4 | speechData/train/seven/aac5b7c1_nohash_1.wav | 11 | seven |
!pip install -q tensorflow-gpu==2.0.0-beta1
|████████████████████████████████| 348.9MB 67kB/s |████████████████████████████████| 3.1MB 49.2MB/s |████████████████████████████████| 501kB 58.7MB/s
import tensorflow as tf
print("GPU Available: ", tf.test.is_gpu_available())
print("Version: ", tf.__version__)
GPU Available: True Version: 2.0.0-beta1
!pip install -q python_speech_features
Building wheel for python-speech-features (setup.py) ... done
from trainingUtils import getDataset
BATCH_SIZE = 32
# Reduced trainign dataset
NUM_EXAMPLES = 30000
train_data, train_steps = getDataset(
df=trainDF[:NUM_EXAMPLES],
batch_size=BATCH_SIZE,
cache_file='train_cache',
shuffle=True
)
val_data, val_steps = getDataset(
df=valDF,
batch_size=BATCH_SIZE,
cache_file='val_cache',
shuffle=False
)
test_data, test_steps = getDataset(
df=testDF,
batch_size=BATCH_SIZE,
cache_file='test_cache',
shuffle=False
)
from models import rnn_att_model
model_lstm = rnn_att_model()
model_gru = rnn_att_model(rnn = 'GRU')
model_lstm.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=["sparse_categorical_accuracy"])
model_lstm.summary()
model_gru.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=["sparse_categorical_accuracy"])
model_gru.summary()
Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(None, 99, 40)] 0 __________________________________________________________________________________________________ reshape (Reshape) (None, 99, 40, 1) 0 input_1[0][0] __________________________________________________________________________________________________ batch_normalization (BatchNorma (None, 99, 40, 1) 4 reshape[0][0] __________________________________________________________________________________________________ conv2d (Conv2D) (None, 99, 40, 30) 300 batch_normalization[0][0] __________________________________________________________________________________________________ batch_normalization_1 (BatchNor (None, 99, 40, 30) 120 conv2d[0][0] __________________________________________________________________________________________________ conv2d_1 (Conv2D) (None, 99, 40, 1) 271 batch_normalization_1[0][0] __________________________________________________________________________________________________ batch_normalization_2 (BatchNor (None, 99, 40, 1) 4 conv2d_1[0][0] __________________________________________________________________________________________________ squeeze_dim (Lambda) (None, 99, 40) 0 batch_normalization_2[0][0] __________________________________________________________________________________________________ bidirectional (Bidirectional) (None, 99, 120) 48480 squeeze_dim[0][0] __________________________________________________________________________________________________ bidirectional_1 (Bidirectional) (None, 99, 120) 86880 bidirectional[0][0] __________________________________________________________________________________________________ lambda (Lambda) [(None, 99, 60), (No 0 bidirectional_1[0][0] __________________________________________________________________________________________________ Attention (Attention) (None, 99, 60) 0 lambda[0][0] lambda[0][1] __________________________________________________________________________________________________ flatten (Flatten) (None, 5940) 0 Attention[0][0] __________________________________________________________________________________________________ dense (Dense) (None, 512) 3041792 flatten[0][0] __________________________________________________________________________________________________ dense_1 (Dense) (None, 30) 15390 dense[0][0] ================================================================================================== Total params: 3,193,241 Trainable params: 3,193,177 Non-trainable params: 64 __________________________________________________________________________________________________ Model: "model_1" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_2 (InputLayer) [(None, 99, 40)] 0 __________________________________________________________________________________________________ reshape_1 (Reshape) (None, 99, 40, 1) 0 input_2[0][0] __________________________________________________________________________________________________ batch_normalization_3 (BatchNor (None, 99, 40, 1) 4 reshape_1[0][0] __________________________________________________________________________________________________ conv2d_2 (Conv2D) (None, 99, 40, 30) 300 batch_normalization_3[0][0] __________________________________________________________________________________________________ batch_normalization_4 (BatchNor (None, 99, 40, 30) 120 conv2d_2[0][0] __________________________________________________________________________________________________ conv2d_3 (Conv2D) (None, 99, 40, 1) 271 batch_normalization_4[0][0] __________________________________________________________________________________________________ batch_normalization_5 (BatchNor (None, 99, 40, 1) 4 conv2d_3[0][0] __________________________________________________________________________________________________ squeeze_dim (Lambda) (None, 99, 40) 0 batch_normalization_5[0][0] __________________________________________________________________________________________________ bidirectional_2 (Bidirectional) (None, 99, 120) 36720 squeeze_dim[0][0] __________________________________________________________________________________________________ bidirectional_3 (Bidirectional) (None, 99, 120) 65520 bidirectional_2[0][0] __________________________________________________________________________________________________ lambda_1 (Lambda) [(None, 99, 60), (No 0 bidirectional_3[0][0] __________________________________________________________________________________________________ Attention (Attention) (None, 99, 60) 0 lambda_1[0][0] lambda_1[0][1] __________________________________________________________________________________________________ flatten_1 (Flatten) (None, 5940) 0 Attention[0][0] __________________________________________________________________________________________________ dense_2 (Dense) (None, 512) 3041792 flatten_1[0][0] __________________________________________________________________________________________________ dense_3 (Dense) (None, 30) 15390 dense_2[0][0] ================================================================================================== Total params: 3,160,121 Trainable params: 3,160,057 Non-trainable params: 64 __________________________________________________________________________________________________
EPOCHS = 25
# Stop if the validation accuracy doesn't imporove for 5 epochs
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)
# Reduce LR on Plateau
reduceLR = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, verbose=1)
history_lstm = model_lstm.fit(train_data.repeat(),
steps_per_epoch=train_steps,
validation_data=val_data.repeat(),
validation_steps=val_steps,
epochs=EPOCHS,
callbacks=[earlyStopping, reduceLR])
history_gru = model_gru.fit(train_data.repeat(),
steps_per_epoch=train_steps,
validation_data=val_data.repeat(),
validation_steps=val_steps,
epochs=EPOCHS,
callbacks=[earlyStopping, reduceLR])
WARNING: Logging before flag parsing goes to stderr. W0712 21:45:40.357242 139958465672960 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string W0712 21:45:40.358233 139959572494080 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string W0712 21:45:40.359967 139959572494080 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32 W0712 21:45:40.360871 139959572494080 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32 W0712 21:45:40.359025 139958465672960 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
Epoch 1/25
W0712 21:47:49.606497 139961739814784 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.where in 2.0, which has the same broadcast rule as np.where
937/937 [==============================] - 202s 216ms/step - loss: 1.3229 - sparse_categorical_accuracy: 0.6127 - val_loss: 0.6924 - val_sparse_categorical_accuracy: 0.7988 Epoch 2/25 937/937 [==============================] - 35s 37ms/step - loss: 0.4067 - sparse_categorical_accuracy: 0.8774 - val_loss: 0.3573 - val_sparse_categorical_accuracy: 0.8962 Epoch 3/25 937/937 [==============================] - 33s 35ms/step - loss: 0.2919 - sparse_categorical_accuracy: 0.9128 - val_loss: 0.3980 - val_sparse_categorical_accuracy: 0.8850 Epoch 4/25 937/937 [==============================] - 33s 35ms/step - loss: 0.2420 - sparse_categorical_accuracy: 0.9264 - val_loss: 0.3517 - val_sparse_categorical_accuracy: 0.9008 Epoch 5/25 937/937 [==============================] - 33s 35ms/step - loss: 0.2094 - sparse_categorical_accuracy: 0.9372 - val_loss: 0.3107 - val_sparse_categorical_accuracy: 0.9204 Epoch 6/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1851 - sparse_categorical_accuracy: 0.9439 - val_loss: 0.4418 - val_sparse_categorical_accuracy: 0.8861 Epoch 7/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1745 - sparse_categorical_accuracy: 0.9480 - val_loss: 0.2865 - val_sparse_categorical_accuracy: 0.9248 Epoch 8/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1601 - sparse_categorical_accuracy: 0.9510 - val_loss: 0.3374 - val_sparse_categorical_accuracy: 0.9161 Epoch 9/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1451 - sparse_categorical_accuracy: 0.9566 - val_loss: 0.3629 - val_sparse_categorical_accuracy: 0.9192 Epoch 10/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1379 - sparse_categorical_accuracy: 0.9595 - val_loss: 0.2808 - val_sparse_categorical_accuracy: 0.9316 Epoch 11/25 937/937 [==============================] - 34s 36ms/step - loss: 0.1279 - sparse_categorical_accuracy: 0.9610 - val_loss: 0.4144 - val_sparse_categorical_accuracy: 0.9085 Epoch 12/25 937/937 [==============================] - 34s 36ms/step - loss: 0.1192 - sparse_categorical_accuracy: 0.9640 - val_loss: 0.3341 - val_sparse_categorical_accuracy: 0.9285 Epoch 13/25 935/937 [============================>.] - ETA: 0s - loss: 0.1191 - sparse_categorical_accuracy: 0.9640 Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513. 937/937 [==============================] - 34s 36ms/step - loss: 0.1192 - sparse_categorical_accuracy: 0.9640 - val_loss: 0.3319 - val_sparse_categorical_accuracy: 0.9310 Epoch 14/25 937/937 [==============================] - 33s 36ms/step - loss: 0.0689 - sparse_categorical_accuracy: 0.9790 - val_loss: 0.2813 - val_sparse_categorical_accuracy: 0.9421 Epoch 15/25 937/937 [==============================] - 34s 36ms/step - loss: 0.0550 - sparse_categorical_accuracy: 0.9832 - val_loss: 0.2903 - val_sparse_categorical_accuracy: 0.9431 Epoch 00015: early stopping Epoch 1/25 937/937 [==============================] - 40s 42ms/step - loss: 1.2130 - sparse_categorical_accuracy: 0.6491 - val_loss: 0.7879 - val_sparse_categorical_accuracy: 0.8065 Epoch 2/25 937/937 [==============================] - 34s 37ms/step - loss: 0.4272 - sparse_categorical_accuracy: 0.8720 - val_loss: 0.5981 - val_sparse_categorical_accuracy: 0.8361 Epoch 3/25 937/937 [==============================] - 33s 36ms/step - loss: 0.3232 - sparse_categorical_accuracy: 0.9043 - val_loss: 0.4231 - val_sparse_categorical_accuracy: 0.8889 Epoch 4/25 937/937 [==============================] - 33s 35ms/step - loss: 0.2641 - sparse_categorical_accuracy: 0.9226 - val_loss: 0.3812 - val_sparse_categorical_accuracy: 0.8968 Epoch 5/25 937/937 [==============================] - 33s 36ms/step - loss: 0.2397 - sparse_categorical_accuracy: 0.9306 - val_loss: 0.3607 - val_sparse_categorical_accuracy: 0.9107 Epoch 6/25 937/937 [==============================] - 33s 35ms/step - loss: 0.2184 - sparse_categorical_accuracy: 0.9372 - val_loss: 0.3218 - val_sparse_categorical_accuracy: 0.9170 Epoch 7/25 937/937 [==============================] - 33s 36ms/step - loss: 0.1946 - sparse_categorical_accuracy: 0.9432 - val_loss: 0.3593 - val_sparse_categorical_accuracy: 0.9067 Epoch 8/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1791 - sparse_categorical_accuracy: 0.9485 - val_loss: 0.3151 - val_sparse_categorical_accuracy: 0.9276 Epoch 9/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1778 - sparse_categorical_accuracy: 0.9501 - val_loss: 0.3913 - val_sparse_categorical_accuracy: 0.9117 Epoch 10/25 937/937 [==============================] - 33s 35ms/step - loss: 0.1722 - sparse_categorical_accuracy: 0.9507 - val_loss: 0.3450 - val_sparse_categorical_accuracy: 0.9250 Epoch 11/25 935/937 [============================>.] - ETA: 0s - loss: 0.1518 - sparse_categorical_accuracy: 0.9558 Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513. 937/937 [==============================] - 33s 35ms/step - loss: 0.1518 - sparse_categorical_accuracy: 0.9558 - val_loss: 0.3421 - val_sparse_categorical_accuracy: 0.9198 Epoch 12/25 937/937 [==============================] - 33s 35ms/step - loss: 0.0823 - sparse_categorical_accuracy: 0.9758 - val_loss: 0.2716 - val_sparse_categorical_accuracy: 0.9394 Epoch 13/25 937/937 [==============================] - 33s 35ms/step - loss: 0.0638 - sparse_categorical_accuracy: 0.9815 - val_loss: 0.2723 - val_sparse_categorical_accuracy: 0.9402 Epoch 14/25 937/937 [==============================] - 34s 36ms/step - loss: 0.0565 - sparse_categorical_accuracy: 0.9829 - val_loss: 0.2747 - val_sparse_categorical_accuracy: 0.9412 Epoch 15/25 935/937 [============================>.] - ETA: 0s - loss: 0.0530 - sparse_categorical_accuracy: 0.9837 Epoch 00015: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05. 937/937 [==============================] - 33s 35ms/step - loss: 0.0529 - sparse_categorical_accuracy: 0.9838 - val_loss: 0.2766 - val_sparse_categorical_accuracy: 0.9415 Epoch 16/25 937/937 [==============================] - 33s 35ms/step - loss: 0.0470 - sparse_categorical_accuracy: 0.9869 - val_loss: 0.2801 - val_sparse_categorical_accuracy: 0.9418 Epoch 17/25 937/937 [==============================] - 33s 35ms/step - loss: 0.0465 - sparse_categorical_accuracy: 0.9869 - val_loss: 0.2817 - val_sparse_categorical_accuracy: 0.9416 Epoch 00017: early stopping
from google.colab import drive
drive.mount('/content/gdrive')
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/gdrive
!mkdir /content/gdrive/My\ Drive/Colab\ Notebooks/rnnModel
mkdir: cannot create directory ‘/content/gdrive/My Drive/Colab Notebooks/rnnModel’: File exists
GDRIVE_PATH = "/content/gdrive/My Drive/Colab Notebooks/rnnModel/"
## Save model
model_lstm_json = model_lstm.to_json()
with open(GDRIVE_PATH + "model_lstm.json", "w") as file:
file.write(model_lstm_json)
# serialize weights to HDF5
model_lstm.save_weights(GDRIVE_PATH + "model_lstm_weights.h5")
print("Saved lstm model to drive")
model_gru_json = model_gru.to_json()
with open(GDRIVE_PATH + "model_gru.json", "w") as file:
file.write(model_gru_json)
# serialize weights to HDF5
model_gru.save_weights(GDRIVE_PATH + "model_gru_weights.h5")
print("Saved gru model to drive")
Saved lstm model to drive Saved gru model to drive
import pickle
## Save history data
with open(GDRIVE_PATH + "lstm_train_results.pickle", "wb") as handle:
pickle.dump(history_lstm.history, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Saved lstm training history to drive")
with open(GDRIVE_PATH + "gru_train_results.pickle", "wb") as handle:
pickle.dump(history_gru.history, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Saved gru training history to drive")
Saved lstm training history to drive Saved gru training history to drive
## test load
with open(GDRIVE_PATH + "lstm_train_results.pickle", 'rb') as handle:
lstm_hist = pickle.load(handle)
with open(GDRIVE_PATH + "gru_train_results.pickle", 'rb') as handle:
gru_hist = pickle.load(handle)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
lstm_acc = lstm_hist['sparse_categorical_accuracy']
lstm_val_acc = lstm_hist['val_sparse_categorical_accuracy']
lstm_loss = lstm_hist['loss']
lstm_val_loss = lstm_hist['val_loss']
gru_acc = gru_hist['sparse_categorical_accuracy']
gru_val_acc = gru_hist['val_sparse_categorical_accuracy']
gru_loss = gru_hist['loss']
gru_val_loss = gru_hist['val_loss']
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(16,8))
ax1[0].plot(lstm_loss, label='lstm train')
ax1[0].plot(lstm_val_loss, label='lstm validation')
ax1[0].set_title('Model loss')
ax1[0].set_xlabel('Epoch')
ax1[0].set_ylabel('Loss')
ax1[0].legend()
ax1[1].plot(gru_loss, label='gru train')
ax1[1].plot(gru_val_loss, label='gru validation')
ax1[1].set_title('Model loss')
ax1[1].set_xlabel('Epoch')
ax1[1].set_ylabel('Loss')
ax1[1].legend()
ax2[0].plot(lstm_acc, label='lstm train')
ax2[0].plot(lstm_val_acc, label='lstm validation')
ax2[0].set_title('Model accuracy')
ax2[0].set_xlabel('Epoch')
ax2[0].set_ylabel('Accuracy')
ax2[0].legend()
ax2[1].plot(gru_acc, label='gru train')
ax2[1].plot(gru_val_acc, label='gru validation')
ax2[1].set_title('Model accuracy')
ax2[1].set_xlabel('Epoch')
ax2[1].set_ylabel('Accuracy')
ax2[1].legend()
plt.tight_layout()
fig.show()
lstm_out = model_lstm.predict(test_data, verbose=1)
gru_out = model_gru.predict(test_data, verbose=1)
214/214 [==============================] - 40s 186ms/step 214/214 [==============================] - 4s 19ms/step
import numpy as np
lstm_y_pred = np.argmax(lstm_out, axis=1)
gru_y_pred = np.argmax(gru_out, axis=1)
y_true = testDF['labels'].tolist()
from sklearn.metrics import confusion_matrix, accuracy_score
print('LSTM Accuracy: {:.4f}'.format(accuracy_score(lstm_y_pred, y_true)))
print('GRU Accuracy: {:.4f}'.format(accuracy_score(gru_y_pred, y_true)))
lstm_cm = confusion_matrix(lstm_y_pred, y_true)
gru_cm = confusion_matrix(gru_y_pred, y_true)
LSTM Accuracy: 0.9469 GRU Accuracy: 0.9462
from analysisFunctions import plot_confusion_matrix
from constants import inv_categories
plot_confusion_matrix(lstm_cm, target_names=list(inv_categories.values())[:-1], normalize=True)
plot_confusion_matrix(gru_cm, target_names=list(inv_categories.values())[:-1], normalize=True)
layer_out = tf.keras.Model(inputs=model_lstm.input, outputs=model_lstm.get_layer('Attention').output)
final_out = tf.keras.Model(inputs=model_lstm.input, outputs=model_lstm.get_layer('dense_1').output)
attention = layer_out.predict(test_data, verbose=1)
out = final_out.predict(test_data, verbose=1)
214/214 [==============================] - 4s 20ms/step 214/214 [==============================] - 4s 19ms/step
print(out.shape)
y_p = np.argmax(out,axis=1)
y_t = testDF['labels'].tolist()
print(accuracy_score(y_p,y_t))
(6835, 30) 0.9468910021945867
from scipy.io import wavfile
attention_avg = np.apply_along_axis(np.max,2,attention)
print(attention_avg.shape)
fig, (ax1, ax2) = plt.subplots(2,4,figsize=(24,8))
for index, i in enumerate(np.random.randint(0,6835,4)):
f = testDF['files'][i]
_, wave = wavfile.read(f)
ax1[index].plot(attention_avg[i])
ax2[index].plot(range(len(wave)), wave)
ax2[index].set_title('Raw wave of ' + f)
ax2[index].set_ylabel('Amplitude')
ax2[index].set_xlabel('Sample')
fig.show()
(6835, 99)