#!/usr/bin/env python # coding: utf-8 # # Example 1 - a pitch detection network # # In this notebook, we wlll train a single-layer network for a super easy task. The goal is to show # how Dense layer is working. # # We will use a synthesized data here. It consists of a pure sinusoid on 12 differnt pitches. Then we will compute a CQT and feed one of the CQT frames into the network. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import librosa import keras from future.utils import implements_iterator # for python 2 compatibility for __next__() from matplotlib import pyplot as plt import warnings warnings.filterwarnings('ignore') plt.rc('figure', titlesize=20) plt.rc('font', size=20) plt.rc('xtick', labelsize=12) # ### Functions to generate data # In[2]: def sin_wave(secs, freq, sr, gain): ''' Generates a sine wave of frequency given by freq, with duration of secs. ''' t = np.arange(sr * secs) return gain * np.sin(2 * np.pi * freq * t / sr) def whitenoise(gain, shape): ''' Generates white noise of duration given by secs ''' return gain * np.random.uniform(-1., 1., shape) # ### A class to generate data batches # In[3]: class DataGen: def __init__(self, sr=16000, batch_size=128): np.random.seed(1209) self.pitches = [440., 466.2, 493.8, 523.3, 554.4, 587.3, 622.3, 659.3, 698.5, 740., 784.0, 830.6] self.sr = sr self.n_class = len(self.pitches) # 12 pitches self.secs = 1. self.batch_size = batch_size self.sins = [] self.labels = np.eye(self.n_class)[range(0, self.n_class)] # 1-hot-vectors for freq in self.pitches: cqt = librosa.cqt(sin_wave(self.secs, freq, self.sr, gain=0.5), sr=sr, fmin=220, n_bins=36, filter_scale=2)[:, 1] # use only one frame! cqt = librosa.amplitude_to_db(cqt, ref=np.min) cqt = cqt / np.max(cqt) self.sins.append(cqt) self.cqt_shape = cqt.shape # (36, ) def __next__(self): choice = np.random.choice(12, size=self.batch_size, # pick pitches for this batch replace=True) noise_gain = 0.1 * np.random.random_sample(1) # a random noise gain noise = whitenoise(noise_gain, self.cqt_shape) # generate white noise xs = [noise + self.sins[i] for i in choice] # compose a batch with additive noise ys = [self.labels[i] for i in choice] # corresponding labels return np.array(xs, dtype=np.float32), np.array(ys, dtype=np.float32) next = __next__ # ### A quick look on the data we're generating -- and we'll feed as inputs # In[4]: datagen = DataGen() print("Input: A frame of CQT in a shape of: {}".format(datagen.cqt_shape)) x, y = next(datagen) print("Input batch: CQT frames, {}".format(x.shape)) print("Number of classes (pitches): {}".format(datagen.n_class)) plt.figure(figsize=(20, 6)) for i in range(2): x, y = next(datagen) plt.subplot(2, 2, i+1) plt.imshow(x.transpose(), cmap=plt.get_cmap('Blues')) plt.xlabel('data sample index') plt.ylabel('pitch index') plt.title('Batch {} (x, input)'.format(i+1)) plt.subplot(2, 2, i+3) plt.imshow(y.transpose(), cmap=plt.get_cmap('Blues')) plt.title('Batch {} (y, label)'.format(i+1)) print('') # Each subplot is a visualisation of each batch. The pitch ranges only in [440 Hz, 830 Hz] (A4 - G#5) but the CQT covers wider range of frequencies (3 octaves from 220 Hz to 880 Hz) # ### Build a model! # In[5]: val_datagen = DataGen() # this is a generator for validation set # The model is very simple. No bias, only single dense layer, which will connect a 36-dim input to a 12-dim output. # In[6]: model = keras.models.Sequential() model.add(keras.layers.Dense(datagen.n_class, use_bias=False, input_shape=datagen.cqt_shape)) # A dense layer (36 input nodes --> 12 output nodes) model.add(keras.layers.Activation('softmax')) # Softmax because it's single-label classification model.compile(optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, # a pretty standard optimizer decay=1e-6, nesterov=True), loss='categorical_crossentropy', # categorical crossentropy makes sense with Softmax metrics=['accuracy']) # we'll also measure the performance but it's NOT a loss function # Number of parameters => $432 = 36 \times 12 $. If there is bias, then it would become $36 \times 12 + 12$. # In[7]: model.summary() # Let's see the network. # ### Train it! # Alright, let's train it! # In[8]: history = model.fit_generator(datagen, steps_per_epoch=200, epochs=25, verbose=1, validation_data=val_datagen, validation_steps=4) # ### What happened? # In[9]: plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) plt.plot(history.history['acc'], label='training') plt.plot(history.history['val_acc'], label='validation', alpha=0.7) plt.title('Accuracy') plt.xlabel('epoch') plt.legend() plt.subplot(1, 2, 2) plt.plot(history.history['loss'], label='training') plt.plot(history.history['val_loss'], label='validation', alpha=0.7) plt.title('Loss') plt.xlabel('epoch') plt.legend() # Validation set loss is similar to training loss, i.e., no overfitting. # # ### How about on test set? # In[10]: loss = model.evaluate_generator(datagen, steps=10) print("loss: {}, accuracy: {}".format(loss[0], loss[1])) # ### How does the network work after training? # In[11]: weights = model.get_weights()[0] # (36, 12) print(weights.shape) # weights = weights / np.sum(np.abs(weights), axis=1, keepdims=True) # In[12]: # weights plt.figure(figsize=(20, 5)) plt.imshow(weights.transpose(), cmap=plt.get_cmap('Blues')) plt.colorbar() plt.title('Visualisation of the trained weights ($W$)') pitch_names = 'A4 A#4 B4 C5 C#5 D5 D#5 E5 F5 F#5 G5 G#5'.split(' ') plt.yticks(range(0, 12), ['{} ({})'.format(p, str(i)) for p, i in zip(pitch_names, range(1, 13))]) plt.xticks(range(0, 36), [str(i) for i in range(1, 37)]) plt.xlabel('Frequency bands of CQT') plt.ylabel('output') # an example input plt.figure(figsize=(20, 1)) plt.imshow(x[0:1], cmap=plt.get_cmap('Blues')) plt.xticks(range(0, 36), [str(i) for i in range(1, 37)]) plt.colorbar() plt.title('What would be the output if this CQT frame is multiplied to this weights?') plt.yticks([]) print('') # This is visualisation of weights $\textbf{W}$. # # Each row corresponeds to each output node. For example, the 1st row (A4) is connected to the 1st output node, which will be activated (has a large value) if the network thinks it is A4. # # In[13]: def softmax(x): """A softmax function that is not perfect in terms of numerical stability (it might overflow)""" return np.exp(x) / np.exp(x).sum(axis=0) output_for_x = softmax(np.dot(weights.transpose(), x[0])) plt.bar(range(len(output_for_x)), output_for_x) plt.xticks(range(len(output_for_x)), pitch_names) plt.title('Output node values for x[0]: ') print('Estimated pitch: {}'.format(pitch_names[np.argmax(output_for_x)])) print('Groundtruth: {}'.format(pitch_names[np.argmax(y[0])])) # In[ ]: