We're gonna use synthesize data and use CQT as representation.
%matplotlib inline
import numpy as np
import librosa
import keras
import keras.backend as K
from matplotlib import pyplot as plt
from future.utils import implements_iterator # for python 2 compatibility for __next__()
import warnings
warnings.filterwarnings('ignore')
Using TensorFlow backend.
def sin_wave(secs, freq, sr, gain):
'''
Generates a sine wave of frequency given by freq, with duration of secs.
'''
t = np.arange(sr * secs)
return gain * np.sin(2 * np.pi * freq * t / sr)
def whitenoise(gain, shape):
'''
Generates white noise of duration given by secs
'''
return gain * np.random.uniform(-1., 1., shape)
def chord_wave(secs, f0, sr, gain, major):
"""major: bool"""
t = np.arange(sr * secs)
sine_f0 = gain * np.sin(2 * np.pi * f0 * t / sr)
if major:
sine_third = gain * np.sin(2 * np.pi * f0 * 2. ** (4./12.) * t / sr)
else:
sine_third = gain * np.sin(2 * np.pi * f0 * 2. ** (3./12.) * t / sr)
return sine_f0 + sine_third
def add_channel_axis(cqt):
if K.image_data_format == 'channels_first':
return cqt[np.newaxis, :, :]
else:
return cqt[:, :, np.newaxis]
class DataGen:
def __init__(self, sr=16000, batch_size=128):
np.random.seed(1209)
self.pitches = [440., 466.2, 493.8, 523.3, 554.4, 587.3,
622.3, 659.3, 698.5, 740., 784.0, 830.6]
self.sr = sr
self.n_class = 2 # major or minor
self.secs = 1.
self.batch_size = batch_size
self.labels = np.eye(self.n_class)[range(0, self.n_class)] # 1-hot-vectors
self.major_cqts = []
self.minor_cqts = []
for freq in self.pitches:
cqt = librosa.cqt(chord_wave(self.secs, freq, self.sr, gain=0.5, major=True), sr=sr,
fmin=220, n_bins=36, filter_scale=2)[:, 2:5] # use three frames!
cqt = librosa.amplitude_to_db(cqt, ref=np.min)
cqt = cqt / np.max(cqt) # cqt in 2d
self.major_cqts.append(add_channel_axis(cqt))
cqt = librosa.cqt(chord_wave(self.secs, freq, self.sr, gain=0.5, major=False), sr=sr,
fmin=220, n_bins=36, filter_scale=2)[:, 2:5] # use three frame!
cqt = librosa.amplitude_to_db(cqt, ref=np.min)
cqt = cqt / np.max(cqt)
self.minor_cqts.append(add_channel_axis(cqt))
self.cqt_shape = add_channel_axis(cqt).shape # (1, 36, 3) or (36, 3, 1)
def __next__(self):
"""Yielding half Major, half minor"""
choice = np.random.choice(12, size=self.batch_size // 2, # pick pitches for this batch
replace=True)
noise_gain = 0.1 * np.random.random_sample(1) # a random noise gain
noise = whitenoise(noise_gain, self.cqt_shape) # generate white noise
xs = [noise + self.major_cqts[i] for i in choice] # compose a batch with additive noise (Major)
xs += [noise + self.minor_cqts[i] for i in choice] # compose a batch with additive noise (minor)
ys = np.eye(2)[np.hstack((np.zeros(self.batch_size // 2, dtype=np.int),
np.ones(self.batch_size // 2, dtype=np.int)))] # corresponding labels
return np.array(xs, dtype=np.float32), np.array(ys, dtype=np.float32)
next = __next__
So, Major is labeled as [1, 0], and minor as [0, 1].
datagen = DataGen()
x, y = next(datagen)
plt.figure(figsize=(14, 7))
if K.image_data_format == 'channels_first':
for i in range(4):
plt.subplot(1, 10, i+1)
plt.imshow(x[i, 0], cmap=plt.get_cmap('Blues'))
plt.subplot(1, 10, 6+i)
plt.imshow(x[64 + i, 0], cmap=plt.get_cmap('Blues'))
else:
for i in range(4):
plt.subplot(1, 10, i+1)
plt.imshow(x[i, :, :, 0], cmap=plt.get_cmap('Blues'))
plt.subplot(1, 10, 6+i)
plt.imshow(x[64 + i, :, :, 0], cmap=plt.get_cmap('Blues'))
(Major inputs on left, minor inputs on right.)
val_datagen = DataGen() # this is a generator for validation set
(36, 3)
with single channel input.# even a simplet network -- one conv layer and that's it!
# If you wanna try it.
# model = keras.models.Sequential()
# model.add(keras.layers.convolutional.Conv2D(datagen.n_class, (5, 3), use_bias=False, padding='same',
# input_shape=datagen.cqt_shape)) # A conv2d layer (36 input nodes --> 8 output nodes)
# model.add(keras.layers.pooling.GlobalMaxPooling2D())
# model.add(keras.layers.Activation('softmax')) # Softmax because it's single-label classification
# model.compile(optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, # a pretty standard optimizer
# decay=1e-6, nesterov=True),
# loss='categorical_crossentropy', # categorical crossentropy makes sense with Softmax
# metrics=['accuracy']) # we'll also measure the performance but it's NOT a loss function
np.random.seed(12345)
model = keras.models.Sequential()
model.add(keras.layers.convolutional.Conv2D(4, (5, 3), use_bias=False, padding='valid',
input_shape=datagen.cqt_shape)) # A conv2d layer (36 input nodes --> 8 output nodes)
model.add(keras.layers.pooling.GlobalMaxPooling2D())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(datagen.n_class, use_bias=False))
model.add(keras.layers.Activation('softmax'))
model.compile(optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, # a pretty standard optimizer
decay=1e-6, nesterov=True),
loss='categorical_crossentropy', # categorical crossentropy makes sense with Softmax
metrics=['accuracy']) # we'll also measure the performance but it's NOT a loss function
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv2d_1 (Conv2D) (None, 32, 1, 4) 60 _________________________________________________________________ global_max_pooling2d_1 (Glob (None, 4) 0 _________________________________________________________________ activation_1 (Activation) (None, 4) 0 _________________________________________________________________ dense_1 (Dense) (None, 2) 8 _________________________________________________________________ activation_2 (Activation) (None, 2) 0 ================================================================= Total params: 68 Trainable params: 68 Non-trainable params: 0 _________________________________________________________________
history = model.fit_generator(datagen, steps_per_epoch=200, epochs=15, verbose=1,
validation_data=val_datagen, validation_steps=4)
Epoch 1/15 200/200 [==============================] - 1s - loss: 0.6179 - acc: 0.8118 - val_loss: 0.4627 - val_acc: 1.0000 Epoch 2/15 200/200 [==============================] - 0s - loss: 0.2446 - acc: 1.0000 - val_loss: 0.1072 - val_acc: 1.0000 Epoch 3/15 200/200 [==============================] - 1s - loss: 0.0611 - acc: 1.0000 - val_loss: 0.0396 - val_acc: 1.0000 Epoch 4/15 200/200 [==============================] - 0s - loss: 0.0279 - acc: 1.0000 - val_loss: 0.0262 - val_acc: 1.0000 Epoch 5/15 200/200 [==============================] - 0s - loss: 0.0170 - acc: 1.0000 - val_loss: 0.0130 - val_acc: 1.0000 Epoch 6/15 200/200 [==============================] - 0s - loss: 0.0122 - acc: 1.0000 - val_loss: 0.0090 - val_acc: 1.0000 Epoch 7/15 200/200 [==============================] - 1s - loss: 0.0093 - acc: 1.0000 - val_loss: 0.0114 - val_acc: 1.0000 Epoch 8/15 200/200 [==============================] - 1s - loss: 0.0075 - acc: 1.0000 - val_loss: 0.0072 - val_acc: 1.0000 Epoch 9/15 200/200 [==============================] - 0s - loss: 0.0063 - acc: 1.0000 - val_loss: 0.0080 - val_acc: 1.0000 Epoch 10/15 200/200 [==============================] - 0s - loss: 0.0053 - acc: 1.0000 - val_loss: 0.0049 - val_acc: 1.0000 Epoch 11/15 200/200 [==============================] - 0s - loss: 0.0049 - acc: 1.0000 - val_loss: 0.0046 - val_acc: 1.0000 Epoch 12/15 200/200 [==============================] - 0s - loss: 0.0039 - acc: 1.0000 - val_loss: 0.0042 - val_acc: 1.0000 Epoch 13/15 200/200 [==============================] - 1s - loss: 0.0037 - acc: 1.0000 - val_loss: 0.0035 - val_acc: 1.0000 Epoch 14/15 200/200 [==============================] - 1s - loss: 0.0033 - acc: 1.0000 - val_loss: 0.0044 - val_acc: 1.0000 Epoch 15/15 200/200 [==============================] - 1s - loss: 0.0029 - acc: 1.0000 - val_loss: 0.0022 - val_acc: 1.0000
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'], label='training')
plt.plot(history.history['val_acc'], label='validation', alpha=0.7)
plt.title('Accuracy')
plt.xlabel('epoch')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='training')
plt.plot(history.history['val_loss'], label='validation', alpha=0.7)
plt.title('Loss')
plt.xlabel('epoch')
plt.legend()
<matplotlib.legend.Legend at 0x11c71a7d0>
loss = model.evaluate_generator(datagen, steps=10)
print("loss: {}, accuracy: {}".format(loss[0], loss[1]))
loss: 0.00349949840456, accuracy: 1.0
weights = model.get_weights()[0]
print("The Convolution2D layer weights shape is: {}".format(weights.shape))
The Convolution2D layer weights shape is: (5, 3, 1, 4)
, which is (height_kernel, width_kernel, input_channel_number, output_channel_number)
plt.figure(figsize=(20, 4))
titles = ['minor chord positive', 'minor chord negative?', 'wth??', 'minor chord positive']
for i in range(4):
plt.subplot(1, 4, i+1)
plt.imshow(weights[:, :, 0, i], cmap=plt.get_cmap('Blues'))
plt.colorbar()
plt.title(titles[i])
Okay, so out of four kernels, the 1st and 4th are definitely minor chord positive. Not sure for the other two.
which is conv(input, kernel)
.
x_major = np.squeeze(x[0])
x_minor = np.squeeze(x[-1])
from scipy import signal
def plot_convolution(x, kernels):
"""kernels: 3d, (height, width, channel) and four channels."""
n_kernels = kernels.shape[2]
plt.figure(figsize=(9, 5))
plt.subplot(1, n_kernels+1, 1)
plt.imshow(x)
plt.title('Input x')
for idx in range(n_kernels):
plt.subplot(1, n_kernels+1, idx+2)
conved = signal.convolve2d(x, kernels[:, :, idx], mode='same')
plt.imshow(conved)
plt.title('max: {:4.2f}'.format(np.max(conved)))
plot_convolution(x_major, weights[:, :, 0, :])
print("Here's the feature map activation for a Major chord input")
Here's the feature map activation for a Major chord input
plot_convolution(x_minor, weights[:, :, 0, :])
print("Here's for a minor chord input")
Here's for a minor chord input
The 1st and 4th probably are most distinctive kernels. Now Let's move to the dense layer weights.
dense_weights = model.get_weights()[1]
print(dense_weights)
[[-0.50099331 2.33464241] [ 2.9126389 -3.8759408 ] [-0.6134612 -0.3230224 ] [-3.05488276 3.24755979]]
Look at the dense layer weights! Each row-->input, each column-->output.
-0.496
on the Major output node, and 2.33
on the minor output node.The 2nd feature map will have a large negative activation on minor chord.
Not sure what this kernel is doing. [-0.6, -0.3]
weights also say this kernel is not too crucial.
The 4th is minor chord positive. [-3.05, 3.25
supports it again.