#!/usr/bin/env python
# coding: utf-8

# # Example 2 - A very simple chord recognition convnet
# 
# We're gonna use synthesize data and use CQT as representation. 

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import librosa
import keras
import keras.backend as K
from matplotlib import pyplot as plt
from future.utils import implements_iterator  # for python 2 compatibility for __next__()

import warnings
warnings.filterwarnings('ignore')


# ### Utility function to generate data

# In[2]:


def sin_wave(secs, freq, sr, gain):
    '''
    Generates a sine wave of frequency given by freq, with duration of secs.
    '''
    t = np.arange(sr * secs)
    return gain * np.sin(2 * np.pi * freq * t / sr)


def whitenoise(gain, shape):
    '''
    Generates white noise of duration given by secs
    '''
    return gain * np.random.uniform(-1., 1., shape)

def chord_wave(secs, f0, sr, gain, major):
    """major: bool"""
    t = np.arange(sr * secs)
    sine_f0 = gain * np.sin(2 * np.pi * f0 * t / sr)
    if major:
        sine_third = gain * np.sin(2 * np.pi * f0 * 2. ** (4./12.) * t / sr)
    else:
        sine_third = gain * np.sin(2 * np.pi * f0 * 2. ** (3./12.) * t / sr)
    return sine_f0 + sine_third


# In[3]:


def add_channel_axis(cqt):
    if K.image_data_format == 'channels_first':
        return cqt[np.newaxis, :, :]
    else:
        return cqt[:, :, np.newaxis]


# In[4]:


class DataGen:
    def __init__(self, sr=16000, batch_size=128):
        np.random.seed(1209)
        self.pitches = [440., 466.2, 493.8, 523.3, 554.4, 587.3,
                        622.3, 659.3, 698.5, 740., 784.0, 830.6]

        self.sr = sr
        self.n_class = 2 # major or minor
        self.secs = 1.
        self.batch_size = batch_size
        self.labels = np.eye(self.n_class)[range(0, self.n_class)]  # 1-hot-vectors
    
        self.major_cqts = []
        self.minor_cqts = []
        
        for freq in self.pitches:
            cqt = librosa.cqt(chord_wave(self.secs, freq, self.sr, gain=0.5, major=True), sr=sr,
                              fmin=220, n_bins=36, filter_scale=2)[:, 2:5]  # use three frames!
            cqt = librosa.amplitude_to_db(cqt, ref=np.min)
            cqt = cqt / np.max(cqt) # cqt in 2d
            
            self.major_cqts.append(add_channel_axis(cqt))
            
            cqt = librosa.cqt(chord_wave(self.secs, freq, self.sr, gain=0.5, major=False), sr=sr,
                              fmin=220, n_bins=36, filter_scale=2)[:, 2:5]  # use three frame!
            cqt = librosa.amplitude_to_db(cqt, ref=np.min)
            cqt = cqt / np.max(cqt)
            self.minor_cqts.append(add_channel_axis(cqt))

        self.cqt_shape = add_channel_axis(cqt).shape  # (1, 36, 3) or (36, 3, 1)

    def __next__(self): 
        """Yielding half Major, half minor"""
        choice = np.random.choice(12, size=self.batch_size // 2, # pick pitches for this batch
                                  replace=True)
        noise_gain = 0.1 * np.random.random_sample(1)  # a random noise gain 
        noise = whitenoise(noise_gain, self.cqt_shape)  # generate white noise
        xs = [noise + self.major_cqts[i] for i in choice]  # compose a batch with additive noise (Major)
        xs += [noise + self.minor_cqts[i] for i in choice]  # compose a batch with additive noise (minor)
        
        ys = np.eye(2)[np.hstack((np.zeros(self.batch_size // 2, dtype=np.int), 
                                  np.ones(self.batch_size // 2, dtype=np.int)))] # corresponding labels

        return np.array(xs, dtype=np.float32), np.array(ys, dtype=np.float32)
    
    next = __next__


# So, Major is labeled as [1, 0], and minor as [0, 1].

# ## Experiment

# In[6]:


datagen = DataGen()
x, y = next(datagen)


# ### How does input data look like?

# In[7]:


plt.figure(figsize=(14, 7))
if K.image_data_format == 'channels_first':
    for i in range(4):
        plt.subplot(1, 10, i+1)
        plt.imshow(x[i, 0], cmap=plt.get_cmap('Blues'))
        plt.subplot(1, 10, 6+i)
        plt.imshow(x[64 + i, 0], cmap=plt.get_cmap('Blues'))
else:
    for i in range(4):
        plt.subplot(1, 10, i+1)
        plt.imshow(x[i, :, :, 0], cmap=plt.get_cmap('Blues'))
        plt.subplot(1, 10, 6+i)
        plt.imshow(x[64 + i, :, :, 0], cmap=plt.get_cmap('Blues'))


# (Major inputs on left, minor inputs on right.)

# ### Buld a convnet model!

# In[8]:


val_datagen = DataGen() # this is a generator for validation set


# * This convlayer takes `(36, 3)` with single channel input.
# * The kernel size is (5, 3) so that it can cover the spectral patterns of major and minor chords.

# In[9]:


# even a simplet network -- one conv layer and that's it!
# If you wanna try it. 


# model = keras.models.Sequential()
# model.add(keras.layers.convolutional.Conv2D(datagen.n_class, (5, 3), use_bias=False, padding='same',
#                              input_shape=datagen.cqt_shape)) # A conv2d layer (36 input nodes --> 8 output nodes)
# model.add(keras.layers.pooling.GlobalMaxPooling2D())
# model.add(keras.layers.Activation('softmax'))  # Softmax because it's single-label classification

# model.compile(optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9,  # a pretty standard optimizer
#                                              decay=1e-6, nesterov=True),
#               loss='categorical_crossentropy',  # categorical crossentropy makes sense with Softmax
#               metrics=['accuracy'])  # we'll also measure the performance but it's NOT a loss function  


# In[9]:


np.random.seed(12345)
model = keras.models.Sequential()
model.add(keras.layers.convolutional.Conv2D(4, (5, 3), use_bias=False, padding='valid',
                             input_shape=datagen.cqt_shape)) # A conv2d layer (36 input nodes --> 8 output nodes)
model.add(keras.layers.pooling.GlobalMaxPooling2D())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(datagen.n_class, use_bias=False))
model.add(keras.layers.Activation('softmax'))

model.compile(optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9,  # a pretty standard optimizer
                                             decay=1e-6, nesterov=True),
              loss='categorical_crossentropy',  # categorical crossentropy makes sense with Softmax
              metrics=['accuracy'])  # we'll also measure the performance but it's NOT a loss function  


# In[10]:


model.summary()


# ### Train it!

# In[11]:


history = model.fit_generator(datagen, steps_per_epoch=200, epochs=15, verbose=1,
                             validation_data=val_datagen, validation_steps=4)


# In[12]:


plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'], label='training')
plt.plot(history.history['val_acc'], label='validation', alpha=0.7)
plt.title('Accuracy')
plt.xlabel('epoch')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='training')
plt.plot(history.history['val_loss'], label='validation', alpha=0.7)
plt.title('Loss')
plt.xlabel('epoch')
plt.legend()


# In[13]:


loss = model.evaluate_generator(datagen, steps=10)
print("loss: {}, accuracy: {}".format(loss[0], loss[1]))


# In[14]:


weights = model.get_weights()[0]
print("The Convolution2D layer weights shape is: {}".format(weights.shape))


# , which is (height_kernel, width_kernel, input_channel_number, output_channel_number)

# # Understanding by visualising

# ## Visualising the kernels

# In[16]:


plt.figure(figsize=(20, 4))
titles = ['minor chord positive', 'minor chord negative?', 'wth??', 'minor chord positive']
for i in range(4):
    plt.subplot(1, 4, i+1)
    plt.imshow(weights[:, :, 0, i], cmap=plt.get_cmap('Blues'))
    plt.colorbar()
    plt.title(titles[i])


# Okay, so out of four kernels, the 1st and 4th are definitely minor chord positive. Not sure for the other two.
# 
# ## Visualising the feature maps
# 
# which is `conv(input, kernel)`.

# In[18]:


x_major = np.squeeze(x[0])
x_minor = np.squeeze(x[-1])
from scipy import signal
def plot_convolution(x, kernels):
    """kernels: 3d, (height, width, channel) and four channels."""
    n_kernels = kernels.shape[2]
    plt.figure(figsize=(9, 5))
    plt.subplot(1, n_kernels+1, 1)
    plt.imshow(x)
    plt.title('Input x')
    for idx in range(n_kernels):
        plt.subplot(1, n_kernels+1, idx+2)
        conved = signal.convolve2d(x, kernels[:, :, idx], mode='same')
        plt.imshow(conved)
        plt.title('max: {:4.2f}'.format(np.max(conved)))


# In[19]:


plot_convolution(x_major, weights[:, :, 0, :])
print("Here's the feature map activation for a Major chord input")


# In[20]:


plot_convolution(x_minor, weights[:, :, 0, :])
print("Here's for a minor chord input")


# The 1st and 4th probably are most distinctive kernels. Now Let's move to the dense layer weights.
# 
# ## Dense layer weights

# In[21]:


dense_weights = model.get_weights()[1]
print(dense_weights)


# Look at the dense layer weights! Each row-->input, each column-->output. 
# 
# 1. The 1st feature map will have an high activation on minor chords. 
#   - After global maxpooling, this feature map will have large value on minor chord input, and small value on Major chord input. 
#   - The value is multiplied to `-0.496` on the Major output node, and `2.33` on the minor output node. 
#   - Seems alright, huh?
# 
# 2. The 2nd feature map will have a large negative activation on minor chord.
# 
# 3. Not sure what this kernel is doing. `[-0.6, -0.3]` weights also say this kernel is not too crucial.
# 
# 4. The 4th is minor chord positive. `[-3.05, 3.25` supports it again.

# In[ ]: