#!/usr/bin/env python
# coding: utf-8
# # The Very Basics of Musical Instruments Classification using Machine Learning
# ## Short-Time Fourier Transform (STFT) and Convolutional Neural Networks (CNN)
#
#
#
#
#
#
#
#
#
# ## Imports
# In[1]:
get_ipython().system('wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb')
get_ipython().system('dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb')
get_ipython().system('apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub')
get_ipython().system('apt-get update')
get_ipython().system('apt-get install cuda=9.0.176-1')
# In[2]:
get_ipython().system('pip install tensorflow==1.12.0')
# In[3]:
get_ipython().system('pip install tensorflow-gpu==1.12.0')
# In[4]:
get_ipython().system('pip install keras==2.2.4')
# In[5]:
get_ipython().system('pip install tensorboardcolab')
# In[6]:
import warnings
warnings.filterwarnings('ignore')
# In[7]:
import tensorflow as tf
print(tf.__version__)
# In[8]:
# Imports
#General
import numpy as np
import itertools
# System
import os, fnmatch
# Data
import pandas as pd
# Visualization
import seaborn
import matplotlib.pyplot as plt
from IPython.core.display import HTML, display, Image
# Machine Learning
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix, f1_score, classification_report
# Deep Learning
import tensorflow as tf
from tensorflow.python.client import device_lib
from keras.backend.tensorflow_backend import set_session
from tensorflow.python.client import device_lib
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, merge
from keras.layers.normalization import BatchNormalization
from keras.callbacks import History, EarlyStopping, ModelCheckpoint
from keras.models import load_model
# Random Seed
from tensorflow import set_random_seed
from numpy.random import seed
seed(0)
set_random_seed(0)
# Audio
import librosa.display, librosa
from librosa.util import normalize as normalize
import IPython.display as ipd
# Configurations
path='./audio/london_phill_dataset_multi/'
# Display CPUs and GPUs
print(device_lib.list_local_devices())
# ## Configurations for Google Colab
# In[9]:
# Only for Google Colab
try:
import google.colab
if "GPU:0" in tf.test.gpu_device_name():
get_ipython().system('nvidia-smi')
config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True, device_count = {'GPU': 0})
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
set_session(session)
else:
print("No GPU Detected. Configure the Runtime.")
get_ipython().system('git clone https://github.com/GuitarsAI/BasicsMusicalInstrumClassifi')
get_ipython().system('unzip ./BasicsMusicalInstrumClassifi/audio/*.zip -d ./BasicsMusicalInstrumClassifi/audio')
path="./BasicsMusicalInstrumClassifi/audio/london_phill_dataset_multi/"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.logging.set_verbosity(tf.logging.ERROR)
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
except Exception as e:
print("Not inside Google Colab: %s. Using standard configurations." % (e))
get_ipython().system('cat /proc/cpuinfo')
# ## Parameters
# In[10]:
# Signal Processing Parameters
fs = 44100 # Sampling Frequency
n_fft = 2048 # length of the FFT window
hop_length = 512 # Number of samples between successive frames
# Machine Learning Parameters
testset_size = 0.25 #Percentage of data for Testing
# ## Aux Functions
# In[11]:
# Function to Display a Website
display(HTML(""))
def show_web(url):
html_code='' \
% (url)
display(HTML(html_code))
# ## Find Audio Files, Generate Labels and Get Duration
# In[12]:
#Find Audio Files
files = []
labels =[]
duration = []
classes=['flute','sax','oboe', 'cello','trumpet','viola']
for root, dirnames, filenames in os.walk(path):
for i, filename in enumerate(fnmatch.filter(filenames, '*.mp3')):
files.append(os.path.join(root, filename))
for name in classes:
if fnmatch.fnmatchcase(filename, '*'+name+'*'):
labels.append(name)
break
else:
labels.append('other')
print ("Get %d = %s"%(i+1, filename))
try:
y, sr = librosa.load(files[i], sr=fs)
if len(y) < 2:
print("Error loading %s" % filename)
continue
#y/=y.max() #Normalize
yt, index = librosa.effects.trim(y,top_db=60) #Trim
duration.append(librosa.get_duration(yt, sr=fs))
except Exception as e:
print("Error loading %s. Error: %s" % (filename,e))
print("found %d audio files in %s"%(len(files),path))
# In[13]:
print("Max. Duration:", max(duration))
print("Min. Duration:", min(duration))
print("Average Duration:", np.mean(duration))
# ## Trim Silence and Recalculate Duration
# In[14]:
# Load audio files, trim silence and calculate duration
duration = []
for i,f in enumerate(files):
print ("Get %d %s"%(i+1, f))
try:
y, sr = librosa.load(f, sr=fs)
if len(y) < 2:
print("Error loading %s" % f)
continue
#y/=y.max() #Normalize
yt, index = librosa.effects.trim(y,top_db=60) #Trim
duration.append(librosa.get_duration(yt, sr=fs))
except Exception as e:
print("Error loading %s. Error: %s" % (f,e))
print("Calculated %d Durations"%len(duration))
# ## Duration Distribution
# In[15]:
durationDist = pd.Series(np.array(duration))
plt.figure()
durationDist.plot.hist(grid=True, bins=40, rwidth=0.8,
color='#607c8e')
plt.title('Duration Distribution')
plt.xlabel('Duration [s]')
plt.ylabel('Counts')
plt.grid(axis='y', alpha=0.75)
print("Duration average:",np.mean(duration))
# ## Short-Time Fourier Transform
# In[16]:
show_web("https://en.wikipedia.org/wiki/Short-time_Fourier_transform")
# In[17]:
# STFT Example
y, sr = librosa.load(files[10], sr=fs, duration=1)
y/=y.max() #Normalize
duration_in_samples=librosa.time_to_samples(1, sr=fs)
y_pad = librosa.util.fix_length(y, duration_in_samples) #Pad to 1s if smaller
y_stft=librosa.core.stft(y_pad, n_fft=n_fft, hop_length=hop_length)
y_spec=librosa.amplitude_to_db(abs(y_stft), np.max)
plt.figure(figsize=(14,8))
plt.title("Short-Time Fourier Transform Spectogram \n %s"%files[0])
librosa.display.specshow(y_spec,sr=fs,y_axis='log', x_axis='time')
plt.colorbar(format='%+2.0f dB');
print("Spectogram Array Shape:",y_spec.shape)
ipd.Audio(y, rate=fs)
# ## Encode Labels
# In[18]:
# Encode Labels
labelencoder = LabelEncoder()
labelencoder.fit(labels)
print(len(labelencoder.classes_), "classes:", ", ".join(list(labelencoder.classes_)))
classes_num = labelencoder.transform(labels)
#OneHotEncoding
encoder=OneHotEncoder(sparse=False, categories="auto")
onehot_labels=encoder.fit_transform(classes_num.reshape(len(classes_num),1))
# ## Train and Test Sets
# In[19]:
# Create Train and Test Sets
splitter = StratifiedShuffleSplit(n_splits=1, test_size=testset_size, random_state=0)
splits = splitter.split(files, onehot_labels)
files_arr=np.array(files)
for train_index, test_index in splits:
train_set_files = files_arr[train_index]
test_set_files = files_arr[test_index]
train_classes = onehot_labels[train_index]
test_classes = onehot_labels[test_index]
# ## Convolutional Neural Networks
# In[20]:
show_web("https://en.wikipedia.org/wiki/Convolutional_neural_network")
# ## Create Model
# In[21]:
# CNN Model
model = Sequential()
conv_filters = 16 # number of convolution filters
# Layer 1
model.add(Convolution2D(conv_filters, 3,input_shape=(1025, 87, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.40))
# Layer 2
model.add(Convolution2D(conv_filters, 3))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.40))
# Flatten
model.add(Flatten())
# Full layer
model.add(Dense(16, activation='sigmoid'))
# Output layer
model.add(Dense(6,activation='softmax'))
# In[22]:
model.summary()
# In[23]:
# Loss Function
loss = 'categorical_crossentropy'
# Optimizer = Gradient Descent
optimizer = 'sgd'
# Compile
model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
# ## Train Model
# In[24]:
def featureGenerator(files, labels):
while True:
for i,f in enumerate(files):
try:
feature_vectors = []
label = []
y, sr = librosa.load(f, sr=fs)
if len(y) < 2:
print("Error loading %s" % f)
continue
y, index = librosa.effects.trim(y,top_db=60) #Trim
y = normalize(y)
duration_in_samples=librosa.time_to_samples(1, sr=fs)
y_pad = librosa.util.fix_length(y, duration_in_samples) #Pad/Trim to same duration
y_stft=librosa.core.stft(y_pad, n_fft=n_fft, hop_length=hop_length)
y_spec=librosa.amplitude_to_db(abs(y_stft), np.min)
scaler = StandardScaler()
dtype = K.floatx()
data = scaler.fit_transform(y_spec).astype(dtype)
data = np.expand_dims(data, axis=0)
data = np.expand_dims(data, axis=3)
feature_vectors.append(data)
label.append([labels[i]])
yield feature_vectors, label
except Exception as e:
print("Error loading %s. Error: %s" % (f,e))
raise
break
# In[27]:
get_ipython().run_cell_magic('time', '', 'hist = History();\nes = EarlyStopping(monitor=\'val_acc\', min_delta=0.01, restore_best_weights=True, patience= 10, verbose=1 )\nmc = ModelCheckpoint(\'best_model.h5\', monitor=\'val_acc\',save_best_only=True, verbose=1)\n\ncallbacksKeras=[hist,es,mc]\n\ntry:\n import google.colab\n tbc=TensorBoardColab()\n callbacksKeras=[hist,es,mc,TensorBoardColabCallback(tbc)]\n\nexcept Exception as e:\n callbacksKeras=[hist,es,mc]\n print("Not inside Google Colab: %s. Using standard configurations." % (e))\n\n\nmodel.fit_generator(featureGenerator(train_set_files, train_classes), \n validation_data=(featureGenerator(test_set_files, test_classes)), \n validation_steps=150, \n steps_per_epoch=450,epochs=3,callbacks=callbacksKeras, verbose=1)\n')
# In[28]:
def plot_history(history):
loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
if len(loss_list) == 0:
print('Loss is missing in history')
return
## As loss always exists
epochs = range(1,len(history.history[loss_list[0]]) + 1)
## Loss
plt.figure(1)
for l in loss_list:
plt.plot(epochs, history.history[l], 'b', label='Training loss')
for l in val_loss_list:
plt.plot(epochs, history.history[l], 'g', label='Validation loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
## Accuracy
plt.figure(2)
for l in acc_list:
plt.plot(epochs, history.history[l], 'b', label='Training accuracy')
for l in val_acc_list:
plt.plot(epochs, history.history[l], 'g', label='Validation accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# In[29]:
plot_history(hist)
# ## Evaluate Model
# In[30]:
saved_model = load_model('best_model.h5')
test_pred = saved_model.predict_generator(featureGenerator(test_set_files, test_classes), steps=150,verbose=1)
# In[31]:
predictions_round=np.around(test_pred).astype('int');
predictions_int=np.argmax(predictions_round,axis=1);
predictions_labels=labelencoder.inverse_transform(np.ravel(predictions_int));
# In[32]:
# Recall - the ability of the classifier to find all the positive samples
print("Recall: ", recall_score(classes_num[test_index], predictions_int,average=None))
# Precision - The precision is intuitively the ability of the classifier not to
#label as positive a sample that is negative
print("Precision: ", precision_score(classes_num[test_index], predictions_int,average=None))
# F1-Score - The F1 score can be interpreted as a weighted average of the precision
#and recall
print("F1-Score: ", f1_score(classes_num[test_index], predictions_int, average=None))
# Accuracy - the number of correctly classified samples
print("Accuracy: %.2f ," % accuracy_score(classes_num[test_index], predictions_int,normalize=True), accuracy_score(classes_num[test_index], predictions_int,normalize=False) )
print("Number of samples:",classes_num[test_index].shape[0])
print(classification_report(classes_num[test_index], predictions_int))
# In[33]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(classes_num[test_index], predictions_int)
np.set_printoptions(precision=2)
# In[34]:
# Function to Plot Confusion Matrix
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
"""
#print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# In[35]:
# Plot non-normalized confusion matrix
plt.figure(figsize=(16,12))
plot_confusion_matrix(cnf_matrix, classes=labelencoder.classes_,
title='Confusion matrix, without normalization')
# In[36]:
# Find wrong predicted samples indexes
wrong_predictions = [i for i, (e1, e2) in enumerate(zip(classes_num[test_index], predictions_int)) if e1 != e2]
# In[37]:
# Find wrong predicted audio files
print(np.array(labels)[test_index[wrong_predictions]])
print(predictions_labels[wrong_predictions].T)
print(np.array(files)[test_index[wrong_predictions]])
# In[ ]: