#!/usr/bin/env python # coding: utf-8 # # The Very Basics of Musical Instruments Classification using Machine Learning # ## Short-Time Fourier Transform (STFT) and Convolutional Neural Networks (CNN) # #
# #

# Business Card #

#
# # # ## Imports # In[1]: get_ipython().system('wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb') get_ipython().system('dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb') get_ipython().system('apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub') get_ipython().system('apt-get update') get_ipython().system('apt-get install cuda=9.0.176-1') # In[2]: get_ipython().system('pip install tensorflow==1.12.0') # In[3]: get_ipython().system('pip install tensorflow-gpu==1.12.0') # In[4]: get_ipython().system('pip install keras==2.2.4') # In[5]: get_ipython().system('pip install tensorboardcolab') # In[6]: import warnings warnings.filterwarnings('ignore') # In[7]: import tensorflow as tf print(tf.__version__) # In[8]: # Imports #General import numpy as np import itertools # System import os, fnmatch # Data import pandas as pd # Visualization import seaborn import matplotlib.pyplot as plt from IPython.core.display import HTML, display, Image # Machine Learning from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import recall_score, precision_score, accuracy_score from sklearn.metrics import confusion_matrix, f1_score, classification_report # Deep Learning import tensorflow as tf from tensorflow.python.client import device_lib from keras.backend.tensorflow_backend import set_session from tensorflow.python.client import device_lib from keras import backend as K from keras.models import Sequential, Model from keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, merge from keras.layers.normalization import BatchNormalization from keras.callbacks import History, EarlyStopping, ModelCheckpoint from keras.models import load_model # Random Seed from tensorflow import set_random_seed from numpy.random import seed seed(0) set_random_seed(0) # Audio import librosa.display, librosa from librosa.util import normalize as normalize import IPython.display as ipd # Configurations path='./audio/london_phill_dataset_multi/' # Display CPUs and GPUs print(device_lib.list_local_devices()) # ## Configurations for Google Colab # In[9]: # Only for Google Colab try: import google.colab if "GPU:0" in tf.test.gpu_device_name(): get_ipython().system('nvidia-smi') config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True, device_count = {'GPU': 0}) config.gpu_options.allow_growth = True session = tf.Session(config=config) set_session(session) else: print("No GPU Detected. Configure the Runtime.") get_ipython().system('git clone https://github.com/GuitarsAI/BasicsMusicalInstrumClassifi') get_ipython().system('unzip ./BasicsMusicalInstrumClassifi/audio/*.zip -d ./BasicsMusicalInstrumClassifi/audio') path="./BasicsMusicalInstrumClassifi/audio/london_phill_dataset_multi/" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.logging.set_verbosity(tf.logging.ERROR) from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback except Exception as e: print("Not inside Google Colab: %s. Using standard configurations." % (e)) get_ipython().system('cat /proc/cpuinfo') # ## Parameters # In[10]: # Signal Processing Parameters fs = 44100 # Sampling Frequency n_fft = 2048 # length of the FFT window hop_length = 512 # Number of samples between successive frames # Machine Learning Parameters testset_size = 0.25 #Percentage of data for Testing # ## Aux Functions # In[11]: # Function to Display a Website display(HTML("")) def show_web(url): html_code='' \ % (url) display(HTML(html_code)) # ## Find Audio Files, Generate Labels and Get Duration # In[12]: #Find Audio Files files = [] labels =[] duration = [] classes=['flute','sax','oboe', 'cello','trumpet','viola'] for root, dirnames, filenames in os.walk(path): for i, filename in enumerate(fnmatch.filter(filenames, '*.mp3')): files.append(os.path.join(root, filename)) for name in classes: if fnmatch.fnmatchcase(filename, '*'+name+'*'): labels.append(name) break else: labels.append('other') print ("Get %d = %s"%(i+1, filename)) try: y, sr = librosa.load(files[i], sr=fs) if len(y) < 2: print("Error loading %s" % filename) continue #y/=y.max() #Normalize yt, index = librosa.effects.trim(y,top_db=60) #Trim duration.append(librosa.get_duration(yt, sr=fs)) except Exception as e: print("Error loading %s. Error: %s" % (filename,e)) print("found %d audio files in %s"%(len(files),path)) # In[13]: print("Max. Duration:", max(duration)) print("Min. Duration:", min(duration)) print("Average Duration:", np.mean(duration)) # ## Trim Silence and Recalculate Duration # In[14]: # Load audio files, trim silence and calculate duration duration = [] for i,f in enumerate(files): print ("Get %d %s"%(i+1, f)) try: y, sr = librosa.load(f, sr=fs) if len(y) < 2: print("Error loading %s" % f) continue #y/=y.max() #Normalize yt, index = librosa.effects.trim(y,top_db=60) #Trim duration.append(librosa.get_duration(yt, sr=fs)) except Exception as e: print("Error loading %s. Error: %s" % (f,e)) print("Calculated %d Durations"%len(duration)) # ## Duration Distribution # In[15]: durationDist = pd.Series(np.array(duration)) plt.figure() durationDist.plot.hist(grid=True, bins=40, rwidth=0.8, color='#607c8e') plt.title('Duration Distribution') plt.xlabel('Duration [s]') plt.ylabel('Counts') plt.grid(axis='y', alpha=0.75) print("Duration average:",np.mean(duration)) # ## Short-Time Fourier Transform # In[16]: show_web("https://en.wikipedia.org/wiki/Short-time_Fourier_transform") # In[17]: # STFT Example y, sr = librosa.load(files[10], sr=fs, duration=1) y/=y.max() #Normalize duration_in_samples=librosa.time_to_samples(1, sr=fs) y_pad = librosa.util.fix_length(y, duration_in_samples) #Pad to 1s if smaller y_stft=librosa.core.stft(y_pad, n_fft=n_fft, hop_length=hop_length) y_spec=librosa.amplitude_to_db(abs(y_stft), np.max) plt.figure(figsize=(14,8)) plt.title("Short-Time Fourier Transform Spectogram \n %s"%files[0]) librosa.display.specshow(y_spec,sr=fs,y_axis='log', x_axis='time') plt.colorbar(format='%+2.0f dB'); print("Spectogram Array Shape:",y_spec.shape) ipd.Audio(y, rate=fs) # ## Encode Labels # In[18]: # Encode Labels labelencoder = LabelEncoder() labelencoder.fit(labels) print(len(labelencoder.classes_), "classes:", ", ".join(list(labelencoder.classes_))) classes_num = labelencoder.transform(labels) #OneHotEncoding encoder=OneHotEncoder(sparse=False, categories="auto") onehot_labels=encoder.fit_transform(classes_num.reshape(len(classes_num),1)) # ## Train and Test Sets # In[19]: # Create Train and Test Sets splitter = StratifiedShuffleSplit(n_splits=1, test_size=testset_size, random_state=0) splits = splitter.split(files, onehot_labels) files_arr=np.array(files) for train_index, test_index in splits: train_set_files = files_arr[train_index] test_set_files = files_arr[test_index] train_classes = onehot_labels[train_index] test_classes = onehot_labels[test_index] # ## Convolutional Neural Networks # In[20]: show_web("https://en.wikipedia.org/wiki/Convolutional_neural_network") # ## Create Model # In[21]: # CNN Model model = Sequential() conv_filters = 16 # number of convolution filters # Layer 1 model.add(Convolution2D(conv_filters, 3,input_shape=(1025, 87, 1))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.40)) # Layer 2 model.add(Convolution2D(conv_filters, 3)) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.40)) # Flatten model.add(Flatten()) # Full layer model.add(Dense(16, activation='sigmoid')) # Output layer model.add(Dense(6,activation='softmax')) # In[22]: model.summary() # In[23]: # Loss Function loss = 'categorical_crossentropy' # Optimizer = Gradient Descent optimizer = 'sgd' # Compile model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) # ## Train Model # In[24]: def featureGenerator(files, labels): while True: for i,f in enumerate(files): try: feature_vectors = [] label = [] y, sr = librosa.load(f, sr=fs) if len(y) < 2: print("Error loading %s" % f) continue y, index = librosa.effects.trim(y,top_db=60) #Trim y = normalize(y) duration_in_samples=librosa.time_to_samples(1, sr=fs) y_pad = librosa.util.fix_length(y, duration_in_samples) #Pad/Trim to same duration y_stft=librosa.core.stft(y_pad, n_fft=n_fft, hop_length=hop_length) y_spec=librosa.amplitude_to_db(abs(y_stft), np.min) scaler = StandardScaler() dtype = K.floatx() data = scaler.fit_transform(y_spec).astype(dtype) data = np.expand_dims(data, axis=0) data = np.expand_dims(data, axis=3) feature_vectors.append(data) label.append([labels[i]]) yield feature_vectors, label except Exception as e: print("Error loading %s. Error: %s" % (f,e)) raise break # In[27]: get_ipython().run_cell_magic('time', '', 'hist = History();\nes = EarlyStopping(monitor=\'val_acc\', min_delta=0.01, restore_best_weights=True, patience= 10, verbose=1 )\nmc = ModelCheckpoint(\'best_model.h5\', monitor=\'val_acc\',save_best_only=True, verbose=1)\n\ncallbacksKeras=[hist,es,mc]\n\ntry:\n import google.colab\n tbc=TensorBoardColab()\n callbacksKeras=[hist,es,mc,TensorBoardColabCallback(tbc)]\n\nexcept Exception as e:\n callbacksKeras=[hist,es,mc]\n print("Not inside Google Colab: %s. Using standard configurations." % (e))\n\n\nmodel.fit_generator(featureGenerator(train_set_files, train_classes), \n validation_data=(featureGenerator(test_set_files, test_classes)), \n validation_steps=150, \n steps_per_epoch=450,epochs=3,callbacks=callbacksKeras, verbose=1)\n') # In[28]: def plot_history(history): loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s] val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s] acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s] val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s] if len(loss_list) == 0: print('Loss is missing in history') return ## As loss always exists epochs = range(1,len(history.history[loss_list[0]]) + 1) ## Loss plt.figure(1) for l in loss_list: plt.plot(epochs, history.history[l], 'b', label='Training loss') for l in val_loss_list: plt.plot(epochs, history.history[l], 'g', label='Validation loss') plt.title('Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() ## Accuracy plt.figure(2) for l in acc_list: plt.plot(epochs, history.history[l], 'b', label='Training accuracy') for l in val_acc_list: plt.plot(epochs, history.history[l], 'g', label='Validation accuracy') plt.title('Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # In[29]: plot_history(hist) # ## Evaluate Model # In[30]: saved_model = load_model('best_model.h5') test_pred = saved_model.predict_generator(featureGenerator(test_set_files, test_classes), steps=150,verbose=1) # In[31]: predictions_round=np.around(test_pred).astype('int'); predictions_int=np.argmax(predictions_round,axis=1); predictions_labels=labelencoder.inverse_transform(np.ravel(predictions_int)); # In[32]: # Recall - the ability of the classifier to find all the positive samples print("Recall: ", recall_score(classes_num[test_index], predictions_int,average=None)) # Precision - The precision is intuitively the ability of the classifier not to #label as positive a sample that is negative print("Precision: ", precision_score(classes_num[test_index], predictions_int,average=None)) # F1-Score - The F1 score can be interpreted as a weighted average of the precision #and recall print("F1-Score: ", f1_score(classes_num[test_index], predictions_int, average=None)) # Accuracy - the number of correctly classified samples print("Accuracy: %.2f ," % accuracy_score(classes_num[test_index], predictions_int,normalize=True), accuracy_score(classes_num[test_index], predictions_int,normalize=False) ) print("Number of samples:",classes_num[test_index].shape[0]) print(classification_report(classes_num[test_index], predictions_int)) # In[33]: # Compute confusion matrix cnf_matrix = confusion_matrix(classes_num[test_index], predictions_int) np.set_printoptions(precision=2) # In[34]: # Function to Plot Confusion Matrix # http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') """ #print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') # In[35]: # Plot non-normalized confusion matrix plt.figure(figsize=(16,12)) plot_confusion_matrix(cnf_matrix, classes=labelencoder.classes_, title='Confusion matrix, without normalization') # In[36]: # Find wrong predicted samples indexes wrong_predictions = [i for i, (e1, e2) in enumerate(zip(classes_num[test_index], predictions_int)) if e1 != e2] # In[37]: # Find wrong predicted audio files print(np.array(labels)[test_index[wrong_predictions]]) print(predictions_labels[wrong_predictions].T) print(np.array(files)[test_index[wrong_predictions]]) # In[ ]: