FMA: A Dataset For Music Analysis

Michaƫl Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

Baselines

  • This notebook evaluates standard classifiers from scikit-learn on the provided features.
  • Moreover, it evaluates Deep Learning models on both audio and spectrograms.
In [1]:
import time
import os

import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier

import utils
Using TensorFlow backend.
In [2]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')

tracks = utils.load('data/fma_metadata/tracks.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape
Out[2]:
((106574, 52), (106574, 518), (14511, 249))

Subset

In [3]:
subset = tracks.index[tracks['set', 'subset'] <= 'medium']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape
Not enough Echonest features: (13554, 767)
Out[3]:
((25000, 52), (25000, 518))
In [4]:
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))
19922 training examples, 2505 validation examples, 2573 testing examples
Top genres (16): ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken']
All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 808, 810, 811, 906, 1032, 1060, 1193, 1235]

1 Multiple classifiers and feature sets

Todo:

  • Cross-validation for hyper-parameters.
  • Dimensionality reduction?

1.1 Pre-processing

In [5]:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
        #y = enc.fit_transform(tracks['track', 'genre_top'])
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        #labels = tracks['track', 'genres']

    # Split in training, validation and testing sets.
    y_train = enc.fit_transform(labels[train])
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    X_train = features.loc[train, columns].as_matrix()
    X_val = features.loc[val, columns].as_matrix()
    X_test = features.loc[test, columns].as_matrix()
    
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

1.2 Single genre

In [6]:
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
    return scores, times

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])
In [7]:
classifiers = {
    'LR': LogisticRegression(),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
    'linSVC1': SVC(kernel="linear"),
    'linSVC2': LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    'DT': DecisionTreeClassifier(max_depth=5),
    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    'NB': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_social': ('echonest', 'social_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
#    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
#    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

dim LR kNN SVCrbf SVCpoly1 linSVC1 linSVC2 DT RF AdaBoost MLP1 MLP2 NB QDA
chroma_cens 84 39.25% 37.50% 42.29% 38.63% 39.29% 39.29% 35.68% 33.77% 30.86% 40.19% 34.55% 9.99% 24.64%
chroma_cqt 84 40.07% 40.03% 44.27% 39.99% 41.39% 40.58% 35.45% 36.46% 35.72% 44.81% 39.60% 1.55% 3.42%
chroma_stft 84 43.61% 43.92% 48.31% 43.65% 44.35% 43.10% 39.88% 37.31% 35.25% 48.50% 44.77% 4.20% 5.91%
mfcc 140 57.83% 54.99% 60.98% 59.66% 59.19% 56.98% 45.86% 44.77% 41.31% 53.17% 53.21% 41.86% 48.39%
rmse 7 37.31% 38.52% 38.90% 37.70% 37.54% 37.35% 38.63% 36.65% 34.67% 39.06% 38.75% 11.78% 15.04%
spectral_bandwidth 7 40.54% 45.39% 44.46% 40.38% 40.42% 40.61% 42.91% 43.65% 37.47% 44.97% 44.66% 36.18% 34.16%
spectral_centroid 7 42.40% 45.36% 45.71% 42.09% 42.09% 42.21% 42.67% 43.41% 42.60% 47.84% 47.53% 33.31% 36.11%
spectral_contrast 49 50.91% 49.55% 54.45% 49.59% 51.81% 49.24% 43.53% 44.38% 39.53% 52.90% 49.16% 39.41% 41.78%
spectral_rolloff 7 41.74% 46.25% 47.53% 41.43% 41.62% 41.47% 45.36% 45.47% 41.66% 48.08% 48.54% 28.49% 28.53%
tonnetz 42 40.11% 37.31% 42.25% 40.23% 40.15% 39.56% 35.91% 36.96% 34.16% 40.85% 37.16% 22.31% 23.05%
zcr 7 42.29% 44.73% 45.43% 42.95% 42.71% 42.13% 43.61% 44.27% 40.89% 46.44% 46.25% 30.39% 32.10%
mfcc/contrast 189 59.77% 55.31% 63.04% 61.02% 59.58% 58.10% 47.61% 44.77% 41.62% 53.75% 55.65% 44.03% 51.85%
mfcc/contrast/chroma 273 60.20% 53.13% 62.92% 61.48% 59.11% 59.19% 47.57% 43.22% 41.62% 54.64% 56.98% 39.02% 51.34%
mfcc/contrast/centroid 196 59.81% 55.23% 63.39% 61.48% 60.28% 59.35% 47.61% 43.57% 41.62% 52.62% 56.12% 43.76% 51.69%
mfcc/contrast/chroma/centroid 280 60.44% 53.01% 63.08% 61.29% 60.12% 59.42% 47.57% 43.61% 41.62% 54.33% 55.23% 38.87% 51.34%
mfcc/contrast/chroma/centroid/tonnetz 322 60.36% 52.62% 63.12% 62.50% 60.20% 59.15% 47.57% 43.61% 41.62% 56.32% 57.25% 39.06% 50.72%
mfcc/contrast/chroma/centroid/zcr 287 60.94% 53.01% 62.81% 61.48% 59.77% 59.58% 47.69% 43.37% 41.62% 55.65% 54.41% 38.90% 51.42%
all_non-echonest 518 61.10% 51.77% 62.88% 61.95% 59.08% 58.65% 47.30% 43.65% 41.62% 58.14% 57.95% 9.91% 20.25%
LR kNN SVCrbf SVCpoly1 linSVC1 linSVC2 DT RF AdaBoost MLP1 MLP2 NB QDA
chroma_cens 18.6645 9.4855 69.7959 53.8920 189.2902 97.0045 0.7902 0.1034 1.8475 281.4669 502.8007 0.4817 1.7261
chroma_cqt 25.5985 9.0593 64.3567 53.6505 244.0052 102.6698 0.7459 0.0910 1.7464 244.7545 408.5998 0.4746 1.6424
chroma_stft 32.5938 7.5791 57.7469 54.2342 170.7127 94.5633 0.7266 0.0884 1.7054 247.2694 351.9716 0.4740 1.6017
mfcc 38.7095 18.5090 64.0735 50.0887 173.5436 96.6151 1.5194 0.1084 3.3614 395.0526 269.8539 0.7230 3.4761
rmse 1.3585 0.3469 28.6978 13.7399 20.5466 17.5076 0.0698 0.0954 0.2715 126.3215 159.3460 0.1219 0.1632
spectral_bandwidth 1.0436 0.2820 29.3073 14.5891 23.1915 18.1089 0.0739 0.0953 0.2725 107.5405 216.2522 0.1261 0.1633
spectral_centroid 1.0393 0.2639 25.3846 15.5176 26.4575 17.7886 0.0703 0.0994 0.2774 147.5693 229.9029 0.1215 0.1624
spectral_contrast 11.8101 4.6273 34.9987 27.6479 69.5350 47.8968 0.5047 0.1000 1.2169 253.3954 483.1398 0.3123 0.9229
spectral_rolloff 1.3367 0.2738 26.9192 15.3378 23.9110 17.8559 0.0543 0.0799 0.2349 110.4304 242.6179 0.1227 0.1638
tonnetz 6.2082 3.9319 46.9757 29.3071 73.9021 49.4196 0.4315 0.0999 1.0565 274.3477 443.6555 0.2756 0.9004
zcr 1.1362 0.2366 25.2766 15.6943 25.4284 17.9923 0.0543 0.0838 0.2379 141.2034 151.4443 0.1202 0.1636
mfcc/contrast 54.6594 23.5665 81.2173 63.0799 232.5177 109.4360 2.1096 0.1172 4.6625 392.4555 203.1467 0.9384 5.6662
mfcc/contrast/chroma 82.9940 24.4655 111.2726 85.5987 366.9468 135.2644 3.0404 0.1178 6.6595 354.3567 176.9491 0.9664 10.5037
mfcc/contrast/centroid 57.2624 23.9886 83.3180 64.4825 234.9260 110.7972 2.1913 0.1162 4.8540 455.7037 181.1979 0.9364 6.5866
mfcc/contrast/chroma/centroid 85.3098 25.1057 115.0022 88.4598 386.0993 138.8553 3.1158 0.1430 6.7839 346.2436 169.2297 0.9710 10.3880
mfcc/contrast/chroma/centroid/tonnetz 104.7722 33.7060 133.2679 101.1152 443.0719 154.5318 3.6442 0.1206 7.9675 272.0756 188.3379 0.9944 13.5507
mfcc/contrast/chroma/centroid/zcr 89.7276 30.2974 119.1835 91.9383 391.2368 140.8637 3.2338 0.1205 6.9843 296.5767 187.6073 0.9748 11.5799
all_non-echonest 234.5713 41.1457 192.4517 152.3811 654.4032 198.2208 5.8524 0.1311 12.6438 286.9855 171.3685 1.0756 30.3718

1.3 Multiple genres

Todo:

  • Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks.
In [8]:
classifiers = {
    #LogisticRegression(),
    'LR': OneVsRestClassifier(LogisticRegression()),
    'SVC': OneVsRestClassifier(SVC()),
    'MLP': MLPClassifier(max_iter=700),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
    'mfcc': 'mfcc',
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

dim LR SVC MLP
mfcc 140 11.39% 12.13% 12.40%
mfcc/contrast/chroma/centroid/tonnetz 322 13.45% 13.41% 10.53%
mfcc/contrast/chroma/centroid/zcr 287 13.06% 13.64% 10.92%
LR SVC MLP
mfcc 214.9928 1095.3422 1178.3974
mfcc/contrast/chroma/centroid/tonnetz 646.5655 2513.0338 1881.1462
mfcc/contrast/chroma/centroid/zcr 553.7129 2110.5772 1750.6880

2 Deep learning on raw audio

Other architectures:

In [ ]:
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

Load audio samples in parallel using multiprocessing so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:

  • librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg
    • resampling is very slow --> use kaiser_fast
    • does not work with multi-processing, for keras fit_generator()
  • pydub is a high-level interface for audio modification, uses ffmpeg to load
    • store a temporary .wav
  • directly pipe ffmpeg output
    • fastest method
  • pyAV may be a fastest alternative by linking to ffmpeg libraries
In [ ]:
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape
In [ ]:
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}

2.1 Fully connected neural network

  • Two layers with 10 hiddens is no better than random, ~11%.

Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.

In [ ]:
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))

optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);

loss

2.2 Convolutional neural network

In [ ]:
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)

model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)

loss

2.3 Recurrent neural network

3 Deep learning on extracted audio features

Look at:

3.1 ConvNet on MFCC

In [ ]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape
In [ ]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss