Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.
import time
import os
import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
import utils
Using TensorFlow backend.
AUDIO_DIR = os.environ.get('AUDIO_DIR')
tracks = utils.load('data/fma_metadata/tracks.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')
np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()
tracks.shape, features.shape, echonest.shape
((106574, 52), (106574, 518), (14511, 249))
subset = tracks.index[tracks['set', 'subset'] <= 'medium']
assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()
features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))
tracks = tracks.loc[subset]
features_all = features.loc[subset]
tracks.shape, features_all.shape
Not enough Echonest features: (13554, 767)
((25000, 52), (25000, 518))
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']
print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))
genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))
19922 training examples, 2505 validation examples, 2573 testing examples Top genres (16): ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken'] All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 808, 810, 811, 906, 1032, 1060, 1193, 1235]
Todo:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
if not multi_label:
# Assign an integer value to each genre.
enc = LabelEncoder()
labels = tracks['track', 'genre_top']
#y = enc.fit_transform(tracks['track', 'genre_top'])
else:
# Create an indicator matrix.
enc = MultiLabelBinarizer()
labels = tracks['track', 'genres_all']
#labels = tracks['track', 'genres']
# Split in training, validation and testing sets.
y_train = enc.fit_transform(labels[train])
y_val = enc.transform(labels[val])
y_test = enc.transform(labels[test])
X_train = features.loc[train, columns].as_matrix()
X_val = features.loc[val, columns].as_matrix()
X_test = features.loc[test, columns].as_matrix()
X_train, y_train = shuffle(X_train, y_train, random_state=42)
# Standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_val)
scaler.transform(X_test)
return y_train, y_val, y_test, X_train, X_val, X_test
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
columns = list(classifiers.keys()).insert(0, 'dim')
scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
scores.loc[fset_name, 'dim'] = X_train.shape[1]
for clf_name, clf in classifiers.items(): # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
t = time.process_time()
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.loc[fset_name, clf_name] = score
times.loc[fset_name, clf_name] = time.process_time() - t
return scores, times
def format_scores(scores):
def highlight(s):
is_max = s == max(s[1:])
return ['background-color: yellow' if v else '' for v in is_max]
scores = scores.style.apply(highlight, axis=1)
return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])
classifiers = {
'LR': LogisticRegression(),
'kNN': KNeighborsClassifier(n_neighbors=200),
'SVCrbf': SVC(kernel='rbf'),
'SVCpoly1': SVC(kernel='poly', degree=1),
'linSVC1': SVC(kernel="linear"),
'linSVC2': LinearSVC(),
#GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
'DT': DecisionTreeClassifier(max_depth=5),
'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
'AdaBoost': AdaBoostClassifier(n_estimators=10),
'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
'NB': GaussianNB(),
'QDA': QuadraticDiscriminantAnalysis(),
}
feature_sets = {
# 'echonest_audio': ('echonest', 'audio_features'),
# 'echonest_social': ('echonest', 'social_features'),
# 'echonest_temporal': ('echonest', 'temporal_features'),
# 'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
# 'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}
for name in features.columns.levels[0]:
feature_sets[name] = name
feature_sets.update({
'mfcc/contrast': ['mfcc', 'spectral_contrast'],
'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
'all_non-echonest': list(features.columns.levels[0])
})
scores, times = test_classifiers_features(classifiers, feature_sets)
ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))
/home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear") /home/ubuntu/.pyenv/versions/3.6.0/envs/fma/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:695: UserWarning: Variables are collinear warnings.warn("Variables are collinear")
dim | LR | kNN | SVCrbf | SVCpoly1 | linSVC1 | linSVC2 | DT | RF | AdaBoost | MLP1 | MLP2 | NB | QDA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
chroma_cens | 84 | 39.25% | 37.50% | 42.29% | 38.63% | 39.29% | 39.29% | 35.68% | 33.77% | 30.86% | 40.19% | 34.55% | 9.99% | 24.64% |
chroma_cqt | 84 | 40.07% | 40.03% | 44.27% | 39.99% | 41.39% | 40.58% | 35.45% | 36.46% | 35.72% | 44.81% | 39.60% | 1.55% | 3.42% |
chroma_stft | 84 | 43.61% | 43.92% | 48.31% | 43.65% | 44.35% | 43.10% | 39.88% | 37.31% | 35.25% | 48.50% | 44.77% | 4.20% | 5.91% |
mfcc | 140 | 57.83% | 54.99% | 60.98% | 59.66% | 59.19% | 56.98% | 45.86% | 44.77% | 41.31% | 53.17% | 53.21% | 41.86% | 48.39% |
rmse | 7 | 37.31% | 38.52% | 38.90% | 37.70% | 37.54% | 37.35% | 38.63% | 36.65% | 34.67% | 39.06% | 38.75% | 11.78% | 15.04% |
spectral_bandwidth | 7 | 40.54% | 45.39% | 44.46% | 40.38% | 40.42% | 40.61% | 42.91% | 43.65% | 37.47% | 44.97% | 44.66% | 36.18% | 34.16% |
spectral_centroid | 7 | 42.40% | 45.36% | 45.71% | 42.09% | 42.09% | 42.21% | 42.67% | 43.41% | 42.60% | 47.84% | 47.53% | 33.31% | 36.11% |
spectral_contrast | 49 | 50.91% | 49.55% | 54.45% | 49.59% | 51.81% | 49.24% | 43.53% | 44.38% | 39.53% | 52.90% | 49.16% | 39.41% | 41.78% |
spectral_rolloff | 7 | 41.74% | 46.25% | 47.53% | 41.43% | 41.62% | 41.47% | 45.36% | 45.47% | 41.66% | 48.08% | 48.54% | 28.49% | 28.53% |
tonnetz | 42 | 40.11% | 37.31% | 42.25% | 40.23% | 40.15% | 39.56% | 35.91% | 36.96% | 34.16% | 40.85% | 37.16% | 22.31% | 23.05% |
zcr | 7 | 42.29% | 44.73% | 45.43% | 42.95% | 42.71% | 42.13% | 43.61% | 44.27% | 40.89% | 46.44% | 46.25% | 30.39% | 32.10% |
mfcc/contrast | 189 | 59.77% | 55.31% | 63.04% | 61.02% | 59.58% | 58.10% | 47.61% | 44.77% | 41.62% | 53.75% | 55.65% | 44.03% | 51.85% |
mfcc/contrast/chroma | 273 | 60.20% | 53.13% | 62.92% | 61.48% | 59.11% | 59.19% | 47.57% | 43.22% | 41.62% | 54.64% | 56.98% | 39.02% | 51.34% |
mfcc/contrast/centroid | 196 | 59.81% | 55.23% | 63.39% | 61.48% | 60.28% | 59.35% | 47.61% | 43.57% | 41.62% | 52.62% | 56.12% | 43.76% | 51.69% |
mfcc/contrast/chroma/centroid | 280 | 60.44% | 53.01% | 63.08% | 61.29% | 60.12% | 59.42% | 47.57% | 43.61% | 41.62% | 54.33% | 55.23% | 38.87% | 51.34% |
mfcc/contrast/chroma/centroid/tonnetz | 322 | 60.36% | 52.62% | 63.12% | 62.50% | 60.20% | 59.15% | 47.57% | 43.61% | 41.62% | 56.32% | 57.25% | 39.06% | 50.72% |
mfcc/contrast/chroma/centroid/zcr | 287 | 60.94% | 53.01% | 62.81% | 61.48% | 59.77% | 59.58% | 47.69% | 43.37% | 41.62% | 55.65% | 54.41% | 38.90% | 51.42% |
all_non-echonest | 518 | 61.10% | 51.77% | 62.88% | 61.95% | 59.08% | 58.65% | 47.30% | 43.65% | 41.62% | 58.14% | 57.95% | 9.91% | 20.25% |
LR | kNN | SVCrbf | SVCpoly1 | linSVC1 | linSVC2 | DT | RF | AdaBoost | MLP1 | MLP2 | NB | QDA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
chroma_cens | 18.6645 | 9.4855 | 69.7959 | 53.8920 | 189.2902 | 97.0045 | 0.7902 | 0.1034 | 1.8475 | 281.4669 | 502.8007 | 0.4817 | 1.7261 |
chroma_cqt | 25.5985 | 9.0593 | 64.3567 | 53.6505 | 244.0052 | 102.6698 | 0.7459 | 0.0910 | 1.7464 | 244.7545 | 408.5998 | 0.4746 | 1.6424 |
chroma_stft | 32.5938 | 7.5791 | 57.7469 | 54.2342 | 170.7127 | 94.5633 | 0.7266 | 0.0884 | 1.7054 | 247.2694 | 351.9716 | 0.4740 | 1.6017 |
mfcc | 38.7095 | 18.5090 | 64.0735 | 50.0887 | 173.5436 | 96.6151 | 1.5194 | 0.1084 | 3.3614 | 395.0526 | 269.8539 | 0.7230 | 3.4761 |
rmse | 1.3585 | 0.3469 | 28.6978 | 13.7399 | 20.5466 | 17.5076 | 0.0698 | 0.0954 | 0.2715 | 126.3215 | 159.3460 | 0.1219 | 0.1632 |
spectral_bandwidth | 1.0436 | 0.2820 | 29.3073 | 14.5891 | 23.1915 | 18.1089 | 0.0739 | 0.0953 | 0.2725 | 107.5405 | 216.2522 | 0.1261 | 0.1633 |
spectral_centroid | 1.0393 | 0.2639 | 25.3846 | 15.5176 | 26.4575 | 17.7886 | 0.0703 | 0.0994 | 0.2774 | 147.5693 | 229.9029 | 0.1215 | 0.1624 |
spectral_contrast | 11.8101 | 4.6273 | 34.9987 | 27.6479 | 69.5350 | 47.8968 | 0.5047 | 0.1000 | 1.2169 | 253.3954 | 483.1398 | 0.3123 | 0.9229 |
spectral_rolloff | 1.3367 | 0.2738 | 26.9192 | 15.3378 | 23.9110 | 17.8559 | 0.0543 | 0.0799 | 0.2349 | 110.4304 | 242.6179 | 0.1227 | 0.1638 |
tonnetz | 6.2082 | 3.9319 | 46.9757 | 29.3071 | 73.9021 | 49.4196 | 0.4315 | 0.0999 | 1.0565 | 274.3477 | 443.6555 | 0.2756 | 0.9004 |
zcr | 1.1362 | 0.2366 | 25.2766 | 15.6943 | 25.4284 | 17.9923 | 0.0543 | 0.0838 | 0.2379 | 141.2034 | 151.4443 | 0.1202 | 0.1636 |
mfcc/contrast | 54.6594 | 23.5665 | 81.2173 | 63.0799 | 232.5177 | 109.4360 | 2.1096 | 0.1172 | 4.6625 | 392.4555 | 203.1467 | 0.9384 | 5.6662 |
mfcc/contrast/chroma | 82.9940 | 24.4655 | 111.2726 | 85.5987 | 366.9468 | 135.2644 | 3.0404 | 0.1178 | 6.6595 | 354.3567 | 176.9491 | 0.9664 | 10.5037 |
mfcc/contrast/centroid | 57.2624 | 23.9886 | 83.3180 | 64.4825 | 234.9260 | 110.7972 | 2.1913 | 0.1162 | 4.8540 | 455.7037 | 181.1979 | 0.9364 | 6.5866 |
mfcc/contrast/chroma/centroid | 85.3098 | 25.1057 | 115.0022 | 88.4598 | 386.0993 | 138.8553 | 3.1158 | 0.1430 | 6.7839 | 346.2436 | 169.2297 | 0.9710 | 10.3880 |
mfcc/contrast/chroma/centroid/tonnetz | 104.7722 | 33.7060 | 133.2679 | 101.1152 | 443.0719 | 154.5318 | 3.6442 | 0.1206 | 7.9675 | 272.0756 | 188.3379 | 0.9944 | 13.5507 |
mfcc/contrast/chroma/centroid/zcr | 89.7276 | 30.2974 | 119.1835 | 91.9383 | 391.2368 | 140.8637 | 3.2338 | 0.1205 | 6.9843 | 296.5767 | 187.6073 | 0.9748 | 11.5799 |
all_non-echonest | 234.5713 | 41.1457 | 192.4517 | 152.3811 | 654.4032 | 198.2208 | 5.8524 | 0.1311 | 12.6438 | 286.9855 | 171.3685 | 1.0756 | 30.3718 |
Todo:
classifiers = {
#LogisticRegression(),
'LR': OneVsRestClassifier(LogisticRegression()),
'SVC': OneVsRestClassifier(SVC()),
'MLP': MLPClassifier(max_iter=700),
}
feature_sets = {
# 'echonest_audio': ('echonest', 'audio_features'),
# 'echonest_temporal': ('echonest', 'temporal_features'),
'mfcc': 'mfcc',
'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}
scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)
ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))
dim | LR | SVC | MLP | |
---|---|---|---|---|
mfcc | 140 | 11.39% | 12.13% | 12.40% |
mfcc/contrast/chroma/centroid/tonnetz | 322 | 13.45% | 13.41% | 10.53% |
mfcc/contrast/chroma/centroid/zcr | 287 | 13.06% | 13.64% | 10.92% |
LR | SVC | MLP | |
---|---|---|---|
mfcc | 214.9928 | 1095.3422 | 1178.3974 |
mfcc/contrast/chroma/centroid/tonnetz | 646.5655 | 2513.0338 | 1881.1462 |
mfcc/contrast/chroma/centroid/zcr | 553.7129 | 2110.5772 | 1750.6880 |
Other architectures:
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)
Load audio samples in parallel using multiprocessing
so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:
kaiser_fast
fit_generator()
.wav
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0)) # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}
Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))
keras.backend.clear_session()
model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))
optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);
loss
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
keras.backend.clear_session()
model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)
model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))
model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))
model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))
print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)
optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)
loss
Look at:
class MfccLoader(utils.Loader):
raw_loader = utils.FfmpegLoader(sampling_rate=22050)
#shape = (13, 190) # For segmented tracks.
shape = (13, 2582)
def load(self, filename):
import librosa
x = self.raw_loader.load(filename)
# Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
return mfcc
loader = MfccLoader()
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape
keras.backend.clear_session()
model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1), input_shape=loader.shape))
print(model.output_shape)
model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)
optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)
loss