import time import os import IPython.display as ipd from tqdm import tqdm_notebook import numpy as np import pandas as pd import keras from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape from sklearn.utils import shuffle from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC #from sklearn.gaussian_process import GaussianProcessClassifier #from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.multiclass import OneVsRestClassifier import utils AUDIO_DIR = os.environ.get('AUDIO_DIR') tracks = utils.load('data/fma_metadata/tracks.csv') features = utils.load('data/fma_metadata/features.csv') echonest = utils.load('data/fma_metadata/echonest.csv') np.testing.assert_array_equal(features.index, tracks.index) assert echonest.index.isin(tracks.index).all() tracks.shape, features.shape, echonest.shape subset = tracks.index[tracks['set', 'subset'] <= 'medium'] assert subset.isin(tracks.index).all() assert subset.isin(features.index).all() features_all = features.join(echonest, how='inner').sort_index(axis=1) print('Not enough Echonest features: {}'.format(features_all.shape)) tracks = tracks.loc[subset] features_all = features.loc[subset] tracks.shape, features_all.shape train = tracks.index[tracks['set', 'split'] == 'training'] val = tracks.index[tracks['set', 'split'] == 'validation'] test = tracks.index[tracks['set', 'split'] == 'test'] print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test]))) genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_) #genres = list(tracks['track', 'genre_top'].unique()) print('Top genres ({}): {}'.format(len(genres), genres)) genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_) print('All genres ({}): {}'.format(len(genres), genres)) def pre_process(tracks, features, columns, multi_label=False, verbose=False): if not multi_label: # Assign an integer value to each genre. enc = LabelEncoder() labels = tracks['track', 'genre_top'] #y = enc.fit_transform(tracks['track', 'genre_top']) else: # Create an indicator matrix. enc = MultiLabelBinarizer() labels = tracks['track', 'genres_all'] #labels = tracks['track', 'genres'] # Split in training, validation and testing sets. y_train = enc.fit_transform(labels[train]) y_val = enc.transform(labels[val]) y_test = enc.transform(labels[test]) X_train = features.loc[train, columns].as_matrix() X_val = features.loc[val, columns].as_matrix() X_test = features.loc[test, columns].as_matrix() X_train, y_train = shuffle(X_train, y_train, random_state=42) # Standardize features by removing the mean and scaling to unit variance. scaler = StandardScaler(copy=False) scaler.fit_transform(X_train) scaler.transform(X_val) scaler.transform(X_test) return y_train, y_val, y_test, X_train, X_val, X_test def test_classifiers_features(classifiers, feature_sets, multi_label=False): columns = list(classifiers.keys()).insert(0, 'dim') scores = pd.DataFrame(columns=columns, index=feature_sets.keys()) times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys()) for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'): y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label) scores.loc[fset_name, 'dim'] = X_train.shape[1] for clf_name, clf in classifiers.items(): # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False): t = time.process_time() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) scores.loc[fset_name, clf_name] = score times.loc[fset_name, clf_name] = time.process_time() - t return scores, times def format_scores(scores): def highlight(s): is_max = s == max(s[1:]) return ['background-color: yellow' if v else '' for v in is_max] scores = scores.style.apply(highlight, axis=1) return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:]) classifiers = { 'LR': LogisticRegression(), 'kNN': KNeighborsClassifier(n_neighbors=200), 'SVCrbf': SVC(kernel='rbf'), 'SVCpoly1': SVC(kernel='poly', degree=1), 'linSVC1': SVC(kernel="linear"), 'linSVC2': LinearSVC(), #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), 'DT': DecisionTreeClassifier(max_depth=5), 'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'AdaBoost': AdaBoostClassifier(n_estimators=10), 'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000), 'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000), 'NB': GaussianNB(), 'QDA': QuadraticDiscriminantAnalysis(), } feature_sets = { # 'echonest_audio': ('echonest', 'audio_features'), # 'echonest_social': ('echonest', 'social_features'), # 'echonest_temporal': ('echonest', 'temporal_features'), # 'echonest_audio/social': ('echonest', ('audio_features', 'social_features')), # 'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')), } for name in features.columns.levels[0]: feature_sets[name] = name feature_sets.update({ 'mfcc/contrast': ['mfcc', 'spectral_contrast'], 'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'], 'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'], 'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'], 'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'], 'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'], 'all_non-echonest': list(features.columns.levels[0]) }) scores, times = test_classifiers_features(classifiers, feature_sets) ipd.display(format_scores(scores)) ipd.display(times.style.format('{:.4f}')) classifiers = { #LogisticRegression(), 'LR': OneVsRestClassifier(LogisticRegression()), 'SVC': OneVsRestClassifier(SVC()), 'MLP': MLPClassifier(max_iter=700), } feature_sets = { # 'echonest_audio': ('echonest', 'audio_features'), # 'echonest_temporal': ('echonest', 'temporal_features'), 'mfcc': 'mfcc', 'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'], 'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'], } scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True) ipd.display(format_scores(scores)) ipd.display(times.style.format('{:.4f}')) labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top']) labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index) # Just be sure that everything is fine. Multiprocessing is tricky to debug. utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2)) SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader()) SampleLoader(train, batch_size=2).__next__()[0].shape # Keras parameters. NB_WORKER = len(os.sched_getaffinity(0)) # number of usables CPUs params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10} loader = utils.FfmpegLoader(sampling_rate=2000) SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader) print('Dimensionality: {}'.format(loader.shape)) keras.backend.clear_session() model = keras.models.Sequential() model.add(Dense(output_dim=1000, input_shape=loader.shape)) model.add(Activation("relu")) model.add(Dense(output_dim=100)) model.add(Activation("relu")) model.add(Dense(output_dim=labels_onehot.shape[1])) model.add(Activation("softmax")) optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True) model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy']) model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params) loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params) loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params) #Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params); loss loader = utils.FfmpegLoader(sampling_rate=16000) #loader = utils.LibrosaLoader(sampling_rate=16000) SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader) keras.backend.clear_session() model = keras.models.Sequential() model.add(Reshape((-1, 1), input_shape=loader.shape)) print(model.output_shape) model.add(Conv1D(128, 512, subsample_length=512)) print(model.output_shape) model.add(Activation("relu")) model.add(Conv1D(32, 8)) print(model.output_shape) model.add(Activation("relu")) model.add(MaxPooling1D(4)) model.add(Conv1D(32, 8)) print(model.output_shape) model.add(Activation("relu")) model.add(MaxPooling1D(4)) print(model.output_shape) #model.add(Dropout(0.25)) model.add(Flatten()) print(model.output_shape) model.add(Dense(100)) model.add(Activation("relu")) print(model.output_shape) model.add(Dense(labels_onehot.shape[1])) model.add(Activation("softmax")) print(model.output_shape) optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True) #optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True) model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy']) model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params) loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params) loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params) loss class MfccLoader(utils.Loader): raw_loader = utils.FfmpegLoader(sampling_rate=22050) #shape = (13, 190) # For segmented tracks. shape = (13, 2582) def load(self, filename): import librosa x = self.raw_loader.load(filename) # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames. mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256) return mfcc loader = MfccLoader() SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader) loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape keras.backend.clear_session() model = keras.models.Sequential() model.add(Reshape((*loader.shape, 1), input_shape=loader.shape)) print(model.output_shape) model.add(Conv2D(3, 13, 10, subsample=(1, 4))) model.add(Activation("relu")) print(model.output_shape) model.add(Conv2D(15, 1, 10, subsample=(1, 4))) model.add(Activation("relu")) print(model.output_shape) model.add(Conv2D(65, 1, 10, subsample=(1, 4))) model.add(Activation("relu")) print(model.output_shape) model.add(Flatten()) print(model.output_shape) model.add(Dense(labels_onehot.shape[1])) model.add(Activation("softmax")) print(model.output_shape) optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True) #optimizer = keras.optimizers.Adam()#lr=1e-5)# model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy']) model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params) loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params) loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params) #Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5) loss