Michaƫl Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.
unzip fma_small.zip
.%matplotlib inline
import os
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display
import utils
plt.rcParams['figure.figsize'] = (17, 5)
# Directory where mp3 are stored.
AUDIO_DIR = os.environ.get('AUDIO_DIR')
# Load metadata and features.
tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')
np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()
tracks.shape, genres.shape, features.shape, echonest.shape
The metadata table, a CSV file in the fma_metadata.zip
archive, is composed of many colums:
ipd.display(tracks['track'].head())
ipd.display(tracks['album'].head())
ipd.display(tracks['artist'].head())
ipd.display(tracks['set'].head())
The small and medium subsets can be selected with the below code.
small = tracks[tracks['set', 'subset'] <= 'small']
small.shape
medium = tracks[tracks['set', 'subset'] <= 'medium']
medium.shape
The genre hierarchy is stored in genres.csv
and distributed in fma_metadata.zip
.
print('{} top-level genres'.format(len(genres['top_level'].unique())))
genres.loc[genres['top_level'].unique()].sort_values('#tracks', ascending=False)
genres.sort_values('#tracks').head(10)
print('{1} features for {0} tracks'.format(*features.shape))
columns = ['mfcc', 'chroma_cens', 'tonnetz', 'spectral_contrast']
columns.append(['spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff'])
columns.append(['rmse', 'zcr'])
for column in columns:
ipd.display(features[column].head().style.format('{:.2f}'))
print('{1} features for {0} tracks'.format(*echonest.shape))
ipd.display(echonest['echonest', 'metadata'].head())
ipd.display(echonest['echonest', 'audio_features'].head())
ipd.display(echonest['echonest', 'social_features'].head())
ipd.display(echonest['echonest', 'ranks'].head())
ipd.display(echonest['echonest', 'temporal_features'].head())
x = echonest.loc[2, ('echonest', 'temporal_features')]
plt.plot(x);
small = tracks['set', 'subset'] <= 'small'
def select(genre):
return tracks['track', 'genres_top'].map(lambda genres: genre in genres)
genre1 = select(1235) # Instrumental.
genre2 = select(21) # Hip-Hop.
X = features.loc[small & (genre1 | genre2), 'mfcc']
X = skl.decomposition.PCA(n_components=2).fit_transform(X)
y = tracks.loc[small & (genre1 | genre2), ('track', 'genres_top')]
y = skl.preprocessing.LabelEncoder().fit_transform(y)
plt.scatter(X[:,0], X[:,1], c=y, cmap='RdBu', alpha=0.5)
X.shape, y.shape
You can load the waveform and listen to audio in the notebook itself.
filename = utils.get_audio_path(AUDIO_DIR, 2)
print('File: {}'.format(filename))
x, sr = librosa.load(filename, sr=None, mono=True)
print('Duration: {:.2f}s, {} samples'.format(x.shape[-1] / sr, x.size))
start, end = 7, 17
ipd.Audio(data=x[start*sr:end*sr], rate=sr)
And use librosa to compute spectrograms and audio features.
librosa.display.waveplot(x, sr, alpha=0.5);
plt.vlines([start, end], -1, 1)
start = len(x) // 2
plt.figure()
plt.plot(x[start:start+2000])
plt.ylim((-1, 1));
stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
log_mel = librosa.logamplitude(mel)
librosa.display.specshow(log_mel, sr=sr, hop_length=512, x_axis='time', y_axis='mel');
mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
mfcc = skl.preprocessing.StandardScaler().fit_transform(mfcc)
librosa.display.specshow(mfcc, sr=sr, x_axis='time');
small = tracks['set', 'subset'] <= 'small'
train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'
y_train = tracks.loc[small & train, ('track', 'genres_top')]
y_test = tracks.loc[small & test, ('track', 'genres_top')]
enc = skl.preprocessing.LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)
X_train = features.loc[small & train, 'mfcc']
X_test = features.loc[small & test, 'mfcc']
print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))
print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))
# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)
# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)
# Support vector classification.
clf = skl.svm.SVC()
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Accuracy: {:.2%}'.format(score))