Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.
From raw_*.csv
and mp3_metadata.csv
, this notebook generates:
tracks.csv
: per-track / album / artist metadata.genres.csv
: genre hierarchy.echonest.csv
: cleaned Echonest features.Before executing this notebook, run the bellow long-term jobs from the companion script, creation.py:
./creation.py metadata
to query the API and store metadata in raw_tracks.csv
, raw_albums.csv
, raw_artists.csv
and raw_genres.csv
../creation.py data /path/to/fma
to download the audio for each track../creation.py mp3_metadata /path/to/fma
to extract technical metadata from the audio, e.g. duration../creation.py clips /path/to/fma
to extract 30s clips from the downloaded full-length audio.Compute the checksums (only once):
sha1sum *.pickle *.csv > checksums
cd /path/to/fma/fma_full; sha1sum ./**/*.mp3 > checksums
cd /path/to/fma/fma_large; sha1sum ./**/*.mp3 > checksums
After executing the notebook, run:
./creation.py normalize /path/to/fma
to normalize the permissions and modification / access times../creation.py zips /path/to/fma
to create the .zip
archives.import os
import ast
import pickle
import IPython.display as ipd
import numpy as np
import pandas as pd
import utils
import creation
# Base directory where the FMA is stored.
AUDIO_DIR = os.environ.get('AUDIO_DIR')
BASE_DIR = os.path.abspath(os.path.dirname(AUDIO_DIR))
# Directory where all the full-length audio is stored. That is the FMA full dataset.
FMA_FULL = os.path.join(BASE_DIR, 'fma_full')
# Directory where all the 30s clips is stored. That is the FMA large dataset.
FMA_LARGE = os.path.join(BASE_DIR, 'fma_large')
.mp3
by HTTPS for each track id (only if we don't have it already).Todo:
track_image_file
, album_image_file
, artist_image_file
). Beware the quality.Dataset update:
# converters={'genres': ast.literal_eval}
tracks = pd.read_csv('data/fma_metadata/raw_tracks.csv', index_col=0)
albums = pd.read_csv('data/fma_metadata/raw_albums.csv', index_col=0)
artists = pd.read_csv('data/fma_metadata/raw_artists.csv', index_col=0)
genres = pd.read_csv('data/fma_metadata/raw_genres.csv', index_col=0)
mp3_metadata = pd.read_csv('data/fma_metadata/mp3_metadata.csv', index_col=0)
not_found = pickle.load(open('not_found.pickle', 'rb'))
audio_tids = utils.get_tids_from_directory(FMA_FULL)
clips_tids = utils.get_tids_from_directory(FMA_LARGE)
print('tracks: {} collected ({} not found, {} max id)'.format(
len(tracks), len(not_found['tracks']), tracks.index.max()))
print('albums: {} collected ({} not found, {} in tracks)'.format(
len(albums), len(not_found['albums']), len(tracks['album_id'].unique())))
print('artists: {} collected ({} not found, {} in tracks)'.format(
len(artists), len(not_found['artists']), len(tracks['artist_id'].unique())))
print('genres: {} collected'.format(len(genres)))
print('audio: {} collected ({} not found, {} not in tracks)'.format(
len(audio_tids), len(not_found['audio']), len(set(audio_tids).difference(tracks.index))))
print('mp3 metadata: {} collected ({} not found, {} not in tracks)'.format(
len(mp3_metadata), len(not_found['mp3_metadata']), len(mp3_metadata.index.difference(tracks.index))))
print('clips: {} collected ({} not found, {} not in tracks)'.format(
len(clips_tids), len(not_found['clips']), len(set(clips_tids).difference(tracks.index))))
assert sum(tracks.index.isin(audio_tids)) + len(not_found['audio']) == len(tracks)
assert sum(tracks.index.isin(clips_tids)) + len(not_found['clips']) == sum(tracks.index.isin(audio_tids))
assert len(mp3_metadata) + len(not_found['mp3_metadata']) == len(audio_tids)
assert len(clips_tids) + len(not_found['clips']) == len(mp3_metadata)
N = 5
ipd.display(tracks.head(N))
ipd.display(albums.head(N))
ipd.display(artists.head(N))
ipd.display(genres.head(N))
ipd.display(mp3_metadata.head(N))
Todo: sanitize values, e.g. list of words for tags, valid links in artist_wikipedia_page
, remove html markup in free-form text. Clean tags. E.g. some tags are just artist names.
df, column = tracks, 'tags'
null = sum(df[column].isnull())
print('{} null, {} non-null'.format(null, df.shape[0] - null))
df[column].value_counts().head(10)
drop = [
'license_image_file', 'license_image_file_large', 'license_parent_id', 'license_url', # keep title only
'track_file', 'track_image_file', # used to download only
'track_url', 'album_url', 'artist_url', # only relevant on website
'track_copyright_c', 'track_copyright_p', # present for ~1000 tracks only
# 'track_composer', 'track_lyricist', 'track_publisher', # present for ~4000, <1000 and <2000 tracks
'track_disc_number', # different from 1 for <1000 tracks
'track_explicit', 'track_explicit_notes', # present for <4000 tracks
'track_instrumental' # ~6000 tracks have a 1, there is an instrumental genre
]
tracks.drop(drop, axis=1, inplace=True)
tracks.rename(columns={'license_title': 'track_license', 'tags': 'track_tags'}, inplace=True)
def convert_datetime(df, column, format=None):
df[column] = pd.to_datetime(df[column], infer_datetime_format=True, format=format)
convert_datetime(tracks, 'track_date_created')
convert_datetime(tracks, 'track_date_recorded')
tracks['album_id'].fillna(-1, inplace=True)
tracks['track_bit_rate'].fillna(-1, inplace=True)
tracks = tracks.astype({'album_id': int, 'track_bit_rate': int})
def convert_genres(genres):
genres = ast.literal_eval(genres)
return [int(genre['genre_id']) for genre in genres]
tracks['track_genres'].fillna('[]', inplace=True)
tracks['track_genres'] = tracks['track_genres'].map(convert_genres)
tracks.columns
drop = [
'artist_name', 'album_url', 'artist_url', # in tracks already (though it can be different)
'album_handle',
'album_image_file', 'album_images', # todo: shall be downloaded
#'album_producer', 'album_engineer', # present for ~2400 albums only
]
albums.drop(drop, axis=1, inplace=True)
albums.rename(columns={'tags': 'album_tags'}, inplace=True)
convert_datetime(albums, 'album_date_created')
convert_datetime(albums, 'album_date_released')
albums.columns
drop = [
'artist_website', 'artist_url', # in tracks already (though it can be different)
'artist_handle',
'artist_image_file', 'artist_images', # todo: shall be downloaded
'artist_donation_url', 'artist_paypal_name', 'artist_flattr_name', # ~1600 & ~400 & ~70, not relevant
'artist_contact', # ~1500, not very useful data
# 'artist_active_year_begin', 'artist_active_year_end', # ~1400, ~500 only
# 'artist_associated_labels', # ~1000
# 'artist_related_projects', # only ~800, but can be combined with bio
]
artists.drop(drop, axis=1, inplace=True)
artists.rename(columns={'tags': 'artist_tags'}, inplace=True)
convert_datetime(artists, 'artist_date_created')
for column in ['artist_active_year_begin', 'artist_active_year_end']:
artists[column].replace(0.0, np.nan, inplace=True)
convert_datetime(artists, column, format='%Y.0')
artists.columns
Problem: the bitrate and duration returned by the FMA API are sometimes wrong (look for example at the duration reported for track ID 2624). These errors cause us to then wrongly extract the 30s excerpts (see this GitHub issue).
Solution: get these metadata (along others) from the mp3 headers. The most accurate measure of duration is to decode the mp3 and count the number of frames.
Limitation: three files (track IDs 23430, 153189, 155249) could not be open by mutagen. As such, the bit_rate
was set to the value given by ffmpeg and the mode
to UNKNOWN
.
# Example of faulty duration.
tid = 2624
duration = tracks.at[tid, 'track_duration']
print('API reported duration: {}'.format(duration))
duration = mp3_metadata.loc[tid, 'samples'] / mp3_metadata.loc[tid, 'sample_rate']
print('Real duration after decoding: {:.0f}s'.format(duration))
tracks['track_channels'] = mp3_metadata['channels']
tracks['track_bit_rate_mode'] = mp3_metadata['mode']
tracks['track_bit_rate'] = mp3_metadata['bit_rate']
tracks['track_sample_rate'] = mp3_metadata['sample_rate']
tracks['track_samples'] = mp3_metadata['samples']
duration = mp3_metadata['samples'] / mp3_metadata['sample_rate']
tracks['track_duration'] = duration.round().astype(int)
not_found['albums'].remove(None)
not_found['albums'].append(-1)
not_found['albums'] = [int(i) for i in not_found['albums']]
not_found['artists'] = [int(i) for i in not_found['artists']]
tracks = tracks.merge(albums, left_on='album_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))
n = sum(tracks['album_title_dup'].isnull())
print('{} tracks without extended album information ({} tracks without album_id)'.format(
n, sum(tracks['album_id'] == -1)))
assert sum(tracks['album_id'].isin(not_found['albums'])) == n
assert sum(tracks['album_title'] != tracks['album_title_dup']) == n
tracks.drop('album_title_dup', axis=1, inplace=True)
assert not any('dup' in col for col in tracks.columns)
# Album artist can be different than track artist. Keep track artist.
#tracks[tracks['artist_name'] != tracks['artist_name_dup']].select(lambda x: 'artist_name' in x, axis=1)
tracks = tracks.merge(artists, left_on='artist_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))
n = sum(tracks['artist_name_dup'].isnull())
print('{} tracks without extended artist information'.format(n))
assert sum(tracks['artist_id'].isin(not_found['artists'])) == n
assert sum(tracks['artist_name'] != tracks[('artist_name_dup')]) == n
tracks.drop('artist_name_dup', axis=1, inplace=True)
assert not any('dup' in col for col in tracks.columns)
columns = []
for name in tracks.columns:
names = name.split('_')
columns.append((names[0], '_'.join(names[1:])))
tracks.columns = pd.MultiIndex.from_tuples(columns)
assert all(label in ['track', 'album', 'artist'] for label in tracks.columns.get_level_values(0))
# Todo: fill other columns ?
tracks['album', 'tags'].fillna('[]', inplace=True)
tracks['artist', 'tags'].fillna('[]', inplace=True)
columns = [('album', 'favorites'), ('album', 'comments'), ('album', 'listens'), ('album', 'tracks'),
('artist', 'favorites'), ('artist', 'comments')]
for column in columns:
tracks[column].fillna(-1, inplace=True)
columns = {column: int for column in columns}
tracks = tracks.astype(columns)
Todo: duplicates (metadata and audio)
def keep(index, df):
old = len(df)
df = df.loc[index]
new = len(df)
print('{} lost, {} left'.format(old - new, new))
return df
tracks = keep(tracks.index, tracks)
# Audio not found or could not be read or trimmed.
tracks = keep(tracks.index.difference(not_found['audio']), tracks)
tracks = keep(tracks.index.difference(not_found['mp3_metadata']), tracks)
tracks = keep(tracks.index.difference(not_found['clips']), tracks)
Errors from the features.py
script.
# Feature extraction failed.
FAILED = [1440, 26436, 28106, 29166, 29167, 29168, 29169, 29170, 29171, 29172,
29173, 29179, 38903, 43903, 56757, 57603, 59361, 62095, 62954, 62956,
62957, 62959, 62971, 75461, 80015, 86079, 92345, 92346, 92347, 92348,
92349, 92350, 92351, 92352, 92353, 92354, 92355, 92356, 92357, 92358,
92359, 92360, 92361, 96426, 104623, 106719, 109714, 114448, 114501,114528,
115235, 117759, 118003, 118004, 127827, 130296, 130298, 131076, 135804, 136486,
144769, 144770, 144771, 144773, 144774, 144775, 144776, 144777, 144778, 152204,
154923]
tracks = keep(tracks.index.difference(FAILED), tracks)
# License forbids redistribution.
tracks = keep(tracks['track', 'license'] != 'FMA-Limited: Download Only', tracks)
print('{} licenses'.format(len(tracks[('track', 'license')].unique())))
#sum(tracks['track', 'title'].duplicated())
genres.drop(['genre_handle', 'genre_color'], axis=1, inplace=True)
genres.rename(columns={'genre_parent_id': 'parent', 'genre_title': 'title'}, inplace=True)
genres['parent'].fillna(0, inplace=True)
genres = genres.astype({'parent': int})
# 13 (Easy Listening) has parent 126 which is missing
# --> a root genre on the website, although not in the genre menu
genres.at[13, 'parent'] = 0
# 580 (Abstract Hip-Hop) has parent 1172 which is missing
# --> listed as child of Hip-Hop on the website
genres.at[580, 'parent'] = 21
# 810 (Nu-Jazz) has parent 51 which is missing
# --> listed as child of Easy Listening on website
genres.at[810, 'parent'] = 13
# 763 (Holiday) has parent 763 which is itself
# --> listed as child of Sound Effects on website
genres.at[763, 'parent'] = 16
# Todo: should novelty be under Experimental? It is alone on website.
# Genre 806 (hiphop) should not exist. Replace it by 21 (Hip-Hop).
print('{} tracks have genre 806'.format(
sum(tracks['track', 'genres'].map(lambda genres: 806 in genres))))
def change_genre(genres):
return [genre if genre != 806 else 21 for genre in genres]
tracks['track', 'genres'] = tracks['track', 'genres'].map(change_genre)
genres.drop(806, inplace=True)
def get_parent(genre, track_all_genres=None):
parent = genres.at[genre, 'parent']
if track_all_genres is not None:
track_all_genres.append(genre)
return genre if parent == 0 else get_parent(parent, track_all_genres)
# Get all genres, i.e. all genres encountered when walking from leafs to roots.
def get_all_genres(track_genres):
track_all_genres = list()
for genre in track_genres:
get_parent(genre, track_all_genres)
return list(set(track_all_genres))
tracks['track', 'genres_all'] = tracks['track', 'genres'].map(get_all_genres)
# Number of tracks per genre.
def count_genres(subset=tracks.index):
count = pd.Series(0, index=genres.index)
for _, track_all_genres in tracks.loc[subset, ('track', 'genres_all')].items():
for genre in track_all_genres:
count[genre] += 1
return count
genres['#tracks'] = count_genres()
# Remove unused genres, i.e. genres who are never referenced.
# So that the length of the table is the number of useful genres.
print(genres.shape)
unused_genres = (genres['#tracks'] == 0)
ipd.display(genres[unused_genres])
genres.drop(genres.index[unused_genres], inplace=True)
print(genres.shape)
def get_top_genres(track_genres):
return list(set(genres.at[genre, 'top_level'] for genre in track_genres))
# Top-level genre.
genres['top_level'] = genres.index.map(get_parent)
tracks['track', 'genres_top'] = tracks['track', 'genres'].map(get_top_genres)
Main characteristic: the full set with clips trimmed to a manageable size.
Main characteristic: clean metadata (includes 1 top-level genre) and quality audio.
fma_medium = pd.DataFrame(tracks)
# Missing meta-information.
# Missing extended album and artist information.
fma_medium = keep(~fma_medium['album', 'id'].isin(not_found['albums']), fma_medium)
fma_medium = keep(~fma_medium['artist', 'id'].isin(not_found['artists']), fma_medium)
# Untitled track or album.
fma_medium = keep(~fma_medium['track', 'title'].isnull(), fma_medium)
fma_medium = keep(fma_medium['track', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)
fma_medium = keep(fma_medium['album', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)
# One tag is often just the artist name. Tags too scarce for tracks and albums.
#keep(fma_medium['artist', 'tags'].map(len) >= 2, fma_medium)
# Too scarce.
#fma_medium = keep(~fma_medium['album', 'information'].isnull(), fma_medium)
#fma_medium = keep(~fma_medium['artist', 'bio'].isnull(), fma_medium)
#fma_medium = keep(~fma_medium['artist', 'website'].isnull(), fma_medium)
#fma_medium = keep(~fma_medium['artist', 'wikipedia_page'].isnull(), fma_medium)
# Too scarce.
#fma_medium = keep(~fma_medium['artist', 'location'].isnull(), fma_medium)
#fma_medium = keep(~fma_medium['artist', 'latitude'].isnull(), fma_medium)
#fma_medium = keep(~fma_medium['artist', 'longitude'].isnull(), fma_medium)
# Technical quality.
# Todo: sample rate
fma_medium = keep(fma_medium['track', 'bit_rate'] > 100000, fma_medium)
# Choosing standard bit rates discards all VBR.
#fma_medium = keep(fma_medium['track', 'bit_rate'].isin([320000, 256000, 192000, 160000, 128000]), fma_medium)
fma_medium = keep(fma_medium['track', 'duration'] >= 60, fma_medium)
fma_medium = keep(fma_medium['track', 'duration'] <= 600, fma_medium)
fma_medium = keep(fma_medium['album', 'tracks'] >= 1, fma_medium)
fma_medium = keep(fma_medium['album', 'tracks'] <= 50, fma_medium)
# Lower popularity bound.
fma_medium = keep(fma_medium['track', 'listens'] >= 100, fma_medium)
fma_medium = keep(fma_medium['track', 'interest'] >= 200, fma_medium)
fma_medium = keep(fma_medium['album', 'listens'] >= 1000, fma_medium);
# Favorites and comments are very scarce.
#fma_medium = keep(fma_medium['artist', 'favorites'] >= 1, fma_medium)
# Targeted genre classification.
fma_medium = keep(fma_medium['track', 'genres_top'].map(len) == 1, fma_medium);
#keep(fma_medium['track', 'genres'].map(len) == 1, fma_medium);
# For convenience.
fma_medium['track', 'genre_top'] = fma_medium['track', 'genres_top'].map(lambda x: x[0])
# Adjust size with popularity measure. Should be of better quality.
N_TRACKS = 25000
# Observations
# * More albums killed than artists --> be sure not to kill diversity
# * Favorites and preterites genres differently --> do it per genre?
# Normalization
# * mean, median, std, max
# * tracks per album or artist
# Test
# * 4/5 of same tracks were selected with various set of measures
# * <5% diff with max and mean
popularity_measures = [('track', 'listens'), ('track', 'interest')] # ('album', 'listens')
# ('track', 'favorites'), ('track', 'comments'),
# ('album', 'favorites'), ('album', 'comments'),
# ('artist', 'favorites'), ('artist', 'comments'),
normalization = {measure: fma_medium[measure].max() for measure in popularity_measures}
def popularity_measure(track):
return sum(track[measure] / normalization[measure] for measure in popularity_measures)
fma_medium['popularity_measure'] = fma_medium.apply(popularity_measure, axis=1)
fma_medium = keep(fma_medium.sort_values('popularity_measure', ascending=False).index[:N_TRACKS], fma_medium)
tmp = genres[genres['parent'] == 0].copy()
tmp['#tracks_medium'] = fma_medium['track', 'genre_top'].value_counts()
tmp.sort_values('#tracks_medium', ascending=False)
Main characteristic: genre balanced (and echonest features).
Choices:
Todo:
N_GENRES = 8
N_TRACKS = 1000
top_genres = tmp.sort_values('#tracks_medium', ascending=False)[:N_GENRES].index
fma_small = pd.DataFrame(fma_medium)
fma_small = keep(fma_small['track', 'genre_top'].isin(top_genres), fma_small)
to_keep = []
for genre in top_genres:
subset = fma_small[fma_small['track', 'genre_top'] == genre]
drop = subset.sort_values('popularity_measure').index[:-N_TRACKS]
fma_small.drop(drop, inplace=True)
assert len(fma_small) == N_GENRES * N_TRACKS
SUBSETS = ('small', 'medium', 'large')
tracks['set', 'subset'] = pd.Series().astype('category', categories=SUBSETS, ordered=True)
tracks.loc[tracks.index, ('set', 'subset')] = 'large'
tracks.loc[fma_medium.index, ('set', 'subset')] = 'medium'
tracks.loc[fma_small.index, ('set', 'subset')] = 'small'
echonest = pd.read_csv('data/fma_metadata/raw_echonest.csv', index_col=0, header=[0, 1, 2])
echonest = keep(~echonest['echonest', 'temporal_features'].isnull().any(axis=1), echonest)
echonest = keep(~echonest['echonest', 'audio_features'].isnull().any(axis=1), echonest)
echonest = keep(~echonest['echonest', 'social_features'].isnull().any(axis=1), echonest)
echonest = keep(echonest.index.isin(tracks.index), echonest);
keep(echonest.index.isin(fma_medium.index), echonest);
keep(echonest.index.isin(fma_small.index), echonest);
Take into account:
for genre in genres.index:
tracks['genre', genres.at[genre, 'title']] = tracks['track', 'genres_all'].map(lambda genres: genre in genres)
SPLITS = ('training', 'test', 'validation')
PERCENTAGES = (0.8, 0.1, 0.1)
tracks['set', 'split'] = pd.Series().astype('category', categories=SPLITS)
for subset in SUBSETS:
tracks_subset = tracks['set', 'subset'] <= subset
if subset == 'large':
genre_list = list(genres['title'])
else:
# Consider only the top-level genre for small and medium.
genre_list = tracks.loc[tracks_subset, ('track', 'genres_top')].map(lambda x: x[0]).unique()
genre_list = list(genres.loc[genre_list, 'title'])
while True:
if len(genre_list) == 0:
break
# Choose most constrained genre, i.e. genre with the least unassigned artists.
tracks_unsplit = tracks['set', 'split'].isnull()
count = tracks[tracks_subset & tracks_unsplit].set_index(('artist', 'id'), append=True)['genre']
count = count.groupby(level=1).sum().astype(np.bool).sum()
genre = np.argmin(count[genre_list])
genre_list.remove(genre)
# Given genre, select artists.
tracks_genre = tracks['genre', genre] == 1
artists = tracks.loc[tracks_genre & tracks_subset & tracks_unsplit, ('artist', 'id')].value_counts()
#print('-->', genre, len(artists))
current = {split: np.sum(tracks_genre & tracks_subset & (tracks['set', 'split'] == split)) for split in SPLITS}
# Assign artists with most tracks first.
for artist, count in artists.items():
choice = np.argmin([current[split] / percentage for split, percentage in zip(SPLITS, PERCENTAGES)])
current[SPLITS[choice]] += count
#assert tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')].isnull().all()
tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')] = SPLITS[choice]
# Tracks without genre can only serve as unlabeled data for training, e.g. for semi-supervised algorithms.
no_genres = tracks['track', 'genres_all'].map(lambda genres: len(genres) == 0)
no_split = tracks['set', 'split'].isnull()
assert not (no_split & ~no_genres).any()
tracks.loc[no_split, ('set', 'split')] = 'training'
# Not needed any more.
tracks.drop('genre', axis=1, level=0, inplace=True)
for dataset in 'tracks', 'genres', 'echonest':
eval(dataset).sort_index(axis=0, inplace=True)
eval(dataset).sort_index(axis=1, inplace=True)
params = dict(float_format='%.10f') if dataset == 'echonest' else dict()
eval(dataset).to_csv(f'data/fma_metadata/{dataset}.csv', **params)
tracks = utils.load('data/fma_metadata/tracks.csv')
tracks.dtypes
N = 5
ipd.display(tracks['track'].head(N))
ipd.display(tracks['album'].head(N))
ipd.display(tracks['artist'].head(N))