import os import ast import pickle import IPython.display as ipd import numpy as np import pandas as pd import utils import creation # Base directory where the FMA is stored. AUDIO_DIR = os.environ.get('AUDIO_DIR') BASE_DIR = os.path.abspath(os.path.dirname(AUDIO_DIR)) # Directory where all the full-length audio is stored. That is the FMA full dataset. FMA_FULL = os.path.join(BASE_DIR, 'fma_full') # Directory where all the 30s clips is stored. That is the FMA large dataset. FMA_LARGE = os.path.join(BASE_DIR, 'fma_large') # converters={'genres': ast.literal_eval} tracks = pd.read_csv('data/fma_metadata/raw_tracks.csv', index_col=0) albums = pd.read_csv('data/fma_metadata/raw_albums.csv', index_col=0) artists = pd.read_csv('data/fma_metadata/raw_artists.csv', index_col=0) genres = pd.read_csv('data/fma_metadata/raw_genres.csv', index_col=0) mp3_metadata = pd.read_csv('data/fma_metadata/mp3_metadata.csv', index_col=0) not_found = pickle.load(open('not_found.pickle', 'rb')) audio_tids = utils.get_tids_from_directory(FMA_FULL) clips_tids = utils.get_tids_from_directory(FMA_LARGE) print('tracks: {} collected ({} not found, {} max id)'.format( len(tracks), len(not_found['tracks']), tracks.index.max())) print('albums: {} collected ({} not found, {} in tracks)'.format( len(albums), len(not_found['albums']), len(tracks['album_id'].unique()))) print('artists: {} collected ({} not found, {} in tracks)'.format( len(artists), len(not_found['artists']), len(tracks['artist_id'].unique()))) print('genres: {} collected'.format(len(genres))) print('audio: {} collected ({} not found, {} not in tracks)'.format( len(audio_tids), len(not_found['audio']), len(set(audio_tids).difference(tracks.index)))) print('mp3 metadata: {} collected ({} not found, {} not in tracks)'.format( len(mp3_metadata), len(not_found['mp3_metadata']), len(mp3_metadata.index.difference(tracks.index)))) print('clips: {} collected ({} not found, {} not in tracks)'.format( len(clips_tids), len(not_found['clips']), len(set(clips_tids).difference(tracks.index)))) assert sum(tracks.index.isin(audio_tids)) + len(not_found['audio']) == len(tracks) assert sum(tracks.index.isin(clips_tids)) + len(not_found['clips']) == sum(tracks.index.isin(audio_tids)) assert len(mp3_metadata) + len(not_found['mp3_metadata']) == len(audio_tids) assert len(clips_tids) + len(not_found['clips']) == len(mp3_metadata) N = 5 ipd.display(tracks.head(N)) ipd.display(albums.head(N)) ipd.display(artists.head(N)) ipd.display(genres.head(N)) ipd.display(mp3_metadata.head(N)) df, column = tracks, 'tags' null = sum(df[column].isnull()) print('{} null, {} non-null'.format(null, df.shape[0] - null)) df[column].value_counts().head(10) drop = [ 'license_image_file', 'license_image_file_large', 'license_parent_id', 'license_url', # keep title only 'track_file', 'track_image_file', # used to download only 'track_url', 'album_url', 'artist_url', # only relevant on website 'track_copyright_c', 'track_copyright_p', # present for ~1000 tracks only # 'track_composer', 'track_lyricist', 'track_publisher', # present for ~4000, <1000 and <2000 tracks 'track_disc_number', # different from 1 for <1000 tracks 'track_explicit', 'track_explicit_notes', # present for <4000 tracks 'track_instrumental' # ~6000 tracks have a 1, there is an instrumental genre ] tracks.drop(drop, axis=1, inplace=True) tracks.rename(columns={'license_title': 'track_license', 'tags': 'track_tags'}, inplace=True) def convert_datetime(df, column, format=None): df[column] = pd.to_datetime(df[column], infer_datetime_format=True, format=format) convert_datetime(tracks, 'track_date_created') convert_datetime(tracks, 'track_date_recorded') tracks['album_id'].fillna(-1, inplace=True) tracks['track_bit_rate'].fillna(-1, inplace=True) tracks = tracks.astype({'album_id': int, 'track_bit_rate': int}) def convert_genres(genres): genres = ast.literal_eval(genres) return [int(genre['genre_id']) for genre in genres] tracks['track_genres'].fillna('[]', inplace=True) tracks['track_genres'] = tracks['track_genres'].map(convert_genres) tracks.columns drop = [ 'artist_name', 'album_url', 'artist_url', # in tracks already (though it can be different) 'album_handle', 'album_image_file', 'album_images', # todo: shall be downloaded #'album_producer', 'album_engineer', # present for ~2400 albums only ] albums.drop(drop, axis=1, inplace=True) albums.rename(columns={'tags': 'album_tags'}, inplace=True) convert_datetime(albums, 'album_date_created') convert_datetime(albums, 'album_date_released') albums.columns drop = [ 'artist_website', 'artist_url', # in tracks already (though it can be different) 'artist_handle', 'artist_image_file', 'artist_images', # todo: shall be downloaded 'artist_donation_url', 'artist_paypal_name', 'artist_flattr_name', # ~1600 & ~400 & ~70, not relevant 'artist_contact', # ~1500, not very useful data # 'artist_active_year_begin', 'artist_active_year_end', # ~1400, ~500 only # 'artist_associated_labels', # ~1000 # 'artist_related_projects', # only ~800, but can be combined with bio ] artists.drop(drop, axis=1, inplace=True) artists.rename(columns={'tags': 'artist_tags'}, inplace=True) convert_datetime(artists, 'artist_date_created') for column in ['artist_active_year_begin', 'artist_active_year_end']: artists[column].replace(0.0, np.nan, inplace=True) convert_datetime(artists, column, format='%Y.0') artists.columns # Example of faulty duration. tid = 2624 duration = tracks.at[tid, 'track_duration'] print('API reported duration: {}'.format(duration)) duration = mp3_metadata.loc[tid, 'samples'] / mp3_metadata.loc[tid, 'sample_rate'] print('Real duration after decoding: {:.0f}s'.format(duration)) tracks['track_channels'] = mp3_metadata['channels'] tracks['track_bit_rate_mode'] = mp3_metadata['mode'] tracks['track_bit_rate'] = mp3_metadata['bit_rate'] tracks['track_sample_rate'] = mp3_metadata['sample_rate'] tracks['track_samples'] = mp3_metadata['samples'] duration = mp3_metadata['samples'] / mp3_metadata['sample_rate'] tracks['track_duration'] = duration.round().astype(int) not_found['albums'].remove(None) not_found['albums'].append(-1) not_found['albums'] = [int(i) for i in not_found['albums']] not_found['artists'] = [int(i) for i in not_found['artists']] tracks = tracks.merge(albums, left_on='album_id', right_index=True, sort=False, how='left', suffixes=('', '_dup')) n = sum(tracks['album_title_dup'].isnull()) print('{} tracks without extended album information ({} tracks without album_id)'.format( n, sum(tracks['album_id'] == -1))) assert sum(tracks['album_id'].isin(not_found['albums'])) == n assert sum(tracks['album_title'] != tracks['album_title_dup']) == n tracks.drop('album_title_dup', axis=1, inplace=True) assert not any('dup' in col for col in tracks.columns) # Album artist can be different than track artist. Keep track artist. #tracks[tracks['artist_name'] != tracks['artist_name_dup']].select(lambda x: 'artist_name' in x, axis=1) tracks = tracks.merge(artists, left_on='artist_id', right_index=True, sort=False, how='left', suffixes=('', '_dup')) n = sum(tracks['artist_name_dup'].isnull()) print('{} tracks without extended artist information'.format(n)) assert sum(tracks['artist_id'].isin(not_found['artists'])) == n assert sum(tracks['artist_name'] != tracks[('artist_name_dup')]) == n tracks.drop('artist_name_dup', axis=1, inplace=True) assert not any('dup' in col for col in tracks.columns) columns = [] for name in tracks.columns: names = name.split('_') columns.append((names[0], '_'.join(names[1:]))) tracks.columns = pd.MultiIndex.from_tuples(columns) assert all(label in ['track', 'album', 'artist'] for label in tracks.columns.get_level_values(0)) # Todo: fill other columns ? tracks['album', 'tags'].fillna('[]', inplace=True) tracks['artist', 'tags'].fillna('[]', inplace=True) columns = [('album', 'favorites'), ('album', 'comments'), ('album', 'listens'), ('album', 'tracks'), ('artist', 'favorites'), ('artist', 'comments')] for column in columns: tracks[column].fillna(-1, inplace=True) columns = {column: int for column in columns} tracks = tracks.astype(columns) def keep(index, df): old = len(df) df = df.loc[index] new = len(df) print('{} lost, {} left'.format(old - new, new)) return df tracks = keep(tracks.index, tracks) # Audio not found or could not be read or trimmed. tracks = keep(tracks.index.difference(not_found['audio']), tracks) tracks = keep(tracks.index.difference(not_found['mp3_metadata']), tracks) tracks = keep(tracks.index.difference(not_found['clips']), tracks) # Feature extraction failed. FAILED = [1440, 26436, 28106, 29166, 29167, 29168, 29169, 29170, 29171, 29172, 29173, 29179, 38903, 43903, 56757, 57603, 59361, 62095, 62954, 62956, 62957, 62959, 62971, 75461, 80015, 86079, 92345, 92346, 92347, 92348, 92349, 92350, 92351, 92352, 92353, 92354, 92355, 92356, 92357, 92358, 92359, 92360, 92361, 96426, 104623, 106719, 109714, 114448, 114501,114528, 115235, 117759, 118003, 118004, 127827, 130296, 130298, 131076, 135804, 136486, 144769, 144770, 144771, 144773, 144774, 144775, 144776, 144777, 144778, 152204, 154923] tracks = keep(tracks.index.difference(FAILED), tracks) # License forbids redistribution. tracks = keep(tracks['track', 'license'] != 'FMA-Limited: Download Only', tracks) print('{} licenses'.format(len(tracks[('track', 'license')].unique()))) #sum(tracks['track', 'title'].duplicated()) genres.drop(['genre_handle', 'genre_color'], axis=1, inplace=True) genres.rename(columns={'genre_parent_id': 'parent', 'genre_title': 'title'}, inplace=True) genres['parent'].fillna(0, inplace=True) genres = genres.astype({'parent': int}) # 13 (Easy Listening) has parent 126 which is missing # --> a root genre on the website, although not in the genre menu genres.at[13, 'parent'] = 0 # 580 (Abstract Hip-Hop) has parent 1172 which is missing # --> listed as child of Hip-Hop on the website genres.at[580, 'parent'] = 21 # 810 (Nu-Jazz) has parent 51 which is missing # --> listed as child of Easy Listening on website genres.at[810, 'parent'] = 13 # 763 (Holiday) has parent 763 which is itself # --> listed as child of Sound Effects on website genres.at[763, 'parent'] = 16 # Todo: should novelty be under Experimental? It is alone on website. # Genre 806 (hiphop) should not exist. Replace it by 21 (Hip-Hop). print('{} tracks have genre 806'.format( sum(tracks['track', 'genres'].map(lambda genres: 806 in genres)))) def change_genre(genres): return [genre if genre != 806 else 21 for genre in genres] tracks['track', 'genres'] = tracks['track', 'genres'].map(change_genre) genres.drop(806, inplace=True) def get_parent(genre, track_all_genres=None): parent = genres.at[genre, 'parent'] if track_all_genres is not None: track_all_genres.append(genre) return genre if parent == 0 else get_parent(parent, track_all_genres) # Get all genres, i.e. all genres encountered when walking from leafs to roots. def get_all_genres(track_genres): track_all_genres = list() for genre in track_genres: get_parent(genre, track_all_genres) return list(set(track_all_genres)) tracks['track', 'genres_all'] = tracks['track', 'genres'].map(get_all_genres) # Number of tracks per genre. def count_genres(subset=tracks.index): count = pd.Series(0, index=genres.index) for _, track_all_genres in tracks.loc[subset, ('track', 'genres_all')].items(): for genre in track_all_genres: count[genre] += 1 return count genres['#tracks'] = count_genres() # Remove unused genres, i.e. genres who are never referenced. # So that the length of the table is the number of useful genres. print(genres.shape) unused_genres = (genres['#tracks'] == 0) ipd.display(genres[unused_genres]) genres.drop(genres.index[unused_genres], inplace=True) print(genres.shape) def get_top_genres(track_genres): return list(set(genres.at[genre, 'top_level'] for genre in track_genres)) # Top-level genre. genres['top_level'] = genres.index.map(get_parent) tracks['track', 'genres_top'] = tracks['track', 'genres'].map(get_top_genres) fma_medium = pd.DataFrame(tracks) # Missing meta-information. # Missing extended album and artist information. fma_medium = keep(~fma_medium['album', 'id'].isin(not_found['albums']), fma_medium) fma_medium = keep(~fma_medium['artist', 'id'].isin(not_found['artists']), fma_medium) # Untitled track or album. fma_medium = keep(~fma_medium['track', 'title'].isnull(), fma_medium) fma_medium = keep(fma_medium['track', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium) fma_medium = keep(fma_medium['album', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium) # One tag is often just the artist name. Tags too scarce for tracks and albums. #keep(fma_medium['artist', 'tags'].map(len) >= 2, fma_medium) # Too scarce. #fma_medium = keep(~fma_medium['album', 'information'].isnull(), fma_medium) #fma_medium = keep(~fma_medium['artist', 'bio'].isnull(), fma_medium) #fma_medium = keep(~fma_medium['artist', 'website'].isnull(), fma_medium) #fma_medium = keep(~fma_medium['artist', 'wikipedia_page'].isnull(), fma_medium) # Too scarce. #fma_medium = keep(~fma_medium['artist', 'location'].isnull(), fma_medium) #fma_medium = keep(~fma_medium['artist', 'latitude'].isnull(), fma_medium) #fma_medium = keep(~fma_medium['artist', 'longitude'].isnull(), fma_medium) # Technical quality. # Todo: sample rate fma_medium = keep(fma_medium['track', 'bit_rate'] > 100000, fma_medium) # Choosing standard bit rates discards all VBR. #fma_medium = keep(fma_medium['track', 'bit_rate'].isin([320000, 256000, 192000, 160000, 128000]), fma_medium) fma_medium = keep(fma_medium['track', 'duration'] >= 60, fma_medium) fma_medium = keep(fma_medium['track', 'duration'] <= 600, fma_medium) fma_medium = keep(fma_medium['album', 'tracks'] >= 1, fma_medium) fma_medium = keep(fma_medium['album', 'tracks'] <= 50, fma_medium) # Lower popularity bound. fma_medium = keep(fma_medium['track', 'listens'] >= 100, fma_medium) fma_medium = keep(fma_medium['track', 'interest'] >= 200, fma_medium) fma_medium = keep(fma_medium['album', 'listens'] >= 1000, fma_medium); # Favorites and comments are very scarce. #fma_medium = keep(fma_medium['artist', 'favorites'] >= 1, fma_medium) # Targeted genre classification. fma_medium = keep(fma_medium['track', 'genres_top'].map(len) == 1, fma_medium); #keep(fma_medium['track', 'genres'].map(len) == 1, fma_medium); # For convenience. fma_medium['track', 'genre_top'] = fma_medium['track', 'genres_top'].map(lambda x: x[0]) # Adjust size with popularity measure. Should be of better quality. N_TRACKS = 25000 # Observations # * More albums killed than artists --> be sure not to kill diversity # * Favorites and preterites genres differently --> do it per genre? # Normalization # * mean, median, std, max # * tracks per album or artist # Test # * 4/5 of same tracks were selected with various set of measures # * <5% diff with max and mean popularity_measures = [('track', 'listens'), ('track', 'interest')] # ('album', 'listens') # ('track', 'favorites'), ('track', 'comments'), # ('album', 'favorites'), ('album', 'comments'), # ('artist', 'favorites'), ('artist', 'comments'), normalization = {measure: fma_medium[measure].max() for measure in popularity_measures} def popularity_measure(track): return sum(track[measure] / normalization[measure] for measure in popularity_measures) fma_medium['popularity_measure'] = fma_medium.apply(popularity_measure, axis=1) fma_medium = keep(fma_medium.sort_values('popularity_measure', ascending=False).index[:N_TRACKS], fma_medium) tmp = genres[genres['parent'] == 0].copy() tmp['#tracks_medium'] = fma_medium['track', 'genre_top'].value_counts() tmp.sort_values('#tracks_medium', ascending=False) N_GENRES = 8 N_TRACKS = 1000 top_genres = tmp.sort_values('#tracks_medium', ascending=False)[:N_GENRES].index fma_small = pd.DataFrame(fma_medium) fma_small = keep(fma_small['track', 'genre_top'].isin(top_genres), fma_small) to_keep = [] for genre in top_genres: subset = fma_small[fma_small['track', 'genre_top'] == genre] drop = subset.sort_values('popularity_measure').index[:-N_TRACKS] fma_small.drop(drop, inplace=True) assert len(fma_small) == N_GENRES * N_TRACKS SUBSETS = ('small', 'medium', 'large') tracks['set', 'subset'] = pd.Series().astype('category', categories=SUBSETS, ordered=True) tracks.loc[tracks.index, ('set', 'subset')] = 'large' tracks.loc[fma_medium.index, ('set', 'subset')] = 'medium' tracks.loc[fma_small.index, ('set', 'subset')] = 'small' echonest = pd.read_csv('data/fma_metadata/raw_echonest.csv', index_col=0, header=[0, 1, 2]) echonest = keep(~echonest['echonest', 'temporal_features'].isnull().any(axis=1), echonest) echonest = keep(~echonest['echonest', 'audio_features'].isnull().any(axis=1), echonest) echonest = keep(~echonest['echonest', 'social_features'].isnull().any(axis=1), echonest) echonest = keep(echonest.index.isin(tracks.index), echonest); keep(echonest.index.isin(fma_medium.index), echonest); keep(echonest.index.isin(fma_small.index), echonest); for genre in genres.index: tracks['genre', genres.at[genre, 'title']] = tracks['track', 'genres_all'].map(lambda genres: genre in genres) SPLITS = ('training', 'test', 'validation') PERCENTAGES = (0.8, 0.1, 0.1) tracks['set', 'split'] = pd.Series().astype('category', categories=SPLITS) for subset in SUBSETS: tracks_subset = tracks['set', 'subset'] <= subset if subset == 'large': genre_list = list(genres['title']) else: # Consider only the top-level genre for small and medium. genre_list = tracks.loc[tracks_subset, ('track', 'genres_top')].map(lambda x: x[0]).unique() genre_list = list(genres.loc[genre_list, 'title']) while True: if len(genre_list) == 0: break # Choose most constrained genre, i.e. genre with the least unassigned artists. tracks_unsplit = tracks['set', 'split'].isnull() count = tracks[tracks_subset & tracks_unsplit].set_index(('artist', 'id'), append=True)['genre'] count = count.groupby(level=1).sum().astype(np.bool).sum() genre = np.argmin(count[genre_list]) genre_list.remove(genre) # Given genre, select artists. tracks_genre = tracks['genre', genre] == 1 artists = tracks.loc[tracks_genre & tracks_subset & tracks_unsplit, ('artist', 'id')].value_counts() #print('-->', genre, len(artists)) current = {split: np.sum(tracks_genre & tracks_subset & (tracks['set', 'split'] == split)) for split in SPLITS} # Assign artists with most tracks first. for artist, count in artists.items(): choice = np.argmin([current[split] / percentage for split, percentage in zip(SPLITS, PERCENTAGES)]) current[SPLITS[choice]] += count #assert tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')].isnull().all() tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')] = SPLITS[choice] # Tracks without genre can only serve as unlabeled data for training, e.g. for semi-supervised algorithms. no_genres = tracks['track', 'genres_all'].map(lambda genres: len(genres) == 0) no_split = tracks['set', 'split'].isnull() assert not (no_split & ~no_genres).any() tracks.loc[no_split, ('set', 'split')] = 'training' # Not needed any more. tracks.drop('genre', axis=1, level=0, inplace=True) for dataset in 'tracks', 'genres', 'echonest': eval(dataset).sort_index(axis=0, inplace=True) eval(dataset).sort_index(axis=1, inplace=True) params = dict(float_format='%.10f') if dataset == 'echonest' else dict() eval(dataset).to_csv(f'data/fma_metadata/{dataset}.csv', **params) tracks = utils.load('data/fma_metadata/tracks.csv') tracks.dtypes N = 5 ipd.display(tracks['track'].head(N)) ipd.display(tracks['album'].head(N)) ipd.display(tracks['artist'].head(N))