Data-driven Multimodal Alignment

Author: Ruchit Agrawal

This notebook demonstrates how time-series data pertaining to different modalities can be aligned using deep learning.

In [1]:
import os
import sys
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
In [2]:
plt.style.use('ggplot')
In [3]:
df = pd.read_csv('M06-1beat_time.csv')
In [4]:
### Data visualization
df
Out[4]:
Unnamed: 0 measure_number beat_number pid1263-01 pid52932-01 pid9048-01 pid9050-01 pid9054-01 pid9055-01 pid9058-01 ... pid9137-01 pid9138-01 pid9139-01 pid9150-15 pid9153-01 pid9166c-03 pid9173-06 pid9174-01 pid9186c-01 pid9192-01
0 0 1 2 4.435011 3.041814 3.877732 1.109002 3.181134 2.298027 3.552653 ... 1.904036 0.589002 1.147007 1.149002 0.952018 0.786009 0.224014 0.269002 0.407007 0.529002
1 1 2 0 5.131610 3.970612 4.760091 1.648594 4.040272 3.268594 4.852971 ... 2.623855 1.713129 1.928594 1.908594 1.728594 1.189944 1.188594 1.048594 0.789478 1.248594
2 2 2 1 5.893084 4.913084 5.553084 2.260771 4.513084 3.993084 5.673084 ... 3.053084 2.433084 2.813084 2.533084 2.345215 1.573084 1.813084 1.633084 1.130771 2.153084
3 3 2 2 6.366145 5.446145 6.066145 2.586145 4.906145 4.406145 6.339048 ... 3.406145 2.846145 3.286145 2.966145 2.716735 2.006145 2.206145 2.046145 1.425125 2.546145
4 4 3 0 7.010227 6.110227 6.670227 3.150227 5.410227 4.890227 7.230227 ... 3.784853 3.410227 3.790227 3.490227 3.170227 2.370227 2.670227 2.550227 1.830227 3.170227
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
332 332 112 1 187.953900 176.553900 192.533900 172.453900 154.973900 179.353900 185.433900 ... 125.333900 162.813900 189.851950 151.093900 153.053900 160.273900 188.253900 142.573900 126.693900 189.293900
333 333 112 2 188.894310 177.874310 193.254310 173.414310 156.214310 180.204770 186.294310 ... 125.991470 164.074310 190.714310 151.614310 154.292150 161.086080 189.014310 143.152150 127.166080 190.112430
334 334 113 0 189.974970 179.649820 194.549820 174.569820 157.049820 181.609820 187.549820 ... 126.869820 165.849820 191.803270 152.489820 155.709820 161.989820 190.289820 143.969820 128.133310 191.049820
335 335 113 1 190.924920 180.904920 195.504920 176.084920 158.184920 183.004920 189.144920 ... 127.407890 167.324920 192.684920 153.824920 156.728730 162.764920 191.724920 145.041640 129.248730 192.104920
336 336 113 2 193.001970 182.541970 196.741970 176.192930 161.941970 184.770650 190.541970 ... 130.341970 168.961970 193.961970 154.974260 157.721970 164.510730 194.621970 146.336570 130.561970 193.911530

337 rows × 37 columns

In [5]:
df[df.columns[5:]].plot(figsize=(20,12))
plt.title('Variations of beats across different versions of M06-1')
Out[5]:
Text(0.5, 1.0, 'Variations of beats across different versions of M06-1')

Data generation


  • gen_data(folder, csv)

    • folder : path containing files corresponding to a specific Mazurka
    • csv : path of the csv file for that Mazurka

    • return : euclidean distance matrix and aligned path values (extrapolated for each value of )

  • get_matrix(file1, file2)
    • file1 : location of the file to be presented on the x-axis of the distance matrix
    • file2 : location of the file to be presented on the y-axis of the distance matrix
    • return : euclidean distance matrix
In [6]:
import numpy as np # linear algebra
import random
import pandas as pd # data processing
import matplotlib.pyplot as plt
from pandas import datetime
import math, time
import itertools
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import datetime
from operator import itemgetter
from sklearn.metrics import mean_squared_error
from math import sqrt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
<ipython-input-6-fd8e901605d8>:5: FutureWarning: The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime module instead.
  from pandas import datetime
In [7]:
import warnings
warnings.filterwarnings('ignore')
In [8]:
import os
import sys
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from scipy.interpolate import interp1d
from tqdm import tqdm

def get_matrix(file1, file2):
    file1 = file1 + '.mp4'
    file2 = file2 + '.mp4'
    fname = librosa.util.example_audio_file()

    y1, sr1 = librosa.load(file1)
    y2, sr2 = librosa.load(file2)
        
    flipped = False
    if y1.shape[0] < y2.shape[0]:
        flipped=True
        y1,y2 = y2, y1
        sr1, sr2 = sr2, sr1

    factor = y2.shape[0]/y1.shape[0]
#     print(factor)


    N = 4410   #2048  default value 
    H = 2205   #512   default value
    
    a1 = librosa.feature.chroma_stft(y = y1, sr = sr1, hop_length = H, n_fft = N)
    a2 = librosa.feature.chroma_stft(y = y2, sr = sr2, hop_length = int(H * factor), n_fft = N)  
   
    x1 = a1.T
    x2 = a2.T
        
    if x1.shape[0] > x2.shape[0]: 
        euc_dists = cdist(x2, x1, metric='euclidean')
    else:
        euc_dists = cdist(x1, x2, metric='euclidean')
        
    return euc_dists, factor, flipped


def gen_data(folder,csv):
    names = []
    
    for n in os.listdir(folder):
        if '.mp4' in n:
            names.append(n.replace(".mp4", ""))
            
    df = pd.read_csv(csv)
    c = 0
    
    for i in (range(len(df.columns[5:]))):
        for j in (range(i+1,len(df.columns[5:]))):
            if df.columns[5:][i] and df.columns[5:][j] in names:
                x,y = df[df.columns[5:][i]].values, df[df.columns[5:][j]].values
                euc_dists, factor, flipped = get_matrix(f'{folder}/{df.columns[5:][i]}', f'{folder}/{df.columns[5:][j]}')
                
                if not flipped:
                    x = np.round(x * 10, 4)
                    y = np.round(y * 10 * (1/factor), 4)
                else:
                    x = np.round(x * 10 * (1/factor), 4)
                    y = np.round(y * 10, 4)
                    
                    x, y = y, x
                
                f = interp1d(y, x, kind='slinear',fill_value="extrapolate")
                xnew = np.arange(0, euc_dists.shape[1],1)  #euc_dists.shape[1]
                ynew = f(xnew)
                
                yield euc_dists,ynew
            

Testing with one iteration over the data loader

In [9]:
el = []
pl = []
i = 0
for e,p in tqdm(gen_data(folder='/media/shredpub/Elements/documents/datasets/Mazurkas/mazurka06-1', csv='M06-1beat_time.csv')):
    print('i',i, e.shape, p.shape)
    el.append(e)
    pl.append(p)
    i+=1
    if i == 10:
        break
1it [00:19, 19.21s/it]
i 0 (2034, 2034) (2034,)
2it [00:32, 15.84s/it]
i 1 (2034, 2034) (2034,)
3it [00:47, 15.37s/it]
i 2 (2034, 2035) (2035,)
4it [01:02, 15.27s/it]
i 3 (2034, 2035) (2035,)
5it [01:16, 14.80s/it]
i 4 (2034, 2035) (2035,)
6it [01:31, 14.87s/it]
i 5 (2100, 2100) (2100,)
7it [01:45, 14.55s/it]
i 6 (2034, 2034) (2034,)
8it [01:59, 14.49s/it]
i 7 (2034, 2034) (2034,)
9it [02:13, 14.12s/it]
i 8 (2034, 2034) (2034,)
9it [02:27, 16.37s/it]
i 9 (2034, 2035) (2035,)

Plotting sample data

In [10]:
fig = plt.figure(figsize=(20, 60))
for i in range(10):
    ax = fig.add_subplot(5, 2, i+1)
    ax.plot(pl[i], 'r--')
    ax.imshow(el[i], interpolation='nearest', origin='lower', aspect='auto')
plt.title('Euclidean distance matrix and aligned path')
Out[10]:
Text(0.5, 1.0, 'Euclidean distance matrix and aligned path')