Notebook

Tests¶

This Jupyter notebook shows some tests for extracting and experimenting with the constant-Q harmonic coefficients (CQHCs) (more personal).

Contents:

Preliminary Tests
Test on a Small Dataset

Author:

1. Preliminary Tests ¶

A. Create a note scale and compute its CQT spectrogram ¶

In [ ]:

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Define the parameters for the notes to concatenate
folder_path = r'nsynth\nsynth-train\audio'
instrument_names = ['bass_acoustic_000']
note_number = 24
note_numbers = np.arange(note_number, note_number+12)
velocity_number = 75
sampling_frequency = 16000

# Loop over the instrument names and note numbers to concatenate the notes
audio_signal = np.empty(0)
for instrument_name in instrument_names:
    for note_number in note_numbers:
    
        # Get the path to the file
        file_name = f'{instrument_name}-{note_number:03d}-{velocity_number:03d}.wav'
        file_path = os.path.join(folder_path, file_name)
        
        # Load the current audio signal and concatenate them
        audio_signal1, _ = librosa.load(file_path, sr=sampling_frequency, mono=True)
        audio_signal = np.concatenate((audio_signal, audio_signal1))
        
# Comptute the CQT spectrogram of the signal
step_length = int(pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))/2)
minimum_frequency = 32.70
maximum_frequency = sampling_frequency/2
octave_resolution = 12
number_frequencies = round(octave_resolution * np.log2(maximum_frequency / minimum_frequency))
audio_cqt = librosa.cqt(audio_signal, sr=sampling_frequency, hop_length=step_length, fmin=minimum_frequency, \
                        n_bins=number_frequencies, bins_per_octave=octave_resolution)
cqt_spectrogram = np.abs(audio_cqt)

# Display the audio signal and the CQT spectrogram
plt.figure(figsize=(14, 2))
librosa.display.waveplot(audio_signal, sr=sampling_frequency)
plt.title('Audio signal')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 4))
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, x_axis='time', y_axis='cqt_note', bins_per_octave=octave_resolution)
plt.title('CQT spectrogram')
plt.tight_layout()
plt.show()

B. Decompose the CQT spectrogram into a spectral component and a pitch component ¶

In [ ]:

# Derive the CQT envelope and the CQT pitch
ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/absftcqt_spectrogram, axis=0)[0:number_frequencies, :])

# Resynthesize the CQT spectrogram by convolving the spectral component and pitch component
number_times = np.shape(cqt_spectrogram)[1]
cqt_spectrogram2 = np.zeros((number_frequencies, number_times))
for i in range(number_times):
    cqt_spectrogram2[:, i] = np.convolve(spectral_component[:, i], pitch_component[:, i])[0:number_frequencies]

# Display the CQT spectrogram, the spectral component, and pitch component, and the resynthesized CQT spectrogram
j = 10
plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
                         x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(spectral_component, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time')
plt.title('Spectral component')
plt.subplot(1, 3, 3)
librosa.display.specshow(pitch_component, sr=sampling_frequency, hop_length=step_length, fmin=minimum_frequency, \
                         bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
plt.title('Pitch component')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(spectral_component[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(pitch_component[:, j]), plt.ylim(top=1)
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
                         x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram2, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
                         x_axis='time', y_axis='cqt_note')
plt.title('Resynthesized CQT spectrogram')
plt.subplot(1, 3, 3)
librosa.display.specshow(cqt_spectrogram-cqt_spectrogram2, sr=sampling_frequency, hop_length=step_length, \
                         fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
rms_value = np.round(np.sqrt(np.mean(np.power(cqt_spectrogram-cqt_spectrogram2, 2))), 3)
plt.title(f'Differences (RMS={rms_value})')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(cqt_spectrogram2[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(cqt_spectrogram[:, j]-cqt_spectrogram2[:, j]), plt.ylim(top=max(cqt_spectrogram[:, j]))
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()

# Refine the pitch component, and then the spectral component
pitch_component2 = np.copy(pitch_component)
pitch_component2[pitch_component2 < 0] = 0
spectral_component2 = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component2, 2*number_frequencies-1, \
                                                                        axis=0)+1e-7), axis=0)[0:number_frequencies, :])

# Resynthesize the CQT spectrogram by convolving the refined spectral component and pitch component
cqt_spectrogram2 = np.zeros((number_frequencies, number_times))
for i in range(number_times):
    cqt_spectrogram2[:, i] = np.convolve(spectral_component2[:, i], pitch_component2[:, i])[0:number_frequencies]

# Display everything again with the refined versions
plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
                         x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(spectral_component2, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time')
plt.title('Refined spectral component')
plt.subplot(1, 3, 3)
librosa.display.specshow(pitch_component2, sr=sampling_frequency, hop_length=step_length, fmin=minimum_frequency, \
                         bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
plt.title('Refined pitch component')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(spectral_component2[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(pitch_component2[:, j]), plt.ylim(top=1)
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
                         x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram2, ref=np.max), sr=sampling_frequency, \
                         hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
                         x_axis='time', y_axis='cqt_note')
plt.title('Resynthesized CQT spectrogram')
plt.subplot(1, 3, 3)
librosa.display.specshow(cqt_spectrogram-cqt_spectrogram2, sr=sampling_frequency, hop_length=step_length, \
                         fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
rms_value = np.round(np.sqrt(np.mean(np.power(cqt_spectrogram-cqt_spectrogram2, 2))), 3)
plt.title(f'Differences (RMS={rms_value})')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(cqt_spectrogram2[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(cqt_spectrogram[:, j]-cqt_spectrogram2[:, j]), plt.ylim(top=max(cqt_spectrogram[:, j]))
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()

# # Resynthesize the signal
# audio_signal2 = librosa.icqt(cqt_spectrogram2*audio_cqt/cqt_spectrogram, sr=sampling_frequency, \
#                              hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution)
# audio_signal2 = np.max(abs(audio_signal))*audio_signal2/np.max(abs(audio_signal2))
# audio_signal2 = np.pad(audio_signal2, (0, len(audio_signal)-len(audio_signal2)), 'constant', constant_values=0)

C. Extract the CQHCs from the spectral component and compare them to the MFCCs ¶

In [ ]:

# Extract the CQHCs from the spectral component
number_coefficients = 20
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhcs = spectral_component[coefficient_indices, :]

# Compute the MFCCs using librosa
window_length = pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))
step_length = int(window_length/2)
audio_mfcc = librosa.feature.mfcc(y=audio_signal, sr=sampling_frequency, n_fft=window_length, hop_length=step_length)

# Compute the self-similarity matrices for the CQHCs and the MFCCs
normalized_feature = audio_cqhcs/(np.sqrt(np.sum(np.power(audio_cqhcs, 2), axis=0))+1e-16)
similarity_matrix1 = np.matmul(normalized_feature.T, normalized_feature)
normalized_feature = audio_mfcc/(np.sqrt(np.sum(np.power(audio_mfcc, 2), axis=0))+1e-16)
similarity_matrix2 = np.matmul(normalized_feature.T, normalized_feature)

# Plot the features and their self-similarity matrices
plt.figure(figsize=(14, 3))
plt.subplot(1, 2, 1), plt.imshow(audio_cqhcs, aspect='auto', cmap='jet', origin='lower')
plt.title('CQHCs'), plt.xlabel('Time'), plt.ylabel('Coefficient')
plt.subplot(1, 2, 2), plt.imshow(audio_mfcc, cmap='jet', aspect='auto', origin='lower')
plt.title('MFCCs'), plt.xlabel('Time'), plt.ylabel('Coefficient')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1), plt.imshow(similarity_matrix1, cmap='gray', aspect='auto', origin='lower', vmin=0.9, vmax=1)
plt.title('CQHCs self-similarity'), plt.xlabel('Time'), plt.ylabel('Time')
plt.subplot(1, 2, 2), plt.imshow(similarity_matrix2, cmap='gray', aspect='auto', origin='lower', vmin=0.9, vmax=1)
plt.title('MFCCs self-similarity'), plt.xlabel('Time'), plt.ylabel('Time')
plt.tight_layout()
plt.show()

2. Test on a Small Dataset ¶

A. Create a small dataset from the NSynth dataset ¶

In [ ]:

import os
from shutil import copyfile

# The NSynth dataset can be downloaded from: https://magenta.tensorflow.org/datasets/nsynth

# Define the folders
folder_path = r'nsynth\nsynth-train\audio'
folder_path2 = r'nsynth11'

# Define the instrument names, numbers, and MIDIs
instrument_list = [{'name':'bass_acoustic', 'number': '000', 'midi': 24}, \
                   {'name':'brass_acoustic', 'number': '000', 'midi': 60}, \
                   {'name':'flute_acoustic', 'number': '000', 'midi': 60}, \
                   {'name':'guitar_acoustic', 'number': '000', 'midi': 60}, \
                   {'name':'keyboard_acoustic', 'number': '000', 'midi': 60}, \
                   {'name':'mallet_acoustic', 'number': '000', 'midi': 72}, \
                   {'name':'organ_electronic', 'number': '000', 'midi': 60}, \
                   {'name':'reed_acoustic', 'number': '000', 'midi': 60}, \
                   {'name':'string_acoustic', 'number': '000', 'midi': 60}, \
                   {'name':'synth_lead_synthetic', 'number': '000', 'midi': 60}, \
                   {'name':'vocal_acoustic', 'number': '002', 'midi': 60}]

# Loop over the list of notes to create the dataset
os.mkdir(folder_path2)
number_semitones = 12
for i in instrument_list:
    for j in range(i['midi'], i['midi']+number_semitones):
        file_name = f"{i['name']}_{i['number']}-{j:03d}-075.wav"
        file_path = os.path.join(folder_path, file_name)
        file_path2 = os.path.join(folder_path2, file_name)
        copyfile(file_path, file_path2)

B. Compute the CQHCs and the MFCCs ¶

In [5]:

import os
import numpy as np
import librosa
import matplotlib.pyplot as plt

# Define a function to compute the CQHCs
def cqhc(audio_signal, sampling_frequency, number_coefficients=20):
    
    # Comptute the CQT spectrogram from the signal
    step_length = int(pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))/2)
    octave_resolution = 12
    minimum_frequency = 32.70
    maximum_frequency = sampling_frequency/2
    number_frequencies = round(octave_resolution * np.log2(maximum_frequency / minimum_frequency))
    cqt_spectrogram = np.abs(librosa.cqt(audio_signal, sr=sampling_frequency, hop_length=step_length, \
                                                  fmin=minimum_frequency, n_bins=number_frequencies, \
                                                  bins_per_octave=octave_resolution))
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and the pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
#     # Refine the spectral component
#     pitch_component[pitch_component<0] = 0
#     spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
#                                                                            axis=0)+1e-16), axis=0)[0:number_frequencies, :])
#     spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQHCs and extract them
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

# Define a function to compute the MFCCs
def mfcc(audio_signal, sampling_frequency, number_coefficients=20):
    
    # Compute the MFCCs using librosa's function
    window_length = pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))
    step_length = int(window_length/2)
    audio_mfcc = librosa.feature.mfcc(y=audio_signal, sr=sampling_frequency, n_mfcc=number_coefficients, 
                                      n_fft=window_length, hop_length=step_length)
    
    return audio_mfcc


# Get the path to the folder and its files
folder_path = r'nsynth11'
folder_listdir = os.listdir(folder_path)
number_files = len(folder_listdir)

# Create an empty list for storing dictionaries
audio_list = []

# Loop over the files
k = 0
for file_name in folder_listdir:
    k = k+1
    
    # Display the name of the file
    print(f'{k}/{number_files}: {file_name}')
    
    # Get the path to the audio file and load it
    file_path = os.path.join(folder_path, file_name)
    audio_signal, sampling_frequency = librosa.load(file_path, sr=None, mono=True)
    
    # Compute the CQHCs and the MFCCs
    audio_cqhc = cqhc(audio_signal, sampling_frequency)
    audio_mfcc = mfcc(audio_signal, sampling_frequency)
    
    # Create a dictionary for the current file and append it to the list
    audio_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc, 'mfcc': audio_mfcc}
    audio_list.append(audio_dict)

1/132: bass_acoustic_000-024-075.wav
2/132: bass_acoustic_000-025-075.wav
3/132: bass_acoustic_000-026-075.wav
4/132: bass_acoustic_000-027-075.wav
5/132: bass_acoustic_000-028-075.wav
6/132: bass_acoustic_000-029-075.wav
7/132: bass_acoustic_000-030-075.wav
8/132: bass_acoustic_000-031-075.wav
9/132: bass_acoustic_000-032-075.wav
10/132: bass_acoustic_000-033-075.wav
11/132: bass_acoustic_000-034-075.wav
12/132: bass_acoustic_000-035-075.wav
13/132: brass_acoustic_000-060-075.wav
14/132: brass_acoustic_000-061-075.wav
15/132: brass_acoustic_000-062-075.wav
16/132: brass_acoustic_000-063-075.wav
17/132: brass_acoustic_000-064-075.wav
18/132: brass_acoustic_000-065-075.wav
19/132: brass_acoustic_000-066-075.wav
20/132: brass_acoustic_000-067-075.wav
21/132: brass_acoustic_000-068-075.wav
22/132: brass_acoustic_000-069-075.wav
23/132: brass_acoustic_000-070-075.wav
24/132: brass_acoustic_000-071-075.wav
25/132: flute_acoustic_000-060-075.wav
26/132: flute_acoustic_000-061-075.wav
27/132: flute_acoustic_000-062-075.wav
28/132: flute_acoustic_000-063-075.wav
29/132: flute_acoustic_000-064-075.wav
30/132: flute_acoustic_000-065-075.wav
31/132: flute_acoustic_000-066-075.wav
32/132: flute_acoustic_000-067-075.wav
33/132: flute_acoustic_000-068-075.wav
34/132: flute_acoustic_000-069-075.wav
35/132: flute_acoustic_000-070-075.wav
36/132: flute_acoustic_000-071-075.wav
37/132: guitar_acoustic_000-060-075.wav
38/132: guitar_acoustic_000-061-075.wav
39/132: guitar_acoustic_000-062-075.wav
40/132: guitar_acoustic_000-063-075.wav
41/132: guitar_acoustic_000-064-075.wav
42/132: guitar_acoustic_000-065-075.wav
43/132: guitar_acoustic_000-066-075.wav
44/132: guitar_acoustic_000-067-075.wav
45/132: guitar_acoustic_000-068-075.wav
46/132: guitar_acoustic_000-069-075.wav
47/132: guitar_acoustic_000-070-075.wav
48/132: guitar_acoustic_000-071-075.wav
49/132: keyboard_acoustic_000-060-075.wav
50/132: keyboard_acoustic_000-061-075.wav
51/132: keyboard_acoustic_000-062-075.wav
52/132: keyboard_acoustic_000-063-075.wav
53/132: keyboard_acoustic_000-064-075.wav
54/132: keyboard_acoustic_000-065-075.wav
55/132: keyboard_acoustic_000-066-075.wav
56/132: keyboard_acoustic_000-067-075.wav
57/132: keyboard_acoustic_000-068-075.wav
58/132: keyboard_acoustic_000-069-075.wav
59/132: keyboard_acoustic_000-070-075.wav
60/132: keyboard_acoustic_000-071-075.wav
61/132: mallet_acoustic_000-072-075.wav
62/132: mallet_acoustic_000-073-075.wav
63/132: mallet_acoustic_000-074-075.wav
64/132: mallet_acoustic_000-075-075.wav
65/132: mallet_acoustic_000-076-075.wav
66/132: mallet_acoustic_000-077-075.wav
67/132: mallet_acoustic_000-078-075.wav
68/132: mallet_acoustic_000-079-075.wav
69/132: mallet_acoustic_000-080-075.wav
70/132: mallet_acoustic_000-081-075.wav
71/132: mallet_acoustic_000-082-075.wav
72/132: mallet_acoustic_000-083-075.wav
73/132: organ_electronic_000-060-075.wav
74/132: organ_electronic_000-061-075.wav
75/132: organ_electronic_000-062-075.wav
76/132: organ_electronic_000-063-075.wav
77/132: organ_electronic_000-064-075.wav
78/132: organ_electronic_000-065-075.wav
79/132: organ_electronic_000-066-075.wav
80/132: organ_electronic_000-067-075.wav
81/132: organ_electronic_000-068-075.wav
82/132: organ_electronic_000-069-075.wav
83/132: organ_electronic_000-070-075.wav
84/132: organ_electronic_000-071-075.wav
85/132: reed_acoustic_000-060-075.wav
86/132: reed_acoustic_000-061-075.wav
87/132: reed_acoustic_000-062-075.wav
88/132: reed_acoustic_000-063-075.wav
89/132: reed_acoustic_000-064-075.wav
90/132: reed_acoustic_000-065-075.wav
91/132: reed_acoustic_000-066-075.wav
92/132: reed_acoustic_000-067-075.wav
93/132: reed_acoustic_000-068-075.wav
94/132: reed_acoustic_000-069-075.wav
95/132: reed_acoustic_000-070-075.wav
96/132: reed_acoustic_000-071-075.wav
97/132: string_acoustic_000-060-075.wav
98/132: string_acoustic_000-061-075.wav
99/132: string_acoustic_000-062-075.wav
100/132: string_acoustic_000-063-075.wav
101/132: string_acoustic_000-064-075.wav
102/132: string_acoustic_000-065-075.wav
103/132: string_acoustic_000-066-075.wav
104/132: string_acoustic_000-067-075.wav
105/132: string_acoustic_000-068-075.wav
106/132: string_acoustic_000-069-075.wav
107/132: string_acoustic_000-070-075.wav
108/132: string_acoustic_000-071-075.wav
109/132: synth_lead_synthetic_000-060-075.wav
110/132: synth_lead_synthetic_000-061-075.wav
111/132: synth_lead_synthetic_000-062-075.wav
112/132: synth_lead_synthetic_000-063-075.wav
113/132: synth_lead_synthetic_000-064-075.wav
114/132: synth_lead_synthetic_000-065-075.wav
115/132: synth_lead_synthetic_000-066-075.wav
116/132: synth_lead_synthetic_000-067-075.wav
117/132: synth_lead_synthetic_000-068-075.wav
118/132: synth_lead_synthetic_000-069-075.wav
119/132: synth_lead_synthetic_000-070-075.wav
120/132: synth_lead_synthetic_000-071-075.wav
121/132: vocal_acoustic_002-060-075.wav
122/132: vocal_acoustic_002-061-075.wav
123/132: vocal_acoustic_002-062-075.wav
124/132: vocal_acoustic_002-063-075.wav
125/132: vocal_acoustic_002-064-075.wav
126/132: vocal_acoustic_002-065-075.wav
127/132: vocal_acoustic_002-066-075.wav
128/132: vocal_acoustic_002-067-075.wav
129/132: vocal_acoustic_002-068-075.wav
130/132: vocal_acoustic_002-069-075.wav
131/132: vocal_acoustic_002-070-075.wav
132/132: vocal_acoustic_002-071-075.wav

C. Compare the note similarities ¶

In [6]:

# Initialize the note similarity matrices for the CQHCs and the MFCCs
number_files = len(audio_list)
cqhc_similarities = np.zeros((number_files, number_files))
mfcc_similarities = np.zeros((number_files, number_files))

# Loop over the rows of the matrices
for i in range(number_files):
    
    # Get the CQHCs and MFCCs for the current audio and normalize them
    audio_cqhc0 = audio_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    audio_mfcc0 = audio_list[i]['mfcc']
    audio_mfcc0 = audio_mfcc0/(np.sqrt(np.sum(np.power(audio_mfcc0, 2), axis=None))+1e-16)
    
    # Loop over the columns of the matrices
    for j in range(number_files):
        
        # Get the CQT-SECs and MFCCs for the current audio and normalize them
        audio_cqhc1 = audio_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        audio_mfcc1 = audio_list[j]['mfcc']
        audio_mfcc1 = audio_mfcc1/(np.sqrt(np.sum(np.power(audio_mfcc1, 2), axis=None))+1e-16)
        
        # Compute the note similarity between the CQTSCs and between the MFCCs
        cqhc_similarities[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        mfcc_similarities[i, j] = np.sum(audio_mfcc0*audio_mfcc1, axis=None)
        
# Display the note similarity matrices for the CQHCs and the MFCCs
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1), plt.imshow(cqhc_similarities, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('CQHC note similarities'), plt.xlabel('Note index'), plt.ylabel('Note index')
plt.subplot(1, 2, 2), plt.imshow(mfcc_similarities, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('MFCC note similarities'), plt.xlabel('Note index'), plt.ylabel('Note index')
plt.tight_layout()
plt.show()

D. Compare the instrument similarities ¶

In [7]:

# Initialize the instrument similarity matrices and the final score vectors
number_instruments = 11
cqhc_similarities2 = np.zeros((number_instruments, number_instruments))
mfcc_similarities2 = np.zeros((number_instruments, number_instruments))
cqhc_scores2 = np.zeros(number_instruments)
mfcc_scores2 = np.zeros(number_instruments)

# Compute the similarity averaged over the instruments
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_similarities2[i, j] = np.mean(cqhc_similarities[i*12:(i+1)*12, j*12:(j+1)*12])
        mfcc_similarities2[i, j] = np.mean(mfcc_similarities[i*12:(i+1)*12, j*12:(j+1)*12])

# Display the instrument similarity matrices
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1), plt.imshow(cqhc_similarities2, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('CQHC instrument similarities'), plt.xlabel('Instrument index'), plt.ylabel('Instrument index')
plt.subplot(1, 2, 2), plt.imshow(mfcc_similarities2, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('MFCC instrument similarities'), plt.xlabel('Instrument index'), plt.ylabel('Instrument index')
plt.tight_layout()
plt.show()

# Compute the final scores (mean between self-similarity and 1 minus the averaged cross-similarities)
for i in range(number_instruments):
    cqhc_scores2[i] = (cqhc_similarities2[i, i] \
                       + 1-((np.sum(cqhc_similarities2[i, :])-cqhc_similarities2[i, i])/(number_instruments-1)))/2
    mfcc_scores2[i] = (mfcc_similarities2[i, i] \
                       + 1-((np.sum(mfcc_similarities2[i, :])-mfcc_similarities2[i, i])/(number_instruments-1)))/2

# Display the final scores
plt.figure(figsize=(14, 2))
plt.plot(cqhc_scores2, label='CQHC')
plt.plot(mfcc_scores2, label='MFCC')
plt.title('Instrument scores')
plt.xlabel('Instrument index')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

E. Compare the instrument similarity scores for different versions of CQHCs ¶

In [8]:

import os
import numpy as np
import librosa
import matplotlib.pyplot as plt

# Get the path to the folder and the files
folder_path = r'nsynth11'
folder_listdir = os.listdir(folder_path)
number_files = len(folder_listdir)
number_instruments = 11

# Define a function to compute the CQT spectrogram
def cqtspectrogram(audio_signal, sampling_frequency):
    
    # Comptute the CQT spectrogram from the signal
    step_length = int(pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))/2)
    octave_resolution = 12
    minimum_frequency = 32.70
    maximum_frequency = sampling_frequency/2
    number_frequencies = round(octave_resolution * np.log2(maximum_frequency / minimum_frequency))
    cqt_spectrogram = np.abs(librosa.cqt(audio_signal, sr=sampling_frequency, hop_length=step_length, \
                                                  fmin=minimum_frequency, n_bins=number_frequencies, \
                                                  bins_per_octave=octave_resolution))
    
    return cqt_spectrogram

# Define a function to compute the MFCCs
def mfcc(audio_signal, sampling_frequency, n_mfcc=20):
    
    # Compute the MFCCs using librosa's function
    window_length = pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))
    step_length = int(window_length/2)
    audio_mfcc = librosa.feature.mfcc(y=audio_signal, sr=sampling_frequency, n_fft=window_length, hop_length=step_length)
    
    return audio_mfcc

# Loop over the files to store the CQT spectrograms and MFCCs
audio_list = []
k = 0
for file_name in folder_listdir:
    k = k+1
    
    # Get the path to the audio file and load it
    file_path = os.path.join(folder_path, file_name)
    audio_signal, sampling_frequency = librosa.load(file_path, sr=None, mono=True)
    
    # Compute the CQT spectrogram and the MFCCs
    cqt_spectrogram = cqtspectrogram(audio_signal, sampling_frequency)
    audio_mfcc = mfcc(audio_signal, sampling_frequency)
    
    # Create a dictionary for the current file and append it to the list
    audio_dict = {'name': file_name[0:-4], 'cqt': cqt_spectrogram, 'mfcc': audio_mfcc}
    audio_list.append(audio_dict)


plt.figure(figsize=(14, 3))

# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    number_frequencies = np.shape(cqt_spectrogram)[0]
    ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
#     # Refine the spectral component
#     pitch_component[pitch_component<0] = 0
#     spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(cqt_pitch, 2*number_frequencies-1, \
#                                                                            axis=0)+1e-16), axis=0)\
#                                  [0:number_frequencies, :])
#     spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQT-SECs and extract them
    octave_resolution = 12
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
    
    # Get the CQT spectrogram and extract the CQHCs
    cqt_spectrogram = audio_list[k]['cqt']
    audio_cqhc = cqhc(cqt_spectrogram)
    
    # Create a dictionary for the current file and append it to the list
    cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
    cqt_list.append(cqt_dict)
    k = k+1

# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the CQHCs current audio and normalize them
    audio_cqhc0 = cqt_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the CQHCs for the current audio and normalize them
        audio_cqhc1 = cqt_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the CQHCs
        cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    cqhc_vector2[i] = (cqhc_matrix2[i, i] \
    + 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2

# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (mag)')


# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    number_frequencies = np.shape(cqt_spectrogram)[0]
    ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
    # Refine the spectral component
    pitch_component[pitch_component<0] = 0
    spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
                                                                           axis=0)+1e-16), axis=0)\
                                 [0:number_frequencies, :])
#     spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQHCs and extract them
    octave_resolution = 12
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
    
    # Get the CQT spectrogram and extract the CQHCs
    cqt_spectrogram = audio_list[k]['cqt']
    audio_cqhc = cqhc(cqt_spectrogram)
    
    # Create a dictionary for the current file and append it to the list
    cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
    cqt_list.append(cqt_dict)
    k = k+1
    
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the CQHCs current audio and normalize them
    audio_cqhc0 = cqt_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the CQHCs for the current audio and normalize them
        audio_cqhc1 = cqt_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the CQHCs
        cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    cqhc_vector2[i] = (cqhc_matrix2[i, i] \
    + 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2

# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (mag, ref)')


# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    number_frequencies = np.shape(cqt_spectrogram)[0]
    ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
    # Refine the spectral component
    pitch_component[pitch_component<0] = 0
    spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
                                                                           axis=0)+1e-16), axis=0)\
                                 [0:number_frequencies, :])
    spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQHCs and extract them
    octave_resolution = 12
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
    
    # Get the CQT spectrogram and extract the CQHCs
    cqt_spectrogram = audio_list[k]['cqt']
    audio_cqhc = cqhc(cqt_spectrogram)
    
    # Create a dictionary for the current file and append it to the list
    cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
    cqt_list.append(cqt_dict)
    k = k+1
    
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the CQHCs current audio and normalize them
    audio_cqhc0 = cqt_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the CQHCs for the current audio and normalize them
        audio_cqhc1 = cqt_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the CQHCs
        cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    cqhc_vector2[i] = (cqhc_matrix2[i, i] \
    + 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2

# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (mag, ref, pos)')


# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    number_frequencies = np.shape(cqt_spectrogram)[0]
    ftcqt_spectrogram = np.fft.fft(np.power(cqt_spectrogram, 2), 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
#     # Refine the spectral component
#     pitch_component[pitch_component<0] = 0
#     spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(cqt_pitch, 2*number_frequencies-1, \
#                                                                            axis=0)+1e-16), axis=0)\
#                                  [0:number_frequencies, :])
#     spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQHCs and extract them
    octave_resolution = 12
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
    
    # Get the CQT spectrogram and extract the CQHCs
    cqt_spectrogram = audio_list[k]['cqt']
    audio_cqhc = cqhc(cqt_spectrogram)
    
    # Create a dictionary for the current file and append it to the list
    cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
    cqt_list.append(cqt_dict)
    k = k+1

# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the CQHCs current audio and normalize them
    audio_cqhc0 = cqt_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the CQHCs for the current audio and normalize them
        audio_cqhc1 = cqt_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the CQHCs
        cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    cqhc_vector2[i] = (cqhc_matrix2[i, i] \
    + 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2

# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (pow)')


# Define a function to compute the CQT-SECs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    number_frequencies = np.shape(cqt_spectrogram)[0]
    ftcqt_spectrogram = np.fft.fft(np.power(cqt_spectrogram, 2), 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
    # Refine the spectral component
    pitch_component[pitch_component<0] = 0
    spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
                                                                           axis=0)+1e-16), axis=0)\
                                 [0:number_frequencies, :])
#     spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQHCs and extract them
    octave_resolution = 12
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
    
    # Get the CQT spectrogram and extract the CQHCs
    cqt_spectrogram = audio_list[k]['cqt']
    audio_cqhc = cqhc(cqt_spectrogram)
    
    # Create a dictionary for the current file and append it to the list
    cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
    cqt_list.append(cqt_dict)
    k = k+1

# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the CQHCs current audio and normalize them
    audio_cqhc0 = cqt_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the CQHCs for the current audio and normalize them
        audio_cqhc1 = cqt_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the CQHCs
        cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    cqhc_vector2[i] = (cqhc_matrix2[i, i] \
    + 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2

# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (pow, ref)')


# Define a function to compute the CQT-SECs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
    
    # Compute the FT of the columns in the CQT spectrogram and their magnitude
    number_frequencies = np.shape(cqt_spectrogram)[0]
    ftcqt_spectrogram = np.fft.fft(np.power(cqt_spectrogram, 2), 2*number_frequencies-1, axis=0)
    absftcqt_spectrogram = abs(ftcqt_spectrogram)
    
    # Derive the spectral component and pitch component
    spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
    pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
    
    # Refine the spectral component
    pitch_component[pitch_component<0] = 0
    spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
                                                                           axis=0)+1e-16), axis=0)\
                                 [0:number_frequencies, :])
    spectral_component[spectral_component<0] = 0
    
    # Get the indices of the CQHCs and extract them
    octave_resolution = 12
    coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
    audio_cqhc = spectral_component[coefficient_indices, :]

    return audio_cqhc

    return cqt_sec

# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
    
    # Get the CQT spectrogram and extract the CQHCs
    cqt_spectrogram = audio_list[k]['cqt']
    audio_cqhc = cqhc(cqt_spectrogram)
    
    # Create a dictionary for the current file and append it to the list
    cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
    cqt_list.append(cqt_dict)
    k = k+1

# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the CQHCs current audio and normalize them
    audio_cqhc0 = cqt_list[i]['cqhc']
    audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the CQHCs for the current audio and normalize them
        audio_cqhc1 = cqt_list[j]['cqhc']
        audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the CQHCs
        cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    cqhc_vector2[i] = (cqhc_matrix2[i, i] \
    + 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2

# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (pow, ref, pos)')

# Loop over the rows and columns of the matrix
mfcc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
    
    # Get the MFCCs for the current audio and normalize them
    audio_mfcc0 = audio_list[i]['mfcc']
    audio_mfcc0 = audio_mfcc0/(np.sqrt(np.sum(np.power(audio_mfcc0, 2), axis=None))+1e-16)
    
    for j in range(number_files):
        
        # Get the MFCCs for the current audio and normalize them
        audio_mfcc1 = audio_list[j]['mfcc']
        audio_mfcc1 = audio_mfcc1/(np.sqrt(np.sum(np.power(audio_mfcc1, 2), axis=None))+1e-16)
        
        # Compute the cosine similarity between the MFCCs
        mfcc_matrix[i, j] = np.sum(audio_mfcc0*audio_mfcc1, axis=None)
        
# Compute the similarity averaged over the instrument classes
mfcc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
    for j in range(number_instruments):
        mfcc_matrix2[i, j] = np.mean(mfcc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])

# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
mfcc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
    mfcc_vector2[i] = (mfcc_matrix2[i, i] \
    + 1-((np.sum(mfcc_matrix2[i, :])-mfcc_matrix2[i, i])/(number_instruments-1)))/2

plt.plot(mfcc_vector2, label='MFCCs')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()