This Jupyter notebook shows some tests for extracting and experimenting with the constant-Q harmonic coefficients (CQHCs) (more personal).
Contents:
Author:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
# Define the parameters for the notes to concatenate
folder_path = r'nsynth\nsynth-train\audio'
instrument_names = ['bass_acoustic_000']
note_number = 24
note_numbers = np.arange(note_number, note_number+12)
velocity_number = 75
sampling_frequency = 16000
# Loop over the instrument names and note numbers to concatenate the notes
audio_signal = np.empty(0)
for instrument_name in instrument_names:
for note_number in note_numbers:
# Get the path to the file
file_name = f'{instrument_name}-{note_number:03d}-{velocity_number:03d}.wav'
file_path = os.path.join(folder_path, file_name)
# Load the current audio signal and concatenate them
audio_signal1, _ = librosa.load(file_path, sr=sampling_frequency, mono=True)
audio_signal = np.concatenate((audio_signal, audio_signal1))
# Comptute the CQT spectrogram of the signal
step_length = int(pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))/2)
minimum_frequency = 32.70
maximum_frequency = sampling_frequency/2
octave_resolution = 12
number_frequencies = round(octave_resolution * np.log2(maximum_frequency / minimum_frequency))
audio_cqt = librosa.cqt(audio_signal, sr=sampling_frequency, hop_length=step_length, fmin=minimum_frequency, \
n_bins=number_frequencies, bins_per_octave=octave_resolution)
cqt_spectrogram = np.abs(audio_cqt)
# Display the audio signal and the CQT spectrogram
plt.figure(figsize=(14, 2))
librosa.display.waveplot(audio_signal, sr=sampling_frequency)
plt.title('Audio signal')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 4))
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, x_axis='time', y_axis='cqt_note', bins_per_octave=octave_resolution)
plt.title('CQT spectrogram')
plt.tight_layout()
plt.show()
# Derive the CQT envelope and the CQT pitch
ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
# Resynthesize the CQT spectrogram by convolving the spectral component and pitch component
number_times = np.shape(cqt_spectrogram)[1]
cqt_spectrogram2 = np.zeros((number_frequencies, number_times))
for i in range(number_times):
cqt_spectrogram2[:, i] = np.convolve(spectral_component[:, i], pitch_component[:, i])[0:number_frequencies]
# Display the CQT spectrogram, the spectral component, and pitch component, and the resynthesized CQT spectrogram
j = 10
plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(spectral_component, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time')
plt.title('Spectral component')
plt.subplot(1, 3, 3)
librosa.display.specshow(pitch_component, sr=sampling_frequency, hop_length=step_length, fmin=minimum_frequency, \
bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
plt.title('Pitch component')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(spectral_component[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(pitch_component[:, j]), plt.ylim(top=1)
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram2, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
x_axis='time', y_axis='cqt_note')
plt.title('Resynthesized CQT spectrogram')
plt.subplot(1, 3, 3)
librosa.display.specshow(cqt_spectrogram-cqt_spectrogram2, sr=sampling_frequency, hop_length=step_length, \
fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
rms_value = np.round(np.sqrt(np.mean(np.power(cqt_spectrogram-cqt_spectrogram2, 2))), 3)
plt.title(f'Differences (RMS={rms_value})')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(cqt_spectrogram2[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(cqt_spectrogram[:, j]-cqt_spectrogram2[:, j]), plt.ylim(top=max(cqt_spectrogram[:, j]))
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()
# Refine the pitch component, and then the spectral component
pitch_component2 = np.copy(pitch_component)
pitch_component2[pitch_component2 < 0] = 0
spectral_component2 = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component2, 2*number_frequencies-1, \
axis=0)+1e-7), axis=0)[0:number_frequencies, :])
# Resynthesize the CQT spectrogram by convolving the refined spectral component and pitch component
cqt_spectrogram2 = np.zeros((number_frequencies, number_times))
for i in range(number_times):
cqt_spectrogram2[:, i] = np.convolve(spectral_component2[:, i], pitch_component2[:, i])[0:number_frequencies]
# Display everything again with the refined versions
plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(spectral_component2, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time')
plt.title('Refined spectral component')
plt.subplot(1, 3, 3)
librosa.display.specshow(pitch_component2, sr=sampling_frequency, hop_length=step_length, fmin=minimum_frequency, \
bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
plt.title('Refined pitch component')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(spectral_component2[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(pitch_component2[:, j]), plt.ylim(top=1)
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
x_axis='time', y_axis='cqt_note')
plt.title('CQT spectrogram')
plt.subplot(1, 3, 2)
librosa.display.specshow(librosa.amplitude_to_db(cqt_spectrogram2, ref=np.max), sr=sampling_frequency, \
hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution, \
x_axis='time', y_axis='cqt_note')
plt.title('Resynthesized CQT spectrogram')
plt.subplot(1, 3, 3)
librosa.display.specshow(cqt_spectrogram-cqt_spectrogram2, sr=sampling_frequency, hop_length=step_length, \
fmin=minimum_frequency, bins_per_octave=octave_resolution, x_axis='time', y_axis='cqt_note')
rms_value = np.round(np.sqrt(np.mean(np.power(cqt_spectrogram-cqt_spectrogram2, 2))), 3)
plt.title(f'Differences (RMS={rms_value})')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 2))
plt.subplot(1, 3, 1), plt.plot(cqt_spectrogram[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 2), plt.plot(cqt_spectrogram2[:, j])
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.subplot(1, 3, 3), plt.plot(cqt_spectrogram[:, j]-cqt_spectrogram2[:, j]), plt.ylim(top=max(cqt_spectrogram[:, j]))
plt.title('One time frame'), plt.xlabel('Frequency index'), plt.ylabel('Energy')
plt.tight_layout()
plt.show()
# # Resynthesize the signal
# audio_signal2 = librosa.icqt(cqt_spectrogram2*audio_cqt/cqt_spectrogram, sr=sampling_frequency, \
# hop_length=step_length, fmin=minimum_frequency, bins_per_octave=octave_resolution)
# audio_signal2 = np.max(abs(audio_signal))*audio_signal2/np.max(abs(audio_signal2))
# audio_signal2 = np.pad(audio_signal2, (0, len(audio_signal)-len(audio_signal2)), 'constant', constant_values=0)
# Extract the CQHCs from the spectral component
number_coefficients = 20
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhcs = spectral_component[coefficient_indices, :]
# Compute the MFCCs using librosa
window_length = pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))
step_length = int(window_length/2)
audio_mfcc = librosa.feature.mfcc(y=audio_signal, sr=sampling_frequency, n_fft=window_length, hop_length=step_length)
# Compute the self-similarity matrices for the CQHCs and the MFCCs
normalized_feature = audio_cqhcs/(np.sqrt(np.sum(np.power(audio_cqhcs, 2), axis=0))+1e-16)
similarity_matrix1 = np.matmul(normalized_feature.T, normalized_feature)
normalized_feature = audio_mfcc/(np.sqrt(np.sum(np.power(audio_mfcc, 2), axis=0))+1e-16)
similarity_matrix2 = np.matmul(normalized_feature.T, normalized_feature)
# Plot the features and their self-similarity matrices
plt.figure(figsize=(14, 3))
plt.subplot(1, 2, 1), plt.imshow(audio_cqhcs, aspect='auto', cmap='jet', origin='lower')
plt.title('CQHCs'), plt.xlabel('Time'), plt.ylabel('Coefficient')
plt.subplot(1, 2, 2), plt.imshow(audio_mfcc, cmap='jet', aspect='auto', origin='lower')
plt.title('MFCCs'), plt.xlabel('Time'), plt.ylabel('Coefficient')
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1), plt.imshow(similarity_matrix1, cmap='gray', aspect='auto', origin='lower', vmin=0.9, vmax=1)
plt.title('CQHCs self-similarity'), plt.xlabel('Time'), plt.ylabel('Time')
plt.subplot(1, 2, 2), plt.imshow(similarity_matrix2, cmap='gray', aspect='auto', origin='lower', vmin=0.9, vmax=1)
plt.title('MFCCs self-similarity'), plt.xlabel('Time'), plt.ylabel('Time')
plt.tight_layout()
plt.show()
import os
from shutil import copyfile
# The NSynth dataset can be downloaded from: https://magenta.tensorflow.org/datasets/nsynth
# Define the folders
folder_path = r'nsynth\nsynth-train\audio'
folder_path2 = r'nsynth11'
# Define the instrument names, numbers, and MIDIs
instrument_list = [{'name':'bass_acoustic', 'number': '000', 'midi': 24}, \
{'name':'brass_acoustic', 'number': '000', 'midi': 60}, \
{'name':'flute_acoustic', 'number': '000', 'midi': 60}, \
{'name':'guitar_acoustic', 'number': '000', 'midi': 60}, \
{'name':'keyboard_acoustic', 'number': '000', 'midi': 60}, \
{'name':'mallet_acoustic', 'number': '000', 'midi': 72}, \
{'name':'organ_electronic', 'number': '000', 'midi': 60}, \
{'name':'reed_acoustic', 'number': '000', 'midi': 60}, \
{'name':'string_acoustic', 'number': '000', 'midi': 60}, \
{'name':'synth_lead_synthetic', 'number': '000', 'midi': 60}, \
{'name':'vocal_acoustic', 'number': '002', 'midi': 60}]
# Loop over the list of notes to create the dataset
os.mkdir(folder_path2)
number_semitones = 12
for i in instrument_list:
for j in range(i['midi'], i['midi']+number_semitones):
file_name = f"{i['name']}_{i['number']}-{j:03d}-075.wav"
file_path = os.path.join(folder_path, file_name)
file_path2 = os.path.join(folder_path2, file_name)
copyfile(file_path, file_path2)
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
# Define a function to compute the CQHCs
def cqhc(audio_signal, sampling_frequency, number_coefficients=20):
# Comptute the CQT spectrogram from the signal
step_length = int(pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))/2)
octave_resolution = 12
minimum_frequency = 32.70
maximum_frequency = sampling_frequency/2
number_frequencies = round(octave_resolution * np.log2(maximum_frequency / minimum_frequency))
cqt_spectrogram = np.abs(librosa.cqt(audio_signal, sr=sampling_frequency, hop_length=step_length, \
fmin=minimum_frequency, n_bins=number_frequencies, \
bins_per_octave=octave_resolution))
# Compute the FT of the columns in the CQT spectrogram and their magnitude
ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and the pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# # Refine the spectral component
# pitch_component[pitch_component<0] = 0
# spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
# axis=0)+1e-16), axis=0)[0:number_frequencies, :])
# spectral_component[spectral_component<0] = 0
# Get the indices of the CQHCs and extract them
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
# Define a function to compute the MFCCs
def mfcc(audio_signal, sampling_frequency, number_coefficients=20):
# Compute the MFCCs using librosa's function
window_length = pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))
step_length = int(window_length/2)
audio_mfcc = librosa.feature.mfcc(y=audio_signal, sr=sampling_frequency, n_mfcc=number_coefficients,
n_fft=window_length, hop_length=step_length)
return audio_mfcc
# Get the path to the folder and its files
folder_path = r'nsynth11'
folder_listdir = os.listdir(folder_path)
number_files = len(folder_listdir)
# Create an empty list for storing dictionaries
audio_list = []
# Loop over the files
k = 0
for file_name in folder_listdir:
k = k+1
# Display the name of the file
print(f'{k}/{number_files}: {file_name}')
# Get the path to the audio file and load it
file_path = os.path.join(folder_path, file_name)
audio_signal, sampling_frequency = librosa.load(file_path, sr=None, mono=True)
# Compute the CQHCs and the MFCCs
audio_cqhc = cqhc(audio_signal, sampling_frequency)
audio_mfcc = mfcc(audio_signal, sampling_frequency)
# Create a dictionary for the current file and append it to the list
audio_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc, 'mfcc': audio_mfcc}
audio_list.append(audio_dict)
1/132: bass_acoustic_000-024-075.wav 2/132: bass_acoustic_000-025-075.wav 3/132: bass_acoustic_000-026-075.wav 4/132: bass_acoustic_000-027-075.wav 5/132: bass_acoustic_000-028-075.wav 6/132: bass_acoustic_000-029-075.wav 7/132: bass_acoustic_000-030-075.wav 8/132: bass_acoustic_000-031-075.wav 9/132: bass_acoustic_000-032-075.wav 10/132: bass_acoustic_000-033-075.wav 11/132: bass_acoustic_000-034-075.wav 12/132: bass_acoustic_000-035-075.wav 13/132: brass_acoustic_000-060-075.wav 14/132: brass_acoustic_000-061-075.wav 15/132: brass_acoustic_000-062-075.wav 16/132: brass_acoustic_000-063-075.wav 17/132: brass_acoustic_000-064-075.wav 18/132: brass_acoustic_000-065-075.wav 19/132: brass_acoustic_000-066-075.wav 20/132: brass_acoustic_000-067-075.wav 21/132: brass_acoustic_000-068-075.wav 22/132: brass_acoustic_000-069-075.wav 23/132: brass_acoustic_000-070-075.wav 24/132: brass_acoustic_000-071-075.wav 25/132: flute_acoustic_000-060-075.wav 26/132: flute_acoustic_000-061-075.wav 27/132: flute_acoustic_000-062-075.wav 28/132: flute_acoustic_000-063-075.wav 29/132: flute_acoustic_000-064-075.wav 30/132: flute_acoustic_000-065-075.wav 31/132: flute_acoustic_000-066-075.wav 32/132: flute_acoustic_000-067-075.wav 33/132: flute_acoustic_000-068-075.wav 34/132: flute_acoustic_000-069-075.wav 35/132: flute_acoustic_000-070-075.wav 36/132: flute_acoustic_000-071-075.wav 37/132: guitar_acoustic_000-060-075.wav 38/132: guitar_acoustic_000-061-075.wav 39/132: guitar_acoustic_000-062-075.wav 40/132: guitar_acoustic_000-063-075.wav 41/132: guitar_acoustic_000-064-075.wav 42/132: guitar_acoustic_000-065-075.wav 43/132: guitar_acoustic_000-066-075.wav 44/132: guitar_acoustic_000-067-075.wav 45/132: guitar_acoustic_000-068-075.wav 46/132: guitar_acoustic_000-069-075.wav 47/132: guitar_acoustic_000-070-075.wav 48/132: guitar_acoustic_000-071-075.wav 49/132: keyboard_acoustic_000-060-075.wav 50/132: keyboard_acoustic_000-061-075.wav 51/132: keyboard_acoustic_000-062-075.wav 52/132: keyboard_acoustic_000-063-075.wav 53/132: keyboard_acoustic_000-064-075.wav 54/132: keyboard_acoustic_000-065-075.wav 55/132: keyboard_acoustic_000-066-075.wav 56/132: keyboard_acoustic_000-067-075.wav 57/132: keyboard_acoustic_000-068-075.wav 58/132: keyboard_acoustic_000-069-075.wav 59/132: keyboard_acoustic_000-070-075.wav 60/132: keyboard_acoustic_000-071-075.wav 61/132: mallet_acoustic_000-072-075.wav 62/132: mallet_acoustic_000-073-075.wav 63/132: mallet_acoustic_000-074-075.wav 64/132: mallet_acoustic_000-075-075.wav 65/132: mallet_acoustic_000-076-075.wav 66/132: mallet_acoustic_000-077-075.wav 67/132: mallet_acoustic_000-078-075.wav 68/132: mallet_acoustic_000-079-075.wav 69/132: mallet_acoustic_000-080-075.wav 70/132: mallet_acoustic_000-081-075.wav 71/132: mallet_acoustic_000-082-075.wav 72/132: mallet_acoustic_000-083-075.wav 73/132: organ_electronic_000-060-075.wav 74/132: organ_electronic_000-061-075.wav 75/132: organ_electronic_000-062-075.wav 76/132: organ_electronic_000-063-075.wav 77/132: organ_electronic_000-064-075.wav 78/132: organ_electronic_000-065-075.wav 79/132: organ_electronic_000-066-075.wav 80/132: organ_electronic_000-067-075.wav 81/132: organ_electronic_000-068-075.wav 82/132: organ_electronic_000-069-075.wav 83/132: organ_electronic_000-070-075.wav 84/132: organ_electronic_000-071-075.wav 85/132: reed_acoustic_000-060-075.wav 86/132: reed_acoustic_000-061-075.wav 87/132: reed_acoustic_000-062-075.wav 88/132: reed_acoustic_000-063-075.wav 89/132: reed_acoustic_000-064-075.wav 90/132: reed_acoustic_000-065-075.wav 91/132: reed_acoustic_000-066-075.wav 92/132: reed_acoustic_000-067-075.wav 93/132: reed_acoustic_000-068-075.wav 94/132: reed_acoustic_000-069-075.wav 95/132: reed_acoustic_000-070-075.wav 96/132: reed_acoustic_000-071-075.wav 97/132: string_acoustic_000-060-075.wav 98/132: string_acoustic_000-061-075.wav 99/132: string_acoustic_000-062-075.wav 100/132: string_acoustic_000-063-075.wav 101/132: string_acoustic_000-064-075.wav 102/132: string_acoustic_000-065-075.wav 103/132: string_acoustic_000-066-075.wav 104/132: string_acoustic_000-067-075.wav 105/132: string_acoustic_000-068-075.wav 106/132: string_acoustic_000-069-075.wav 107/132: string_acoustic_000-070-075.wav 108/132: string_acoustic_000-071-075.wav 109/132: synth_lead_synthetic_000-060-075.wav 110/132: synth_lead_synthetic_000-061-075.wav 111/132: synth_lead_synthetic_000-062-075.wav 112/132: synth_lead_synthetic_000-063-075.wav 113/132: synth_lead_synthetic_000-064-075.wav 114/132: synth_lead_synthetic_000-065-075.wav 115/132: synth_lead_synthetic_000-066-075.wav 116/132: synth_lead_synthetic_000-067-075.wav 117/132: synth_lead_synthetic_000-068-075.wav 118/132: synth_lead_synthetic_000-069-075.wav 119/132: synth_lead_synthetic_000-070-075.wav 120/132: synth_lead_synthetic_000-071-075.wav 121/132: vocal_acoustic_002-060-075.wav 122/132: vocal_acoustic_002-061-075.wav 123/132: vocal_acoustic_002-062-075.wav 124/132: vocal_acoustic_002-063-075.wav 125/132: vocal_acoustic_002-064-075.wav 126/132: vocal_acoustic_002-065-075.wav 127/132: vocal_acoustic_002-066-075.wav 128/132: vocal_acoustic_002-067-075.wav 129/132: vocal_acoustic_002-068-075.wav 130/132: vocal_acoustic_002-069-075.wav 131/132: vocal_acoustic_002-070-075.wav 132/132: vocal_acoustic_002-071-075.wav
# Initialize the note similarity matrices for the CQHCs and the MFCCs
number_files = len(audio_list)
cqhc_similarities = np.zeros((number_files, number_files))
mfcc_similarities = np.zeros((number_files, number_files))
# Loop over the rows of the matrices
for i in range(number_files):
# Get the CQHCs and MFCCs for the current audio and normalize them
audio_cqhc0 = audio_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
audio_mfcc0 = audio_list[i]['mfcc']
audio_mfcc0 = audio_mfcc0/(np.sqrt(np.sum(np.power(audio_mfcc0, 2), axis=None))+1e-16)
# Loop over the columns of the matrices
for j in range(number_files):
# Get the CQT-SECs and MFCCs for the current audio and normalize them
audio_cqhc1 = audio_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
audio_mfcc1 = audio_list[j]['mfcc']
audio_mfcc1 = audio_mfcc1/(np.sqrt(np.sum(np.power(audio_mfcc1, 2), axis=None))+1e-16)
# Compute the note similarity between the CQTSCs and between the MFCCs
cqhc_similarities[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
mfcc_similarities[i, j] = np.sum(audio_mfcc0*audio_mfcc1, axis=None)
# Display the note similarity matrices for the CQHCs and the MFCCs
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1), plt.imshow(cqhc_similarities, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('CQHC note similarities'), plt.xlabel('Note index'), plt.ylabel('Note index')
plt.subplot(1, 2, 2), plt.imshow(mfcc_similarities, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('MFCC note similarities'), plt.xlabel('Note index'), plt.ylabel('Note index')
plt.tight_layout()
plt.show()
# Initialize the instrument similarity matrices and the final score vectors
number_instruments = 11
cqhc_similarities2 = np.zeros((number_instruments, number_instruments))
mfcc_similarities2 = np.zeros((number_instruments, number_instruments))
cqhc_scores2 = np.zeros(number_instruments)
mfcc_scores2 = np.zeros(number_instruments)
# Compute the similarity averaged over the instruments
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_similarities2[i, j] = np.mean(cqhc_similarities[i*12:(i+1)*12, j*12:(j+1)*12])
mfcc_similarities2[i, j] = np.mean(mfcc_similarities[i*12:(i+1)*12, j*12:(j+1)*12])
# Display the instrument similarity matrices
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1), plt.imshow(cqhc_similarities2, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('CQHC instrument similarities'), plt.xlabel('Instrument index'), plt.ylabel('Instrument index')
plt.subplot(1, 2, 2), plt.imshow(mfcc_similarities2, cmap="jet", aspect="auto", vmin=0, vmax=1, origin="lower")
plt.title('MFCC instrument similarities'), plt.xlabel('Instrument index'), plt.ylabel('Instrument index')
plt.tight_layout()
plt.show()
# Compute the final scores (mean between self-similarity and 1 minus the averaged cross-similarities)
for i in range(number_instruments):
cqhc_scores2[i] = (cqhc_similarities2[i, i] \
+ 1-((np.sum(cqhc_similarities2[i, :])-cqhc_similarities2[i, i])/(number_instruments-1)))/2
mfcc_scores2[i] = (mfcc_similarities2[i, i] \
+ 1-((np.sum(mfcc_similarities2[i, :])-mfcc_similarities2[i, i])/(number_instruments-1)))/2
# Display the final scores
plt.figure(figsize=(14, 2))
plt.plot(cqhc_scores2, label='CQHC')
plt.plot(mfcc_scores2, label='MFCC')
plt.title('Instrument scores')
plt.xlabel('Instrument index')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
# Get the path to the folder and the files
folder_path = r'nsynth11'
folder_listdir = os.listdir(folder_path)
number_files = len(folder_listdir)
number_instruments = 11
# Define a function to compute the CQT spectrogram
def cqtspectrogram(audio_signal, sampling_frequency):
# Comptute the CQT spectrogram from the signal
step_length = int(pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))/2)
octave_resolution = 12
minimum_frequency = 32.70
maximum_frequency = sampling_frequency/2
number_frequencies = round(octave_resolution * np.log2(maximum_frequency / minimum_frequency))
cqt_spectrogram = np.abs(librosa.cqt(audio_signal, sr=sampling_frequency, hop_length=step_length, \
fmin=minimum_frequency, n_bins=number_frequencies, \
bins_per_octave=octave_resolution))
return cqt_spectrogram
# Define a function to compute the MFCCs
def mfcc(audio_signal, sampling_frequency, n_mfcc=20):
# Compute the MFCCs using librosa's function
window_length = pow(2, int(np.ceil(np.log2(0.04*sampling_frequency))))
step_length = int(window_length/2)
audio_mfcc = librosa.feature.mfcc(y=audio_signal, sr=sampling_frequency, n_fft=window_length, hop_length=step_length)
return audio_mfcc
# Loop over the files to store the CQT spectrograms and MFCCs
audio_list = []
k = 0
for file_name in folder_listdir:
k = k+1
# Get the path to the audio file and load it
file_path = os.path.join(folder_path, file_name)
audio_signal, sampling_frequency = librosa.load(file_path, sr=None, mono=True)
# Compute the CQT spectrogram and the MFCCs
cqt_spectrogram = cqtspectrogram(audio_signal, sampling_frequency)
audio_mfcc = mfcc(audio_signal, sampling_frequency)
# Create a dictionary for the current file and append it to the list
audio_dict = {'name': file_name[0:-4], 'cqt': cqt_spectrogram, 'mfcc': audio_mfcc}
audio_list.append(audio_dict)
plt.figure(figsize=(14, 3))
# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
# Compute the FT of the columns in the CQT spectrogram and their magnitude
number_frequencies = np.shape(cqt_spectrogram)[0]
ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# # Refine the spectral component
# pitch_component[pitch_component<0] = 0
# spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(cqt_pitch, 2*number_frequencies-1, \
# axis=0)+1e-16), axis=0)\
# [0:number_frequencies, :])
# spectral_component[spectral_component<0] = 0
# Get the indices of the CQT-SECs and extract them
octave_resolution = 12
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
# Get the CQT spectrogram and extract the CQHCs
cqt_spectrogram = audio_list[k]['cqt']
audio_cqhc = cqhc(cqt_spectrogram)
# Create a dictionary for the current file and append it to the list
cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
cqt_list.append(cqt_dict)
k = k+1
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the CQHCs current audio and normalize them
audio_cqhc0 = cqt_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the CQHCs for the current audio and normalize them
audio_cqhc1 = cqt_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the CQHCs
cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
cqhc_vector2[i] = (cqhc_matrix2[i, i] \
+ 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2
# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (mag)')
# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
# Compute the FT of the columns in the CQT spectrogram and their magnitude
number_frequencies = np.shape(cqt_spectrogram)[0]
ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# Refine the spectral component
pitch_component[pitch_component<0] = 0
spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
axis=0)+1e-16), axis=0)\
[0:number_frequencies, :])
# spectral_component[spectral_component<0] = 0
# Get the indices of the CQHCs and extract them
octave_resolution = 12
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
# Get the CQT spectrogram and extract the CQHCs
cqt_spectrogram = audio_list[k]['cqt']
audio_cqhc = cqhc(cqt_spectrogram)
# Create a dictionary for the current file and append it to the list
cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
cqt_list.append(cqt_dict)
k = k+1
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the CQHCs current audio and normalize them
audio_cqhc0 = cqt_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the CQHCs for the current audio and normalize them
audio_cqhc1 = cqt_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the CQHCs
cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
cqhc_vector2[i] = (cqhc_matrix2[i, i] \
+ 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2
# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (mag, ref)')
# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
# Compute the FT of the columns in the CQT spectrogram and their magnitude
number_frequencies = np.shape(cqt_spectrogram)[0]
ftcqt_spectrogram = np.fft.fft(cqt_spectrogram, 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# Refine the spectral component
pitch_component[pitch_component<0] = 0
spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
axis=0)+1e-16), axis=0)\
[0:number_frequencies, :])
spectral_component[spectral_component<0] = 0
# Get the indices of the CQHCs and extract them
octave_resolution = 12
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
# Get the CQT spectrogram and extract the CQHCs
cqt_spectrogram = audio_list[k]['cqt']
audio_cqhc = cqhc(cqt_spectrogram)
# Create a dictionary for the current file and append it to the list
cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
cqt_list.append(cqt_dict)
k = k+1
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the CQHCs current audio and normalize them
audio_cqhc0 = cqt_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the CQHCs for the current audio and normalize them
audio_cqhc1 = cqt_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the CQHCs
cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
cqhc_vector2[i] = (cqhc_matrix2[i, i] \
+ 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2
# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (mag, ref, pos)')
# Define a function to compute the CQHCs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
# Compute the FT of the columns in the CQT spectrogram and their magnitude
number_frequencies = np.shape(cqt_spectrogram)[0]
ftcqt_spectrogram = np.fft.fft(np.power(cqt_spectrogram, 2), 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# # Refine the spectral component
# pitch_component[pitch_component<0] = 0
# spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(cqt_pitch, 2*number_frequencies-1, \
# axis=0)+1e-16), axis=0)\
# [0:number_frequencies, :])
# spectral_component[spectral_component<0] = 0
# Get the indices of the CQHCs and extract them
octave_resolution = 12
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
# Get the CQT spectrogram and extract the CQHCs
cqt_spectrogram = audio_list[k]['cqt']
audio_cqhc = cqhc(cqt_spectrogram)
# Create a dictionary for the current file and append it to the list
cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
cqt_list.append(cqt_dict)
k = k+1
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the CQHCs current audio and normalize them
audio_cqhc0 = cqt_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the CQHCs for the current audio and normalize them
audio_cqhc1 = cqt_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the CQHCs
cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
cqhc_vector2[i] = (cqhc_matrix2[i, i] \
+ 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2
# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (pow)')
# Define a function to compute the CQT-SECs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
# Compute the FT of the columns in the CQT spectrogram and their magnitude
number_frequencies = np.shape(cqt_spectrogram)[0]
ftcqt_spectrogram = np.fft.fft(np.power(cqt_spectrogram, 2), 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# Refine the spectral component
pitch_component[pitch_component<0] = 0
spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
axis=0)+1e-16), axis=0)\
[0:number_frequencies, :])
# spectral_component[spectral_component<0] = 0
# Get the indices of the CQHCs and extract them
octave_resolution = 12
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
# Get the CQT spectrogram and extract the CQHCs
cqt_spectrogram = audio_list[k]['cqt']
audio_cqhc = cqhc(cqt_spectrogram)
# Create a dictionary for the current file and append it to the list
cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
cqt_list.append(cqt_dict)
k = k+1
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the CQHCs current audio and normalize them
audio_cqhc0 = cqt_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the CQHCs for the current audio and normalize them
audio_cqhc1 = cqt_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the CQHCs
cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
cqhc_vector2[i] = (cqhc_matrix2[i, i] \
+ 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2
# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (pow, ref)')
# Define a function to compute the CQT-SECs from the CQT spectrogram
def cqhc(cqt_spectrogram, number_coefficients=20):
# Compute the FT of the columns in the CQT spectrogram and their magnitude
number_frequencies = np.shape(cqt_spectrogram)[0]
ftcqt_spectrogram = np.fft.fft(np.power(cqt_spectrogram, 2), 2*number_frequencies-1, axis=0)
absftcqt_spectrogram = abs(ftcqt_spectrogram)
# Derive the spectral component and pitch component
spectral_component = np.real(np.fft.ifft(absftcqt_spectrogram, axis=0)[0:number_frequencies, :])
pitch_component = np.real(np.fft.ifft(ftcqt_spectrogram/(absftcqt_spectrogram+1e-16), axis=0)[0:number_frequencies, :])
# Refine the spectral component
pitch_component[pitch_component<0] = 0
spectral_component = np.real(np.fft.ifft(ftcqt_spectrogram/(np.fft.fft(pitch_component, 2*number_frequencies-1, \
axis=0)+1e-16), axis=0)\
[0:number_frequencies, :])
spectral_component[spectral_component<0] = 0
# Get the indices of the CQHCs and extract them
octave_resolution = 12
coefficient_indices = np.round(octave_resolution*np.log2(np.arange(1, number_coefficients+1))).astype(int)
audio_cqhc = spectral_component[coefficient_indices, :]
return audio_cqhc
return cqt_sec
# Loop over the files to extract the CQHCs
cqt_list = []
k = 0
for file_name in folder_listdir:
# Get the CQT spectrogram and extract the CQHCs
cqt_spectrogram = audio_list[k]['cqt']
audio_cqhc = cqhc(cqt_spectrogram)
# Create a dictionary for the current file and append it to the list
cqt_dict = {'name': file_name[0:-4], 'cqhc': audio_cqhc}
cqt_list.append(cqt_dict)
k = k+1
# Loop over the files twice to compute the cosine similarity matrix
cqhc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the CQHCs current audio and normalize them
audio_cqhc0 = cqt_list[i]['cqhc']
audio_cqhc0 = audio_cqhc0/(np.sqrt(np.sum(np.power(audio_cqhc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the CQHCs for the current audio and normalize them
audio_cqhc1 = cqt_list[j]['cqhc']
audio_cqhc1 = audio_cqhc1/(np.sqrt(np.sum(np.power(audio_cqhc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the CQHCs
cqhc_matrix[i, j] = np.sum(audio_cqhc0*audio_cqhc1, axis=None)
# Compute the similarity averaged over the instrument classes
cqhc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
cqhc_matrix2[i, j] = np.mean(cqhc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
cqhc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
cqhc_vector2[i] = (cqhc_matrix2[i, i] \
+ 1-((np.sum(cqhc_matrix2[i, :])-cqhc_matrix2[i, i])/(number_instruments-1)))/2
# Display the final score vectors
plt.plot(cqhc_vector2, label='CQHCs (pow, ref, pos)')
# Loop over the rows and columns of the matrix
mfcc_matrix = np.zeros((number_files, number_files))
for i in range(number_files):
# Get the MFCCs for the current audio and normalize them
audio_mfcc0 = audio_list[i]['mfcc']
audio_mfcc0 = audio_mfcc0/(np.sqrt(np.sum(np.power(audio_mfcc0, 2), axis=None))+1e-16)
for j in range(number_files):
# Get the MFCCs for the current audio and normalize them
audio_mfcc1 = audio_list[j]['mfcc']
audio_mfcc1 = audio_mfcc1/(np.sqrt(np.sum(np.power(audio_mfcc1, 2), axis=None))+1e-16)
# Compute the cosine similarity between the MFCCs
mfcc_matrix[i, j] = np.sum(audio_mfcc0*audio_mfcc1, axis=None)
# Compute the similarity averaged over the instrument classes
mfcc_matrix2 = np.zeros((number_instruments, number_instruments))
for i in range(number_instruments):
for j in range(number_instruments):
mfcc_matrix2[i, j] = np.mean(mfcc_matrix[i*12:(i+1)*12, j*12:(j+1)*12])
# Compute the final score vectors (mean between self-similarity and 1 minus the averaged cross-similarities)
mfcc_vector2 = np.zeros(number_instruments)
for i in range(number_instruments):
mfcc_vector2[i] = (mfcc_matrix2[i, i] \
+ 1-((np.sum(mfcc_matrix2[i, :])-mfcc_matrix2[i, i])/(number_instruments-1)))/2
plt.plot(mfcc_vector2, label='MFCCs')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()