#!/usr/bin/env python # coding: utf-8 # ## Comparison of spectral envelope parametrization methods # # This notebook investigates the difference between two spectral envelope parametrization / recovering methods below: # # 1. WORLD's `code_spectral_envelope` and `decode_spectral_envelope` # 2. [pysptk.sp2mc and pysptk.mc2sp](https://github.com/r9y9/pysptk/blob/023c71b7bc5e45a9eb0a41b3bb23d85d6b8dac85/pysptk/conversion.py#L64-L143), which resembles [SPTK's mcep function](https://github.com/r9y9/SPTK/blob/440f4e82da63319277484394b63fa7a97c5652e7/bin/mcep/_mcep.c#L49-L77) for input type is 4 (i.e. periodgram) and its inverse transform. # # I measuered [normalized mean square error (NMSE)](https://math.stackexchange.com/questions/488964/the-definition-of-nmse-normalized-mean-square-error) between spectral envelope and reconstructed spectral envelope. # Synthesized audio examples are available in the notebook. # # ### Requirements # # - [pyworld](https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder) (needs https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/pull/8) # - [pysptk](https://github.com/r9y9/pysptk) # - [librosa](https://github.com/librosa/librosa) # - matplotlib # - seaborn # - numpy # In[1]: get_ipython().run_line_magic('pylab', 'inline') # In[2]: import matplotlib import seaborn seaborn.set(style="dark") rcParams['figure.figsize'] = (16, 4) # In[3]: import pysptk import pyworld import librosa import librosa.display from IPython.display import Audio # ## 0. Preparation # # ### Utlis # In[4]: def decompose(x, fs, period=5.0): """Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD """ f0, timeaxis = pyworld.harvest(x, fs, frame_period=period, f0_floor=71.0, f0_ceil=800.0) sp = pyworld.cheaptrick(x, f0, timeaxis, fs) ap = pyworld.d4c(x, f0, timeaxis, fs) return f0, timeaxis, sp, ap def vis(sp_param, approx_sp, x, y, top_title): """Visualize compressed spectral parameter, recovered spectral envelope and reconstructed speech wavevform """ figure(figsize=(16,14)) subplots_adjust(hspace=0.4) subplot(3,1,1) librosa.display.specshow(sp_param.T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") title(top_title) subplot(3,1,2) librosa.display.specshow(10*log(approx_sp).T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") title("20log|H(w)|") subplot(3,1,3) plot(y, "r-+", label="reconstructed speech signal") plot(x, label="original speech signal") xlabel("sample") legend(fontsize=14) def mcepalpha(fs): """Determine frequency warping parameter from fs (simplest implementation) See https://bitbucket.org/happyalu/mcep_alpha_calc/ for details. """ if fs == 16000: return 0.41 elif fs == 22050: return 0.455 elif fs == 44100: return 0.544 elif fs == 48000: return 0.554 else: raise NotImplementedError # ### Input data # In[5]: path = pysptk.util.example_audio_file() # from cmu_arctic x, fs = librosa.load(path, sr=None) x = x.astype(np.float64) librosa.display.waveplot(x, sr=fs) Audio(x, rate=fs) # In[6]: frame_period = 5.0 hop_length = int(fs * frame_period * 0.001) fftlen = pyworld.get_cheaptrick_fft_size(fs) f0, timeaxis, sp, ap = decompose(x, fs, frame_period) print(hop_length) print(sp.shape) print(fftlen) # ### F0 # In[7]: plot(timeaxis, f0, linewidth=2, label="F0 contour estimated by Harvest") xlabel("Time [sec]") ylabel("Frequency [Hz]") legend(fontsize=18); # ### Spectral envelope # In[8]: librosa.display.specshow(10*log(sp).T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") colorbar() # ### Aperiodicity # In[9]: librosa.display.specshow(20*log(ap).T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") colorbar() # ## 1. Synthesis from coded spectral envelope by WORLD # In[10]: dim = 60 coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) y = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period) nmse = np.linalg.norm(np.log(sp) - np.log(decoded_sp)) / np.linalg.norm(np.log(sp)) print("NMSE: {}".format(nmse)) vis(coded_sp, decoded_sp, x, y, "Mel-cepstrum") Audio(y, rate=fs) # ## 2. Synthesis from mel-cepstrum using pysptk # In[11]: dim = 60 alpha = mcepalpha(fs) mc = np.apply_along_axis(pysptk.sp2mc, 1, sp, dim-1, alpha) approximate_sp = np.apply_along_axis(pysptk.mc2sp, 1, mc, alpha, fftlen) y = pyworld.synthesize(f0, approximate_sp, ap, fs, frame_period) nmse = np.linalg.norm(np.log(sp) - np.log(approximate_sp)) / np.linalg.norm(np.log(sp)) print("NMSE: {}".format(nmse)) vis(mc, approximate_sp, x, y, "Coded spectral envelope by WORLD") Audio(y, rate=fs)