#!/usr/bin/env python # coding: utf-8 # 🐙 # # Tacotron: Towards End-to-End Speech Synthesis: https://arxiv.org/abs/1703.10135 # In[1]: get_ipython().run_line_magic('pylab', 'inline') rcParams["figure.figsize"] = (16,5) # Use text & audio modules from existing Tacotron implementation. import sys sys.path.append("../lib/tacotron") from text import text_to_sequence, symbols from util import audio # In[2]: import torch from torch.autograd import Variable import numpy as np from tacotron_pytorch import Tacotron from hparams import hparams import os import librosa import librosa.display import IPython from IPython.display import Audio # In[3]: use_cuda = torch.cuda.is_available() fs = 20000 hop_length = 250 # In[4]: def visualize(alignment, spectrogram): figure(figsize=(16,16)) subplot(2,1,1) imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) xlabel("Decoder timestamp") ylabel("Encoder timestamp") colorbar() subplot(2,1,2) librosa.display.specshow(spectrogram.T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") colorbar() # In[5]: def tts(model, text): if use_cuda: model = model.cuda() # TODO: Turning off dropout of decoder's prenet causes serious performance # regression, not sure why. # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text_to_sequence(text, ["english_cleaners"])) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram # In[6]: def test_one_text(model, text, figures=True): waveform, alignment, spectrogram = tts(model, text) if figures: visualize(alignment, spectrogram) IPython.display.display(Audio(waveform, rate=fs)) # ## Model # In[7]: model = Tacotron(n_vocab=len(symbols)) # Choose your favorite model checkpoint_path = "../checkpoints/checkpoint_step772000.pth" checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) # ## Try same input multiple times # In[11]: # translated from japanese: https://twitter.com/vmpmember/status/911137213189984257 text = "It seems everytime change the speech. There was inconsistent results each time as if we say in voice conversion using a hostile learning trend." for idx in range(10): test_one_text(model, text, figures=False)