#!/usr/bin/env python # coding: utf-8 # # Fish Speech # ### For Windows User / win用户 # In[ ]: get_ipython().system('chcp 65001') # ### For Linux User / Linux 用户 # In[ ]: import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') # ### Prepare Model # In[ ]: # For Chinese users, you probably want to use mirror to accelerate downloading # !set HF_ENDPOINT=https://hf-mirror.com # !export HF_ENDPOINT=https://hf-mirror.com get_ipython().system('huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4/') # ## WebUI Inference # # > You can use --compile to fuse CUDA kernels for faster inference (10x). # In[ ]: get_ipython().system('python tools/webui.py --llama-checkpoint-path checkpoints/fish-speech-1.4 --decoder-checkpoint-path checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth # --compile') # ## Break-down CLI Inference # ### 1. Encode reference audio: / 从语音生成 prompt: # # You should get a `fake.npy` file. # # 你应该能得到一个 `fake.npy` 文件. # In[ ]: ## Enter the path to the audio file here src_audio = r"D:\PythonProject\vo_hutao_draw_appear.wav" get_ipython().system('python tools/vqgan/inference.py -i {src_audio} --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"') from IPython.display import Audio, display audio = Audio(filename="fake.wav") display(audio) # ### 2. Generate semantic tokens from text: / 从文本生成语义 token: # # > This command will create a codes_N file in the working directory, where N is an integer starting from 0. # # > You may want to use `--compile` to fuse CUDA kernels for faster inference (~30 tokens/second -> ~300 tokens/second). # # > 该命令会在工作目录下创建 codes_N 文件, 其中 N 是从 0 开始的整数. # # > 您可以使用 `--compile` 来融合 cuda 内核以实现更快的推理 (~30 tokens/秒 -> ~300 tokens/秒) # In[ ]: get_ipython().system('python tools/llama/generate.py --text "hello world" --prompt-text "The text corresponding to reference audio" --prompt-tokens "fake.npy" --checkpoint-path "checkpoints/fish-speech-1.4" --num-samples 2') # --compile # ### 3. Generate speech from semantic tokens: / 从语义 token 生成人声: # In[ ]: get_ipython().system('python tools/vqgan/inference.py -i "codes_0.npy" --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"') from IPython.display import Audio, display audio = Audio(filename="fake.wav") display(audio)