mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 09:29:25 +08:00
add cosyvoice3 inference code
This commit is contained in:
94
README.md
94
README.md
@@ -2,7 +2,7 @@
|
||||
|
||||
## 👉🏻 CosyVoice 👈🏻
|
||||
|
||||
**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
|
||||
**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice3-0.5B); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
|
||||
|
||||
**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
|
||||
|
||||
@@ -29,6 +29,11 @@
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [x] 2025/12
|
||||
|
||||
- [x] release cosyvoice3-0.5B base model and its training/inference script
|
||||
- [x] release cosyvoice3-0.5B modelscope gradio space
|
||||
|
||||
- [x] 2025/08
|
||||
|
||||
- [x] Thanks to the contribution from NVIDIA Yuekai Zhang, add triton trtllm runtime support and cosyvoice2 grpo training support
|
||||
@@ -96,6 +101,7 @@ We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVo
|
||||
``` python
|
||||
# SDK模型下载
|
||||
from modelscope import snapshot_download
|
||||
snapshot_download('iic/CosyVoice3-0.5B', local_dir='pretrained_models/CosyVoice3-0.5B')
|
||||
snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
|
||||
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
||||
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
||||
@@ -103,16 +109,6 @@ snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/Co
|
||||
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
||||
```
|
||||
|
||||
``` sh
|
||||
# git模型下载,请确保已安装git lfs
|
||||
mkdir -p pretrained_models
|
||||
git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
|
||||
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
||||
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
||||
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
||||
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
||||
```
|
||||
|
||||
Optionally, you can unzip `ttsfrd` resource and install `ttsfrd` package for better text normalization performance.
|
||||
|
||||
Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use wetext by default.
|
||||
@@ -127,49 +123,9 @@ pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
|
||||
### Basic Usage
|
||||
|
||||
We strongly recommend using `CosyVoice2-0.5B` for better performance.
|
||||
Follow the code below for detailed usage of each model.
|
||||
|
||||
``` python
|
||||
import sys
|
||||
sys.path.append('third_party/Matcha-TTS')
|
||||
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
|
||||
from cosyvoice.utils.file_utils import load_wav
|
||||
import torchaudio
|
||||
```
|
||||
|
||||
#### CosyVoice2 Usage
|
||||
```python
|
||||
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
|
||||
|
||||
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
|
||||
# zero_shot usage
|
||||
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
|
||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
|
||||
# save zero_shot spk for future usage
|
||||
assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
|
||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
|
||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
cosyvoice.save_spkinfo()
|
||||
|
||||
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
|
||||
for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
|
||||
# instruct usage
|
||||
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
|
||||
# bistream usage, you can use generator as input, this is useful when using text llm model as input
|
||||
# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
|
||||
def text_generator():
|
||||
yield '收到好友从远方寄来的生日礼物,'
|
||||
yield '那份意外的惊喜与深深的祝福'
|
||||
yield '让我心中充满了甜蜜的快乐,'
|
||||
yield '笑容如花儿般绽放。'
|
||||
for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
Follow the code in `example.py` for detailed usage of each model.
|
||||
```sh
|
||||
python example.py
|
||||
```
|
||||
|
||||
#### CosyVoice2 vllm Usage
|
||||
@@ -184,36 +140,6 @@ pip install vllm==v0.9.0 transformers==4.51.3 -i https://mirrors.aliyun.com/pypi
|
||||
python vllm_example.py
|
||||
```
|
||||
|
||||
#### CosyVoice Usage
|
||||
```python
|
||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
|
||||
# sft usage
|
||||
print(cosyvoice.list_available_spks())
|
||||
# change stream=True for chunk stream inference
|
||||
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
||||
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
|
||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
|
||||
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
||||
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
|
||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
# cross_lingual usage
|
||||
prompt_speech_16k = load_wav('./asset/cross_lingual_prompt.wav', 16000)
|
||||
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
# vc usage
|
||||
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
|
||||
source_speech_16k = load_wav('./asset/cross_lingual_prompt.wav', 16000)
|
||||
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
||||
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
|
||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
||||
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
||||
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||
```
|
||||
|
||||
#### Start web demo
|
||||
|
||||
You can use our web demo page to get familiar with CosyVoice quickly.
|
||||
|
||||
Reference in New Issue
Block a user