90 Commits

Author SHA1 Message Date
lyuxiang.lx
8555549e88 fix lint 2025-07-07 14:52:12 +08:00
lyuxiang.lx
3047591fad fix lint 2025-07-07 14:49:51 +08:00
lyuxiang.lx
5a00aefa20 update 2025-07-07 14:27:03 +08:00
lyuxiang.lx
35abb1f3b5 Merge branch 'main' of github.com:FunAudioLLM/CosyVoice into main 2025-07-07 10:56:06 +08:00
lyuxiang.lx
21a5efd8ae fix lint 2025-06-29 13:21:44 +08:00
Xiang Lyu
44316c3475 Merge pull request #1408 from xxnuo/wetext
Replace WeTextProcessing with wetext to enhance compatibility on all platforms
2025-06-29 12:56:57 +08:00
xxnuo
116c99bf39 Fix: remove overwrite_cache 2025-06-27 15:57:17 +08:00
xxnuo
6eaef42126 Fix: remove full_to_half 2025-06-27 15:55:00 +08:00
xxnuo
525531d8a3 Update README.md 2025-06-27 14:52:58 +08:00
xxnuo
c788bca1a6 Update requirements.txt 2025-06-27 14:50:35 +08:00
xxnuo
ff9694dc2c Fix: Use wetext cache 2025-06-27 14:45:07 +08:00
xxnuo
4505924608 Fix: Use wetext replace WeTextProcessing 2025-06-27 14:43:57 +08:00
lyuxiang.lx
46dfe0439b fix possible bug 2025-06-25 14:31:05 +08:00
lyuxiang.lx
63856565f3 update dpo 2025-06-24 11:49:20 +08:00
Xiang Lyu
cc234bd322 Merge pull request #1369 from UncleLLD/Update-README.md
Update README.md CosyVoice2 vllm Usage
2025-06-12 16:21:43 +08:00
UncleLLD
98ef35b1c0 Update README.md CosyVoice2 vllm Usage
First change env and then pip libs
2025-06-12 15:12:58 +08:00
Xiang Lyu
fca1df3ea7 Merge pull request #1340 from Soooda/main
Fix README Typos
2025-06-02 09:53:05 +08:00
lyuxiang.lx
c939c80480 update 2025-06-01 08:46:47 +00:00
Hilbert Kong
5bf6befd70 Fix README Typos 2025-06-01 17:19:50 +09:00
lyuxiang.lx
1c7976779b update 2025-05-30 13:07:56 +00:00
lyuxiang.lx
a8e1774e82 add vllm example 2025-05-30 10:06:14 +00:00
lyuxiang.lx
1f50ae259b update 2025-05-30 09:25:26 +00:00
Xiang Lyu
793a7fe6ad Merge pull request #1337 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
2025-05-30 17:16:30 +08:00
Xiang Lyu
79b2ea818b Merge branch 'main' into dev/lyuxiang.lx 2025-05-30 17:16:22 +08:00
lyuxiang.lx
7d9d84d32d update 2025-05-30 09:12:03 +00:00
lyuxiang.lx
9b052a94c4 fix lint 2025-05-30 07:51:49 +00:00
lyuxiang.lx
6dd68b9d5e add vllm inference 2025-05-30 07:22:35 +00:00
root
9f55c5af8f add vllm export 2025-05-28 08:35:48 +00:00
Xiang Lyu
b6c5f9dfd2 Merge pull request #1331 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
2025-05-27 14:07:56 +08:00
lyuxiang.lx
cbfed4a9ee send streaming as args 2025-05-27 13:47:12 +08:00
lyuxiang.lx
54d21b40f0 update 2025-05-27 10:40:45 +08:00
lyuxiang.lx
c3250c222f add trt_concurrent arg 2025-05-27 10:34:59 +08:00
lyuxiang.lx
82219cdd27 update 2025-05-27 10:23:06 +08:00
Xiang Lyu
4159a18469 Merge pull request #1325 from FunAudioLLM/dev/lyuxiang.lx
fix trt wrapper bug
2025-05-26 18:04:02 +08:00
lyuxiang.lx
3e12bb86bd fix trt wrapper bug 2025-05-26 18:03:15 +08:00
Xiang Lyu
cbfbe2bc33 Merge pull request #1317 from Lsnsh/patch-1
docs(README): set Markdown headings for paragraphs to support quick anchor points
2025-05-26 11:28:28 +08:00
Lsnsh Xin
3660da4a19 docs(README): set Markdown headings for paragraphs to support quick anchor points 2025-05-24 04:17:28 +08:00
Xiang Lyu
3c921daede Merge pull request #1315 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
2025-05-23 16:11:40 +08:00
lyuxiang.lx
68100c267a remove flow_cache 2025-05-23 16:07:27 +08:00
burkliu
fbab274b6a [feature] modify pad to trim
Conflicts:
	cosyvoice/dataset/processor.py
2025-05-06 10:53:37 +08:00
lyuxiang.lx
97f0bc61cd remove unnecessary file 2025-05-06 10:41:55 +08:00
lyuxiang.lx
afb1a70f7a Merge branch 'main' of github.com:FunAudioLLM/CosyVoice into main 2025-05-06 10:37:48 +08:00
Xiang Lyu
5c77e40304 Merge pull request #1256 from hwangsihu/main
Reorder requirements.txt
2025-05-04 16:27:47 +08:00
hwangsihu
b4c4d848ca Reorder requirements.txt 2025-05-01 13:28:15 +09:00
Xiang Lyu
88f467a8ac Merge pull request #1232 from boji123/bj_dev_feat_len_pad
a better solution for mismatch of speech feat len and speech token len when trainning
2025-04-30 09:41:50 +08:00
burkliu
038ff9f353 [feature] modify pad to trim 2025-04-25 10:36:25 +08:00
burkliu
65ad448714 [debug] a better solution for mismatch of speech feat len and speech token len, refer to https://github.com/FunAudioLLM/CosyVoice/issues/1051 2025-04-25 10:36:08 +08:00
lyuxiang.lx
a96ae13616 fix instruct2 bug 2025-04-23 15:40:59 +08:00
bearlu
587604b2b4 fix inference_instruct2 speaker ID bug 2025-04-21 09:26:34 -07:00
lyuxiang.lx
e97cd1b655 fix cross_lingual bug 2025-04-19 09:08:47 +08:00
lyuxiang.lx
8d67d17f73 update 2025-04-16 20:18:49 +08:00
lyuxiang.lx
a442317d17 add flow trt wrapper 2025-04-16 17:57:02 +08:00
lyuxiang.lx
7f8bea2669 Merge branch 'dev/lyuxiang.lx' of github.com:FunAudioLLM/CosyVoice into dev/lyuxiang.lx 2025-04-16 15:00:24 +08:00
ShengqiangLi
6d876f573c feat: Support DPO 2025-04-16 14:57:35 +08:00
lyuxiang.lx
3770c1c8b1 fix bug 2025-04-16 13:48:13 +08:00
lyuxiang.lx
2c193781cc fix export_onnx.py 2025-04-15 17:29:42 +08:00
lyuxiang.lx
efe1d15960 fix export_onnx.py 2025-04-15 16:51:25 +08:00
lyuxiang.lx
9ebcf7b1ad fix bug 2025-04-15 16:36:10 +08:00
lyuxiang.lx
37e48dd318 optimize vc code 2025-04-15 16:15:20 +08:00
lyuxiang.lx
c07cd3d730 fix lint 2025-04-15 15:00:29 +08:00
lyuxiang.lx
36aec2c0f7 optimize flow cache code 2025-04-15 13:58:52 +08:00
lyuxiang.lx
d71d790f55 fix flow cache bug 2025-04-15 13:12:25 +08:00
lyuxiang.lx
e1ffb1e978 fix lint 2025-04-15 11:46:53 +08:00
lyuxiang.lx
9fea0f0836 add_zero_shot_spk 2025-04-15 11:45:00 +08:00
lyuxiang.lx
9dc559fc2a force set use_flow_cache 2025-04-08 12:23:26 +08:00
Xiang Lyu
634edfadf0 Merge pull request #983 from Shengqiang-Li/main
feat: Support DPO
2025-04-08 12:14:51 +08:00
Xiang Lyu
b56dfa223d Merge pull request #1140 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
2025-04-07 23:04:37 +08:00
lyuxiang.lx
f0b8e892f6 fix lint 2025-04-07 23:04:01 +08:00
lyuxiang.lx
cfc68f379c only keep online export 2025-04-07 22:54:13 +08:00
lyuxiang.lx
4951d2ad1a update 2025-04-07 22:45:47 +08:00
lyuxiang.lx
d9ffd592f6 use static_chunk_size in flow training 2025-04-07 22:34:45 +08:00
lyuxiang.lx
7902d1c17f set use_flow_cache=True when export 2025-04-07 22:27:17 +08:00
lyuxiang.lx
39ffc50dec add flow cache inference code 2025-04-07 21:23:09 +08:00
Xiang Lyu
08312f4c46 Merge pull request #1100 from jingfelix/fix/dockerfile-dependency
fix: missing dependency in `runtime/python/Dockerfile`
2025-03-25 08:30:26 +08:00
jingfelix
c6d8737336 fix: missing dependency in runtime/python/Dockerfile
Signed-off-by: jingfelix <jingfelix@outlook.com>
2025-03-22 23:34:59 +08:00
ShengqiangLi
a22873e360 feat: Support DPO 2025-03-16 18:33:42 +08:00
Xiang Lyu
c97b445df4 Merge pull request #1036 from hwangsihu/main
Fixed an issue where onnxruntime would not install on Windows
2025-03-10 15:48:04 +08:00
hwangsihu
265507f213 Fixed an issue where onnxruntime would not install on Windows 2025-03-10 16:07:01 +09:00
lyuxiang.lx
a69b7e275d fix vocoder train 2025-03-07 17:34:13 +08:00
lyuxiang.lx
fcc054f64e fix hifigan bug 2025-02-18 14:45:43 +08:00
lyuxiang.lx
890300513c fix bug 2025-02-11 15:56:20 +08:00
lyuxiang.lx
f77c6a85aa fix bug 2025-02-11 00:07:12 +08:00
lyuxiang.lx
b6d66ce2e3 update 2025-02-08 13:58:41 +08:00
lyuxiang.lx
8e4f252d32 add llm bistream 2025-02-08 12:15:37 +08:00
lyuxiang.lx
79b7dff8d2 add llm train 2025-02-07 17:17:12 +08:00
lyuxiang.lx
2a3e033ee1 fix lint 2025-02-07 16:23:03 +08:00
lyuxiang.lx
24f796a2b1 Merge branch 'main' into dev/lyuxiang.lx 2025-01-26 16:56:18 +08:00
lyuxiang.lx
fd1a951a6c add flow unified training 2025-01-26 16:56:06 +08:00
lyuxiang.lx
aea75207dd fix cache bug 2025-01-24 11:07:26 +08:00
lyuxiang.lx
1c062ab381 add flow decoder cache 2025-01-23 16:48:13 +08:00
54 changed files with 1970 additions and 824 deletions

View File

@@ -52,5 +52,5 @@ jobs:
set -eux
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
flake8 --version
flake8 --max-line-length 180 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
flake8 --max-line-length 180 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504,F401,F403,F405,F841 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
if [ $? != 0 ]; then exit 1; fi

129
README.md
View File

@@ -1,6 +1,9 @@
[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)
## 👉🏻 CosyVoice 👈🏻
**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
@@ -26,6 +29,14 @@
## Roadmap
- [x] 2025/07
- [x] release cosyvoice 3.0 eval set
- [x] 2025/05
- [x] add cosyvoice 2.0 vllm support
- [x] 2024/12
- [x] 25hz cosyvoice 2.0 released
@@ -49,34 +60,32 @@
## Install
**Clone and install**
### Clone and install
- Clone the repo
``` sh
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
# If you failed to clone submodule due to network failures, please run following command until success
cd CosyVoice
git submodule update --init --recursive
```
``` sh
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
# If you failed to clone the submodule due to network failures, please run the following command until success
cd CosyVoice
git submodule update --init --recursive
```
- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
- Create Conda env:
``` sh
conda create -n cosyvoice -y python=3.10
conda activate cosyvoice
# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
conda install -y -c conda-forge pynini==2.1.5
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
``` sh
conda create -n cosyvoice -y python=3.10
conda activate cosyvoice
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
# If you encounter sox compatibility issues
# ubuntu
sudo apt-get install sox libsox-dev
# centos
sudo yum install sox sox-devel
```
# If you encounter sox compatibility issues
# ubuntu
sudo apt-get install sox libsox-dev
# centos
sudo yum install sox sox-devel
```
**Model download**
### Model download
We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource.
@@ -85,7 +94,6 @@ We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVo
from modelscope import snapshot_download
snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
@@ -96,15 +104,14 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
mkdir -p pretrained_models
git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
```
Optionally, you can unzip `ttsfrd` resouce and install `ttsfrd` package for better text normalization performance.
Optionally, you can unzip `ttsfrd` resource and install `ttsfrd` package for better text normalization performance.
Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use WeTextProcessing by default.
Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use wetext by default.
``` sh
cd pretrained_models/CosyVoice-ttsfrd/
@@ -113,10 +120,10 @@ pip install ttsfrd_dependency-0.1-py3-none-any.whl
pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
```
**Basic Usage**
### Basic Usage
We strongly recommend using `CosyVoice2-0.5B` for better performance.
Follow code below for detailed usage of each model.
Follow the code below for detailed usage of each model.
``` python
import sys
@@ -126,9 +133,9 @@ from cosyvoice.utils.file_utils import load_wav
import torchaudio
```
**CosyVoice2 Usage**
#### CosyVoice2 Usage
```python
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
# zero_shot usage
@@ -136,6 +143,12 @@ prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
# save zero_shot spk for future usage
assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
cosyvoice.save_spkinfo()
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
@@ -155,7 +168,19 @@ for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
```
**CosyVoice Usage**
#### CosyVoice2 vllm Usage
If you want to use vllm for inference, please install `vllm==v0.9.0`. Older vllm version do not support CosyVoice2 inference.
Notice that `vllm==v0.9.0` has a lot of specific requirements, for example `torch==2.7.0`. You can create a new env to in case your hardward do not support vllm and old env is corrupted.
``` sh
conda create -n cosyvoice_vllm --clone cosyvoice
conda activate cosyvoice_vllm
pip install vllm==v0.9.0 -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
python vllm_example.py
```
#### CosyVoice Usage
```python
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
# sft usage
@@ -164,7 +189,7 @@ print(cosyvoice.list_available_spks())
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M') # or change to pretrained_models/CosyVoice-300M-25Hz for 25Hz inference
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
@@ -185,7 +210,7 @@ for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
```
**Start web demo**
#### Start web demo
You can use our web demo page to get familiar with CosyVoice quickly.
@@ -196,14 +221,14 @@ Please see the demo website for details.
python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
```
**Advanced Usage**
#### Advanced Usage
For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
For advanced users, we have provided training and inference scripts in `examples/libritts/cosyvoice/run.sh`.
**Build for deployment**
#### Build for deployment
Optionally, if you want service deployment,
you can run following steps.
You can run the following steps.
``` sh
cd runtime/python
@@ -233,5 +258,39 @@ You can also scan the QR code to join our official Dingding chat group.
4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
## Citations
``` bibtex
@article{du2024cosyvoice,
title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
journal={arXiv preprint arXiv:2407.05407},
year={2024}
}
@article{du2024cosyvoice,
title={Cosyvoice 2: Scalable streaming speech synthesis with large language models},
author={Du, Zhihao and Wang, Yuxuan and Chen, Qian and Shi, Xian and Lv, Xiang and Zhao, Tianyu and Gao, Zhifu and Yang, Yexin and Gao, Changfeng and Wang, Hui and others},
journal={arXiv preprint arXiv:2412.10117},
year={2024}
}
@article{du2025cosyvoice,
title={CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Post-training},
author={Du, Zhihao and Gao, Changfeng and Wang, Yuxuan and Yu, Fan and Zhao, Tianyu and Wang, Hao and Lv, Xiang and Wang, Hui and Shi, Xian and An, Keyu and others},
journal={arXiv preprint arXiv:2505.17589},
year={2025}
}
@inproceedings{lyu2025build,
title={Build LLM-Based Zero-Shot Streaming TTS System with Cosyvoice},
author={Lyu, Xiang and Wang, Yuxuan and Zhao, Tianyu and Wang, Hao and Liu, Huadai and Du, Zhihao},
booktitle={ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={1--2},
year={2025},
organization={IEEE}
}
```
## Disclaimer
The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 94 KiB

After

Width:  |  Height:  |  Size: 94 KiB

View File

@@ -75,10 +75,11 @@ def main():
print('Processing {}'.format(path))
states = torch.load(path, map_location=torch.device('cpu'))
for k in states.keys():
if k not in avg.keys():
avg[k] = states[k].clone()
else:
avg[k] += states[k]
if k not in ['step', 'epoch']:
if k not in avg.keys():
avg[k] = states[k].clone()
else:
avg[k] += states[k]
# average
for k in avg.keys():
if avg[k] is not None:

View File

@@ -24,6 +24,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/../..'.format(ROOT_DIR))
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import logging
def get_args():
@@ -71,6 +72,7 @@ def main():
script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
script = get_optimized_script(llm_text_encoder.half())
script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
logging.info('successfully export llm_text_encoder')
# 2. export llm llm
llm_llm = model.model.llm.llm
@@ -78,13 +80,23 @@ def main():
script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
logging.info('successfully export llm_llm')
# 3. export flow encoder
flow_encoder = model.model.flow.encoder
script = get_optimized_script(flow_encoder)
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
script = get_optimized_script(flow_encoder.half())
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
# 3. export flow encoder
flow_encoder = model.model.flow.encoder
script = get_optimized_script(flow_encoder)
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
script = get_optimized_script(flow_encoder.half())
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
logging.info('successfully export flow_encoder')
else:
# 3. export flow encoder
flow_encoder = model.model.flow.encoder
script = get_optimized_script(flow_encoder)
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
script = get_optimized_script(flow_encoder.half())
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
logging.info('successfully export flow_encoder')
if __name__ == '__main__':

View File

@@ -28,6 +28,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/../..'.format(ROOT_DIR))
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import logging
def get_dummy_input(batch_size, seq_len, out_channels, device):
@@ -51,6 +52,7 @@ def get_args():
return args
@torch.no_grad()
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
@@ -66,6 +68,7 @@ def main():
# 1. export flow decoder estimator
estimator = model.model.flow.decoder.estimator
estimator.eval()
device = model.model.device
batch_size, seq_len = 2, 256
@@ -110,6 +113,7 @@ def main():
}
output_onnx = estimator_onnx.run(None, ort_inputs)[0]
torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
logging.info('successfully export estimator')
if __name__ == "__main__":

View File

@@ -1,10 +0,0 @@
#!/bin/bash
# Copyright 2024 Alibaba Inc. All Rights Reserved.
# download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability
# for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
TRT_DIR=<YOUR_TRT_DIR>
MODEL_DIR=<COSYVOICE2_MODEL_DIR>
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64
$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw --outputIOFormats=fp32:chw
$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw

View File

@@ -23,7 +23,7 @@ from torch.utils.data import DataLoader
import torchaudio
from hyperpyyaml import load_hyperpyyaml
from tqdm import tqdm
from cosyvoice.cli.model import CosyVoiceModel
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
from cosyvoice.dataset.dataset import Dataset
@@ -33,6 +33,7 @@ def get_args():
parser.add_argument('--prompt_data', required=True, help='prompt data file')
parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
parser.add_argument('--tts_text', required=True, help='tts input file')
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
parser.add_argument('--llm_model', required=True, help='llm model file')
parser.add_argument('--flow_model', required=True, help='flow model file')
parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
@@ -59,16 +60,25 @@ def main():
# Init cosyvoice models from configs
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f)
try:
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': args.qwen_pretrain_path})
model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'])
except Exception:
try:
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f)
model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
except Exception:
raise TypeError('no valid model_type!')
model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
model.load(args.llm_model, args.flow_model, args.hifigan_model)
test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
sample_rate = configs['sample_rate']
del configs
os.makedirs(args.result_dir, exist_ok=True)
fn = os.path.join(args.result_dir, 'wav.scp')
@@ -104,7 +114,7 @@ def main():
tts_speeches = torch.concat(tts_speeches, dim=1)
tts_key = '{}_{}'.format(utts[0], tts_index[0])
tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
torchaudio.save(tts_fn, tts_speeches, sample_rate=sample_rate, backend='soundfile')
f.write('{} {}\n'.format(tts_key, tts_fn))
f.flush()
f.close()
@@ -112,4 +122,5 @@ def main():
if __name__ == '__main__':
logging.warning('this code has been deprecated, please refer to README for CosyVoice inference usage!')
main()

View File

@@ -27,6 +27,7 @@ from hyperpyyaml import load_hyperpyyaml
from torch.distributed.elastic.multiprocessing.errors import record
from cosyvoice.utils.losses import DPOLoss
from cosyvoice.utils.executor import Executor
from cosyvoice.utils.train_utils import (
init_distributed,
@@ -43,9 +44,11 @@ def get_args():
choices=['torch_ddp', 'deepspeed'],
help='Engine for paralleled training')
parser.add_argument('--model', required=True, help='model which will be trained')
parser.add_argument('--ref_model', required=False, help='ref model used in dpo')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument('--train_data', required=True, help='train data file')
parser.add_argument('--cv_data', required=True, help='cv data file')
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
parser.add_argument('--checkpoint', help='checkpoint model')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--tensorboard_dir',
@@ -72,6 +75,10 @@ def get_args():
action='store_true',
default=False,
help='Use automatic mixed precision training')
parser.add_argument('--dpo',
action='store_true',
default=False,
help='Use Direct Preference Optimization')
parser.add_argument('--deepspeed.save_states',
dest='save_states',
default='model_only',
@@ -97,8 +104,12 @@ def main():
override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
if gan is True:
override_dict.pop('hift')
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f, overrides=override_dict)
try:
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path})
except Exception:
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f, overrides=override_dict)
if gan is True:
configs['train_conf'] = configs['train_conf_gan']
configs['train_conf'].update(vars(args))
@@ -108,7 +119,7 @@ def main():
# Get dataset & dataloader
train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
init_dataset_and_dataloader(args, configs, gan)
init_dataset_and_dataloader(args, configs, gan, args.dpo)
# Do some sanity checks and save config to arsg.model_dir
configs = check_modify_and_save_config(args, configs)
@@ -117,6 +128,8 @@ def main():
writer = init_summarywriter(args)
# load checkpoint
if args.dpo is True:
configs[args.model].forward = configs[args.model].forward_dpo
model = configs[args.model]
start_step, start_epoch = 0, -1
if args.checkpoint is not None:
@@ -145,13 +158,25 @@ def main():
info_dict['epoch'] = start_epoch
save_model(model, 'init', info_dict)
# DPO related
if args.dpo is True:
ref_model = deepcopy(configs[args.model])
state_dict = torch.load(args.ref_model, map_location='cpu')
ref_model.load_state_dict(state_dict, strict=False)
dpo_loss = DPOLoss(beta=0.01, label_smoothing=0.0, ipo=False)
# NOTE maybe it is not needed to wrap ref_model as ddp because its parameter is not updated
ref_model = wrap_cuda_model(args, ref_model)
else:
ref_model, dpo_loss = None, None
# Get executor
executor = Executor(gan=gan)
executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss)
executor.step = start_step
# Init scaler, used for pytorch amp mixed precision training
scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
print('start step {} start epoch {}'.format(start_step, start_epoch))
# Start training loop
for epoch in range(start_epoch + 1, info_dict['max_epoch']):
executor.epoch = epoch
@@ -162,7 +187,7 @@ def main():
executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
writer, info_dict, scaler, group_join)
else:
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join)
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=ref_model)
dist.destroy_process_group(group_join)

View File

@@ -26,13 +26,16 @@ from cosyvoice.utils.class_utils import get_model_type
class CosyVoice:
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1):
self.instruct = True if '-Instruct' in model_dir else False
self.model_dir = model_dir
self.fp16 = fp16
if not os.path.exists(model_dir):
model_dir = snapshot_download(model_dir)
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
hyper_yaml_path = '{}/cosyvoice.yaml'.format(model_dir)
if not os.path.exists(hyper_yaml_path):
raise ValueError('{} not found!'.format(hyper_yaml_path))
with open(hyper_yaml_path, 'r') as f:
configs = load_hyperpyyaml(f)
assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
@@ -49,7 +52,6 @@ class CosyVoice:
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
'{}/hift.pt'.format(model_dir))
self.vllm_codec_engine = None
if load_jit:
self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
@@ -57,6 +59,7 @@ class CosyVoice:
if load_trt:
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
trt_concurrent,
self.fp16)
del configs
@@ -64,6 +67,17 @@ class CosyVoice:
spks = list(self.frontend.spk2info.keys())
return spks
def add_zero_shot_spk(self, prompt_text, prompt_speech_16k, zero_shot_spk_id):
assert zero_shot_spk_id != '', 'do not use empty zero_shot_spk_id'
model_input = self.frontend.frontend_zero_shot('', prompt_text, prompt_speech_16k, self.sample_rate, '')
del model_input['text']
del model_input['text_len']
self.frontend.spk2info[zero_shot_spk_id] = model_input
return True
def save_spkinfo(self):
torch.save(self.frontend.spk2info, '{}/spk2info.pt'.format(self.model_dir))
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True):
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_sft(i, spk_id)
@@ -75,12 +89,12 @@ class CosyVoice:
yield model_output
start_time = time.time()
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
if (not isinstance(i, Generator)) and len(i) < 0.5 * len(prompt_text):
logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate)
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
@@ -89,9 +103,9 @@ class CosyVoice:
yield model_output
start_time = time.time()
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
def inference_cross_lingual(self, tts_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
@@ -118,7 +132,7 @@ class CosyVoice:
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
start_time = time.time()
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
@@ -127,13 +141,16 @@ class CosyVoice:
class CosyVoice2(CosyVoice):
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, use_vllm=False):
def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
self.instruct = True if '-Instruct' in model_dir else False
self.model_dir = model_dir
self.fp16 = fp16
if not os.path.exists(model_dir):
model_dir = snapshot_download(model_dir)
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
hyper_yaml_path = '{}/cosyvoice2.yaml'.format(model_dir)
if not os.path.exists(hyper_yaml_path):
raise ValueError('{} not found!'.format(hyper_yaml_path))
with open(hyper_yaml_path, 'r') as f:
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
assert get_model_type(configs) == CosyVoice2Model, 'do not use {} for CosyVoice2 initialization!'.format(model_dir)
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
@@ -150,31 +167,24 @@ class CosyVoice2(CosyVoice):
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
'{}/hift.pt'.format(model_dir))
self.vllm_codec_engine = None
if use_vllm:
from vllm import EngineArgs, LLMEngine
self.model.export_codec_vllm(''.join([model_dir, '/codec_vllm_model']))
engine_args = EngineArgs(model=''.join([model_dir, '/codec_vllm_model']),
skip_tokenizer_init=True,
gpu_memory_utilization=0.2)
self.vllm_codec_engine = LLMEngine.from_engine_args(engine_args)
self.model.vllm_codec_engine = self.vllm_codec_engine
if load_vllm:
self.model.load_vllm('{}/vllm'.format(model_dir))
if load_jit:
self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
if load_trt:
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
trt_concurrent,
self.fp16)
del configs
def inference_instruct(self, *args, **kwargs):
raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):

View File

@@ -28,9 +28,9 @@ try:
import ttsfrd
use_ttsfrd = True
except ImportError:
print("failed to import ttsfrd, use WeTextProcessing instead")
from tn.chinese.normalizer import Normalizer as ZhNormalizer
from tn.english.normalizer import Normalizer as EnNormalizer
print("failed to import ttsfrd, use wetext instead")
from wetext import Normalizer as ZhNormalizer
from wetext import Normalizer as EnNormalizer
use_ttsfrd = False
from cosyvoice.utils.file_utils import logging
from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
@@ -68,7 +68,7 @@ class CosyVoiceFrontEnd:
'failed to initialize ttsfrd resource'
self.frd.set_lang_type('pinyinvg')
else:
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
self.zh_tn_model = ZhNormalizer(remove_erhua=False)
self.en_tn_model = EnNormalizer()
self.inflect_parser = inflect.engine()
@@ -122,7 +122,7 @@ class CosyVoiceFrontEnd:
if isinstance(text, Generator):
logging.info('get tts_text generator, will skip text_normalize!')
return [text]
if text_frontend is False:
if text_frontend is False or text == '':
return [text] if split is True else text
text = text.strip()
if self.use_ttsfrd:
@@ -154,28 +154,32 @@ class CosyVoiceFrontEnd:
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
return model_input
def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
if resample_rate == 24000:
# cosyvoice2, force speech_feat % speech_token = 2
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
embedding = self._extract_spk_embedding(prompt_speech_16k)
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
'llm_embedding': embedding, 'flow_embedding': embedding}
if zero_shot_spk_id == '':
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
if resample_rate == 24000:
# cosyvoice2, force speech_feat % speech_token = 2
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
embedding = self._extract_spk_embedding(prompt_speech_16k)
model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
'llm_embedding': embedding, 'flow_embedding': embedding}
else:
model_input = self.spk2info[zero_shot_spk_id]
model_input['text'] = tts_text_token
model_input['text_len'] = tts_text_token_len
return model_input
def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
# in cross lingual mode, we remove prompt in llm
del model_input['prompt_text']
del model_input['prompt_text_len']
@@ -192,8 +196,8 @@ class CosyVoiceFrontEnd:
model_input['prompt_text_len'] = instruct_text_token_len
return model_input
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate)
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
del model_input['llm_prompt_speech_token']
del model_input['llm_prompt_speech_token_len']
return model_input

View File

@@ -1,4 +1,5 @@
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
# 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -17,12 +18,12 @@ import torch
import numpy as np
import threading
import time
from torch import nn
from torch.nn import functional as F
from contextlib import nullcontext
import uuid
from cosyvoice.utils.common import fade_in_out
from cosyvoice.utils.file_utils import convert_onnx_to_trt
from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
from cosyvoice.utils.common import TrtContextWrapper
class CosyVoiceModel:
@@ -31,22 +32,18 @@ class CosyVoiceModel:
llm: torch.nn.Module,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool):
fp16: bool = False):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
self.fp16 = fp16
self.llm.fp16 = fp16
self.flow.fp16 = fp16
if self.fp16 is True:
self.llm.half()
self.flow.half()
self.token_min_hop_len = 2 * self.flow.input_frame_rate
self.token_max_hop_len = 4 * self.flow.input_frame_rate
self.token_overlap_len = 20
# here we fix set flow.decoder.estimator.static_chunk_size = 0 for compatibability
self.flow.decoder.estimator.static_chunk_size = 0
# mel fade in out
self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
self.mel_window = np.hamming(2 * self.mel_overlap_len)
@@ -66,7 +63,6 @@ class CosyVoiceModel:
self.mel_overlap_dict = {}
self.flow_cache_dict = {}
self.hift_cache_dict = {}
self.vllm_codec_engine = None
def load(self, llm_model, flow_model, hift_model):
self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
@@ -86,41 +82,35 @@ class CosyVoiceModel:
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
self.flow.encoder = flow_encoder
def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, trt_concurrent, fp16):
assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
if not os.path.exists(flow_decoder_estimator_model):
convert_onnx_to_trt(flow_decoder_estimator_model, flow_decoder_onnx_model, fp16)
if os.path.getsize(flow_decoder_estimator_model) == 0:
raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
if not os.path.exists(flow_decoder_estimator_model) or os.path.getsize(flow_decoder_estimator_model) == 0:
convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
del self.flow.decoder.estimator
import tensorrt as trt
with open(flow_decoder_estimator_model, 'rb') as f:
self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
if self.flow.decoder.estimator_engine is None:
raise ValueError('failed to load trt {}'.format(flow_decoder_estimator_model))
self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
assert estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
self.flow.decoder.estimator = TrtContextWrapper(estimator_engine, trt_concurrent=trt_concurrent, device=self.device)
def get_trt_kwargs(self):
min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)]
max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
input_names = ["x", "mask", "mu", "cond"]
return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
with self.llm_context:
with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False):
if isinstance(text, Generator):
assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
if self.vllm_codec_engine is None:
for i in self.llm.inference_bistream(text=text,
prompt_text=prompt_text.to(self.device),
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
embedding=llm_embedding.to(self.device)):
self.tts_speech_token_dict[uuid].append(i)
else:
for i in self.llm.inference_bistream_vllm(text=text,
prompt_text=prompt_text.to(self.device),
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
embedding=llm_embedding.to(self.device),
vllm_codec_engine=self.vllm_codec_engine):
self.tts_speech_token_dict[uuid].append(i)
assert isinstance(self, CosyVoice2Model) and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2 and do not support vllm!'
for i in self.llm.inference_bistream(text=text,
prompt_text=prompt_text.to(self.device),
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
embedding=llm_embedding.to(self.device)):
self.tts_speech_token_dict[uuid].append(i)
else:
for i in self.llm.inference(text=text.to(self.device),
text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
@@ -129,20 +119,24 @@ class CosyVoiceModel:
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
embedding=llm_embedding.to(self.device),
vllm_codec_engine=self.vllm_codec_engine):
uuid=uuid):
self.tts_speech_token_dict[uuid].append(i)
self.llm_end_dict[uuid] = True
def vc_job(self, source_speech_token, uuid):
self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
self.llm_end_dict[uuid] = True
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
tts_mel, flow_cache = self.flow.inference(token=token.to(self.device),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
prompt_feat=prompt_feat.to(self.device),
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
embedding=embedding.to(self.device),
flow_cache=self.flow_cache_dict[uuid])
self.flow_cache_dict[uuid] = flow_cache
with torch.cuda.amp.autocast(self.fp16):
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
prompt_feat=prompt_feat.to(self.device),
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
embedding=embedding.to(self.device),
flow_cache=self.flow_cache_dict[uuid])
# mel overlap fade in out
if self.mel_overlap_dict[uuid].shape[2] != 0:
@@ -173,11 +167,11 @@ class CosyVoiceModel:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
return tts_speech
def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
# this_uuid is used to track variables related to this inference thread
this_uuid = str(uuid.uuid1())
with self.lock:
@@ -185,7 +179,10 @@ class CosyVoiceModel:
self.hift_cache_dict[this_uuid] = None
self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
if source_speech_token.shape[1] == 0:
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
else:
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
p.start()
if stream is True:
token_hop_len = self.token_min_hop_len
@@ -235,61 +232,9 @@ class CosyVoiceModel:
self.mel_overlap_dict.pop(this_uuid)
self.hift_cache_dict.pop(this_uuid)
self.flow_cache_dict.pop(this_uuid)
torch.cuda.empty_cache()
def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
# this_uuid is used to track variables related to this inference thread
this_uuid = str(uuid.uuid1())
with self.lock:
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
self.hift_cache_dict[this_uuid] = None
self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
if stream is True:
token_hop_len = self.token_min_hop_len
while True:
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
.unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=False)
yield {'tts_speech': this_tts_speech.cpu()}
with self.lock:
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
# increase token_hop_len for better speech quality
token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
break
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=True)
yield {'tts_speech': this_tts_speech.cpu()}
else:
# deal with all tokens
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=True,
speed=speed)
yield {'tts_speech': this_tts_speech.cpu()}
with self.lock:
self.tts_speech_token_dict.pop(this_uuid)
self.llm_end_dict.pop(this_uuid)
self.mel_overlap_dict.pop(this_uuid)
self.hift_cache_dict.pop(this_uuid)
torch.cuda.empty_cache()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.current_stream().synchronize()
class CosyVoice2Model(CosyVoiceModel):
@@ -298,88 +243,56 @@ class CosyVoice2Model(CosyVoiceModel):
llm: torch.nn.Module,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool):
fp16: bool = False):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
self.fp16 = fp16
self.llm.fp16 = fp16
self.flow.fp16 = fp16
if self.fp16 is True:
self.llm.half()
self.flow.half()
self.token_hop_len = 2 * self.flow.input_frame_rate
# here we fix flow encoder/decoder decoding_chunk_size, in the future we will send it as arguments, or use cache
self.flow.encoder.static_chunk_size = 2 * self.flow.input_frame_rate
self.flow.decoder.estimator.static_chunk_size = 2 * self.flow.input_frame_rate * self.flow.token_mel_ratio
# NOTE must matching training static_chunk_size
self.token_hop_len = 25
# hift cache
self.mel_cache_len = 8
self.source_cache_len = int(self.mel_cache_len * 480)
# speech fade in out
self.speech_window = np.hamming(2 * self.source_cache_len)
# rtf and decoding related
self.stream_scale_factor = 1
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
self.lock = threading.Lock()
# dict used to store session related variable
self.tts_speech_token_dict = {}
self.llm_end_dict = {}
self.hift_cache_dict = {}
self.vllm_codec_engine = None
def load_jit(self, flow_encoder_model):
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
self.flow.encoder = flow_encoder
def export_codec_vllm(self, model_path):
if os.path.exists(model_path):
return
pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64
vocab_size = self.llm.speech_embedding.num_embeddings
feature_size = self.llm.speech_embedding.embedding_dim
pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to
dtype = torch.bfloat16
# lm_head
new_lm_head = nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=True)
with torch.no_grad():
new_lm_head.weight[:vocab_size] = self.llm.llm_decoder.weight
new_lm_head.bias[:vocab_size] = self.llm.llm_decoder.bias
new_lm_head.weight[vocab_size:] = 0
new_lm_head.bias[vocab_size:] = 0
self.llm.llm.model.lm_head = new_lm_head
new_codec_embed = nn.Linear(in_features=feature_size, out_features=pad_vocab_size)
# embed_tokens
embed_tokens = self.llm.llm.model.model.embed_tokens
with torch.no_grad():
new_codec_embed.weight[:vocab_size] = self.llm.speech_embedding.weight
new_codec_embed.weight[vocab_size:] = 0
self.llm.llm.model.set_input_embeddings(new_codec_embed)
self.llm.llm.model.to(self.device)
self.llm.llm.model.to(dtype)
tmp_vocab_size = self.llm.llm.model.config.vocab_size
tmp_tie_embedding = self.llm.llm.model.config.tie_word_embeddings
del self.llm.llm.model.generation_config.eos_token_id
del self.llm.llm.model.config.bos_token_id
del self.llm.llm.model.config.eos_token_id
self.llm.llm.model.config.vocab_size = pad_vocab_size
self.llm.llm.model.config.tie_word_embeddings = False
self.llm.llm.model.config.use_bias = True
self.llm.llm.model.save_pretrained(model_path)
self.llm.llm.model.config.vocab_size = tmp_vocab_size
self.llm.llm.model.config.tie_word_embeddings = tmp_tie_embedding
self.llm.llm.model.set_input_embeddings(embed_tokens)
def load_vllm(self, model_dir):
export_cosyvoice2_vllm(self.llm, model_dir, self.device)
from vllm import EngineArgs, LLMEngine
engine_args = EngineArgs(model=model_dir,
skip_tokenizer_init=True,
enable_prompt_embeds=True,
gpu_memory_utilization=0.2)
self.llm.vllm = LLMEngine.from_engine_args(engine_args)
self.llm.lock = threading.Lock()
del self.llm.llm.model.model.layers
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
tts_mel, _ = self.flow.inference(token=token.to(self.device),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
prompt_feat=prompt_feat.to(self.device),
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
embedding=embedding.to(self.device),
finalize=finalize)
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
with torch.cuda.amp.autocast(self.fp16):
tts_mel, _ = self.flow.inference(token=token.to(self.device),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
prompt_feat=prompt_feat.to(self.device),
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
embedding=embedding.to(self.device),
streaming=stream,
finalize=finalize)
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
# append hift cache
if self.hift_cache_dict[uuid] is not None:
@@ -405,34 +318,40 @@ class CosyVoice2Model(CosyVoiceModel):
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
return tts_speech
def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
# this_uuid is used to track variables related to this inference thread
this_uuid = str(uuid.uuid1())
with self.lock:
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
self.hift_cache_dict[this_uuid] = None
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
if source_speech_token.shape[1] == 0:
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
else:
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
p.start()
if stream is True:
token_offset = 0
prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
while True:
time.sleep(0.1)
if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
this_token_hop_len = self.token_hop_len + prompt_token_pad if token_offset == 0 else self.token_hop_len
if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= this_token_hop_len + self.flow.pre_lookahead_len:
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + this_token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
token_offset=token_offset,
uuid=this_uuid,
stream=stream,
finalize=False)
token_offset += self.token_hop_len
token_offset += this_token_hop_len
yield {'tts_speech': this_tts_speech.cpu()}
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len:
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
break
p.join()
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
@@ -441,8 +360,8 @@ class CosyVoice2Model(CosyVoiceModel):
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
token_offset=token_offset,
uuid=this_uuid,
finalize=True)
yield {'tts_speech': this_tts_speech.cpu()}
else:
@@ -453,12 +372,15 @@ class CosyVoice2Model(CosyVoiceModel):
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
token_offset=0,
uuid=this_uuid,
finalize=True,
speed=speed)
yield {'tts_speech': this_tts_speech.cpu()}
with self.lock:
self.tts_speech_token_dict.pop(this_uuid)
self.llm_end_dict.pop(this_uuid)
torch.cuda.empty_cache()
self.hift_cache_dict.pop(this_uuid)
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.current_stream().synchronize()

View File

@@ -14,14 +14,13 @@
# limitations under the License.
import random
import json
import math
from functools import partial
import torch
import torch.distributed as dist
from torch.utils.data import IterableDataset
from cosyvoice.utils.file_utils import read_lists, read_json_lists
from cosyvoice.utils.file_utils import read_lists
class Processor(IterableDataset):
@@ -127,10 +126,9 @@ def Dataset(data_list_file,
data_pipeline,
mode='train',
gan=False,
dpo=False,
shuffle=True,
partition=True,
tts_file='',
prompt_utt2data=''):
partition=True):
""" Construct dataset from arguments
We have two shuffle stage in the Dataset. The first is global
@@ -142,23 +140,12 @@ def Dataset(data_list_file,
tokenizer (BaseTokenizer): tokenizer to tokenize
partition(bool): whether to do data partition in terms of rank
"""
assert mode in ['train', 'inference']
lists = read_lists(data_list_file)
if mode == 'inference':
with open(tts_file) as f:
tts_data = json.load(f)
utt2lists = read_json_lists(prompt_utt2data)
# filter unnecessary file in inference mode
lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
dataset = DataList(lists,
shuffle=shuffle,
partition=partition)
if mode == 'inference':
# map partial arg to parquet_opener func in inference mode
data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
if gan is True:
# map partial arg to padding func in gan mode
data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
# map partial arg to padding func
data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
for func in data_pipeline:
dataset = Processor(dataset, func, mode=mode)
return dataset

View File

@@ -43,8 +43,6 @@ def parquet_opener(data, mode='train', tts_data={}):
for df in pq.ParquetFile(url).iter_batches(batch_size=64):
df = df.to_pandas()
for i in range(len(df)):
if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
continue
sample.update(dict(df.loc[i]))
if mode == 'train':
# NOTE do not return sample directly, must initialize a new dict
@@ -100,6 +98,8 @@ def filter(data,
continue
if len(sample['speech_token']) == 0:
continue
if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
continue
if num_frames != 0:
if len(sample['text_token']) / num_frames < min_output_input_ratio:
continue
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
def compute_fbank(data,
feat_extractor,
token_mel_ratio=0,
mode='train'):
""" Extract fbank
@@ -174,8 +175,13 @@ def compute_fbank(data,
assert 'utt' in sample
assert 'text_token' in sample
waveform = sample['speech']
mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
sample['speech_feat'] = mat
feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
if token_mel_ratio != 0:
# trim to align speech_token and speech_feat
token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
feat = feat[:token_mel_ratio * token_len]
sample["speech_token"] = sample["speech_token"][:token_len]
sample['speech_feat'] = feat
yield sample
@@ -196,8 +202,8 @@ def compute_f0(data, sample_rate, hop_size, mode='train'):
assert 'text_token' in sample
waveform = sample['speech']
_f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
if sum(_f0 != 0) < 5: # this happens when the algorithm fails
_f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
if sum(_f0 != 0) < 5: # this happens when the algorithm fails
_f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
sample['pitch_feat'] = f0
@@ -236,8 +242,6 @@ def tokenize(data, get_tokenizer, allowed_special, mode='train'):
for sample in data:
assert 'text' in sample
sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
if mode == 'inference':
sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
yield sample
@@ -345,18 +349,15 @@ def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
""" Wrapper for static/dynamic batch
"""
if mode == 'inference':
return static_batch(data, 1)
if batch_type == 'static':
return static_batch(data, batch_size)
elif batch_type == 'dynamic':
return dynamic_batch(data, max_frames_in_batch)
else:
if batch_type == 'static':
return static_batch(data, batch_size)
elif batch_type == 'dynamic':
return dynamic_batch(data, max_frames_in_batch)
else:
logging.fatal('Unsupported batch type {}'.format(batch_type))
logging.fatal('Unsupported batch type {}'.format(batch_type))
def padding(data, use_spk_embedding, mode='train', gan=False):
def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
""" Padding the data into training data
Args:
@@ -418,16 +419,14 @@ def padding(data, use_spk_embedding, mode='train', gan=False):
# only gan train needs speech, delete it to save memory
del batch["speech"]
del batch["speech_len"]
if mode == 'inference':
tts_text = [sample[i]['tts_text'] for i in order]
tts_index = [sample[i]['tts_index'] for i in order]
tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
batch.update({'tts_text': tts_text,
'tts_index': tts_index,
'tts_text_token': tts_text_token,
'tts_text_token_len': tts_text_token_len})
if dpo is True:
reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
reject_speech_token = pad_sequence(reject_speech_token,
batch_first=True,
padding_value=0)
batch['reject_speech_token'] = reject_speech_token
batch['reject_speech_token_len'] = reject_speech_token_len
if use_spk_embedding is True:
batch["embedding"] = batch["spk_embedding"]
else:

View File

@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
@@ -27,34 +28,11 @@ class Transpose(torch.nn.Module):
self.dim0 = dim0
self.dim1 = dim1
def forward(self, x: torch.Tensor):
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = torch.transpose(x, self.dim0, self.dim1)
return x
class CausalBlock1D(Block1D):
def __init__(self, dim: int, dim_out: int):
super(CausalBlock1D, self).__init__(dim, dim_out)
self.block = torch.nn.Sequential(
CausalConv1d(dim, dim_out, 3),
Transpose(1, 2),
nn.LayerNorm(dim_out),
Transpose(1, 2),
nn.Mish(),
)
def forward(self, x: torch.Tensor, mask: torch.Tensor):
output = self.block(x * mask)
return output * mask
class CausalResnetBlock1D(ResnetBlock1D):
def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
self.block1 = CausalBlock1D(dim, dim_out)
self.block2 = CausalBlock1D(dim_out, dim_out)
class CausalConv1d(torch.nn.Conv1d):
def __init__(
self,
@@ -76,20 +54,42 @@ class CausalConv1d(torch.nn.Conv1d):
padding_mode=padding_mode,
device=device, dtype=dtype)
assert stride == 1
self.causal_padding = (kernel_size - 1, 0)
self.causal_padding = kernel_size - 1
def forward(self, x: torch.Tensor):
x = F.pad(x, self.causal_padding)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = F.pad(x, (self.causal_padding, 0), value=0.0)
x = super(CausalConv1d, self).forward(x)
return x
class CausalBlock1D(Block1D):
def __init__(self, dim: int, dim_out: int):
super(CausalBlock1D, self).__init__(dim, dim_out)
self.block = torch.nn.Sequential(
CausalConv1d(dim, dim_out, 3),
Transpose(1, 2),
nn.LayerNorm(dim_out),
Transpose(1, 2),
nn.Mish(),
)
def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
output = self.block(x * mask)
return output * mask
class CausalResnetBlock1D(ResnetBlock1D):
def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
self.block1 = CausalBlock1D(dim, dim_out)
self.block2 = CausalBlock1D(dim_out, dim_out)
class ConditionalDecoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
causal=False,
channels=(256, 256),
dropout=0.05,
attention_head_dim=64,
@@ -106,7 +106,7 @@ class ConditionalDecoder(nn.Module):
channels = tuple(channels)
self.in_channels = in_channels
self.out_channels = out_channels
self.causal = causal
self.time_embeddings = SinusoidalPosEmb(in_channels)
time_embed_dim = channels[0] * 4
self.time_mlp = TimestepEmbedding(
@@ -123,8 +123,7 @@ class ConditionalDecoder(nn.Module):
input_channel = output_channel
output_channel = channels[i]
is_last = i == len(channels) - 1
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
@@ -138,16 +137,14 @@ class ConditionalDecoder(nn.Module):
]
)
downsample = (
Downsample1D(output_channel) if not is_last else
CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
)
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
for _ in range(num_mid_blocks):
input_channel = channels[-1]
out_channels = channels[-1]
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
@@ -169,11 +166,7 @@ class ConditionalDecoder(nn.Module):
input_channel = channels[i] * 2
output_channel = channels[i + 1]
is_last = i == len(channels) - 2
resnet = CausalResnetBlock1D(
dim=input_channel,
dim_out=output_channel,
time_emb_dim=time_embed_dim,
) if self.causal else ResnetBlock1D(
resnet = ResnetBlock1D(
dim=input_channel,
dim_out=output_channel,
time_emb_dim=time_embed_dim,
@@ -193,10 +186,10 @@ class ConditionalDecoder(nn.Module):
upsample = (
Upsample1D(output_channel, use_conv_transpose=True)
if not is_last
else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
else nn.Conv1d(output_channel, output_channel, 3, padding=1)
)
self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
self.final_block = Block1D(channels[-1], channels[-1])
self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
self.initialize_weights()
@@ -214,7 +207,7 @@ class ConditionalDecoder(nn.Module):
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x, mask, mu, t, spks=None, cond=None):
def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
"""Forward pass of the UNet1DConditional model.
Args:
@@ -249,9 +242,8 @@ class ConditionalDecoder(nn.Module):
mask_down = masks[-1]
x = resnet(x, mask_down, t)
x = rearrange(x, "b c t -> b t c").contiguous()
# attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
attn_mask = mask_to_bias(attn_mask, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
@@ -268,9 +260,8 @@ class ConditionalDecoder(nn.Module):
for resnet, transformer_blocks in self.mid_blocks:
x = resnet(x, mask_mid, t)
x = rearrange(x, "b c t -> b t c").contiguous()
# attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
attn_mask = mask_to_bias(attn_mask, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
@@ -285,9 +276,211 @@ class ConditionalDecoder(nn.Module):
x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
x = resnet(x, mask_up, t)
x = rearrange(x, "b c t -> b t c").contiguous()
# attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
attn_mask = mask_to_bias(attn_mask, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
attention_mask=attn_mask,
timestep=t,
)
x = rearrange(x, "b t c -> b c t").contiguous()
x = upsample(x * mask_up)
x = self.final_block(x, mask_up)
output = self.final_proj(x * mask_up)
return output * mask
class CausalConditionalDecoder(ConditionalDecoder):
def __init__(
self,
in_channels,
out_channels,
channels=(256, 256),
dropout=0.05,
attention_head_dim=64,
n_blocks=1,
num_mid_blocks=2,
num_heads=4,
act_fn="snake",
static_chunk_size=50,
num_decoding_left_chunks=2,
):
"""
This decoder requires an input with the same shape of the target. So, if your text content
is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
"""
torch.nn.Module.__init__(self)
channels = tuple(channels)
self.in_channels = in_channels
self.out_channels = out_channels
self.time_embeddings = SinusoidalPosEmb(in_channels)
time_embed_dim = channels[0] * 4
self.time_mlp = TimestepEmbedding(
in_channels=in_channels,
time_embed_dim=time_embed_dim,
act_fn="silu",
)
self.static_chunk_size = static_chunk_size
self.num_decoding_left_chunks = num_decoding_left_chunks
self.down_blocks = nn.ModuleList([])
self.mid_blocks = nn.ModuleList([])
self.up_blocks = nn.ModuleList([])
output_channel = in_channels
for i in range(len(channels)): # pylint: disable=consider-using-enumerate
input_channel = output_channel
output_channel = channels[i]
is_last = i == len(channels) - 1
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
dim=output_channel,
num_attention_heads=num_heads,
attention_head_dim=attention_head_dim,
dropout=dropout,
activation_fn=act_fn,
)
for _ in range(n_blocks)
]
)
downsample = (
Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3)
)
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
for _ in range(num_mid_blocks):
input_channel = channels[-1]
out_channels = channels[-1]
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
dim=output_channel,
num_attention_heads=num_heads,
attention_head_dim=attention_head_dim,
dropout=dropout,
activation_fn=act_fn,
)
for _ in range(n_blocks)
]
)
self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
channels = channels[::-1] + (channels[0],)
for i in range(len(channels) - 1):
input_channel = channels[i] * 2
output_channel = channels[i + 1]
is_last = i == len(channels) - 2
resnet = CausalResnetBlock1D(
dim=input_channel,
dim_out=output_channel,
time_emb_dim=time_embed_dim,
)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
dim=output_channel,
num_attention_heads=num_heads,
attention_head_dim=attention_head_dim,
dropout=dropout,
activation_fn=act_fn,
)
for _ in range(n_blocks)
]
)
upsample = (
Upsample1D(output_channel, use_conv_transpose=True)
if not is_last
else CausalConv1d(output_channel, output_channel, 3)
)
self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
self.final_block = CausalBlock1D(channels[-1], channels[-1])
self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
self.initialize_weights()
def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
"""Forward pass of the UNet1DConditional model.
Args:
x (torch.Tensor): shape (batch_size, in_channels, time)
mask (_type_): shape (batch_size, 1, time)
t (_type_): shape (batch_size)
spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
cond (_type_, optional): placeholder for future use. Defaults to None.
Raises:
ValueError: _description_
ValueError: _description_
Returns:
_type_: _description_
"""
t = self.time_embeddings(t).to(t.dtype)
t = self.time_mlp(t)
x = pack([x, mu], "b * t")[0]
if spks is not None:
spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
x = pack([x, spks], "b * t")[0]
if cond is not None:
x = pack([x, cond], "b * t")[0]
hiddens = []
masks = [mask]
for resnet, transformer_blocks, downsample in self.down_blocks:
mask_down = masks[-1]
x = resnet(x, mask_down, t)
x = rearrange(x, "b c t -> b t c").contiguous()
if streaming is True:
attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
else:
attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
attn_mask = mask_to_bias(attn_mask, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
attention_mask=attn_mask,
timestep=t,
)
x = rearrange(x, "b t c -> b c t").contiguous()
hiddens.append(x) # Save hidden states for skip connections
x = downsample(x * mask_down)
masks.append(mask_down[:, :, ::2])
masks = masks[:-1]
mask_mid = masks[-1]
for resnet, transformer_blocks in self.mid_blocks:
x = resnet(x, mask_mid, t)
x = rearrange(x, "b c t -> b t c").contiguous()
if streaming is True:
attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
else:
attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
attn_mask = mask_to_bias(attn_mask, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
attention_mask=attn_mask,
timestep=t,
)
x = rearrange(x, "b t c -> b c t").contiguous()
for resnet, transformer_blocks, upsample in self.up_blocks:
mask_up = masks.pop()
skip = hiddens.pop()
x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
x = resnet(x, mask_up, t)
x = rearrange(x, "b c t -> b t c").contiguous()
if streaming is True:
attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
else:
attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
attn_mask = mask_to_bias(attn_mask, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,

View File

@@ -91,7 +91,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
conds = conds.transpose(1, 2)
mask = (~make_pad_mask(feat_len)).to(h)
feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
# NOTE this is unnecessary, feat/h already same shape
loss, _ = self.decoder.compute_loss(
feat.transpose(1, 2).contiguous(),
mask.unsqueeze(1),
@@ -111,16 +111,12 @@ class MaskedDiffWithXvec(torch.nn.Module):
prompt_feat_len,
embedding,
flow_cache):
if self.fp16 is True:
prompt_feat = prompt_feat.half()
embedding = embedding.half()
assert token.shape[0] == 1
# xvec projection
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
# concat text and prompt_text
# concat speech token and prompt speech token
token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
@@ -145,7 +141,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
cond=conds,
n_timesteps=10,
prompt_len=mel_len1,
flow_cache=flow_cache
cache=flow_cache
)
feat = feat[:, :, mel_len1:]
assert feat.shape[2] == mel_len2
@@ -190,6 +186,52 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
self.token_mel_ratio = token_mel_ratio
self.pre_lookahead_len = pre_lookahead_len
def forward(
self,
batch: dict,
device: torch.device,
) -> Dict[str, Optional[torch.Tensor]]:
token = batch['speech_token'].to(device)
token_len = batch['speech_token_len'].to(device)
feat = batch['speech_feat'].to(device)
feat_len = batch['speech_feat_len'].to(device)
embedding = batch['embedding'].to(device)
# NOTE unified training, static_chunk_size > 0 or = 0
streaming = True if random.random() < 0.5 else False
# xvec projection
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
# concat text and prompt_text
mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
token = self.input_embedding(torch.clamp(token, min=0)) * mask
# text encode
h, h_lengths = self.encoder(token, token_len, streaming=streaming)
h = self.encoder_proj(h)
# get conditions
conds = torch.zeros(feat.shape, device=token.device)
for i, j in enumerate(feat_len):
if random.random() < 0.5:
continue
index = random.randint(0, int(0.3 * j))
conds[i, :index] = feat[i, :index]
conds = conds.transpose(1, 2)
mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
loss, _ = self.decoder.compute_loss(
feat.transpose(1, 2).contiguous(),
mask.unsqueeze(1),
h.transpose(1, 2).contiguous(),
embedding,
cond=conds,
streaming=streaming,
)
return {'loss': loss}
@torch.inference_mode()
def inference(self,
token,
@@ -199,11 +241,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
prompt_feat,
prompt_feat_len,
embedding,
streaming,
finalize):
if self.fp16 is True:
prompt_feat = prompt_feat.half()
embedding = embedding.half()
assert token.shape[0] == 1
# xvec projection
embedding = F.normalize(embedding, dim=1)
@@ -215,9 +254,11 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
token = self.input_embedding(torch.clamp(token, min=0)) * mask
# text encode
h, h_lengths = self.encoder(token, token_len)
if finalize is False:
h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
if finalize is True:
h, h_lengths = self.encoder(token, token_len, streaming=streaming)
else:
token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
h, h_lengths = self.encoder(token, token_len, context=context, streaming=streaming)
mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
h = self.encoder_proj(h)
@@ -232,7 +273,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
mask=mask.unsqueeze(1),
spks=embedding,
cond=conds,
n_timesteps=10
n_timesteps=10,
streaming=streaming
)
feat = feat[:, :, mel_len1:]
assert feat.shape[2] == mel_len2

View File

@@ -1,4 +1,5 @@
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
# 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,10 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import threading
import torch
import torch.nn.functional as F
from matcha.models.components.flow_matching import BASECFM
from cosyvoice.utils.common import set_all_random_seed
class ConditionalCFM(BASECFM):
@@ -31,10 +32,9 @@ class ConditionalCFM(BASECFM):
in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
# Just change the architecture of the estimator here
self.estimator = estimator
self.lock = threading.Lock()
@torch.inference_mode()
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, cache=torch.zeros(1, 80, 0, 2)):
"""Forward diffusion
Args:
@@ -54,21 +54,21 @@ class ConditionalCFM(BASECFM):
"""
z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
cache_size = flow_cache.shape[2]
cache_size = cache.shape[2]
# fix prompt and overlap part mu and z
if cache_size != 0:
z[:, :, :cache_size] = flow_cache[:, :, :, 0]
mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
z[:, :, :cache_size] = cache[:, :, :, 0]
mu[:, :, :cache_size] = cache[:, :, :, 1]
z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
cache = torch.stack([z_cache, mu_cache], dim=-1)
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
if self.t_scheduler == 'cosine':
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), cache
def solve_euler(self, x, t_span, mu, mask, spks, cond):
def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False):
"""
Fixed euler solver for ODEs.
Args:
@@ -109,7 +109,8 @@ class ConditionalCFM(BASECFM):
x_in, mask_in,
mu_in, t_in,
spks_in,
cond_in
cond_in,
streaming
)
dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
@@ -121,28 +122,36 @@ class ConditionalCFM(BASECFM):
return sol[-1].float()
def forward_estimator(self, x, mask, mu, t, spks, cond):
def forward_estimator(self, x, mask, mu, t, spks, cond, streaming=False):
if isinstance(self.estimator, torch.nn.Module):
return self.estimator.forward(x, mask, mu, t, spks, cond)
return self.estimator(x, mask, mu, t, spks, cond, streaming=streaming)
else:
with self.lock:
self.estimator.set_input_shape('x', (2, 80, x.size(2)))
self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
self.estimator.set_input_shape('t', (2,))
self.estimator.set_input_shape('spks', (2, 80))
self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
[estimator, stream], trt_engine = self.estimator.acquire_estimator()
# NOTE need to synchronize when switching stream
torch.cuda.current_stream().synchronize()
with stream:
estimator.set_input_shape('x', (2, 80, x.size(2)))
estimator.set_input_shape('mask', (2, 1, x.size(2)))
estimator.set_input_shape('mu', (2, 80, x.size(2)))
estimator.set_input_shape('t', (2,))
estimator.set_input_shape('spks', (2, 80))
estimator.set_input_shape('cond', (2, 80, x.size(2)))
data_ptrs = [x.contiguous().data_ptr(),
mask.contiguous().data_ptr(),
mu.contiguous().data_ptr(),
t.contiguous().data_ptr(),
spks.contiguous().data_ptr(),
cond.contiguous().data_ptr(),
x.data_ptr()]
for i, j in enumerate(data_ptrs):
estimator.set_tensor_address(trt_engine.get_tensor_name(i), j)
# run trt engine
self.estimator.execute_v2([x.contiguous().data_ptr(),
mask.contiguous().data_ptr(),
mu.contiguous().data_ptr(),
t.contiguous().data_ptr(),
spks.contiguous().data_ptr(),
cond.contiguous().data_ptr(),
x.data_ptr()])
assert estimator.execute_async_v3(torch.cuda.current_stream().cuda_stream) is True
torch.cuda.current_stream().synchronize()
self.estimator.release_estimator(estimator, stream)
return x
def compute_loss(self, x1, mask, mu, spks=None, cond=None):
def compute_loss(self, x1, mask, mu, spks=None, cond=None, streaming=False):
"""Computes diffusion loss
Args:
@@ -179,7 +188,7 @@ class ConditionalCFM(BASECFM):
spks = spks * cfg_mask.view(-1, 1)
cond = cond * cfg_mask.view(-1, 1, 1)
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
return loss, y
@@ -187,10 +196,11 @@ class ConditionalCFM(BASECFM):
class CausalConditionalCFM(ConditionalCFM):
def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
set_all_random_seed(0)
self.rand_noise = torch.randn([1, 80, 50 * 300])
@torch.inference_mode()
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, streaming=False):
"""Forward diffusion
Args:
@@ -214,4 +224,4 @@ class CausalConditionalCFM(ConditionalCFM):
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
if self.t_scheduler == 'cosine':
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond, streaming=streaming), None

View File

@@ -51,6 +51,7 @@ class InterpolateRegulator(nn.Module):
def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
# in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
# NOTE 20 corresponds to token_overlap_len in cosyvoice/cli/model.py
# x in (B, T, D)
if x2.shape[1] > 40:
x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')

View File

@@ -1,10 +1,16 @@
import torch
import torch.nn as nn
from torch.nn.utils.parametrizations import weight_norm
import torch.nn.functional as F
try:
from torch.nn.utils.parametrizations import weight_norm, spectral_norm
except ImportError:
from torch.nn.utils import weight_norm, spectral_norm
from typing import List, Optional, Tuple
from einops import rearrange
from torchaudio.transforms import Spectrogram
LRELU_SLOPE = 0.1
class MultipleDiscriminator(nn.Module):
def __init__(
@@ -138,3 +144,87 @@ class DiscriminatorR(nn.Module):
x += h
return x, fmap
class MultiResSpecDiscriminator(torch.nn.Module):
def __init__(self,
fft_sizes=[1024, 2048, 512],
hop_sizes=[120, 240, 50],
win_lengths=[600, 1200, 240],
window="hann_window"):
super(MultiResSpecDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window)])
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for _, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
def stft(x, fft_size, hop_size, win_length, window):
"""Perform STFT and convert to magnitude spectrogram.
Args:
x (Tensor): Input signal tensor (B, T).
fft_size (int): FFT size.
hop_size (int): Hop size.
win_length (int): Window length.
window (str): Window function type.
Returns:
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
"""
x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=True)
# NOTE(kan-bayashi): clamp is needed to avoid nan or inf
return torch.abs(x_stft).transpose(2, 1)
class SpecDiscriminator(nn.Module):
"""docstring for Discriminator."""
def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False):
super(SpecDiscriminator, self).__init__()
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.fft_size = fft_size
self.shift_size = shift_size
self.win_length = win_length
self.window = getattr(torch, window)(win_length)
self.discriminators = nn.ModuleList([
norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
])
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
def forward(self, y):
fmap = []
y = y.squeeze(1)
y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.device))
y = y.unsqueeze(1)
for _, d in enumerate(self.discriminators):
y = d(y)
y = F.leaky_relu(y, LRELU_SLOPE)
fmap.append(y)
y = self.out(y)
fmap.append(y)
return torch.flatten(y, 1, -1), fmap

View File

@@ -13,7 +13,10 @@
# limitations under the License.
import torch
import torch.nn as nn
from torch.nn.utils.parametrizations import weight_norm
try:
from torch.nn.utils.parametrizations import weight_norm
except ImportError:
from torch.nn.utils import weight_norm
class ConvRNNF0Predictor(nn.Module):

View File

@@ -23,7 +23,10 @@ import torch.nn.functional as F
from torch.nn import Conv1d
from torch.nn import ConvTranspose1d
from torch.nn.utils import remove_weight_norm
from torch.nn.utils.parametrizations import weight_norm
try:
from torch.nn.utils.parametrizations import weight_norm
except ImportError:
from torch.nn.utils import weight_norm
from torch.distributions.uniform import Uniform
from cosyvoice.transformer.activation import Snake
@@ -220,6 +223,172 @@ class SourceModuleHnNSF(torch.nn.Module):
return sine_merge, noise, uv
class SineGen2(torch.nn.Module):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False):
super(SineGen2, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
self.flag_for_pulse = flag_for_pulse
self.upsample_scale = upsample_scale
def _f02uv(self, f0):
# generate uv signal
uv = (f0 > self.voiced_threshold).type(torch.float32)
return uv
def _f02sine(self, f0_values):
""" f0_values: (batchsize, length, dim)
where dim indicates fundamental tone and overtones
"""
# convert to F0 in rad. The interger part n can be ignored
# because 2 * np.pi * n doesn't affect phase
rad_values = (f0_values / self.sampling_rate) % 1
# initial phase noise (no noise for fundamental component)
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
if not self.flag_for_pulse:
rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
scale_factor=1 / self.upsample_scale,
mode="linear").transpose(1, 2)
phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
sines = torch.sin(phase)
else:
# If necessary, make sure that the first time step of every
# voiced segments is sin(pi) or cos(0)
# This is used for pulse-train generation
# identify the last time step in unvoiced segments
uv = self._f02uv(f0_values)
uv_1 = torch.roll(uv, shifts=-1, dims=1)
uv_1[:, -1, :] = 1
u_loc = (uv < 1) * (uv_1 > 0)
# get the instantanouse phase
tmp_cumsum = torch.cumsum(rad_values, dim=1)
# different batch needs to be processed differently
for idx in range(f0_values.shape[0]):
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
# stores the accumulation of i.phase within
# each voiced segments
tmp_cumsum[idx, :, :] = 0
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
# rad_values - tmp_cumsum: remove the accumulation of i.phase
# within the previous voiced segment.
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
# get the sines
sines = torch.cos(i_phase * 2 * np.pi)
return sines
def forward(self, f0):
""" sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
# fundamental component
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
# generate sine waveforms
sine_waves = self._f02sine(fn) * self.sine_amp
# generate uv signal
uv = self._f02uv(f0)
# noise: for unvoiced should be similar to sine_amp
# std = self.sine_amp/3 -> max value ~ self.sine_amp
# . for voiced regions is self.noise_std
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
# first: set the unvoiced part to 0 by uv
# then: additive noise
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF2(torch.nn.Module):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0):
super(SourceModuleHnNSF2, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
# to produce sine waveforms
self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x):
"""
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
"""
# source for harmonic branch
with torch.no_grad():
sine_wavs, uv, _ = self.l_sin_gen(x)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
# source for noise branch, in the same shape as uv
noise = torch.randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv
class HiFTGenerator(nn.Module):
"""
HiFTNet Generator: Neural Source Filter + ISTFTNet
@@ -256,7 +425,9 @@ class HiFTGenerator(nn.Module):
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.m_source = SourceModuleHnNSF(
# NOTE in CosyVoice2, we use the original SourceModuleHnNSF implementation
this_SourceModuleHnNSF = SourceModuleHnNSF if self.sampling_rate == 22050 else SourceModuleHnNSF2
self.m_source = this_SourceModuleHnNSF(
sampling_rate=sampling_rate,
upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
harmonic_num=nb_harmonics,

View File

@@ -41,7 +41,7 @@ class HiFiGan(nn.Module):
loss_fm = feature_loss(fmap_rs, fmap_gs)
loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
if self.tpr_loss_weight != 0:
loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
loss_tpr = tpr_loss(y_d_gs, y_d_rs, self.tpr_loss_tau)
else:
loss_tpr = torch.zeros(1).to(device)
loss_f0 = F.l1_loss(generated_f0, pitch_feat)
@@ -56,7 +56,7 @@ class HiFiGan(nn.Module):
with torch.no_grad():
generated_speech, generated_f0 = self.generator(batch, device)
# 2. calculate discriminator outputs
y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech.detach())
# 3. calculate discriminator losses, tpr losses [Optional]
loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
if self.tpr_loss_weight != 0:

View File

@@ -1,4 +1,5 @@
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
# 2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,6 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import queue
import random
import time
import threading
from typing import Dict, Optional, Callable, List, Generator
import torch
from torch import nn
@@ -21,6 +26,7 @@ from cosyvoice.utils.common import IGNORE_ID
from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
from cosyvoice.utils.common import th_accuracy
from cosyvoice.utils.file_utils import logging
from cosyvoice.utils.mask import make_pad_mask
class TransformerLM(torch.nn.Module):
@@ -168,10 +174,8 @@ class TransformerLM(torch.nn.Module):
sampling: int = 25,
max_token_text_ratio: float = 20,
min_token_text_ratio: float = 2,
uuid: str = '',
) -> Generator[torch.Tensor, None, None]:
if self.fp16 is True:
embedding = embedding.half()
device = text.device
text = torch.concat([prompt_text, text], dim=1)
text_len += prompt_text_len
@@ -229,6 +233,17 @@ class Qwen2Encoder(torch.nn.Module):
super().__init__()
self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
def forward(self, xs: torch.Tensor, xs_lens: torch.Tensor):
T = xs.size(1)
masks = ~make_pad_mask(xs_lens, T)
outs = self.model(
inputs_embeds=xs,
attention_mask=masks,
output_hidden_states=True,
return_dict=True,
)
return outs.hidden_states[-1], masks.unsqueeze(1)
def forward_one_step(self, xs, masks, cache=None):
input_masks = masks[:, -1, :]
outs = self.model(
@@ -260,7 +275,6 @@ class Qwen2LM(TransformerLM):
self.llm_input_size = llm_input_size
self.llm_output_size = llm_output_size
self.speech_token_size = speech_token_size
# 2. build speech token language model related modules
self.sos_eos = 0
self.task_id = 1
@@ -283,6 +297,133 @@ class Qwen2LM(TransformerLM):
self.sampling = sampling
self.mix_ratio = mix_ratio
# 5. vllm related
self.stop_token_ids = [speech_token_size + i for i in range(3)]
self.vllm_output_queue = {}
def prepare_lm_input_target(self, text_token, text_token_emb, text_token_len, speech_token, speech_token_emb, speech_token_len):
lm_target, lm_input = [], []
text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
text_token_emb = unpad_sequence(text_token_emb, text_token_len.cpu(), batch_first=True)
speech_token_emb = unpad_sequence(speech_token_emb, speech_token_len.cpu(), batch_first=True)
for i in range(len(text_token)):
# bistream sequence
if random.random() < 0.5 and speech_token_len[i] / text_token_len[i] > self.mix_ratio[1] / self.mix_ratio[0]:
this_lm_target, this_lm_input = [], []
this_lm_target.append(IGNORE_ID)
this_lm_input.append(self.llm_embedding.weight[self.sos_eos].reshape(1, -1))
for j in range(((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()):
this_text_token = text_token[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]].tolist()
this_speech_token = speech_token[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]].tolist()
if len(this_text_token) == self.mix_ratio[0]:
assert len(this_speech_token) == self.mix_ratio[1]
this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
this_lm_target += this_speech_token
this_lm_target.append(self.speech_token_size + 2)
this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]])
this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]])
else:
this_lm_target += [-1] * len(this_text_token)
this_lm_target += speech_token[i][j * self.mix_ratio[1]:].tolist()
this_lm_target.append(self.speech_token_size)
this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]:])
this_lm_input.append(self.llm_embedding.weight[self.task_id].reshape(1, -1))
this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]:])
this_lm_target, this_lm_input = torch.tensor(this_lm_target), torch.concat(this_lm_input, dim=0)
# unistream sequence
else:
this_lm_target = torch.tensor([IGNORE_ID] * (1 + text_token_len[i]) + speech_token[i].tolist() + [self.speech_token_size])
this_lm_input = torch.concat([self.llm_embedding.weight[self.sos_eos].reshape(1, -1), text_token_emb[i],
self.llm_embedding.weight[self.task_id].reshape(1, -1), speech_token_emb[i]], dim=0)
lm_target.append(this_lm_target)
lm_input.append(this_lm_input)
lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID)
return lm_target, lm_input, lm_input_len
def forward(
self,
batch: dict,
device: torch.device,
) -> Dict[str, Optional[torch.Tensor]]:
"""
Args:
text: (B, L, D)
text_lengths: (B,)
audio: (B, T, N) or (B, T)
audio_lengths: (B,)
"""
text_token = batch['text_token'].to(device)
text_token_len = batch['text_token_len'].to(device)
speech_token = batch['speech_token'].to(device)
speech_token_len = batch['speech_token_len'].to(device)
# 1. encode text_token
text_token_emb = self.llm.model.model.embed_tokens(text_token)
# 2. encode speech_token
speech_token_emb = self.speech_embedding(speech_token)
# 3. prepare llm_input/target
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(text_token, text_token_emb, text_token_len, speech_token, speech_token_emb, speech_token_len)
lm_target = lm_target.to(device)
# 4. run lm forward
lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
logits = self.llm_decoder(lm_output)
loss = self.criterion_ce(logits, lm_target.to(device))
acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
return {'loss': loss, 'acc': acc}
def forward_dpo(
self,
batch: dict,
device: torch.device,
) -> Dict[str, Optional[torch.Tensor]]:
text_token = batch['text_token'].to(device)
text_token_len = batch['text_token_len'].to(device)
speech_token = batch['speech_token'].to(device)
speech_token_len = batch['speech_token_len'].to(device)
reject_speech_token = batch['reject_speech_token'].to(device)
reject_speech_token_len = batch['reject_speech_token_len'].to(device)
# 1. encode text_token
text_token_emb = self.llm.model.model.embed_tokens(text_token)
# 2. encode speech_token
speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
reject_speech_token = unpad_sequence(reject_speech_token, reject_speech_token_len.cpu(), batch_first=True)
speech_token_combined = speech_token + reject_speech_token
speech_token_combined = pad_sequence(speech_token_combined, batch_first=True, padding_value=0)
speech_token_combined_len = torch.concat([speech_token_len, reject_speech_token_len], dim=0)
speech_token_combined_emb = self.speech_embedding(speech_token_combined)
# 3. prepare llm_input/target
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(text_token.repeat(2, 1), text_token_emb.repeat(2, 1, 1), text_token_len.repeat(2),
speech_token_combined, speech_token_combined_emb, speech_token_combined_len)
lm_target = lm_target.to(device)
# 4. run lm forward
lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
logits = self.llm_decoder(lm_output)
chosen_logits = logits[:text_token.shape[0]]
rejected_logits = logits[text_token.shape[0]:]
chosen_lm_target = lm_target[:text_token.shape[0]]
rejected_lm_target = lm_target[text_token.shape[0]:]
loss = self.criterion_ce(chosen_logits, chosen_lm_target.to(device))
acc = th_accuracy(chosen_logits.view(-1, self.speech_token_size + 3), chosen_lm_target, ignore_label=IGNORE_ID)
# 5. calculate dpo logits
chosen_lm_mask = chosen_lm_target == IGNORE_ID
rejected_lm_mask = rejected_lm_target == IGNORE_ID
chosen_logps = torch.gather(chosen_logits.log_softmax(dim=-1), dim=2, index=chosen_lm_target.masked_fill(chosen_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
rejected_logps = torch.gather(rejected_logits.log_softmax(dim=-1), dim=2, index=rejected_lm_target.masked_fill(rejected_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
chosen_logps = (chosen_logps * chosen_lm_mask).mean(dim=-1)
rejected_logps = (rejected_logps * chosen_lm_mask).mean(dim=-1)
return {'loss': loss, 'acc': acc, 'chosen_logps': chosen_logps, 'rejected_logps': rejected_logps}
@torch.inference_mode()
def inference(
self,
@@ -296,7 +437,7 @@ class Qwen2LM(TransformerLM):
sampling: int = 25,
max_token_text_ratio: float = 20,
min_token_text_ratio: float = 2,
vllm_codec_engine=None,
uuid: str = '',
) -> Generator[torch.Tensor, None, None]:
device = text.device
text = torch.concat([prompt_text, text], dim=1)
@@ -317,13 +458,47 @@ class Qwen2LM(TransformerLM):
max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
# 5. step by step decode
if vllm_codec_engine is None:
for token in self.inference_wrapper(lm_input, sampling, min_len, max_len, uuid):
yield token
@torch.inference_mode()
def inference_wrapper(self, lm_input, sampling, min_len, max_len, uuid):
if hasattr(self, 'vllm'):
from vllm import SamplingParams, RequestOutput
sampling_params = SamplingParams(top_k=sampling,
stop_token_ids=self.stop_token_ids,
min_tokens=min_len,
max_tokens=max_len)
with self.lock:
self.vllm.add_request(uuid, {"prompt_embeds": lm_input.squeeze(0).to(torch.bfloat16).to(lm_input.device)}, sampling_params)
self.vllm_output_queue[uuid] = queue.Queue()
out_tokens = []
while True:
with self.lock:
if self.vllm_output_queue[uuid].empty() is True:
request_outputs: List[RequestOutput] = self.vllm.step()
for request_output in request_outputs:
top_ids = list(request_output.outputs[0].token_ids)[-1]
self.vllm_output_queue[request_output.request_id].put(top_ids)
if self.vllm_output_queue[uuid].empty() is False:
top_ids = self.vllm_output_queue[uuid].get()
if top_ids in self.stop_token_ids:
break
# in stream mode, yield token one by one
yield top_ids
out_tokens.append(top_ids)
if len(out_tokens) == max_len:
break
time.sleep(0.001)
with self.lock:
self.vllm_output_queue.pop(uuid)
else:
out_tokens = []
cache = None
for i in range(max_len):
y_pred, cache = self.llm.forward_one_step(lm_input,
masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
cache=cache)
masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
cache=cache)
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
if top_ids == self.speech_token_size:
@@ -334,34 +509,6 @@ class Qwen2LM(TransformerLM):
yield top_ids
out_tokens.append(top_ids)
lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
else:
from vllm import SamplingParams, RequestOutput
import uuid
sampling_params = SamplingParams(top_k=sampling,
stop_token_ids=[6561, 6563],
min_tokens=min_len,
max_tokens=max_len)
request_id = uuid.uuid4()
vllm_codec_engine.add_request(request_id,
{"prompt_embeds": lm_input.squeeze(0).to(torch.bfloat16).to(device)},
sampling_params)
while True:
speech_token_break = False
request_outputs: List[RequestOutput] = vllm_codec_engine.step()
for request_output in request_outputs:
if str(request_output.request_id) != str(request_id):
continue
# print(f"request output: {request_output}")
top_ids = list(request_output.outputs[0].token_ids)[-1]
if top_ids == self.speech_token_size:
speech_token_break = True
break
if top_ids > self.speech_token_size:
continue
yield top_ids
if not vllm_codec_engine.has_unfinished_requests() or speech_token_break:
break
@torch.inference_mode()
def inference_bistream(
@@ -423,8 +570,8 @@ class Qwen2LM(TransformerLM):
while True:
seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
y_pred, cache = self.llm.forward_one_step(lm_input,
masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
cache=cache)
masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
cache=cache)
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
if next_fill_index != -1 and len(out_tokens) == next_fill_index:
top_ids = self.speech_token_size + 2
@@ -462,129 +609,3 @@ class Qwen2LM(TransformerLM):
# in stream mode, yield token one by one
yield top_ids
lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
@torch.inference_mode()
def inference_bistream_vllm(
self,
text: Generator,
prompt_text: torch.Tensor,
prompt_text_len: torch.Tensor,
prompt_speech_token: torch.Tensor,
prompt_speech_token_len: torch.Tensor,
embedding: torch.Tensor,
sampling: int = 25,
max_token_text_ratio: float = 20,
min_token_text_ratio: float = 2,
vllm_codec_engine=None,
) -> Generator[torch.Tensor, None, None]:
device = prompt_text.device
# 1. prepare input
sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
if prompt_speech_token_len != 0:
prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
else:
prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=prompt_text.dtype).to(device)
lm_input = torch.concat([sos_eos_emb], dim=1)
# 2. iterate text
out_tokens = []
cache = None
# NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
text_cache = self.llm.model.model.embed_tokens(prompt_text)
next_fill_index = -1
from vllm import SamplingParams, RequestOutput
import uuid
sampling_params = SamplingParams(top_k=sampling,
stop_token_ids=[6561, 6563],
max_tokens=10000)
for this_text in text:
text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
# prompt_speech_token_emb not empty, try append to lm_input
while prompt_speech_token_emb.size(1) != 0:
if text_cache.size(1) >= self.mix_ratio[0]:
lm_input_text, lm_input_speech = text_cache[:, :self.mix_ratio[0]], prompt_speech_token_emb[:, :self.mix_ratio[1]]
logging.info('append {} text token {} speech token'.format(lm_input_text.size(1), lm_input_speech.size(1)))
lm_input = torch.concat([lm_input, lm_input_text, lm_input_speech], dim=1)
text_cache, prompt_speech_token_emb = text_cache[:, self.mix_ratio[0]:], prompt_speech_token_emb[:, self.mix_ratio[1]:]
else:
logging.info('not enough text token to decode, wait for more')
break
# no prompt_speech_token_emb remain, can decode some speech token
if prompt_speech_token_emb.size(1) == 0:
if (len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2) or (len(out_tokens) == 0 and lm_input.size(1) == 1):
logging.info('get fill token, need to append more text token')
if text_cache.size(1) >= self.mix_ratio[0]:
lm_input_text = text_cache[:, :self.mix_ratio[0]]
logging.info('append {} text token'.format(lm_input_text.size(1)))
if vllm_codec_engine is None and len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2:
lm_input = lm_input_text
else:
lm_input = torch.concat([lm_input, lm_input_text], dim=1)
text_cache = text_cache[:, self.mix_ratio[0]:]
else:
logging.info('not enough text token to decode, wait for more')
continue
request_id = uuid.uuid4()
vllm_codec_engine.add_request(request_id,
{"prompt_embeds": lm_input.squeeze(0).to(torch.bfloat16).to(device)},
sampling_params)
## generator
while True:
speech_token_break = False
request_outputs: List[RequestOutput] = vllm_codec_engine.step()
for request_output in request_outputs:
if str(request_output.request_id) != str(request_id):
continue
# print(f"request output: {request_output}")
out_token = list(request_output.outputs[0].token_ids)[-1]
if next_fill_index != -1 and len(out_tokens) == next_fill_index:
top_ids = self.speech_token_size + 2
next_fill_index += (self.mix_ratio[1] + 1)
else:
top_ids = out_token
if top_ids == self.speech_token_size + 2:
next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
out_tokens.append(top_ids)
if top_ids >= self.speech_token_size:
if top_ids == self.speech_token_size + 2:
speech_token_break = True
break
else:
raise ValueError('should not get token {}'.format(top_ids))
yield top_ids
token_embedding = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
lm_input = torch.concat([lm_input, token_embedding], dim=1)
if not vllm_codec_engine.has_unfinished_requests() or speech_token_break:
break
# 3. final decode
lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
logging.info('no more text token, decode until met eos')
request_id = uuid.uuid4()
vllm_codec_engine.add_request(request_id,
{"prompt_embeds": lm_input.squeeze(0).to(torch.bfloat16).to(device)},
sampling_params)
## generator
while True:
speech_token_break = False
request_outputs: List[RequestOutput] = vllm_codec_engine.step()
for request_output in request_outputs:
if str(request_output.request_id) != str(request_id):
continue
# print(f"request output: {request_output}")
top_ids = list(request_output.outputs[0].token_ids)[-1]
out_tokens.append(top_ids)
if top_ids >= self.speech_token_size:
if top_ids == self.speech_token_size:
speech_token_break = True
break
else:
raise ValueError('should not get token {}'.format(top_ids))
# in stream mode, yield token one by one
yield top_ids
if not vllm_codec_engine.has_unfinished_requests() or speech_token_break:
break

View File

@@ -287,8 +287,16 @@ class EspnetRelPositionalEncoding(torch.nn.Module):
Returns:
torch.Tensor: Corresponding encoding
"""
pos_emb = self.pe[
:,
self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
]
# How to subscript a Union type:
# https://github.com/pytorch/pytorch/issues/69434
if isinstance(offset, int):
pos_emb = self.pe[
:,
self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
]
elif isinstance(offset, torch.Tensor):
pos_emb = self.pe[
:,
self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
]
return pos_emb

View File

@@ -56,7 +56,7 @@ class Upsample1D(nn.Module):
# In this mode, first repeat interpolate, than conv with stride=1
self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
outputs = self.conv(outputs)
@@ -78,16 +78,22 @@ class PreLookaheadLayer(nn.Module):
kernel_size=3, stride=1, padding=0,
)
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> torch.Tensor:
"""
inputs: (batch_size, seq_len, channels)
"""
outputs = inputs.transpose(1, 2).contiguous()
context = context.transpose(1, 2).contiguous()
# look ahead
outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
if context.size(2) == 0:
outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
else:
assert self.training is False, 'you have passed context, make sure that you are running inference mode'
assert context.size(2) == self.pre_lookahead_len
outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
outputs = F.leaky_relu(self.conv1(outputs))
# outputs
outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
outputs = self.conv2(outputs)
outputs = outputs.transpose(1, 2).contiguous()
@@ -238,8 +244,10 @@ class UpsampleConformerEncoder(torch.nn.Module):
self,
xs: torch.Tensor,
xs_lens: torch.Tensor,
context: torch.Tensor = torch.zeros(0, 0, 0),
decoding_chunk_size: int = 0,
num_decoding_left_chunks: int = -1,
streaming: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Embed positions in tensor.
@@ -269,15 +277,14 @@ class UpsampleConformerEncoder(torch.nn.Module):
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
xs, pos_emb, masks = self.embed(xs, masks)
if context.size(1) != 0:
assert self.training is False, 'you have passed context, make sure that you are running inference mode'
context_masks = torch.ones(1, 1, context.size(1)).to(masks)
context, _, _ = self.embed(context, context_masks, offset=xs.size(1))
mask_pad = masks # (B, 1, T/subsample_rate)
chunk_masks = add_optional_chunk_mask(xs, masks,
self.use_dynamic_chunk,
self.use_dynamic_left_chunk,
decoding_chunk_size,
self.static_chunk_size,
num_decoding_left_chunks)
chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
# lookahead + conformer encoder
xs = self.pre_lookahead_layer(xs)
xs = self.pre_lookahead_layer(xs, context=context)
xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
# upsample + conformer encoder
@@ -288,12 +295,7 @@ class UpsampleConformerEncoder(torch.nn.Module):
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
xs, pos_emb, masks = self.up_embed(xs, masks)
mask_pad = masks # (B, 1, T/subsample_rate)
chunk_masks = add_optional_chunk_mask(xs, masks,
self.use_dynamic_chunk,
self.use_dynamic_left_chunk,
decoding_chunk_size,
self.static_chunk_size * self.up_layer.stride,
num_decoding_left_chunks)
chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size * self.up_layer.stride if streaming is True else 0, -1)
xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
if self.normalize_before:

View File

@@ -1,5 +1,6 @@
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
# 2024 Alibaba Inc (authors: Xiang Lyu)
# 2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -15,6 +16,7 @@
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Unility functions for Transformer."""
import queue
import random
from typing import List
@@ -164,3 +166,21 @@ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
# chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
mask = (1.0 - mask) * -1.0e+10
return mask
class TrtContextWrapper:
def __init__(self, trt_engine, trt_concurrent=1, device='cuda:0'):
self.trt_context_pool = queue.Queue(maxsize=trt_concurrent)
self.trt_engine = trt_engine
for _ in range(trt_concurrent):
trt_context = trt_engine.create_execution_context()
trt_stream = torch.cuda.stream(torch.cuda.Stream(device))
assert trt_context is not None, 'failed to create trt context, maybe not enough CUDA memory, try reduce current trt concurrent {}'.format(trt_concurrent)
self.trt_context_pool.put([trt_context, trt_stream])
assert self.trt_context_pool.empty() is False, 'no avaialbe estimator context'
def acquire_estimator(self):
return self.trt_context_pool.get(), self.trt_engine
def release_estimator(self, context, stream):
self.trt_context_pool.put([context, stream])

View File

@@ -25,14 +25,16 @@ from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, l
class Executor:
def __init__(self, gan: bool = False):
def __init__(self, gan: bool = False, ref_model: torch.nn.Module = None, dpo_loss: torch.nn.Module = None):
self.gan = gan
self.ref_model = ref_model
self.dpo_loss = dpo_loss
self.step = 0
self.epoch = 0
self.rank = int(os.environ.get('RANK', 0))
self.device = torch.device('cuda:{}'.format(self.rank))
def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join):
def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=None):
''' Train one epoch
'''
@@ -44,6 +46,8 @@ class Executor:
# torch.nn.parallel.DistributedDataParallel to be able to train
# with uneven inputs across participating processes.
model.train()
if self.ref_model is not None:
self.ref_model.eval()
model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
with model_context():
for batch_idx, batch_dict in enumerate(train_data_loader):
@@ -65,7 +69,7 @@ class Executor:
context = nullcontext
with context():
info_dict = batch_forward(model, batch_dict, scaler, info_dict)
info_dict = batch_forward(model, batch_dict, scaler, info_dict, ref_model=self.ref_model, dpo_loss=self.dpo_loss)
info_dict = batch_backward(model, scaler, info_dict)
info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict)

View File

@@ -1,5 +1,6 @@
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
# 2024 Alibaba Inc (authors: Xiang Lyu, Zetao Hu)
# 2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,7 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import torch
import torchaudio
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
@@ -47,13 +50,8 @@ def load_wav(wav, target_sr):
return speech
def convert_onnx_to_trt(trt_model, onnx_model, fp16):
def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
import tensorrt as trt
_min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2,), (2, 80), (2, 80, 4)]
_opt_shape = [(2, 80, 193), (2, 1, 193), (2, 80, 193), (2,), (2, 80), (2, 80, 193)]
_max_shape = [(2, 80, 6800), (2, 1, 6800), (2, 80, 6800), (2,), (2, 80), (2, 80, 6800)]
input_names = ["x", "mask", "mu", "t", "spks", "cond"]
logging.info("Converting onnx to trt...")
network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
logger = trt.Logger(trt.Logger.INFO)
@@ -61,7 +59,7 @@ def convert_onnx_to_trt(trt_model, onnx_model, fp16):
network = builder.create_network(network_flags)
parser = trt.OnnxParser(network, logger)
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 33) # 8GB
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32) # 4GB
if fp16:
config.set_flag(trt.BuilderFlag.FP16)
profile = builder.create_optimization_profile()
@@ -72,8 +70,8 @@ def convert_onnx_to_trt(trt_model, onnx_model, fp16):
print(parser.get_error(error))
raise ValueError('failed to parse {}'.format(onnx_model))
# set input shapes
for i in range(len(input_names)):
profile.set_shape(input_names[i], _min_shape[i], _opt_shape[i], _max_shape[i])
for i in range(len(trt_kwargs['input_names'])):
profile.set_shape(trt_kwargs['input_names'][i], trt_kwargs['min_shape'][i], trt_kwargs['opt_shape'][i], trt_kwargs['max_shape'][i])
tensor_dtype = trt.DataType.HALF if fp16 else trt.DataType.FLOAT
# set input and output data type
for i in range(network.num_inputs):
@@ -87,3 +85,45 @@ def convert_onnx_to_trt(trt_model, onnx_model, fp16):
# save trt engine
with open(trt_model, "wb") as f:
f.write(engine_bytes)
logging.info("Succesfully convert onnx to trt...")
def export_cosyvoice2_vllm(model, model_path, device):
if os.path.exists(model_path):
return
pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64
vocab_size = model.speech_embedding.num_embeddings
feature_size = model.speech_embedding.embedding_dim
pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to
dtype = torch.bfloat16
# lm_head
new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=True)
with torch.no_grad():
new_lm_head.weight[:vocab_size] = model.llm_decoder.weight
new_lm_head.bias[:vocab_size] = model.llm_decoder.bias
new_lm_head.weight[vocab_size:] = 0
new_lm_head.bias[vocab_size:] = 0
model.llm.model.lm_head = new_lm_head
new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size)
# embed_tokens
embed_tokens = model.llm.model.model.embed_tokens
with torch.no_grad():
new_codec_embed.weight[:vocab_size] = model.speech_embedding.weight
new_codec_embed.weight[vocab_size:] = 0
model.llm.model.set_input_embeddings(new_codec_embed)
model.llm.model.to(device)
model.llm.model.to(dtype)
tmp_vocab_size = model.llm.model.config.vocab_size
tmp_tie_embedding = model.llm.model.config.tie_word_embeddings
del model.llm.model.generation_config.eos_token_id
del model.llm.model.config.bos_token_id
del model.llm.model.config.eos_token_id
model.llm.model.config.vocab_size = pad_vocab_size
model.llm.model.config.tie_word_embeddings = False
model.llm.model.config.use_bias = True
model.llm.model.save_pretrained(model_path)
os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
model.llm.model.config.vocab_size = tmp_vocab_size
model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
model.llm.model.set_input_embeddings(embed_tokens)

View File

@@ -1,5 +1,6 @@
import torch
import torch.nn.functional as F
from typing import Tuple
def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
@@ -18,3 +19,39 @@ def mel_loss(real_speech, generated_speech, mel_transforms):
mel_g = transform(generated_speech)
loss += F.l1_loss(mel_g, mel_r)
return loss
class DPOLoss(torch.nn.Module):
"""
DPO Loss
"""
def __init__(self, beta: float, label_smoothing: float = 0.0, ipo: bool = False) -> None:
super().__init__()
self.beta = beta
self.label_smoothing = label_smoothing
self.ipo = ipo
def forward(
self,
policy_chosen_logps: torch.Tensor,
policy_rejected_logps: torch.Tensor,
reference_chosen_logps: torch.Tensor,
reference_rejected_logps: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
pi_logratios = policy_chosen_logps - policy_rejected_logps
ref_logratios = reference_chosen_logps - reference_rejected_logps
logits = pi_logratios - ref_logratios
if self.ipo:
losses = (logits - 1 / (2 * self.beta)) ** 2 # Eq. 17 of https://arxiv.org/pdf/2310.12036v2.pdf
else:
# Eq. 3 https://ericmitchell.ai/cdpo.pdf; label_smoothing=0 gives original DPO (Eq. 7 of https://arxiv.org/pdf/2305.18290.pdf)
losses = (
-F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
- F.logsigmoid(-self.beta * logits) * self.label_smoothing
)
loss = losses.mean()
chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach()
return loss, chosen_rewards, rejected_rewards

View File

@@ -15,7 +15,6 @@
# limitations under the License.
import torch
from cosyvoice.utils.file_utils import logging
'''
def subsequent_mask(
size: int,
@@ -153,7 +152,6 @@ def subsequent_chunk_mask(
[1, 1, 1, 1]]
"""
# NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
# actually this is not needed after we have inference cache implemented, will remove it later
pos_idx = torch.arange(size, device=device)
block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
@@ -233,8 +231,8 @@ def add_optional_chunk_mask(xs: torch.Tensor,
chunk_masks = masks
assert chunk_masks.dtype == torch.bool
if (chunk_masks.sum(dim=-1) == 0).sum().item() != 0:
logging.warning('get chunk_masks all false at some timestep, force set to true, make sure they are masked in futuer computation!')
chunk_masks[chunk_masks.sum(dim=-1)==0] = True
print('get chunk_masks all false at some timestep, force set to true, make sure they are masked in futuer computation!')
chunk_masks[chunk_masks.sum(dim=-1) == 0] = True
return chunk_masks

View File

@@ -50,10 +50,10 @@ def init_distributed(args):
return world_size, local_rank, rank
def init_dataset_and_dataloader(args, configs, gan):
def init_dataset_and_dataloader(args, configs, gan, dpo):
data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=True, partition=True)
cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=False, partition=False)
train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=True, partition=True)
cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, dpo=dpo, shuffle=False, partition=False)
# do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
train_data_loader = DataLoader(train_dataset,
@@ -235,7 +235,7 @@ def cosyvoice_join(group_join, info_dict):
return False
def batch_forward(model, batch, scaler, info_dict):
def batch_forward(model, batch, scaler, info_dict, ref_model=None, dpo_loss=None):
device = int(os.environ.get('LOCAL_RANK', 0))
dtype = info_dict["dtype"]
@@ -253,6 +253,24 @@ def batch_forward(model, batch, scaler, info_dict):
with autocast:
info_dict['loss_dict'] = model(batch, device)
if ref_model is not None and dpo_loss is not None:
chosen_logps = info_dict['loss_dict']["chosen_logps"]
rejected_logps = info_dict['loss_dict']["rejected_logps"]
sft_loss = info_dict['loss_dict']['loss']
with torch.no_grad():
ref_loss_dict = ref_model(batch, device)
reference_chosen_logps = ref_loss_dict["chosen_logps"]
reference_rejected_logps = ref_loss_dict["rejected_logps"]
preference_loss, chosen_reward, reject_reward = dpo_loss(
chosen_logps, rejected_logps, reference_chosen_logps, reference_rejected_logps
)
dpo_acc = (chosen_reward > reject_reward).float().mean()
info_dict['loss_dict']["loss"] = preference_loss + sft_loss
info_dict['loss_dict']["sft_loss"] = sft_loss
info_dict['loss_dict']["dpo_loss"] = preference_loss
info_dict['loss_dict']["dpo_acc"] = dpo_acc
info_dict['loss_dict']["chosen_reward"] = chosen_reward.mean()
info_dict['loss_dict']["reject_reward"] = reject_reward.mean()
return info_dict
@@ -286,11 +304,15 @@ def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict):
# optimizer.step().
if torch.isfinite(grad_norm):
scaler.step(optimizer)
else:
logging.warning('get infinite grad_norm, check your code/data if it appears frequently')
scaler.update()
else:
grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
if torch.isfinite(grad_norm):
optimizer.step()
else:
logging.warning('get infinite grad_norm, check your code/data if it appears frequently')
optimizer.zero_grad()
scheduler.step()
info_dict["lr"] = optimizer.param_groups[0]['lr']
@@ -336,7 +358,7 @@ def log_per_save(writer, info_dict):
rank = int(os.environ.get('RANK', 0))
logging.info(
'Epoch {} Step {} CV info lr {} {} rank {}'.format(
epoch, step + 1, lr, rank, ' '.join(['{}_{}'.format(k, v) for k, v in loss_dict.items()])))
epoch, step + 1, lr, rank, ' '.join(['{} {}'.format(k, v) for k, v in loss_dict.items()])))
if writer is not None:
for k in ['epoch', 'lr']:

View File

@@ -0,0 +1,103 @@
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
from vllm.model_executor.models.qwen2 import *
class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config
self.lora_config = lora_config
self.quant_config = quant_config
self.model = Qwen2Model(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
if get_pp_group().is_last_rank:
if config.tie_word_embeddings:
self.lm_head = self.model.embed_tokens
else:
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
True,
quant_config=quant_config,
prefix=maybe_prefix(
prefix, "lm_head"))
else:
self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
hidden_states = self.model(input_ids, positions, intermediate_tensors,
inputs_embeds)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata, self.lm_head.bias)
return logits
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(
self,
skip_prefixes=(["lm_head."]
if self.config.tie_word_embeddings else None),
)
return loader.load_weights(weights)

View File

@@ -147,7 +147,7 @@ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift>
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
mel_spec_transform: [
!ref <mel_spec_transform1>
]

View File

@@ -147,7 +147,7 @@ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift>
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
mel_spec_transform: [
!ref <mel_spec_transform1>
]

View File

@@ -5,70 +5,40 @@ __set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
# fixed params
sample_rate: 22050
text_encoder_input_size: 512
llm_input_size: 1024
llm_output_size: 1024
sample_rate: 24000 # 16000 for llm, 24000 for cfm
llm_input_size: 896
llm_output_size: 896
spk_embed_dim: 192
qwen_pretrain_path: 'CosyVoice2-0.5B/CosyVoice-BlankEN'
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:cosyvoice.llm.llm.TransformerLM
text_encoder_input_size: !ref <text_encoder_input_size>
llm: !new:cosyvoice.llm.llm_dpo.Qwen2LM
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
speech_token_size: 4096
speech_token_size: 6561
length_normalized_loss: True
lsm_weight: 0
spk_embed_dim: !ref <spk_embed_dim>
text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
input_size: !ref <text_encoder_input_size>
output_size: 1024
attention_heads: 16
linear_units: 4096
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
use_cnn_module: False
macaron_style: False
use_dynamic_chunk: False
use_dynamic_left_chunk: False
static_chunk_size: 1
llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
input_size: !ref <llm_input_size>
output_size: !ref <llm_output_size>
attention_heads: 16
linear_units: 4096
num_blocks: 14
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: 'linear_legacy'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
static_chunk_size: 1
dpo: True
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 10
tau_r: 0.1
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512
output_size: 80
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
vocab_size: 4096
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
vocab_size: 6561
input_frame_rate: 25
only_mask_loss: True
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
token_mel_ratio: 2
pre_lookahead_len: 3
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 512
attention_heads: 8
linear_units: 2048
@@ -83,10 +53,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
input_size: 512
use_cnn_module: False
macaron_style: False
length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
channels: 80
sampling_ratios: [1, 1, 1, 1]
decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
spk_emb_dim: 80
@@ -101,7 +68,8 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
in_channels: 320
out_channels: 80
channels: [256, 256]
causal: True
channels: [256]
dropout: 0.0
attention_head_dim: 64
n_blocks: 4
@@ -117,15 +85,15 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
nsf_alpha: 0.1
nsf_sigma: 0.003
nsf_voiced_threshold: 10
upsample_rates: [8, 8]
upsample_kernel_sizes: [16, 16]
upsample_rates: [8, 5, 3]
upsample_kernel_sizes: [16, 11, 7]
istft_params:
n_fft: 16
hop_len: 4
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
lrelu_slope: 0.1
audio_limit: 0.99
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
@@ -133,6 +101,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80
cond_channels: 512
# gan related module
mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1024
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 256
win_size: 1024
fmin: 0
fmax: null
center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift>
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
mel_spec_transform: [
!ref <mel_spec_transform1>
]
# processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
@@ -151,6 +138,8 @@ filter: !name:cosyvoice.dataset.processor.filter
token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
truncate_length: 24576 # must be a multiplier of hop_size
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1024
num_mels: 80
@@ -162,6 +151,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
sample_rate: !ref <sample_rate>
hop_size: 256
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
normalize: True
shuffle: !name:cosyvoice.dataset.processor.shuffle
@@ -170,9 +162,10 @@ sort: !name:cosyvoice.dataset.processor.sort
sort_size: 500 # sort_size should be less than shuffle_size
batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 2000
max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g
padding: !name:cosyvoice.dataset.processor.padding
use_spk_embedding: False # change to True during sft
use_spk_embedding: True # change to True during sft
dpo: True
# dataset processor pipeline
data_pipeline: [
@@ -187,17 +180,47 @@ data_pipeline: [
!ref <batch>,
!ref <padding>,
]
data_pipeline_gan: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <truncate>,
!ref <compute_fbank>,
!ref <compute_f0>,
!ref <parse_embedding>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
# train conf
# llm flow train conf
train_conf:
optim: adam
optim_conf:
lr: 0.001 # change to 1e-5 during sft
lr: 0.00001 # change to 1e-5 during sft
scheduler: warmuplr # change to constantlr during sft
scheduler_conf:
warmup_steps: 2500
warmup_steps: 25000
max_epoch: 200
grad_clip: 5
accum_grad: 2
log_interval: 100
save_per_step: -1
# gan train conf
train_conf_gan:
optim: adam
optim_conf:
lr: 0.0002 # use small lr for gan training
scheduler: constantlr
optim_d: adam
optim_conf_d:
lr: 0.0002 # use small lr for gan training
scheduler_d: constantlr
max_epoch: 200
grad_clip: 5
accum_grad: 1 # in gan training, accum_grad must be 1
log_interval: 100
save_per_step: -1

View File

@@ -49,5 +49,7 @@ if __name__ == "__main__":
type=str)
parser.add_argument('--des_dir',
type=str)
parser.add_argument('--ref_model',
type=str)
args = parser.parse_args()
main()

View File

@@ -0,0 +1,50 @@
import argparse
import logging
import os
from tqdm import tqdm
import torch
import torchaudio
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
logger = logging.getLogger()
def main():
cosyvoice = CosyVoice2(args.ref_model)
utt2wav, utt2text = {}, {}
with open('{}/wav.scp'.format(args.src_dir)) as f:
for l in f:
l = l.split('\n')[0].split()
utt2wav[l[0]] = l[1]
with open('{}/text'.format(args.src_dir)) as f:
for l in f:
l = l.split('\n')[0].split()
utt2text[l[0]] = ' '.join(l[1:])
os.makedirs('{}/wav'.format(args.des_dir), exist_ok=True)
with open('{}/wav.scp'.format(args.des_dir), 'w') as f:
for utt, wav in tqdm(utt2wav.items()):
prompt_speech_16k = load_wav(wav, 16000)
if prompt_speech_16k.shape[1] >= 30 * 16000:
continue
speech_list = []
for _, j in enumerate(cosyvoice.inference_zero_shot(utt2text[utt], utt2text[utt], prompt_speech_16k, stream=False, text_frontend=False)):
speech_list.append(j['tts_speech'])
negative_wav = os.path.abspath('{}/wav/{}'.format(args.des_dir, os.path.basename(wav)))
torchaudio.save(negative_wav, torch.concat(speech_list, dim=1), cosyvoice.sample_rate, backend='soundfile')
f.write('{} {}\n'.format(utt, negative_wav))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--src_dir',
type=str)
parser.add_argument('--des_dir',
type=str)
parser.add_argument('--ref_model',
type=str)
args = parser.parse_args()
main()

View File

@@ -51,23 +51,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
done
fi
# inference
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "Run inference. Please make sure utt in tts_text is in prompt_data"
for mode in sft zero_shot; do
python cosyvoice/bin/inference.py --mode $mode \
--gpu 0 \
--config conf/cosyvoice.yaml \
--prompt_data data/test-clean/parquet/data.list \
--prompt_utt2data data/test-clean/parquet/utt2data.list \
--tts_text `pwd`/tts_text.json \
--llm_model $pretrained_model_dir/llm.pt \
--flow_model $pretrained_model_dir/flow.pt \
--hifigan_model $pretrained_model_dir/hift.pt \
--result_dir `pwd`/exp/cosyvoice/test-clean/$mode
done
fi
# train llm
export CUDA_VISIBLE_DEVICES="0,1,2,3"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')

View File

@@ -5,74 +5,51 @@ __set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
# fixed params
sample_rate: 22050
text_encoder_input_size: 512
llm_input_size: 1024
llm_output_size: 1024
sample_rate: 24000
llm_input_size: 896
llm_output_size: 896
spk_embed_dim: 192
qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
# stream related params
chunk_size: 25 # streaming inference chunk size, in token
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:cosyvoice.llm.llm.TransformerLM
text_encoder_input_size: !ref <text_encoder_input_size>
llm: !new:cosyvoice.llm.llm.Qwen2LM
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
speech_token_size: 4096
speech_token_size: 6561
length_normalized_loss: True
lsm_weight: 0
spk_embed_dim: !ref <spk_embed_dim>
text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
input_size: !ref <text_encoder_input_size>
output_size: 1024
attention_heads: 8
linear_units: 2048
num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
use_cnn_module: False
macaron_style: False
use_dynamic_chunk: False
use_dynamic_left_chunk: False
static_chunk_size: 1
llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
input_size: !ref <llm_input_size>
output_size: !ref <llm_output_size>
attention_heads: 8
linear_units: 2048
num_blocks: 7
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: 'linear_legacy'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
static_chunk_size: 1
mix_ratio: [5, 15]
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 10
tau_r: 0.1
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512
output_size: 80
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
vocab_size: 4096
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
vocab_size: 6561
input_frame_rate: !ref <token_frame_rate>
only_mask_loss: True
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
token_mel_ratio: !ref <token_mel_ratio>
pre_lookahead_len: 3
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 512
attention_heads: 4
linear_units: 1024
num_blocks: 3
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
@@ -83,10 +60,8 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
input_size: 512
use_cnn_module: False
macaron_style: False
length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
channels: 80
sampling_ratios: [1, 1, 1, 1]
decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
static_chunk_size: !ref <chunk_size>
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
spk_emb_dim: 80
@@ -98,16 +73,18 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
training_cfg_rate: 0.2
inference_cfg_rate: 0.7
reg_loss_type: 'l1'
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
in_channels: 320
out_channels: 80
channels: [256, 256]
channels: [256]
dropout: 0.0
attention_head_dim: 64
n_blocks: 4
num_mid_blocks: 8
num_mid_blocks: 12
num_heads: 8
act_fn: 'gelu'
static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80
@@ -117,15 +94,15 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
nsf_alpha: 0.1
nsf_sigma: 0.003
nsf_voiced_threshold: 10
upsample_rates: [8, 8]
upsample_kernel_sizes: [16, 16]
upsample_rates: [8, 5, 3]
upsample_kernel_sizes: [16, 11, 7]
istft_params:
n_fft: 16
hop_len: 4
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
lrelu_slope: 0.1
audio_limit: 0.99
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
@@ -133,35 +110,58 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80
cond_channels: 512
# gan related module
mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 480
win_size: 1920
fmin: 0
fmax: null
center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift>
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
mel_spec_transform: [
!ref <mel_spec_transform1>
]
# processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
multilingual: True
num_languages: 100
language: 'en'
task: 'transcribe'
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
token_path: !ref <qwen_pretrain_path>
skip_special_tokens: True
allowed_special: 'all'
tokenize: !name:cosyvoice.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
max_length: 40960
min_length: 0
min_length: 100
token_max_length: 200
token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
truncate_length: 24480 # must be a multiplier of hop_size
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1024
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 256
win_size: 1024
hop_size: 480
win_size: 1920
fmin: 0
fmax: 8000
center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
token_mel_ratio: 2
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
sample_rate: !ref <sample_rate>
hop_size: 480
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
normalize: True
shuffle: !name:cosyvoice.dataset.processor.shuffle
@@ -170,10 +170,11 @@ sort: !name:cosyvoice.dataset.processor.sort
sort_size: 500 # sort_size should be less than shuffle_size
batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 12000
max_frames_in_batch: 2000
padding: !name:cosyvoice.dataset.processor.padding
use_spk_embedding: False # change to True during sft
# dataset processor pipeline
data_pipeline: [
!ref <parquet_opener>,
@@ -187,17 +188,47 @@ data_pipeline: [
!ref <batch>,
!ref <padding>,
]
data_pipeline_gan: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <truncate>,
!ref <compute_fbank>,
!ref <compute_f0>,
!ref <parse_embedding>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
# train conf
# llm flow train conf
train_conf:
optim: adam
optim_conf:
lr: 0.002 # change to 0.001 if you want to train flow from scratch
scheduler: warmuplr
lr: 1e-5 # change to 1e-5 during sft
scheduler: constantlr # change to constantlr during sft
scheduler_conf:
warmup_steps: 25000
warmup_steps: 2500
max_epoch: 200
grad_clip: 5
accum_grad: 2
log_interval: 100
save_per_step: -1
# gan train conf
train_conf_gan:
optim: adam
optim_conf:
lr: 0.0002 # use small lr for gan training
scheduler: constantlr
optim_d: adam
optim_conf_d:
lr: 0.0002 # use small lr for gan training
scheduler_d: constantlr
max_epoch: 200
grad_clip: 5
accum_grad: 1 # in gan training, accum_grad must be 1
log_interval: 100
save_per_step: -1

View File

@@ -0,0 +1 @@
../cosyvoice/local

View File

@@ -0,0 +1 @@
../cosyvoice/path.sh

View File

@@ -0,0 +1,111 @@
#!/bin/bash
# Copyright 2024 Alibaba Inc. All Rights Reserved.
. ./path.sh || exit 1;
stage=-1
stop_stage=3
data_url=www.openslr.org/resources/60
data_dir=/mnt/lyuxiang.lx/data/tts/openslr/libritts
pretrained_model_dir=../../../pretrained_models/CosyVoice2-0.5B
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Data Download"
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
local/download_and_untar.sh ${data_dir} ${data_url} ${part}
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x
python local/prepare_data.py --src_dir $data_dir/LibriTTS/$x --des_dir data/$x
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \
--src_dir data/$x \
--des_dir data/$x/parquet
done
fi
# train llm
export CUDA_VISIBLE_DEVICES="0,1,2,3"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
job_id=1986
dist_backend="nccl"
num_workers=2
prefetch=100
train_engine=torch_ddp
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
if [ $train_engine == 'deepspeed' ]; then
echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
fi
cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list
cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list
# NOTE will update llm/hift training later
for model in llm flow hifigan; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
cosyvoice/bin/train.py \
--train_engine $train_engine \
--config conf/cosyvoice2.yaml \
--train_data data/train.data.list \
--cv_data data/dev.data.list \
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
--model $model \
--checkpoint $pretrained_model_dir/$model.pt \
--model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
--tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \
--ddp.dist_backend $dist_backend \
--num_workers ${num_workers} \
--prefetch ${prefetch} \
--pin_memory \
--use_amp \
--deepspeed_config ./conf/ds_stage2.json \
--deepspeed.save_states model+optimizer
done
fi
# average model
average_num=5
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
for model in llm flow hifigan; do
decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python cosyvoice/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path `pwd`/exp/cosyvoice/$model/$train_engine \
--num ${average_num} \
--val_best
done
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir
python cosyvoice/bin/export_onnx.py --model_dir $pretrained_model_dir
fi

View File

@@ -0,0 +1,123 @@
#!/bin/bash
# Copyright 2024 Alibaba Inc. All Rights Reserved.
. ./path.sh || exit 1;
stage=-1
stop_stage=3
data_url=www.openslr.org/resources/60
data_dir=/mnt/lyuxiang.lx/data/tts/openslr/libritts
pretrained_model_dir=../../../pretrained_models/CosyVoice2-0.5B
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Data Download"
for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
local/download_and_untar.sh ${data_dir} ${data_url} ${part}
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x
python local/prepare_data.py --src_dir $data_dir/LibriTTS/$x --des_dir data/$x
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Prepare negative samples using CosyVoice2-0.5B, this is also our reference model.
Here we use CosyVoice2-0.5B generated audio as reject sample for simplicity, you can use metric like wer/similarity."
for x in train-clean-100 train-clean-360 train-other-500; do
mkdir -p data/${x}_reject
python local/prepare_reject_sample.py --src_dir data/$x --des_dir data/${x}_reject --ref_model $pretrained_model_dir
done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do
tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \
--dpo \
--src_dir data/$x \
--des_dir data/$x/parquet
done
fi
# train llm
export CUDA_VISIBLE_DEVICES="0,1,2,3"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
job_id=1986
dist_backend="nccl"
num_workers=2
prefetch=100
train_engine=torch_ddp
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
if [ $train_engine == 'deepspeed' ]; then
echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
fi
cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list
cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list
# NOTE only llm supports dpo
for model in llm; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
cosyvoice/bin/train.py \
--train_engine $train_engine \
--config conf/cosyvoice2.yaml \
--train_data data/train.data.list \
--cv_data data/dev.data.list \
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
--model $model \
--checkpoint $pretrained_model_dir/$model.pt \
--ref_model $pretrained_model_dir/llm.pt \
--model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
--tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \
--ddp.dist_backend $dist_backend \
--num_workers ${num_workers} \
--prefetch ${prefetch} \
--pin_memory \
--use_amp \
--dpo \
--deepspeed_config ./conf/ds_stage2.json \
--deepspeed.save_states model+optimizer
done
fi
# average model
average_num=5
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
for model in llm flow hifigan; do
decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python cosyvoice/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path `pwd`/exp/cosyvoice/$model/$train_engine \
--num ${average_num} \
--val_best
done
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir
python cosyvoice/bin/export_onnx.py --model_dir $pretrained_model_dir
fi

View File

@@ -0,0 +1 @@
../cosyvoice/tts_text.json

View File

@@ -0,0 +1 @@
../../libritts/cosyvoice/conf

View File

@@ -1,3 +0,0 @@
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH

View File

@@ -0,0 +1 @@
../../libritts/cosyvoice/path.sh

View File

@@ -51,23 +51,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
done
fi
# inference
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "Run inference. Please make sure utt in tts_text is in prompt_data"
for mode in sft zero_shot; do
python cosyvoice/bin/inference.py --mode $mode \
--gpu 0 \
--config conf/cosyvoice.yaml \
--prompt_data data/test/parquet/data.list \
--prompt_utt2data data/test/parquet/utt2data.list \
--tts_text `pwd`/tts_text.json \
--llm_model $pretrained_model_dir/llm.pt \
--flow_model $pretrained_model_dir/flow.pt \
--hifigan_model $pretrained_model_dir/hift.pt \
--result_dir `pwd`/exp/cosyvoice/test/$mode
done
fi
# train llm
export CUDA_VISIBLE_DEVICES="0,1,2,3"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
@@ -83,7 +66,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
fi
cp data/train/parquet/data.list data/train.data.list
cp data/dev/parquet/data.list data/dev.data.list
for model in llm flow; do
for model in llm flow hifigan; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
cosyvoice/bin/train.py \
@@ -99,11 +82,26 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--num_workers ${num_workers} \
--prefetch ${prefetch} \
--pin_memory \
--use_amp \
--deepspeed_config ./conf/ds_stage2.json \
--deepspeed.save_states model+optimizer
done
fi
# average model
average_num=5
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
for model in llm flow hifigan; do
decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt
echo "do model average and final checkpoint is $decode_checkpoint"
python cosyvoice/bin/average_model.py \
--dst_model $decode_checkpoint \
--src_path `pwd`/exp/cosyvoice/$model/$train_engine \
--num ${average_num} \
--val_best
done
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir

View File

@@ -1,8 +1,10 @@
--extra-index-url https://download.pytorch.org/whl/cu121
--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
conformer==0.3.2
deepspeed==0.14.2; sys_platform == 'linux'
deepspeed==0.15.1; sys_platform == 'linux'
diffusers==0.29.0
fastapi==0.115.6
fastapi-cli==0.0.4
gdown==5.1.0
gradio==5.4.0
grpcio==1.57.0
@@ -13,14 +15,15 @@ inflect==7.3.1
librosa==0.10.2
lightning==2.2.4
matplotlib==3.7.5
modelscope==1.15.0
modelscope==1.20.0
networkx==3.1
omegaconf==2.3.0
onnx==1.16.0
onnxruntime-gpu==1.18.0; sys_platform == 'linux'
onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows'
onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32'
openai-whisper==20231117
protobuf==4.25
pyarrow==18.1.0
pydantic==2.7.0
pyworld==0.3.4
rich==13.7.1
@@ -29,11 +32,9 @@ tensorboard==2.14.0
tensorrt-cu12==10.0.1; sys_platform == 'linux'
tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
torch==2.4.0
torchaudio==2.4.0
torch==2.3.1
torchaudio==2.3.1
transformers==4.40.1
uvicorn==0.30.0
wetext==0.0.4
wget==3.2
fastapi==0.115.6
fastapi-cli==0.0.4
WeTextProcessing==1.0.3

View File

@@ -5,7 +5,7 @@ WORKDIR /opt/CosyVoice
RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
RUN apt-get update -y
RUN apt-get -y install git unzip git-lfs
RUN apt-get -y install git unzip git-lfs g++
RUN git lfs install
RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
# here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed

View File

@@ -72,6 +72,7 @@ async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instr
model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
return StreamingResponse(generate_data(model_output))
@app.get("/inference_instruct2")
@app.post("/inference_instruct2")
async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(), prompt_wav: UploadFile = File()):
@@ -80,7 +81,6 @@ async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(
return StreamingResponse(generate_data(model_output))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--port',
@@ -98,4 +98,4 @@ if __name__ == '__main__':
cosyvoice = CosyVoice2(args.model_dir)
except Exception:
raise TypeError('no valid model_type!')
uvicorn.run(app, host="0.0.0.0", port=args.port)
uvicorn.run(app, host="0.0.0.0", port=args.port)

View File

@@ -34,10 +34,10 @@ logging.basicConfig(level=logging.DEBUG,
class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
def __init__(self, args):
try:
self.cosyvoice = CosyVoice(args.model_dir)
self.cosyvoice = CosyVoice(args.model_dir, trt_concurrent=args.max_conc)
except Exception:
try:
self.cosyvoice = CosyVoice2(args.model_dir)
self.cosyvoice = CosyVoice2(args.model_dir, trt_concurrent=args.max_conc)
except Exception:
raise TypeError('no valid model_type!')
logging.info('grpc service initialized')

View File

@@ -34,7 +34,9 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file):
spk_list = [utt2spk[utt] for utt in utt_list]
uttembedding_list = [utt2embedding[utt] for utt in utt_list]
spkembedding_list = [spk2embedding[utt2spk[utt]] for utt in utt_list]
speech_token_list = [utt2speech_token[utt] for utt in utt_list]
speech_token_list = [utt2speech_token.get(utt, []) for utt in utt_list]
if args.dpo:
reject_speech_token_list = [utt2reject_speech_token[utt] for utt in utt_list]
# 保存到parquet,utt2parquet_file,spk2parquet_file
df = pd.DataFrame()
@@ -46,6 +48,8 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file):
df['utt_embedding'] = uttembedding_list
df['spk_embedding'] = spkembedding_list
df['speech_token'] = speech_token_list
if args.dpo:
df['reject_speech_token'] = reject_speech_token_list
df.to_parquet(parquet_file)
with open(utt2parquet_file, 'w') as f:
json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2)
@@ -68,6 +72,10 @@ if __name__ == "__main__":
type=str)
parser.add_argument('--des_dir',
type=str)
parser.add_argument('--dpo',
action='store_true',
default=False,
help='Use Direct Preference Optimization')
args = parser.parse_args()
utt2wav, utt2text, utt2spk = {}, {}, {}
@@ -86,6 +94,8 @@ if __name__ == "__main__":
utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir))
spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir))
utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir))
if args.dpo:
utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir))
utts = list(utt2wav.keys())
# Using process pool to speedup

23
vllm_example.py Normal file
View File

@@ -0,0 +1,23 @@
import sys
sys.path.append('third_party/Matcha-TTS')
from vllm import ModelRegistry
from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM
ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM)
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
from cosyvoice.utils.common import set_all_random_seed
from tqdm import tqdm
def main():
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True)
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
for i in tqdm(range(100)):
set_all_random_seed(i)
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
continue
if __name__ == '__main__':
main()