diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 5dc2204..884011d 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -52,5 +52,5 @@ jobs:
set -eux
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
flake8 --version
- flake8 --max-line-length 150 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
+ flake8 --max-line-length 180 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
if [ $? != 0 ]; then exit 1; fi
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 12b53ef..4a9f23f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,5 @@ compile_commands.json
*.pt
pretrained_models/*
*_pb2_grpc.py
-*_pb2.py
\ No newline at end of file
+*_pb2.py
+*.tar
\ No newline at end of file
diff --git a/README.md b/README.md
index 0e01098..2c3b59f 100644
--- a/README.md
+++ b/README.md
@@ -1,34 +1,51 @@
-# CosyVoice
-## 👉🏻 [CosyVoice Demos](https://fun-audio-llm.github.io/) 👈🏻
-[[CosyVoice Paper](https://fun-audio-llm.github.io/pdf/CosyVoice_v1.pdf)][[CosyVoice Studio](https://www.modelscope.cn/studios/iic/CosyVoice-300M)][[CosyVoice Code](https://github.com/FunAudioLLM/CosyVoice)]
+[](https://github.com/Akshay090/svg-banners)
-For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice).
+## 👉🏻 CosyVoice 👈🏻
+**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_2.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B)
+
+**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
+
+## Highlight🔥
+
+**CosyVoice 2.0** has been released! Compared to version 1.0, the new version offers more accurate, more stable, faster, and better speech generation capabilities.
+### Multilingual
+- **Support Language**: Chinese, English, Japanese, Korean, Chinese dialects (Cantonese, Sichuanese, Shanghainese, Tianjinese, Wuhanese, etc.)
+- **Crosslingual & Mixlingual**:Support zero-shot voice cloning for cross-lingual and code-switching scenarios.
+### Ultra-Low Latency
+- **Bidirectional Streaming Support**: CosyVoice 2.0 integrates offline and streaming modeling technologies.
+- **Rapid First Packet Synthesis**: Achieves latency as low as 150ms while maintaining high-quality audio output.
+### High Accuracy
+- **Improved Pronunciation**: Reduces pronunciation errors by 30% to 50% compared to CosyVoice 1.0.
+- **Benchmark Achievements**: Attains the lowest character error rate on the hard test set of the Seed-TTS evaluation set.
+### Strong Stability
+- **Consistency in Timbre**: Ensures reliable voice consistency for zero-shot and cross-language speech synthesis.
+- **Cross-language Synthesis**: Marked improvements compared to version 1.0.
+### Natural Experience
+- **Enhanced Prosody and Sound Quality**: Improved alignment of synthesized audio, raising MOS evaluation scores from 5.4 to 5.53.
+- **Emotional and Dialectal Flexibility**: Now supports more granular emotional controls and accent adjustments.
## Roadmap
+- [x] 2024/12
+
+ - [x] 25hz cosyvoice 2.0 released
+
+- [x] 2024/09
+
+ - [x] 25hz cosyvoice base model
+ - [x] 25hz cosyvoice voice conversion model
+
+- [x] 2024/08
+
+ - [x] Repetition Aware Sampling(RAS) inference for llm stability
+ - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
+
- [x] 2024/07
- [x] Flow matching training support
- [x] WeTextProcessing support when ttsfrd is not avaliable
- [x] Fastapi server and client
-- [x] 2024/08
-
- - [x] Repetition Aware Sampling(RAS) inference for llm stability
- - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
-
-- [x] 2024/09
-
- - [x] 25hz cosyvoice base model
- - [x] 25hz cosyvoice voice conversion model
-
-- [ ] TBD
-
- - [ ] 25hz llama based llm model which supports lora finetune
- - [ ] Support more instruction mode
- - [ ] Music generation
- - [ ] CosyVoice-500M trained with more multi-lingual data
- - [ ] More...
## Install
@@ -68,6 +85,7 @@ If you are expert in this field, and you are only interested in training your ow
``` python
# SDK模型下载
from modelscope import snapshot_download
+snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
@@ -78,6 +96,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
``` sh
# git模型下载,请确保已安装git lfs
mkdir -p pretrained_models
+git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
@@ -92,11 +111,13 @@ Notice that this step is not necessary. If you do not install `ttsfrd` package,
``` sh
cd pretrained_models/CosyVoice-ttsfrd/
unzip resource.zip -d .
-pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
+pip install ttsfrd_dependency-0.1-py3-none-any.whl
+pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
```
**Basic Usage**
+We strongly recommend using `CosyVoice2-0.5B` for better performance.
For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
For sft inference, please use `CosyVoice-300M-SFT` model.
For instruct inference, please use `CosyVoice-300M-Instruct` model.
@@ -107,36 +128,48 @@ export PYTHONPATH=third_party/Matcha-TTS
```
``` python
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
import torchaudio
+# cosyvoice2
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
+
+# zero_shot usage
+prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
+for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+# instruct usage
+for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+
+# cosyvoice
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
# sft usage
print(cosyvoice.list_avaliable_spks())
# change stream=True for chunk stream inference
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
- torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
+ torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
- torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
# cross_lingual usage
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
- torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
+ torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
# vc usage
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
- torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
+ torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
# instruct usage, support [laughter][breath]
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气与智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
- torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
```
**Start web demo**
@@ -190,4 +223,4 @@ You can also scan the QR code to join our official Dingding chat group.
5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
## Disclaimer
-The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
\ No newline at end of file
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index bcade73..c7e3b4e 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -18,7 +18,7 @@ from hyperpyyaml import load_hyperpyyaml
from modelscope import snapshot_download
import torch
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
-from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
from cosyvoice.utils.file_utils import logging
@@ -38,6 +38,7 @@ class CosyVoice:
'{}/spk2info.pt'.format(model_dir),
instruct,
configs['allowed_special'])
+ self.sample_rate = configs['sample_rate']
if torch.cuda.is_available() is False and (fp16 is True or load_jit is True):
load_jit = False
fp16 = False
@@ -64,7 +65,7 @@ class CosyVoice:
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
- speech_len = model_output['tts_speech'].shape[1] / 22050
+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
@@ -74,11 +75,11 @@ class CosyVoice:
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
if len(i) < 0.5 * len(prompt_text):
logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
- model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
+ model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
- speech_len = model_output['tts_speech'].shape[1] / 22050
+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
@@ -87,16 +88,17 @@ class CosyVoice:
if self.frontend.instruct is True:
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
- model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
+ model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
- speech_len = model_output['tts_speech'].shape[1] / 22050
+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
+ assert isinstance(self.model, CosyVoiceModel)
if self.frontend.instruct is False:
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
@@ -105,16 +107,64 @@ class CosyVoice:
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
- speech_len = model_output['tts_speech'].shape[1] / 22050
+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+ logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+ yield model_output
+ start_time = time.time()
+
+ def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
+ assert isinstance(self.model, CosyVoice2Model)
+ for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
+ model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
+ start_time = time.time()
+ logging.info('synthesis text {}'.format(i))
+ for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
- model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k)
+ model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
start_time = time.time()
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
- speech_len = model_output['tts_speech'].shape[1] / 22050
+ speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
+
+
+class CosyVoice2(CosyVoice):
+
+ def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
+ instruct = True if '-Instruct' in model_dir else False
+ self.model_dir = model_dir
+ if not os.path.exists(model_dir):
+ model_dir = snapshot_download(model_dir)
+ with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+ configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'Qwen2-0.5B-CosyVoice-BlankEN')})
+ self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+ configs['feat_extractor'],
+ '{}/campplus.onnx'.format(model_dir),
+ '{}/speech_tokenizer_v2.onnx'.format(model_dir),
+ '{}/spk2info.pt'.format(model_dir),
+ instruct,
+ configs['allowed_special'])
+ self.sample_rate = configs['sample_rate']
+ if torch.cuda.is_available() is False and load_jit is True:
+ load_jit = False
+ logging.warning('cpu do not support jit, force set to False')
+ self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'])
+ self.model.load('{}/llm.pt'.format(model_dir),
+ '{}/flow.pt'.format(model_dir),
+ '{}/hift.pt'.format(model_dir))
+ if load_jit:
+ self.model.load_jit('{}/flow.encoder.fp32.zip'.format(model_dir))
+ if load_trt is True and load_onnx is True:
+ load_onnx = False
+ logging.warning('can not set both load_trt and load_onnx to True, force set load_onnx to False')
+ if load_onnx:
+ self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
+ if load_trt:
+ self.model.load_trt('{}/flow.decoder.estimator.fp16.Volta.plan'.format(model_dir))
+ del configs
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index ac138b1..9885a0f 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
+import json
import onnxruntime
import torch
import numpy as np
@@ -66,9 +67,7 @@ class CosyVoiceFrontEnd:
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
'failed to initialize ttsfrd resource'
- self.frd.set_lang_type('pinyin')
- self.frd.enable_pinyin_mix(True)
- self.frd.set_breakmodel_index(1)
+ self.frd.set_lang_type('pinyinvg')
else:
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
self.en_tn_model = EnNormalizer()
@@ -112,26 +111,28 @@ class CosyVoiceFrontEnd:
text = text.strip()
if contains_chinese(text):
if self.use_ttsfrd:
- text = self.frd.get_frd_extra_info(text, 'input')
+ texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+ text = ''.join(texts)
else:
text = self.zh_tn_model.normalize(text)
- text = text.replace("\n", "")
- text = replace_blank(text)
- text = replace_corner_mark(text)
- text = text.replace(".", "。")
- text = text.replace(" - ", ",")
- text = remove_bracket(text)
- text = re.sub(r'[,,、]+$', '。', text)
- texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
- token_min_n=60, merge_len=20, comma_split=False))
+ text = text.replace("\n", "")
+ text = replace_blank(text)
+ text = replace_corner_mark(text)
+ text = text.replace(".", "。")
+ text = text.replace(" - ", ",")
+ text = remove_bracket(text)
+ text = re.sub(r'[,,、]+$', '。', text)
+ texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+ token_min_n=60, merge_len=20, comma_split=False))
else:
if self.use_ttsfrd:
- text = self.frd.get_frd_extra_info(text, 'input')
+ texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+ text = ''.join(texts)
else:
text = self.en_tn_model.normalize(text)
- text = spell_out_number(text, self.inflect_parser)
- texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
- token_min_n=60, merge_len=20, comma_split=False))
+ text = spell_out_number(text, self.inflect_parser)
+ texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+ token_min_n=60, merge_len=20, comma_split=False))
if split is False:
return text
return texts
@@ -142,12 +143,17 @@ class CosyVoiceFrontEnd:
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
return model_input
- def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
- prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
- speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+ speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+ if resample_rate == 24000:
+ # cosyvoice2, force speech_feat % speech_token = 2
+ token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+ speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+ speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
embedding = self._extract_spk_embedding(prompt_speech_16k)
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
@@ -157,8 +163,8 @@ class CosyVoiceFrontEnd:
'llm_embedding': embedding, 'flow_embedding': embedding}
return model_input
- def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
- model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
+ def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
+ model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
# in cross lingual mode, we remove prompt in llm
del model_input['prompt_text']
del model_input['prompt_text_len']
@@ -175,10 +181,29 @@ class CosyVoiceFrontEnd:
model_input['prompt_text_len'] = instruct_text_token_len
return model_input
- def frontend_vc(self, source_speech_16k, prompt_speech_16k):
+ def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
+ tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+ prompt_text_token, prompt_text_token_len = self._extract_text_token(instruct_text + '<|endofprompt|>')
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+ speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+ speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+ if resample_rate == 24000:
+ # cosyvoice2, force speech_feat % speech_token = 2
+ token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+ speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+ speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
+ model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+ 'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+ 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+ 'llm_embedding': embedding, 'flow_embedding': embedding}
+ return model_input
+
+ def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
- prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
- prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+ prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
embedding = self._extract_spk_embedding(prompt_speech_16k)
source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 850e5db..5314e8b 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -58,15 +58,15 @@ class CosyVoiceModel:
self.hift_cache_dict = {}
def load(self, llm_model, flow_model, hift_model):
- self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=False)
+ self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
self.llm.to(self.device).eval()
if self.fp16 is True:
self.llm.half()
- self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=False)
+ self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
self.flow.to(self.device).eval()
# in case hift_model is a hifigan model
hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
- self.hift.load_state_dict(hift_state_dict, strict=False)
+ self.hift.load_state_dict(hift_state_dict, strict=True)
self.hift.to(self.device).eval()
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
@@ -260,3 +260,167 @@ class CosyVoiceModel:
self.llm_end_dict.pop(this_uuid)
self.mel_overlap_dict.pop(this_uuid)
self.hift_cache_dict.pop(this_uuid)
+
+
+class CosyVoice2Model:
+
+ def __init__(self,
+ llm: torch.nn.Module,
+ flow: torch.nn.Module,
+ hift: torch.nn.Module):
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+ self.llm = llm
+ self.flow = flow
+ self.hift = hift
+ self.token_hop_len = 2 * self.flow.input_frame_rate
+ # here we fix flow encoder/decoder decoding_chunk_size, in the future we will send it as arguments, or use cache
+ self.flow.encoder.static_chunk_size = 2 * self.flow.input_frame_rate
+ self.flow.decoder.estimator.static_chunk_size = 2 * self.flow.input_frame_rate * self.flow.token_mel_ratio
+ # hift cache
+ self.mel_cache_len = 8
+ self.source_cache_len = int(self.mel_cache_len * 480)
+ # speech fade in out
+ self.speech_window = np.hamming(2 * self.source_cache_len)
+ # rtf and decoding related
+ self.stream_scale_factor = 1
+ self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+ self.lock = threading.Lock()
+ # dict used to store session related variable
+ self.tts_speech_token_dict = {}
+ self.llm_end_dict = {}
+ self.hift_cache_dict = {}
+
+ def load(self, llm_model, flow_model, hift_model):
+ self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
+ self.llm.to(self.device).eval()
+ self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
+ self.flow.to(self.device).eval()
+ self.flow.decoder.fp16 = False
+ # in case hift_model is a hifigan model
+ hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
+ self.hift.load_state_dict(hift_state_dict, strict=True)
+ self.hift.to(self.device).eval()
+
+ def load_jit(self, flow_encoder_model):
+ flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+ self.flow.encoder = flow_encoder
+
+ def load_onnx(self, flow_decoder_estimator_model):
+ import onnxruntime
+ option = onnxruntime.SessionOptions()
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+ option.intra_op_num_threads = 1
+ providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+ del self.flow.decoder.estimator
+ self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
+
+ def load_trt(self, flow_decoder_estimator_model):
+ del self.flow.decoder.estimator
+ import tensorrt as trt
+ with open(flow_decoder_estimator_model, 'rb') as f:
+ self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
+ self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
+ self.flow.decoder.fp16 = True
+
+ def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
+ with self.llm_context:
+ for i in self.llm.inference(text=text.to(self.device),
+ text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+ prompt_text=prompt_text.to(self.device),
+ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+ prompt_speech_token=llm_prompt_speech_token.to(self.device),
+ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+ embedding=llm_embedding.to(self.device)):
+ self.tts_speech_token_dict[uuid].append(i)
+ self.llm_end_dict[uuid] = True
+
+ def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
+ tts_mel, _ = self.flow.inference(token=token.to(self.device),
+ token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+ prompt_token=prompt_token.to(self.device),
+ prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+ prompt_feat=prompt_feat.to(self.device),
+ prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+ embedding=embedding.to(self.device),
+ finalize=finalize)
+ tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
+ # append hift cache
+ if self.hift_cache_dict[uuid] is not None:
+ hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+ tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+ else:
+ hift_cache_source = torch.zeros(1, 1, 0)
+ # keep overlap mel and hift cache
+ if finalize is False:
+ tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+ if self.hift_cache_dict[uuid] is not None:
+ tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+ self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+ 'source': tts_source[:, :, -self.source_cache_len:],
+ 'speech': tts_speech[:, -self.source_cache_len:]}
+ tts_speech = tts_speech[:, :-self.source_cache_len]
+ else:
+ if speed != 1.0:
+ assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+ tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+ tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+ if self.hift_cache_dict[uuid] is not None:
+ tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+ return tts_speech
+
+ def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
+ prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+ llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+ flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+ prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
+ # this_uuid is used to track variables related to this inference thread
+ this_uuid = str(uuid.uuid1())
+ with self.lock:
+ self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+ self.hift_cache_dict[this_uuid] = None
+ p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+ p.start()
+ if stream is True:
+ token_offset = 0
+ while True:
+ time.sleep(0.1)
+ if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
+ this_tts_speech = self.token2wav(token=this_tts_speech_token,
+ prompt_token=flow_prompt_speech_token,
+ prompt_feat=prompt_speech_feat,
+ embedding=flow_embedding,
+ uuid=this_uuid,
+ token_offset=token_offset,
+ finalize=False)
+ token_offset += self.token_hop_len
+ yield {'tts_speech': this_tts_speech.cpu()}
+ if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len:
+ break
+ p.join()
+ # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+ this_tts_speech = self.token2wav(token=this_tts_speech_token,
+ prompt_token=flow_prompt_speech_token,
+ prompt_feat=prompt_speech_feat,
+ embedding=flow_embedding,
+ uuid=this_uuid,
+ token_offset=token_offset,
+ finalize=True)
+ yield {'tts_speech': this_tts_speech.cpu()}
+ else:
+ # deal with all tokens
+ p.join()
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+ this_tts_speech = self.token2wav(token=this_tts_speech_token,
+ prompt_token=flow_prompt_speech_token,
+ prompt_feat=prompt_speech_feat,
+ embedding=flow_embedding,
+ uuid=this_uuid,
+ token_offset=0,
+ finalize=True,
+ speed=speed)
+ yield {'tts_speech': this_tts_speech.cpu()}
+ with self.lock:
+ self.tts_speech_token_dict.pop(this_uuid)
+ self.llm_end_dict.pop(this_uuid)
diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py
index b101d72..420a1bf 100644
--- a/cosyvoice/flow/decoder.py
+++ b/cosyvoice/flow/decoder.py
@@ -13,16 +13,83 @@
# limitations under the License.
import torch
import torch.nn as nn
+import torch.nn.functional as F
from einops import pack, rearrange, repeat
+from cosyvoice.utils.common import mask_to_bias
+from cosyvoice.utils.mask import add_optional_chunk_mask
from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
from matcha.models.components.transformer import BasicTransformerBlock
+class Transpose(torch.nn.Module):
+ def __init__(self, dim0: int, dim1: int):
+ super().__init__()
+ self.dim0 = dim0
+ self.dim1 = dim1
+
+ def forward(self, x: torch.Tensor):
+ x = torch.transpose(x, self.dim0, self.dim1)
+ return x
+
+
+class CausalBlock1D(Block1D):
+ def __init__(self, dim: int, dim_out: int):
+ super(CausalBlock1D, self).__init__(dim, dim_out)
+ self.block = torch.nn.Sequential(
+ CausalConv1d(dim, dim_out, 3),
+ Transpose(1, 2),
+ nn.LayerNorm(dim_out),
+ Transpose(1, 2),
+ nn.Mish(),
+ )
+
+ def forward(self, x: torch.Tensor, mask: torch.Tensor):
+ output = self.block(x * mask)
+ return output * mask
+
+
+class CausalResnetBlock1D(ResnetBlock1D):
+ def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+ super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+ self.block1 = CausalBlock1D(dim, dim_out)
+ self.block2 = CausalBlock1D(dim_out, dim_out)
+
+
+class CausalConv1d(torch.nn.Conv1d):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ dilation: int = 1,
+ groups: int = 1,
+ bias: bool = True,
+ padding_mode: str = 'zeros',
+ device=None,
+ dtype=None
+ ) -> None:
+ super(CausalConv1d, self).__init__(in_channels, out_channels,
+ kernel_size, stride,
+ padding=0, dilation=dilation,
+ groups=groups, bias=bias,
+ padding_mode=padding_mode,
+ device=device, dtype=dtype)
+ assert stride == 1
+ self.causal_padding = (kernel_size - 1, 0)
+
+ def forward(self, x: torch.Tensor):
+ x = F.pad(x, self.causal_padding)
+ x = super(CausalConv1d, self).forward(x)
+ return x
+
+
class ConditionalDecoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
+ causal=False,
channels=(256, 256),
dropout=0.05,
attention_head_dim=64,
@@ -39,7 +106,7 @@ class ConditionalDecoder(nn.Module):
channels = tuple(channels)
self.in_channels = in_channels
self.out_channels = out_channels
-
+ self.causal = causal
self.time_embeddings = SinusoidalPosEmb(in_channels)
time_embed_dim = channels[0] * 4
self.time_mlp = TimestepEmbedding(
@@ -56,7 +123,8 @@ class ConditionalDecoder(nn.Module):
input_channel = output_channel
output_channel = channels[i]
is_last = i == len(channels) - 1
- resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+ resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+ ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
@@ -70,14 +138,16 @@ class ConditionalDecoder(nn.Module):
]
)
downsample = (
- Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+ Downsample1D(output_channel) if not is_last else
+ CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
)
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
for _ in range(num_mid_blocks):
input_channel = channels[-1]
out_channels = channels[-1]
- resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+ resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+ ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
@@ -99,7 +169,11 @@ class ConditionalDecoder(nn.Module):
input_channel = channels[i] * 2
output_channel = channels[i + 1]
is_last = i == len(channels) - 2
- resnet = ResnetBlock1D(
+ resnet = CausalResnetBlock1D(
+ dim=input_channel,
+ dim_out=output_channel,
+ time_emb_dim=time_embed_dim,
+ ) if self.causal else ResnetBlock1D(
dim=input_channel,
dim_out=output_channel,
time_emb_dim=time_embed_dim,
@@ -119,10 +193,10 @@ class ConditionalDecoder(nn.Module):
upsample = (
Upsample1D(output_channel, use_conv_transpose=True)
if not is_last
- else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+ else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
)
self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
- self.final_block = Block1D(channels[-1], channels[-1])
+ self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
self.initialize_weights()
@@ -175,7 +249,9 @@ class ConditionalDecoder(nn.Module):
mask_down = masks[-1]
x = resnet(x, mask_down, t)
x = rearrange(x, "b c t -> b t c").contiguous()
- attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+ # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+ attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+ attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
@@ -192,7 +268,9 @@ class ConditionalDecoder(nn.Module):
for resnet, transformer_blocks in self.mid_blocks:
x = resnet(x, mask_mid, t)
x = rearrange(x, "b c t -> b t c").contiguous()
- attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+ # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+ attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+ attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
@@ -207,7 +285,9 @@ class ConditionalDecoder(nn.Module):
x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
x = resnet(x, mask_up, t)
x = rearrange(x, "b c t -> b t c").contiguous()
- attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+ # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+ attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+ attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index eea705b..c69ded6 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -146,3 +146,86 @@ class MaskedDiffWithXvec(torch.nn.Module):
feat = feat[:, :, mel_len1:]
assert feat.shape[2] == mel_len2
return feat, flow_cache
+
+
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+ def __init__(self,
+ input_size: int = 512,
+ output_size: int = 80,
+ spk_embed_dim: int = 192,
+ output_type: str = "mel",
+ vocab_size: int = 4096,
+ input_frame_rate: int = 50,
+ only_mask_loss: bool = True,
+ token_mel_ratio: int = 2,
+ pre_lookahead_len: int = 3,
+ encoder: torch.nn.Module = None,
+ decoder: torch.nn.Module = None,
+ decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+ 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+ 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+ 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+ 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+ mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
+ 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+ super().__init__()
+ self.input_size = input_size
+ self.output_size = output_size
+ self.decoder_conf = decoder_conf
+ self.mel_feat_conf = mel_feat_conf
+ self.vocab_size = vocab_size
+ self.output_type = output_type
+ self.input_frame_rate = input_frame_rate
+ logging.info(f"input frame rate={self.input_frame_rate}")
+ self.input_embedding = nn.Embedding(vocab_size, input_size)
+ self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+ self.encoder = encoder
+ self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+ self.decoder = decoder
+ self.only_mask_loss = only_mask_loss
+ self.token_mel_ratio = token_mel_ratio
+ self.pre_lookahead_len = pre_lookahead_len
+
+ @torch.inference_mode()
+ def inference(self,
+ token,
+ token_len,
+ prompt_token,
+ prompt_token_len,
+ prompt_feat,
+ prompt_feat_len,
+ embedding,
+ finalize):
+ assert token.shape[0] == 1
+ # xvec projection
+ embedding = F.normalize(embedding, dim=1)
+ embedding = self.spk_embed_affine_layer(embedding)
+
+ # concat text and prompt_text
+ token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+ mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+ token = self.input_embedding(torch.clamp(token, min=0)) * mask
+
+ # text encode
+ h, h_lengths = self.encoder(token, token_len)
+ if finalize is False:
+ h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
+ mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+ h = self.encoder_proj(h)
+
+ # get conditions
+ conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
+ conds[:, :mel_len1] = prompt_feat
+ conds = conds.transpose(1, 2)
+
+ mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+ feat, _ = self.decoder(
+ mu=h.transpose(1, 2).contiguous(),
+ mask=mask.unsqueeze(1),
+ spks=embedding,
+ cond=conds,
+ n_timesteps=10
+ )
+ feat = feat[:, :, mel_len1:]
+ assert feat.shape[2] == mel_len2
+ return feat, None
diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py
index d011304..05ec37a 100644
--- a/cosyvoice/flow/flow_matching.py
+++ b/cosyvoice/flow/flow_matching.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import onnxruntime
import torch
import torch.nn.functional as F
from matcha.models.components.flow_matching import BASECFM
@@ -88,30 +89,48 @@ class ConditionalCFM(BASECFM):
# Or in future might add like a return_all_steps flag
sol = []
+ if self.inference_cfg_rate > 0:
+ # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+ x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+ mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+ mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+ t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+ spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+ cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+ else:
+ x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
for step in range(1, len(t_span)):
- dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
# Classifier-Free Guidance inference introduced in VoiceBox
if self.inference_cfg_rate > 0:
- cfg_dphi_dt = self.forward_estimator(
- x, mask,
- torch.zeros_like(mu), t,
- torch.zeros_like(spks) if spks is not None else None,
- torch.zeros_like(cond)
- )
- dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
- self.inference_cfg_rate * cfg_dphi_dt)
+ x_in[:] = x
+ mask_in[:] = mask
+ mu_in[0] = mu
+ t_in[:] = t.unsqueeze(0)
+ spks_in[0] = spks
+ cond_in[0] = cond
+ else:
+ x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
+ dphi_dt = self.forward_estimator(
+ x_in, mask_in,
+ mu_in, t_in,
+ spks_in,
+ cond_in
+ )
+ if self.inference_cfg_rate > 0:
+ dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+ dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
x = x + dt * dphi_dt
t = t + dt
sol.append(x)
if step < len(t_span) - 1:
dt = t_span[step + 1] - t
- return sol[-1]
+ return sol[-1].float()
def forward_estimator(self, x, mask, mu, t, spks, cond):
if isinstance(self.estimator, torch.nn.Module):
return self.estimator.forward(x, mask, mu, t, spks, cond)
- else:
+ elif isinstance(self.estimator, onnxruntime.InferenceSession):
ort_inputs = {
'x': x.cpu().numpy(),
'mask': mask.cpu().numpy(),
@@ -122,6 +141,22 @@ class ConditionalCFM(BASECFM):
}
output = self.estimator.run(None, ort_inputs)[0]
return torch.tensor(output, dtype=x.dtype, device=x.device)
+ else:
+ self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+ self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+ self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+ self.estimator.set_input_shape('t', (2,))
+ self.estimator.set_input_shape('spks', (2, 80))
+ self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+ # run trt engine
+ self.estimator.execute_v2([x.contiguous().data_ptr(),
+ mask.contiguous().data_ptr(),
+ mu.contiguous().data_ptr(),
+ t.contiguous().data_ptr(),
+ spks.contiguous().data_ptr(),
+ cond.contiguous().data_ptr(),
+ x.data_ptr()])
+ return x
def compute_loss(self, x1, mask, mu, spks=None, cond=None):
"""Computes diffusion loss
@@ -163,3 +198,38 @@ class ConditionalCFM(BASECFM):
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
return loss, y
+
+
+class CausalConditionalCFM(ConditionalCFM):
+ def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+ super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
+ self.rand_noise = torch.randn([1, 80, 50 * 300])
+
+ @torch.inference_mode()
+ def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+ """Forward diffusion
+
+ Args:
+ mu (torch.Tensor): output of encoder
+ shape: (batch_size, n_feats, mel_timesteps)
+ mask (torch.Tensor): output_mask
+ shape: (batch_size, 1, mel_timesteps)
+ n_timesteps (int): number of diffusion steps
+ temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
+ shape: (batch_size, spk_emb_dim)
+ cond: Not used but kept for future purposes
+
+ Returns:
+ sample: generated mel-spectrogram
+ shape: (batch_size, n_feats, mel_timesteps)
+ """
+
+ z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature
+ if self.fp16 is True:
+ z = z.half()
+ # fix prompt and overlap part mu and z
+ t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+ if self.t_scheduler == 'cosine':
+ t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+ return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py
index cf9c231..814e4d1 100644
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -15,6 +15,7 @@ from typing import Dict, Optional, Callable, List, Generator
import torch
from torch import nn
import torch.nn.functional as F
+from transformers import Qwen2ForCausalLM
from torch.nn.utils.rnn import pad_sequence, unpad_sequence
from cosyvoice.utils.common import IGNORE_ID
from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
@@ -213,3 +214,127 @@ class TransformerLM(torch.nn.Module):
out_tokens.append(top_ids)
offset += lm_input.size(1)
lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+
+
+class Qwen2Encoder(torch.nn.Module):
+ def __init__(self, pretrain_path):
+ super().__init__()
+ self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
+
+ def forward_one_step(self, xs, masks, cache=None):
+ input_masks = masks[:, -1, :]
+ outs = self.model(
+ inputs_embeds=xs,
+ attention_mask=input_masks,
+ output_hidden_states=True,
+ return_dict=True,
+ use_cache=True,
+ past_key_values=cache,
+ )
+ xs = outs.hidden_states[-1]
+ new_cache = outs.past_key_values
+ return xs, new_cache
+
+
+class Qwen2LM(torch.nn.Module):
+ def __init__(
+ self,
+ llm_input_size: int,
+ llm_output_size: int,
+ speech_token_size: int,
+ llm: torch.nn.Module,
+ sampling: Callable,
+ length_normalized_loss: bool = True,
+ lsm_weight: float = 0.0,
+ ):
+ super().__init__()
+ self.llm_input_size = llm_input_size
+ self.llm_output_size = llm_output_size
+ self.speech_token_size = speech_token_size
+
+ # 2. build speech token language model related modules
+ self.sos_eos = 0
+ self.task_id = 1
+ self.fill_token = 2
+
+ self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+ self.llm = llm
+ self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 3)
+ self.criterion_ce = LabelSmoothingLoss(
+ size=speech_token_size + 3,
+ padding_idx=IGNORE_ID,
+ smoothing=lsm_weight,
+ normalize_length=length_normalized_loss,
+ )
+
+ # 3. [Optional] build speech token related modules
+ self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size)
+
+ # 4. sampling method
+ self.sampling = sampling
+
+ def sampling_ids(
+ self,
+ weighted_scores: torch.Tensor,
+ decoded_tokens: List,
+ sampling: int,
+ ignore_eos: bool = True,
+ ):
+ while True:
+ top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
+ if (not ignore_eos) or (self.speech_token_size not in top_ids):
+ break
+ return top_ids
+
+ @torch.inference_mode()
+ def inference(
+ self,
+ text: torch.Tensor,
+ text_len: torch.Tensor,
+ prompt_text: torch.Tensor,
+ prompt_text_len: torch.Tensor,
+ prompt_speech_token: torch.Tensor,
+ prompt_speech_token_len: torch.Tensor,
+ embedding: torch.Tensor,
+ sampling: int = 25,
+ max_token_text_ratio: float = 20,
+ min_token_text_ratio: float = 2,
+ ) -> Generator[torch.Tensor, None, None]:
+ device = text.device
+ text = torch.concat([prompt_text, text], dim=1)
+ text_len += prompt_text_len
+ text = self.llm.model.model.embed_tokens(text)
+
+ # 2. encode embedding
+ embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+
+ # 3. concat llm_input
+ sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+ task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+ if prompt_speech_token_len != 0:
+ prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+ else:
+ prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+ lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
+
+ # 4. cal min/max_length
+ min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+ max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+
+ # 5. step by step decode
+ out_tokens = []
+ cache = None
+ for i in range(max_len):
+ y_pred, cache = self.llm.forward_one_step(lm_input,
+ masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
+ cache=cache)
+ logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+ top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
+ if top_ids == self.speech_token_size:
+ break
+ if top_ids > self.speech_token_size:
+ continue
+ # in stream mode, yield token one by one
+ yield top_ids
+ out_tokens.append(top_ids)
+ lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py
index caecf26..00c97c9 100644
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -2,6 +2,8 @@ import base64
import os
from functools import lru_cache
from typing import Optional
+import torch
+from transformers import AutoTokenizer
from whisper.tokenizer import Tokenizer
import tiktoken
@@ -234,3 +236,43 @@ def get_tokenizer(
return Tokenizer(
encoding=encoding, num_languages=num_languages, language=language, task=task
)
+
+
+class QwenTokenizer():
+ def __init__(self, token_path, skip_special_tokens=True):
+ super().__init__()
+ # NOTE: non-chat model, all these special tokens keep randomly initialized.
+ special_tokens = {
+ 'eos_token': '<|endoftext|>',
+ 'pad_token': '<|endoftext|>',
+ 'additional_special_tokens': [
+ '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+ '[breath]', '', '', '[noise]',
+ '[laughter]', '[cough]', '[clucking]', '[accent]',
+ '[quick_breath]',
+ "", "",
+ "[hissing]", "[sigh]", "[vocalized-noise]",
+ "[lipsmack]", "[mn]"
+ ]
+ }
+ self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+ self.tokenizer.add_special_tokens(special_tokens)
+ self.skip_special_tokens = skip_special_tokens
+
+ def encode(self, text, **kwargs):
+ tokens = self.tokenizer([text], return_tensors="pt")
+ tokens = tokens["input_ids"][0].cpu().tolist()
+ return tokens
+
+ def decode(self, tokens):
+ tokens = torch.tensor(tokens, dtype=torch.int64)
+ text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+ return text
+
+
+@lru_cache(maxsize=None)
+def get_qwen_tokenizer(
+ token_path: str,
+ skip_special_tokens: bool
+) -> QwenTokenizer:
+ return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
diff --git a/cosyvoice/transformer/encoder_layer.py b/cosyvoice/transformer/encoder_layer.py
index dfd758b..efbb12d 100644
--- a/cosyvoice/transformer/encoder_layer.py
+++ b/cosyvoice/transformer/encoder_layer.py
@@ -49,8 +49,8 @@ class TransformerEncoderLayer(nn.Module):
super().__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
- self.norm1 = nn.LayerNorm(size, eps=1e-5)
- self.norm2 = nn.LayerNorm(size, eps=1e-5)
+ self.norm1 = nn.LayerNorm(size, eps=1e-12)
+ self.norm2 = nn.LayerNorm(size, eps=1e-12)
self.dropout = nn.Dropout(dropout_rate)
self.size = size
self.normalize_before = normalize_before
@@ -142,17 +142,17 @@ class ConformerEncoderLayer(nn.Module):
self.feed_forward = feed_forward
self.feed_forward_macaron = feed_forward_macaron
self.conv_module = conv_module
- self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module
- self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module
+ self.norm_ff = nn.LayerNorm(size, eps=1e-12) # for the FNN module
+ self.norm_mha = nn.LayerNorm(size, eps=1e-12) # for the MHA module
if feed_forward_macaron is not None:
- self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5)
+ self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
self.ff_scale = 0.5
else:
self.ff_scale = 1.0
if self.conv_module is not None:
- self.norm_conv = nn.LayerNorm(size, eps=1e-5) # for the CNN module
+ self.norm_conv = nn.LayerNorm(size, eps=1e-12) # for the CNN module
self.norm_final = nn.LayerNorm(
- size, eps=1e-5) # for the final output of the block
+ size, eps=1e-12) # for the final output of the block
self.dropout = nn.Dropout(dropout_rate)
self.size = size
self.normalize_before = normalize_before
diff --git a/cosyvoice/transformer/upsample_encoder.py b/cosyvoice/transformer/upsample_encoder.py
new file mode 100644
index 0000000..f67fb98
--- /dev/null
+++ b/cosyvoice/transformer/upsample_encoder.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+# 2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Tuple
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from cosyvoice.transformer.convolution import ConvolutionModule
+from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
+from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from cosyvoice.utils.class_utils import (
+ COSYVOICE_EMB_CLASSES,
+ COSYVOICE_SUBSAMPLE_CLASSES,
+ COSYVOICE_ATTENTION_CLASSES,
+ COSYVOICE_ACTIVATION_CLASSES,
+)
+from cosyvoice.utils.mask import make_pad_mask
+from cosyvoice.utils.mask import add_optional_chunk_mask
+
+
+class Upsample1D(nn.Module):
+ """A 1D upsampling layer with an optional convolution.
+
+ Parameters:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ use_conv_transpose (`bool`, default `False`):
+ option to use a convolution transpose.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ """
+
+ def __init__(self, channels: int, out_channels: int, stride: int = 2):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels
+ self.stride = stride
+ # In this mode, first repeat interpolate, than conv with stride=1
+ self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
+
+ def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
+ outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
+ outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+ outputs = self.conv(outputs)
+ return outputs, input_lengths * self.stride
+
+
+class PreLookaheadLayer(nn.Module):
+ def __init__(self, channels: int, pre_lookahead_len: int = 1):
+ super().__init__()
+ self.channels = channels
+ self.pre_lookahead_len = pre_lookahead_len
+ self.conv1 = nn.Conv1d(
+ channels, channels,
+ kernel_size=pre_lookahead_len + 1,
+ stride=1, padding=0,
+ )
+ self.conv2 = nn.Conv1d(
+ channels, channels,
+ kernel_size=3, stride=1, padding=0,
+ )
+
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+ """
+ inputs: (batch_size, seq_len, channels)
+ """
+ outputs = inputs.transpose(1, 2).contiguous()
+ # look ahead
+ outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+ outputs = F.leaky_relu(self.conv1(outputs))
+ # outputs
+ outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
+ outputs = self.conv2(outputs)
+ outputs = outputs.transpose(1, 2).contiguous()
+
+ # residual connection
+ outputs = outputs + inputs
+ return outputs
+
+
+class UpsampleConformerEncoder(torch.nn.Module):
+
+ def __init__(
+ self,
+ input_size: int,
+ output_size: int = 256,
+ attention_heads: int = 4,
+ linear_units: int = 2048,
+ num_blocks: int = 6,
+ dropout_rate: float = 0.1,
+ positional_dropout_rate: float = 0.1,
+ attention_dropout_rate: float = 0.0,
+ input_layer: str = "conv2d",
+ pos_enc_layer_type: str = "rel_pos",
+ normalize_before: bool = True,
+ static_chunk_size: int = 0,
+ use_dynamic_chunk: bool = False,
+ global_cmvn: torch.nn.Module = None,
+ use_dynamic_left_chunk: bool = False,
+ positionwise_conv_kernel_size: int = 1,
+ macaron_style: bool = True,
+ selfattention_layer_type: str = "rel_selfattn",
+ activation_type: str = "swish",
+ use_cnn_module: bool = True,
+ cnn_module_kernel: int = 15,
+ causal: bool = False,
+ cnn_module_norm: str = "batch_norm",
+ key_bias: bool = True,
+ gradient_checkpointing: bool = False,
+ ):
+ """
+ Args:
+ input_size (int): input dim
+ output_size (int): dimension of attention
+ attention_heads (int): the number of heads of multi head attention
+ linear_units (int): the hidden units number of position-wise feed
+ forward
+ num_blocks (int): the number of decoder blocks
+ dropout_rate (float): dropout rate
+ attention_dropout_rate (float): dropout rate in attention
+ positional_dropout_rate (float): dropout rate after adding
+ positional encoding
+ input_layer (str): input layer type.
+ optional [linear, conv2d, conv2d6, conv2d8]
+ pos_enc_layer_type (str): Encoder positional encoding layer type.
+ opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+ normalize_before (bool):
+ True: use layer_norm before each sub-block of a layer.
+ False: use layer_norm after each sub-block of a layer.
+ static_chunk_size (int): chunk size for static chunk training and
+ decoding
+ use_dynamic_chunk (bool): whether use dynamic chunk size for
+ training or not, You can only use fixed chunk(chunk_size > 0)
+ or dyanmic chunk size(use_dynamic_chunk = True)
+ global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+ use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+ dynamic chunk training
+ key_bias: whether use bias in attention.linear_k, False for whisper models.
+ gradient_checkpointing: rerunning a forward-pass segment for each
+ checkpointed segment during backward.
+ """
+ super().__init__()
+ self._output_size = output_size
+
+ self.global_cmvn = global_cmvn
+ self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+ input_size,
+ output_size,
+ dropout_rate,
+ COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+ positional_dropout_rate),
+ )
+
+ self.normalize_before = normalize_before
+ self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+ self.static_chunk_size = static_chunk_size
+ self.use_dynamic_chunk = use_dynamic_chunk
+ self.use_dynamic_left_chunk = use_dynamic_left_chunk
+ self.gradient_checkpointing = gradient_checkpointing
+ activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+ # self-attention module definition
+ encoder_selfattn_layer_args = (
+ attention_heads,
+ output_size,
+ attention_dropout_rate,
+ key_bias,
+ )
+ # feed-forward module definition
+ positionwise_layer_args = (
+ output_size,
+ linear_units,
+ dropout_rate,
+ activation,
+ )
+ # convolution module definition
+ convolution_layer_args = (output_size, cnn_module_kernel, activation,
+ cnn_module_norm, causal)
+ self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3)
+ self.encoders = torch.nn.ModuleList([
+ ConformerEncoderLayer(
+ output_size,
+ COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+ *encoder_selfattn_layer_args),
+ PositionwiseFeedForward(*positionwise_layer_args),
+ PositionwiseFeedForward(
+ *positionwise_layer_args) if macaron_style else None,
+ ConvolutionModule(
+ *convolution_layer_args) if use_cnn_module else None,
+ dropout_rate,
+ normalize_before,
+ ) for _ in range(num_blocks)
+ ])
+ self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
+ self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+ input_size,
+ output_size,
+ dropout_rate,
+ COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+ positional_dropout_rate),
+ )
+ self.up_encoders = torch.nn.ModuleList([
+ ConformerEncoderLayer(
+ output_size,
+ COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+ *encoder_selfattn_layer_args),
+ PositionwiseFeedForward(*positionwise_layer_args),
+ PositionwiseFeedForward(
+ *positionwise_layer_args) if macaron_style else None,
+ ConvolutionModule(
+ *convolution_layer_args) if use_cnn_module else None,
+ dropout_rate,
+ normalize_before,
+ ) for _ in range(4)
+ ])
+
+ def output_size(self) -> int:
+ return self._output_size
+
+ def forward(
+ self,
+ xs: torch.Tensor,
+ xs_lens: torch.Tensor,
+ decoding_chunk_size: int = 0,
+ num_decoding_left_chunks: int = -1,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Embed positions in tensor.
+
+ Args:
+ xs: padded input tensor (B, T, D)
+ xs_lens: input length (B)
+ decoding_chunk_size: decoding chunk size for dynamic chunk
+ 0: default for training, use random dynamic chunk.
+ <0: for decoding, use full chunk.
+ >0: for decoding, use fixed chunk size as set.
+ num_decoding_left_chunks: number of left chunks, this is for decoding,
+ the chunk size is decoding_chunk_size.
+ >=0: use num_decoding_left_chunks
+ <0: use all left chunks
+ Returns:
+ encoder output tensor xs, and subsampled masks
+ xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+ masks: torch.Tensor batch padding mask after subsample
+ (B, 1, T' ~= T/subsample_rate)
+ NOTE(xcsong):
+ We pass the `__call__` method of the modules instead of `forward` to the
+ checkpointing API because `__call__` attaches all the hooks of the module.
+ https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+ """
+ T = xs.size(1)
+ masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
+ if self.global_cmvn is not None:
+ xs = self.global_cmvn(xs)
+ xs, pos_emb, masks = self.embed(xs, masks)
+ mask_pad = masks # (B, 1, T/subsample_rate)
+ chunk_masks = add_optional_chunk_mask(xs, masks,
+ self.use_dynamic_chunk,
+ self.use_dynamic_left_chunk,
+ decoding_chunk_size,
+ self.static_chunk_size,
+ num_decoding_left_chunks)
+ # lookahead + conformer encoder
+ xs = self.pre_lookahead_layer(xs)
+ xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+
+ # upsample + conformer encoder
+ xs = xs.transpose(1, 2).contiguous()
+ xs, xs_lens = self.up_layer(xs, xs_lens)
+ xs = xs.transpose(1, 2).contiguous()
+ T = xs.size(1)
+ masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
+ xs, pos_emb, masks = self.up_embed(xs, masks)
+ mask_pad = masks # (B, 1, T/subsample_rate)
+ chunk_masks = add_optional_chunk_mask(xs, masks,
+ self.use_dynamic_chunk,
+ self.use_dynamic_left_chunk,
+ decoding_chunk_size,
+ self.static_chunk_size * self.up_layer.stride,
+ num_decoding_left_chunks)
+ xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
+
+ if self.normalize_before:
+ xs = self.after_norm(xs)
+ # Here we assume the mask is not changed in encoder layers, so just
+ # return the masks before encoder layers, and the masks will be used
+ # for cross attention with decoder later
+ return xs, masks
+
+ def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+ pos_emb: torch.Tensor,
+ mask_pad: torch.Tensor) -> torch.Tensor:
+ for layer in self.encoders:
+ xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+ return xs
+
+ def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+ pos_emb: torch.Tensor,
+ mask_pad: torch.Tensor) -> torch.Tensor:
+ for layer in self.up_encoders:
+ xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+ return xs
diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py
index f926424..22b944d 100644
--- a/cosyvoice/utils/common.py
+++ b/cosyvoice/utils/common.py
@@ -160,3 +160,15 @@ def is_only_punctuation(text):
# Regular expression: Match strings that consist only of punctuation marks or are empty.
punctuation_pattern = r'^[\p{P}\p{S}]*$'
return bool(regex.fullmatch(punctuation_pattern, text))
+
+
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+ assert mask.dtype == torch.bool
+ assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+ mask = mask.to(dtype)
+ # attention mask bias
+ # NOTE(Mddct): torch.finfo jit issues
+ # chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+ mask = (1.0 - mask) * torch.finfo(dtype).min
+ return mask
+
diff --git a/requirements.txt b/requirements.txt
index d6b3ca2..7fd5596 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
---extra-index-url https://download.pytorch.org/whl/cu118
+--extra-index-url https://download.pytorch.org/whl/cu121
+--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
conformer==0.3.2
deepspeed==0.14.2; sys_platform == 'linux'
diffusers==0.27.2
@@ -17,16 +18,20 @@ modelscope==1.15.0
networkx==3.1
omegaconf==2.3.0
onnx==1.16.0
-onnxruntime-gpu==1.16.0; sys_platform == 'linux'
-onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'
+onnxruntime-gpu==1.18.0; sys_platform == 'linux'
+onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows'
openai-whisper==20231117
protobuf==4.25
pydantic==2.7.0
rich==13.7.1
soundfile==0.12.1
tensorboard==2.14.0
-torch==2.0.1
-torchaudio==2.0.2
+tensorrt-cu12==10.0.1
+tensorrt-cu12-bindings==10.0.1
+tensorrt-cu12-libs==10.0.1
+torch==2.3.1
+torchaudio==2.3.1
+transformers==4.40.1
uvicorn==0.30.0
wget==3.2
fastapi==0.111.0
diff --git a/webui.py b/webui.py
index 0233a94..196718d 100644
--- a/webui.py
+++ b/webui.py
@@ -22,7 +22,7 @@ import random
import librosa
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav, logging
from cosyvoice.utils.common import set_all_random_seed
@@ -51,7 +51,7 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
)
if speech.abs().max() > max_val:
speech = speech / speech.abs().max() * max_val
- speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
+ speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
return speech
@@ -71,31 +71,31 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
if mode_checkbox_group in ['自然语言控制']:
if cosyvoice.frontend.instruct is False:
gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
if instruct_text == '':
gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
if prompt_wav is not None or prompt_text != '':
gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
if mode_checkbox_group in ['跨语种复刻']:
if cosyvoice.frontend.instruct is True:
gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
if instruct_text != '':
gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
if prompt_wav is None:
gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
if prompt_wav is None:
gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
# sft mode only use sft_dropdown
if mode_checkbox_group in ['预训练音色']:
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
@@ -104,7 +104,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
if mode_checkbox_group in ['3s极速复刻']:
if prompt_text == '':
gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
- yield (target_sr, default_data)
+ yield (cosyvoice.sample_rate, default_data)
if instruct_text != '':
gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!')
@@ -112,24 +112,24 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
logging.info('get sft inference request')
set_all_random_seed(seed)
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
- yield (target_sr, i['tts_speech'].numpy().flatten())
+ yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
elif mode_checkbox_group == '3s极速复刻':
logging.info('get zero_shot inference request')
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
- yield (target_sr, i['tts_speech'].numpy().flatten())
+ yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
elif mode_checkbox_group == '跨语种复刻':
logging.info('get cross_lingual inference request')
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
- yield (target_sr, i['tts_speech'].numpy().flatten())
+ yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
else:
logging.info('get instruct inference request')
set_all_random_seed(seed)
for i in cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed):
- yield (target_sr, i['tts_speech'].numpy().flatten())
+ yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
def main():
@@ -144,7 +144,7 @@ def main():
with gr.Row():
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
- sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', value=sft_spk[0], scale=0.25)
+ sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', value=sft_spk[0] if len(sft_spk) != 0 else '', scale=0.25)
stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
speed = gr.Number(value=1, label="速度调节(仅支持非流式推理)", minimum=0.5, maximum=2.0, step=0.1)
with gr.Column(scale=0.25):
@@ -178,11 +178,11 @@ if __name__ == '__main__':
default=8000)
parser.add_argument('--model_dir',
type=str,
- default='pretrained_models/CosyVoice-300M',
+ default='pretrained_models/CosyVoice2-0.5B',
help='local path or modelscope repo id')
args = parser.parse_args()
- cosyvoice = CosyVoice(args.model_dir)
+ cosyvoice = CosyVoice2(args.model_dir) if 'CosyVoice2' in args.model_dir else CosyVoice(args.model_dir)
sft_spk = cosyvoice.list_avaliable_spks()
- prompt_sr, target_sr = 16000, 22050
- default_data = np.zeros(target_sr)
+ prompt_sr = 16000
+ default_data = np.zeros(cosyvoice.sample_rate)
main()