From 2345ce6be2548f134c6366d24c2fd69d045ee33a Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 12 Dec 2024 15:43:17 +0800 Subject: [PATCH] update --- README.md | 10 +++++----- cosyvoice/flow/flow.py | 6 +++++- cosyvoice/tokenizer/tokenizer.py | 5 +++++ requirements.txt | 7 ++++--- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0e01098..9a717cd 100644 --- a/README.md +++ b/README.md @@ -116,27 +116,27 @@ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, loa print(cosyvoice.list_avaliable_spks()) # change stream=True for chunk stream inference for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)): - torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): - torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) # cross_lingual usage prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)): - torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) # vc usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)): - torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct') # instruct usage, support [laughter][breath] for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)): - torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050) + torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) ``` **Start web demo** diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index 459e3fc..d99c495 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -157,6 +157,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): vocab_size: int = 4096, input_frame_rate: int = 50, only_mask_loss: bool = True, + token_mel_ratio: int = 2, + pre_lookahead_len: int = 3, encoder: torch.nn.Module = None, decoder: torch.nn.Module = None, decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, @@ -181,6 +183,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) self.decoder = decoder self.only_mask_loss = only_mask_loss + self.token_mel_ratio = token_mel_ratio + self.pre_lookahead_len = pre_lookahead_len @torch.inference_mode() def inference(self, @@ -206,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module): # text encode h, h_lengths = self.encoder(token, token_len) if finalize is False: - h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride] + h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio] mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1] h = self.encoder_proj(h) diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py index 3cbe8b5..fbe78ff 100644 --- a/cosyvoice/tokenizer/tokenizer.py +++ b/cosyvoice/tokenizer/tokenizer.py @@ -240,6 +240,8 @@ def get_tokenizer( class QwenTokenizer(): def __init__(self, token_path, skip_special_tokens=True): + super().__init__() + # NOTE: non-chat model, all these special tokens keep randomly initialized. special_tokens = { 'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', @@ -248,6 +250,9 @@ class QwenTokenizer(): '[breath]', '', '', '[noise]', '[laughter]', '[cough]', '[clucking]', '[accent]', '[quick_breath]', + "", "", + "[hissing]", "[sigh]", "[vocalized-noise]", + "[lipsmack]", "[mn]" ] } self.tokenizer = AutoTokenizer.from_pretrained(token_path) diff --git a/requirements.txt b/requirements.txt index d6b3ca2..da758a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 +--extra-index-url https://download.pytorch.org/whl/torch_stable.html conformer==0.3.2 deepspeed==0.14.2; sys_platform == 'linux' diffusers==0.27.2 @@ -25,8 +25,9 @@ pydantic==2.7.0 rich==13.7.1 soundfile==0.12.1 tensorboard==2.14.0 -torch==2.0.1 -torchaudio==2.0.2 +tensorrt-cu12==10.0.1 +torch==2.3.1+cu121 +torchaudio==2.3.1+cu121 uvicorn==0.30.0 wget==3.2 fastapi==0.111.0