From 2345ce6be2548f134c6366d24c2fd69d045ee33a Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Thu, 12 Dec 2024 15:43:17 +0800
Subject: [PATCH] update

---
 README.md                        | 10 +++++-----
 cosyvoice/flow/flow.py           |  6 +++++-
 cosyvoice/tokenizer/tokenizer.py |  5 +++++
 requirements.txt                 |  7 ++++---
 4 files changed, 19 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 0e01098..9a717cd 100644
--- a/README.md
+++ b/README.md
@@ -116,27 +116,27 @@ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, loa
 print(cosyvoice.list_avaliable_spks())
 # change stream=True for chunk stream inference
 for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
-    torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
 # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 # cross_lingual usage
 prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
-    torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 # vc usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
-    torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
 # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
 for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
+    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 ```
 
 **Start web demo**
diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index 459e3fc..d99c495 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -157,6 +157,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
                  vocab_size: int = 4096,
                  input_frame_rate: int = 50,
                  only_mask_loss: bool = True,
+                 token_mel_ratio: int = 2,
+                 pre_lookahead_len: int = 3,
                  encoder: torch.nn.Module = None,
                  decoder: torch.nn.Module = None,
                  decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
@@ -181,6 +183,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
         self.decoder = decoder
         self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
 
     @torch.inference_mode()
     def inference(self,
@@ -206,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         # text encode
         h, h_lengths = self.encoder(token, token_len)
         if finalize is False:
-            h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride]
+            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
         mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] -  prompt_feat.shape[1]
         h = self.encoder_proj(h)
 
diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py
index 3cbe8b5..fbe78ff 100644
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -240,6 +240,8 @@ def get_tokenizer(
 
 class QwenTokenizer():
     def __init__(self, token_path, skip_special_tokens=True):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
         special_tokens = {
             'eos_token': '<|endoftext|>',
             'pad_token': '<|endoftext|>',
@@ -248,6 +250,9 @@ class QwenTokenizer():
                 '[breath]', '<strong>', '</strong>', '[noise]',
                 '[laughter]', '[cough]', '[clucking]', '[accent]',
                 '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
             ]
         }
         self.tokenizer = AutoTokenizer.from_pretrained(token_path)
diff --git a/requirements.txt b/requirements.txt
index d6b3ca2..da758a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
---extra-index-url https://download.pytorch.org/whl/cu118
+--extra-index-url https://download.pytorch.org/whl/torch_stable.html
 conformer==0.3.2
 deepspeed==0.14.2; sys_platform == 'linux'
 diffusers==0.27.2
@@ -25,8 +25,9 @@ pydantic==2.7.0
 rich==13.7.1
 soundfile==0.12.1
 tensorboard==2.14.0
-torch==2.0.1
-torchaudio==2.0.2
+tensorrt-cu12==10.0.1
+torch==2.3.1+cu121
+torchaudio==2.3.1+cu121
 uvicorn==0.30.0
 wget==3.2
 fastapi==0.111.0