From 1298d90e4850b4595f4ac45048a10fea18ca213c Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Mon, 16 Dec 2024 14:05:00 +0800
Subject: [PATCH] update readme

---
 README.md                        | 16 +++++++++++++---
 cosyvoice/cli/cosyvoice.py       |  2 +-
 cosyvoice/cli/frontend.py        |  4 ++++
 cosyvoice/tokenizer/tokenizer.py |  1 +
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2c3b59f..df25829 100644
--- a/README.md
+++ b/README.md
@@ -131,19 +131,29 @@ export PYTHONPATH=third_party/Matcha-TTS
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
 from cosyvoice.utils.file_utils import load_wav
 import torchaudio
+```
 
-# cosyvoice2
+**CosyVoice2 Usage**
+```python
 cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
 
 # zero_shot usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
     torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+
+# fine grained control
+prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
+for i, j in enumerate(cosyvoice.inference_cross_lingual('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', prompt_speech_16k, stream=False)):
+    torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+
 # instruct usage
 for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+```
 
-# cosyvoice
+**CosyVoice Usage**
+```python
 cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
 # sft usage
 print(cosyvoice.list_avaliable_spks())
diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py
index c7e3b4e..d512de5 100644
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -85,7 +85,7 @@ class CosyVoice:
                 start_time = time.time()
 
     def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
-        if self.frontend.instruct is True:
+        if self.frontend.instruct is True and isinstance(self.model, CosyVoiceModel):
             raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
         for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
             model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index 9885a0f..228ec41 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -109,6 +109,10 @@ class CosyVoiceFrontEnd:
 
     def text_normalize(self, text, split=True):
         text = text.strip()
+        # NOTE(lyuxiang.lx) move this judgement into ttsfrd in the future
+        for token in self.tokenizer.special_tokens['additional_special_tokens']:
+            if token in text:
+                return text if split is False else [text]
         if contains_chinese(text):
             if self.use_ttsfrd:
                 texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py
index 00c97c9..43fb39a 100644
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -255,6 +255,7 @@ class QwenTokenizer():
                 "[lipsmack]", "[mn]"
             ]
         }
+        self.special_tokens = special_tokens
         self.tokenizer = AutoTokenizer.from_pretrained(token_path)
         self.tokenizer.add_special_tokens(special_tokens)
         self.skip_special_tokens = skip_special_tokens