From 1298d90e4850b4595f4ac45048a10fea18ca213c Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Mon, 16 Dec 2024 14:05:00 +0800 Subject: [PATCH] update readme --- README.md | 16 +++++++++++++--- cosyvoice/cli/cosyvoice.py | 2 +- cosyvoice/cli/frontend.py | 4 ++++ cosyvoice/tokenizer/tokenizer.py | 1 + 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2c3b59f..df25829 100644 --- a/README.md +++ b/README.md @@ -131,19 +131,29 @@ export PYTHONPATH=third_party/Matcha-TTS from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.utils.file_utils import load_wav import torchaudio +``` -# cosyvoice2 +**CosyVoice2 Usage** +```python cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False) # zero_shot usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + +# fine grained control +prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) +for i, j in enumerate(cosyvoice.inference_cross_lingual('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', prompt_speech_16k, stream=False)): + torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + # instruct usage for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)): - torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) +``` -# cosyvoice +**CosyVoice Usage** +```python cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True) # sft usage print(cosyvoice.list_avaliable_spks()) diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index c7e3b4e..d512de5 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -85,7 +85,7 @@ class CosyVoice: start_time = time.time() def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0): - if self.frontend.instruct is True: + if self.frontend.instruct is True and isinstance(self.model, CosyVoiceModel): raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir)) for i in tqdm(self.frontend.text_normalize(tts_text, split=True)): model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 9885a0f..228ec41 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -109,6 +109,10 @@ class CosyVoiceFrontEnd: def text_normalize(self, text, split=True): text = text.strip() + # NOTE(lyuxiang.lx) move this judgement into ttsfrd in the future + for token in self.tokenizer.special_tokens['additional_special_tokens']: + if token in text: + return text if split is False else [text] if contains_chinese(text): if self.use_ttsfrd: texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]] diff --git a/cosyvoice/tokenizer/tokenizer.py b/cosyvoice/tokenizer/tokenizer.py index 00c97c9..43fb39a 100644 --- a/cosyvoice/tokenizer/tokenizer.py +++ b/cosyvoice/tokenizer/tokenizer.py @@ -255,6 +255,7 @@ class QwenTokenizer(): "[lipsmack]", "[mn]" ] } + self.special_tokens = special_tokens self.tokenizer = AutoTokenizer.from_pretrained(token_path) self.tokenizer.add_special_tokens(special_tokens) self.skip_special_tokens = skip_special_tokens