From 587604b2b433bc350c344b4b181b47249b54faf2 Mon Sep 17 00:00:00 2001 From: bearlu Date: Mon, 21 Apr 2025 09:26:34 -0700 Subject: [PATCH 1/2] fix inference_instruct2 speaker ID bug --- cosyvoice/cli/cosyvoice.py | 4 ++-- cosyvoice/cli/frontend.py | 4 ++-- test1.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 test1.py diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index fc1ea90..d82f66e 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -177,10 +177,10 @@ class CosyVoice2(CosyVoice): def inference_instruct(self, *args, **kwargs): raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!') - def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True): + def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True): assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!' for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)): - model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate) + model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.tts(**model_input, stream=stream, speed=speed): diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 99cdb18..36dcd18 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -196,8 +196,8 @@ class CosyVoiceFrontEnd: model_input['prompt_text_len'] = instruct_text_token_len return model_input - def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate): - model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate) + def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id): + model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id) del model_input['llm_prompt_speech_token'] del model_input['llm_prompt_speech_token_len'] return model_input diff --git a/test1.py b/test1.py new file mode 100644 index 0000000..a1243e4 --- /dev/null +++ b/test1.py @@ -0,0 +1,37 @@ +import sys +sys.path.append('third_party/Matcha-TTS') +from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 +from cosyvoice.utils.file_utils import load_wav +import torchaudio # type: ignore + +cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False) + +# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference +# zero_shot usage +prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000) +for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + +# save zero_shot spk for future usage +assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True +for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)): + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) +cosyvoice.save_spkinfo() + +# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248 +for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)): + torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + +# instruct usage +for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)): + torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + +# bistream usage, you can use generator as input, this is useful when using text llm model as input +# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length +def text_generator(): + yield '收到好友从远方寄来的生日礼物,' + yield '那份意外的惊喜与深深的祝福' + yield '让我心中充满了甜蜜的快乐,' + yield '笑容如花儿般绽放。' +for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): + torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) \ No newline at end of file From b4c4d848ca6a6645cb7d35a51266b2e053781369 Mon Sep 17 00:00:00 2001 From: hwangsihu <129564966+hwangsihu@users.noreply.github.com> Date: Thu, 1 May 2025 13:28:15 +0900 Subject: [PATCH 2/2] Reorder requirements.txt --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4166dac..e482020 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,8 @@ conformer==0.3.2 deepspeed==0.14.2; sys_platform == 'linux' diffusers==0.29.0 +fastapi==0.115.6 +fastapi-cli==0.0.4 gdown==5.1.0 gradio==5.4.0 grpcio==1.57.0 @@ -34,7 +36,5 @@ torch==2.3.1 torchaudio==2.3.1 transformers==4.40.1 uvicorn==0.30.0 -wget==3.2 -fastapi==0.115.6 -fastapi-cli==0.0.4 WeTextProcessing==1.0.3 +wget==3.2