From 16d66dc6a687a06b068600bf7c583d94016be2c9 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 22 Oct 2024 12:56:29 +0800 Subject: [PATCH 1/3] fix short tts_text bug --- cosyvoice/cli/cosyvoice.py | 2 ++ cosyvoice/llm/llm.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 48babf3..be82f0c 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -67,6 +67,8 @@ class CosyVoice: def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0): prompt_text = self.frontend.text_normalize(prompt_text, split=False) for i in tqdm(self.frontend.text_normalize(tts_text, split=True)): + if len(i) < 0.5 * len(prompt_text): + logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text)) model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) start_time = time.time() logging.info('synthesis text {}'.format(i)) diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index 00e4af0..cf9c231 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -202,6 +202,9 @@ class TransformerLM(torch.nn.Module): att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool)) logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + # force continue decode first token + if i == 0: + logp[:, self.speech_token_size] = -float('inf') top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item() if top_ids == self.speech_token_size: break From a2ece33477510ddf8277af8efd73d79a20fdc1b4 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Fri, 1 Nov 2024 16:17:01 +0800 Subject: [PATCH 2/3] fix hifigan yaml bug --- examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml | 2 +- examples/libritts/cosyvoice/conf/cosyvoice.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml index cd63d9d..2247007 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram hop_size: 256 win_size: 1024 fmin: 0 - fmax: 8000 + fmax: null center: False hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan generator: !ref diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index 53c4118..5ce5caf 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram hop_size: 256 win_size: 1024 fmin: 0 - fmax: 8000 + fmax: null center: False hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan generator: !ref From 487701c98ce03554d519836b3ad7b7c5fb196d31 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 5 Nov 2024 09:34:51 +0800 Subject: [PATCH 3/3] fix huggingface-hub version --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 4189c5f..d6b3ca2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ gdown==5.1.0 gradio==4.32.2 grpcio==1.57.0 grpcio-tools==1.57.0 +huggingface-hub==0.23.5 hydra-core==1.3.2 HyperPyYAML==1.2.2 inflect==7.3.1