From df653f1e98cdc4516ba025a25403d08da7986824 Mon Sep 17 00:00:00 2001 From: liubaiji Date: Wed, 11 Sep 2024 10:36:32 +0800 Subject: [PATCH 1/2] [refator] modify fade_in_out func to a commom form --- cosyvoice/utils/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py index a5435eb..62ab916 100644 --- a/cosyvoice/utils/common.py +++ b/cosyvoice/utils/common.py @@ -139,6 +139,7 @@ def fade_in_out(fade_in_mel, fade_out_mel, window): device = fade_in_mel.device fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu() mel_overlap_len = int(window.shape[0] / 2) - fade_in_mel[:, :, :mel_overlap_len] = fade_in_mel[:, :, :mel_overlap_len] * window[:mel_overlap_len] + \ - fade_out_mel[:, :, -mel_overlap_len:] * window[mel_overlap_len:] + + fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \ + fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:] return fade_in_mel.to(device) From 9e0b99e48e67c3a874b7d0bbdc1a6a15c35f422e Mon Sep 17 00:00:00 2001 From: liubaiji Date: Wed, 11 Sep 2024 10:41:50 +0800 Subject: [PATCH 2/2] [feature] fix badcase, add fade on speech output --- cosyvoice/cli/model.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 5efd30c..34279a4 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -49,6 +49,7 @@ class CosyVoiceModel: self.llm_end_dict = {} self.mel_overlap_dict = {} self.hift_cache_dict = {} + self.speech_window = np.hamming(2 * self.source_cache_len) def load(self, llm_model, flow_model, hift_model): self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) @@ -113,10 +114,17 @@ class CosyVoiceModel: self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:] tts_mel = tts_mel[:, :, :-self.mel_overlap_len] tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source) - self.hift_cache_dict[uuid] = {'source': tts_source[:, :, -self.source_cache_len:], 'mel': tts_mel[:, :, -self.mel_cache_len:]} + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) + self.hift_cache_dict[uuid] = { + 'mel': tts_mel[:, :, -self.mel_cache_len:], + 'source': tts_source[:, :, -self.source_cache_len:], + 'speech': tts_speech[:, -self.source_cache_len:]} tts_speech = tts_speech[:, :-self.source_cache_len] else: tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source) + if self.hift_cache_dict[uuid] is not None: + tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) return tts_speech def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),