fix bistream bug

2026-02-05 01:49:25 +08:00 · 2025-12-12 10:41:25 +00:00
parent b02d7e61f7
commit ca3b054a52
6 changed files with 37 additions and 34 deletions
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -413,18 +413,18 @@ class CosyVoice3Model(CosyVoice2Model):
                                             embedding=embedding.to(self.device),
                                             streaming=stream,
                                             finalize=finalize)
-        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
-        # append mel cache
-        if self.hift_cache_dict[uuid] is not None:
-            hift_cache_mel = self.hift_cache_dict[uuid]['mel']
-            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
-            self.hift_cache_dict[uuid]['mel'] = tts_mel
-        else:
-            self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
-        if speed != 1.0:
-            assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
-            tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
-        tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
-        tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
-        self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
+            tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
+            # append mel cache
+            if self.hift_cache_dict[uuid] is not None:
+                hift_cache_mel = self.hift_cache_dict[uuid]['mel']
+                tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+                self.hift_cache_dict[uuid]['mel'] = tts_mel
+            else:
+                self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
+            if speed != 1.0:
+                assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
+            tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
+            self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
        return tts_speech