fix bistream bug

2026-02-05 18:09:24 +08:00 · 2025-12-12 10:41:25 +00:00
parent b02d7e61f7
commit ca3b054a52
6 changed files with 37 additions and 34 deletions
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -122,12 +122,12 @@ class CosyVoiceFrontEnd:
        return speech_feat, speech_feat_len

    def text_normalize(self, text, split=True, text_frontend=True):
-        # NOTE skip text_frontend when ssml symbol in text
-        if '<|' in text and '|>' in text:
-            text_frontend = False
        if isinstance(text, Generator):
            logging.info('get tts_text generator, will skip text_normalize!')
            return [text]
+        # NOTE skip text_frontend when ssml symbol in text
+        if '<|' in text and '|>' in text:
+            text_frontend = False
        if text_frontend is False or text == '':
            return [text] if split is True else text
        text = text.strip()
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -413,18 +413,18 @@ class CosyVoice3Model(CosyVoice2Model):
                                             embedding=embedding.to(self.device),
                                             streaming=stream,
                                             finalize=finalize)
-        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
-        # append mel cache
-        if self.hift_cache_dict[uuid] is not None:
-            hift_cache_mel = self.hift_cache_dict[uuid]['mel']
-            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
-            self.hift_cache_dict[uuid]['mel'] = tts_mel
-        else:
-            self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
-        if speed != 1.0:
-            assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
-            tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
-        tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
-        tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
-        self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
+            tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
+            # append mel cache
+            if self.hift_cache_dict[uuid] is not None:
+                hift_cache_mel = self.hift_cache_dict[uuid]['mel']
+                tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+                self.hift_cache_dict[uuid]['mel'] = tts_mel
+            else:
+                self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
+            if speed != 1.0:
+                assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
+            tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
+            self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
        return tts_speech
--- a/cosyvoice/hifigan/generator.py
+++ b/cosyvoice/hifigan/generator.py
@@ -155,11 +155,13 @@ class SineGen(torch.nn.Module):

    @torch.no_grad()
    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, dim=1, length)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
        """
-        :param f0: [B, 1, sample_len], Hz
-        :return: [B, 1, sample_len]
-        """
-
+        f0 = f0.transpose(1, 2)
        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
        for i in range(self.harmonic_num + 1):
            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
@@ -184,7 +186,7 @@ class SineGen(torch.nn.Module):
        # first: set the unvoiced part to 0 by uv
        # then: additive noise
        sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
+        return sine_waves.transpose(1, 2), uv.transpose(1, 2), noise


 class SineGen2(torch.nn.Module):
@@ -221,7 +223,7 @@ class SineGen2(torch.nn.Module):
        if causal is True:
            self.rand_ini = torch.rand(1, 9)
            self.rand_ini[:, 0] = 0
-            self.sine_waves = torch.rand(1, 60 * 16000, 9)
+            self.sine_waves = torch.rand(1, 300 * 24000, 9)

    def _f02uv(self, f0):
        # generate uv signal
@@ -351,7 +353,7 @@ class SourceModuleHnNSF(torch.nn.Module):
        self.l_tanh = torch.nn.Tanh()
        self.causal = causal
        if causal is True:
-            self.uv = torch.rand(1, 60 * 24000, 1)
+            self.uv = torch.rand(1, 300 * 24000, 1)

    def forward(self, x):
        """
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -17,6 +17,7 @@ import random
 import time
 import threading
 from typing import Dict, Optional, Callable, List, Generator
+import numpy as np
 import torch
 from torch import nn
 import torch.nn.functional as F
@@ -216,7 +217,7 @@ class TransformerLM(torch.nn.Module):
                                                                  att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
                                                                                                 device=lm_input.device)).to(torch.bool))
            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
-            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
            if top_ids == self.eos_token:
                break
            # in stream mode, yield token one by one
@@ -544,7 +545,7 @@ class Qwen2LM(TransformerLM):
        cache = None
        # NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
        text_cache = self.llm.model.model.embed_tokens(prompt_text)
-        next_fill_index = -1
+        next_fill_index = (int(prompt_speech_token.shape[1] / self.mix_ratio[1]) + 1) * self.mix_ratio[1] - prompt_speech_token.shape[1]
        for this_text in text:
            text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
            # prompt_speech_token_emb not empty, try append to lm_input
@@ -582,7 +583,7 @@ class Qwen2LM(TransformerLM):
                        top_ids = self.fill_token
                        next_fill_index += (self.mix_ratio[1] + 1)
                    else:
-                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
+                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True)
                    if top_ids == self.fill_token:
                        next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
                        logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
--- a/example.py
+++ b/example.py
@@ -15,15 +15,15 @@ def cosyvoice_example():
        torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)

    cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M')
-    # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
+    # zero_shot usage
    for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')):
        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
-    # cross_lingual usage
+    # cross_lingual usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
    for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.',
                                                            './asset/cross_lingual_prompt.wav')):
        torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
    # vc usage
-    for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav')):
+    for i, j in enumerate(cosyvoice.inference_vc('./asset/cross_lingual_prompt.wav', './asset/zero_shot_prompt.wav')):
        torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)

    cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct')
@@ -65,7 +65,7 @@ def cosyvoice2_example():
        yield '让我心中充满了甜蜜的快乐，'
        yield '笑容如花儿般绽放。'
    for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
-        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
+        torchaudio.save('zero_shot_bistream_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)


 def cosyvoice3_example():
@@ -97,8 +97,8 @@ def cosyvoice3_example():


 def main():
-    cosyvoice_example()
-    cosyvoice2_example()
+    # cosyvoice_example()
+    # cosyvoice2_example()
    cosyvoice3_example()


--- a/vllm_example.py
+++ b/vllm_example.py
@@ -31,7 +31,7 @@ def cosyvoice3_example():


 def main():
-    cosyvoice2_example()
+    # cosyvoice2_example()
    cosyvoice3_example()