add func inference_bistream_vllm

2026-02-04 17:39:25 +08:00 · 2025-03-01 18:50:19 +08:00
parent 54e9384fb1
commit 9a4aebb0ea
2 changed files with 144 additions and 7 deletions
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -104,13 +104,23 @@ class CosyVoiceModel:
        with self.llm_context:
            if isinstance(text, Generator):
                assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
-                for i in self.llm.inference_bistream(text=text,
-                                                     prompt_text=prompt_text.to(self.device),
-                                                     prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
-                                                     prompt_speech_token=llm_prompt_speech_token.to(self.device),
-                                                     prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
-                                                     embedding=llm_embedding.to(self.device)):
-                    self.tts_speech_token_dict[uuid].append(i)
+                if self.vllm_codec_engine is None:
+                    for i in self.llm.inference_bistream(text=text,
+                                                        prompt_text=prompt_text.to(self.device),
+                                                        prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                                        prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                                        prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                        embedding=llm_embedding.to(self.device)):
+                        self.tts_speech_token_dict[uuid].append(i)
+                else:
+                    for i in self.llm.inference_bistream_vllm(text=text,
+                                                        prompt_text=prompt_text.to(self.device),
+                                                        prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                                        prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                                        prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                        embedding=llm_embedding.to(self.device),
+                                                        vllm_codec_engine=self.vllm_codec_engine):
+                        self.tts_speech_token_dict[uuid].append(i)
            else:
                for i in self.llm.inference(text=text.to(self.device),
                                            text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),