From 2fbeba50ae077cc0082eb9e9ebd8dc66eebb5df9 Mon Sep 17 00:00:00 2001 From: qihua Date: Sat, 8 Mar 2025 00:04:01 +0800 Subject: [PATCH] =?UTF-8?q?refactor(llm):=20=E7=A7=BB=E9=99=A4=E6=9C=AA?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E7=9A=84=E5=BC=82=E6=AD=A5=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 删除了 LLM 类中的 async_llm_inference 方法 - 该方法尚未使用,且再在loop_thread之外运行后会导致 vllm 崩溃,因此将其移除 --- cosyvoice/llm/llm_vllm.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cosyvoice/llm/llm_vllm.py b/cosyvoice/llm/llm_vllm.py index 61b1090..3fd7152 100644 --- a/cosyvoice/llm/llm_vllm.py +++ b/cosyvoice/llm/llm_vllm.py @@ -120,21 +120,6 @@ class VllmQwen2LM(Qwen2LM): except Exception as e: logging.error(f"Error in inference_processor: {e}") - async def async_llm_inference(self, prompt_token_ids: List[int], request_id: str=None, stop_token_ids=None, max_tokens=None)\ - -> AsyncGenerator[CompletionOutput, None]: - sampling_params = SamplingParams(**SAMPLING_PARAMS) - sampling_params.stop_token_ids = stop_token_ids or [6561] - if max_tokens: - sampling_params.max_tokens = max_tokens - async for output in self.llm_engine.generate( - { - "prompt_token_ids": prompt_token_ids, - }, - sampling_params=sampling_params, - request_id=request_id or f"{time.time()}", - ): - yield output.outputs[0] - def llm_inference(self, prompt_token_ids: List[int], request_id: str=None, stop_token_ids=None, max_tokens=None): # 使用 同步转异步 会导致vllm崩溃,目前选择 queue 的方式,后台线程运行推理任务 # 提交推理任务到队列中