From 4df0683a37e3bfd6071fac874c252902ce125f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=BE=E8=81=AA?= Date: Tue, 25 Feb 2025 17:43:33 +0800 Subject: [PATCH] add vllm_codec_engine --- cosyvoice/cli/cosyvoice.py | 9 +++++++++ cosyvoice/llm/llm.py | 1 + 2 files changed, 10 insertions(+) diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index b8fe756..f67c6d7 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -49,6 +49,7 @@ class CosyVoice: self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), '{}/hift.pt'.format(model_dir)) + self.vllm_codec_engine = None if load_jit: self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), '{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), @@ -149,8 +150,16 @@ class CosyVoice2(CosyVoice): self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), '{}/hift.pt'.format(model_dir)) + self.vllm_codec_engine = None if use_vllm: + from vllm import EngineArgs, LLMEngine self.model.export_codec_vllm(''.join([model_dir, '/codec_vllm_model'])) + engine_args = EngineArgs(model=''.join([model_dir, '/codec_vllm_model']), + skip_tokenizer_init=True, + gpu_memory_utilization=0.1) + self.vllm_codec_engine = LLMEngine.from_engine_args(engine_args) + self.model.llm.vllm_codec_engine = self.vllm_codec_engine + if load_jit: self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32')) if load_trt: diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index bbd3305..a7f12a5 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -282,6 +282,7 @@ class Qwen2LM(TransformerLM): # 4. sampling method self.sampling = sampling self.mix_ratio = mix_ratio + self.vllm_codec_engine = None @torch.inference_mode() def inference(