mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
Revert "mv AsyncLLMEngine init to CosyVoice2"
This reverts commit 9b3f351496.
This commit is contained in:
@@ -166,29 +166,7 @@ class CosyVoice2(CosyVoice):
|
|||||||
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
||||||
if use_vllm:
|
if use_vllm:
|
||||||
try:
|
try:
|
||||||
os.environ["VLLM_USE_V1"] = '1'
|
|
||||||
from vllm import AsyncLLMEngine
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
||||||
# EngineArgs
|
|
||||||
ENGINE_ARGS = {
|
|
||||||
"block_size": 16,
|
|
||||||
"swap_space": 0,
|
|
||||||
# "enforce_eager": True,
|
|
||||||
"gpu_memory_utilization": 0.4,
|
|
||||||
"max_num_batched_tokens": 1024,
|
|
||||||
"max_model_len": 1024,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"disable_log_requests": True,
|
|
||||||
"disable_log_stats": True,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
}
|
|
||||||
self.model = VllmCosyVoice2Model(model_dir, configs['flow'], configs['hift'], fp16)
|
self.model = VllmCosyVoice2Model(model_dir, configs['flow'], configs['hift'], fp16)
|
||||||
engine_args = AsyncEngineArgs(
|
|
||||||
model=model_dir,
|
|
||||||
**ENGINE_ARGS,
|
|
||||||
)
|
|
||||||
self.llm_engine: AsyncLLMEngine = AsyncLLMEngine.from_engine_args(engine_args)
|
|
||||||
self.model.llm_engine = self.llm_engine
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f'use vllm inference failed. \n{e}')
|
logging.warning(f'use vllm inference failed. \n{e}')
|
||||||
raise e
|
raise e
|
||||||
|
|||||||
@@ -31,6 +31,20 @@ from vllm.sampling_params import SamplingParams
|
|||||||
from cosyvoice.llm.vllm_use_cosyvoice2_model import CosyVoice2Model as CosyVoice2LLM
|
from cosyvoice.llm.vllm_use_cosyvoice2_model import CosyVoice2Model as CosyVoice2LLM
|
||||||
ModelRegistry.register_model("CosyVoice2Model", CosyVoice2LLM)
|
ModelRegistry.register_model("CosyVoice2Model", CosyVoice2LLM)
|
||||||
|
|
||||||
|
# EngineArgs
|
||||||
|
ENGINE_ARGS = {
|
||||||
|
"block_size": 16,
|
||||||
|
"swap_space": 0,
|
||||||
|
# "enforce_eager": True,
|
||||||
|
"gpu_memory_utilization": 0.4,
|
||||||
|
"max_num_batched_tokens": 1024,
|
||||||
|
"max_model_len": 1024,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"disable_log_requests": True,
|
||||||
|
"disable_log_stats": True,
|
||||||
|
"dtype": "float16"
|
||||||
|
}
|
||||||
|
|
||||||
from vllm.sampling_params import RequestOutputKind
|
from vllm.sampling_params import RequestOutputKind
|
||||||
# SamplingParams
|
# SamplingParams
|
||||||
SAMPLING_PARAMS = {
|
SAMPLING_PARAMS = {
|
||||||
@@ -58,7 +72,13 @@ class VllmQwen2LM(Qwen2LM):
|
|||||||
self.fp16 = False
|
self.fp16 = False
|
||||||
self.half = lambda: None
|
self.half = lambda: None
|
||||||
self.mix_ratio = mix_ratio
|
self.mix_ratio = mix_ratio
|
||||||
self.llm_engine = None
|
# ---------------------------------------------
|
||||||
|
# vllm engine 的参数配置
|
||||||
|
engine_args = AsyncEngineArgs(
|
||||||
|
model=model_dir,
|
||||||
|
**ENGINE_ARGS,
|
||||||
|
)
|
||||||
|
self.llm_engine: AsyncLLMEngine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
self.speech_token_size = 6564 # 6561 + 3
|
self.speech_token_size = 6564 # 6561 + 3
|
||||||
self.llm_token_size = 151936 # llm vocab_size
|
self.llm_token_size = 151936 # llm vocab_size
|
||||||
|
|||||||
Reference in New Issue
Block a user