From b4dd67a8afcf71d49a07ad1ec22d0abf2628eb00 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Mon, 8 Dec 2025 10:55:53 +0000 Subject: [PATCH] add cosyvoice3 vllm example --- cosyvoice/utils/file_utils.py | 16 +++++++++++----- example.py | 2 +- vllm_example.py | 21 +++++++++++++++++---- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index 374a90e..358a9f6 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -88,6 +88,7 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16): logging.info("Succesfully convert onnx to trt...") +# NOTE do not support bistream inference as only speech token embedding/head is kept def export_cosyvoice2_vllm(model, model_path, device): if os.path.exists(model_path): return @@ -98,12 +99,14 @@ def export_cosyvoice2_vllm(model, model_path, device): dtype = torch.bfloat16 # lm_head - new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=True) + use_bias = True if model.llm_decoder.bias is not None else False + new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=use_bias) with torch.no_grad(): new_lm_head.weight[:vocab_size] = model.llm_decoder.weight - new_lm_head.bias[:vocab_size] = model.llm_decoder.bias new_lm_head.weight[vocab_size:] = 0 - new_lm_head.bias[vocab_size:] = 0 + if use_bias is True: + new_lm_head.bias[:vocab_size] = model.llm_decoder.bias + new_lm_head.bias[vocab_size:] = 0 model.llm.model.lm_head = new_lm_head new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size) # embed_tokens @@ -121,9 +124,12 @@ def export_cosyvoice2_vllm(model, model_path, device): del model.llm.model.config.eos_token_id model.llm.model.config.vocab_size = pad_vocab_size model.llm.model.config.tie_word_embeddings = False - model.llm.model.config.use_bias = True + model.llm.model.config.use_bias = use_bias model.llm.model.save_pretrained(model_path) - os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path))) + if use_bias is True: + os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path))) + else: + os.system('sed -i s@Qwen2ForCausalLM@Qwen2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path))) model.llm.model.config.vocab_size = tmp_vocab_size model.llm.model.config.tie_word_embeddings = tmp_tie_embedding model.llm.model.set_input_embeddings(embed_tokens) diff --git a/example.py b/example.py index 70d0d8c..ba7c5a2 100644 --- a/example.py +++ b/example.py @@ -88,7 +88,7 @@ def cosyvoice3_example(): torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) def main(): - # cosyvoice_example() + cosyvoice_example() cosyvoice2_example() cosyvoice3_example() diff --git a/vllm_example.py b/vllm_example.py index e613033..8b177a3 100644 --- a/vllm_example.py +++ b/vllm_example.py @@ -4,20 +4,33 @@ from vllm import ModelRegistry from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM) -from cosyvoice.cli.cosyvoice import CosyVoice2 -from cosyvoice.utils.file_utils import load_wav +from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice3 from cosyvoice.utils.common import set_all_random_seed from tqdm import tqdm -def main(): +def cosyvoice2_example(): + """ CosyVoice2 vllm usage + """ cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True) + for i in tqdm(range(100)): + set_all_random_seed(i) + for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): + continue + +def cosyvoice3_example(): + """ CosyVoice3 vllm usage + """ + cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True) prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000) for i in tqdm(range(100)): set_all_random_seed(i) - for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): + for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): continue +def main(): + cosyvoice2_example() + cosyvoice3_example() if __name__ == '__main__': main()