From 0c65d3c7ab2e85cccad00dc3a9e94e3d5ff5961e Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 9 Dec 2025 15:15:05 +0000 Subject: [PATCH] use automodel --- cosyvoice/bin/export_jit.py | 30 ++++++++++++++---------------- cosyvoice/bin/export_onnx.py | 13 ++----------- cosyvoice/cli/cosyvoice.py | 23 +++++++++++++++++++---- cosyvoice/cli/frontend.py | 3 +++ cosyvoice/utils/file_utils.py | 23 +++-------------------- example.py | 12 ++++++------ vllm_example.py | 6 +++--- webui.py | 34 ++++++---------------------------- 8 files changed, 56 insertions(+), 88 deletions(-) diff --git a/cosyvoice/bin/export_jit.py b/cosyvoice/bin/export_jit.py index 4eedc1a..0013d64 100644 --- a/cosyvoice/bin/export_jit.py +++ b/cosyvoice/bin/export_jit.py @@ -23,8 +23,10 @@ import torch ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/../..'.format(ROOT_DIR)) sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR)) -from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model, CosyVoice3Model from cosyvoice.utils.file_utils import logging +from cosyvoice.utils.class_utils import get_model_type def get_args(): @@ -57,15 +59,17 @@ def main(): torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) - try: - model = CosyVoice(args.model_dir) - except Exception: - try: - model = CosyVoice2(args.model_dir) - except Exception: - raise TypeError('no valid model_type!') + model = AutoModel(model_dir=args.model_dir) - if not isinstance(model, CosyVoice2): + if get_model_type(model.model) == CosyVoiceModel: + # 1. export flow encoder + flow_encoder = model.model.flow.encoder + script = get_optimized_script(flow_encoder) + script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir)) + script = get_optimized_script(flow_encoder.half()) + script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir)) + logging.info('successfully export flow_encoder') + elif get_model_type(model.model) == CosyVoice2Model: # 1. export llm text_encoder llm_text_encoder = model.model.llm.text_encoder script = get_optimized_script(llm_text_encoder) @@ -90,13 +94,7 @@ def main(): script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir)) logging.info('successfully export flow_encoder') else: - # 3. export flow encoder - flow_encoder = model.model.flow.encoder - script = get_optimized_script(flow_encoder) - script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir)) - script = get_optimized_script(flow_encoder.half()) - script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir)) - logging.info('successfully export flow_encoder') + raise ValueError('unsupported model type') if __name__ == '__main__': diff --git a/cosyvoice/bin/export_onnx.py b/cosyvoice/bin/export_onnx.py index e4857da..58e7708 100644 --- a/cosyvoice/bin/export_onnx.py +++ b/cosyvoice/bin/export_onnx.py @@ -27,7 +27,7 @@ from tqdm import tqdm ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/../..'.format(ROOT_DIR)) sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR)) -from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2, CosyVoice3 +from cosyvoice.cli.cosyvoice import AutoModel from cosyvoice.utils.file_utils import logging @@ -58,16 +58,7 @@ def main(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') - try: - model = CosyVoice(args.model_dir) - except Exception: - try: - model = CosyVoice2(args.model_dir) - except Exception: - try: - model = CosyVoice3(args.model_dir) - except Exception: - raise TypeError('no valid model_type!') + model = AutoModel(model_dir=args.model_dir) # 1. export flow decoder estimator estimator = model.model.flow.decoder.estimator diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index ea63fc0..6395d60 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -196,7 +196,7 @@ class CosyVoice2(CosyVoice): class CosyVoice3(CosyVoice2): - def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1): + def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1): self.instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir self.fp16 = fp16 @@ -215,9 +215,9 @@ class CosyVoice3(CosyVoice2): '{}/spk2info.pt'.format(model_dir), configs['allowed_special']) self.sample_rate = configs['sample_rate'] - if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): - load_jit, load_trt, fp16 = False, False, False - logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') + if torch.cuda.is_available() is False and (load_trt is True or fp16 is True): + load_trt, fp16 = False, False + logging.warning('no cuda device, set load_trt/fp16 to False') self.model = CosyVoice3Model(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), @@ -225,8 +225,23 @@ class CosyVoice3(CosyVoice2): if load_vllm: self.model.load_vllm('{}/vllm'.format(model_dir)) if load_trt: + if self.fp16 is True: + logging.warning('DiT tensorRT fp16 engine have some performance issue, use at caution!') self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir), trt_concurrent, self.fp16) del configs + + +def AutoModel(**kwargs): + if not os.path.exists(kwargs['model_dir']): + kwargs['model_dir'] = snapshot_download(kwargs['model_dir']) + if os.path.exists('{}/cosyvoice.yaml'.format(kwargs['model_dir'])): + return CosyVoice(**kwargs) + elif os.path.exists('{}/cosyvoice2.yaml'.format(kwargs['model_dir'])): + return CosyVoice2(**kwargs) + elif os.path.exists('{}/cosyvoice3.yaml'.format(kwargs['model_dir'])): + return CosyVoice3(**kwargs) + else: + raise TypeError('No valid model type found!') diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index ae2b485..4292931 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -122,6 +122,9 @@ class CosyVoiceFrontEnd: return speech_feat, speech_feat_len def text_normalize(self, text, split=True, text_frontend=True): + # NOTE skip text_frontend when ssml symbol in text + if '<|' in text and '|>' in text: + text_frontend = False if isinstance(text, Generator): logging.info('get tts_text generator, will skip text_normalize!') return [text] diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index 358a9f6..b173ef2 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -92,29 +92,14 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16): def export_cosyvoice2_vllm(model, model_path, device): if os.path.exists(model_path): return - pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64 - vocab_size = model.speech_embedding.num_embeddings - feature_size = model.speech_embedding.embedding_dim - pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to dtype = torch.bfloat16 # lm_head use_bias = True if model.llm_decoder.bias is not None else False - new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=use_bias) - with torch.no_grad(): - new_lm_head.weight[:vocab_size] = model.llm_decoder.weight - new_lm_head.weight[vocab_size:] = 0 - if use_bias is True: - new_lm_head.bias[:vocab_size] = model.llm_decoder.bias - new_lm_head.bias[vocab_size:] = 0 - model.llm.model.lm_head = new_lm_head - new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size) + model.llm.model.lm_head = model.llm_decoder # embed_tokens embed_tokens = model.llm.model.model.embed_tokens - with torch.no_grad(): - new_codec_embed.weight[:vocab_size] = model.speech_embedding.weight - new_codec_embed.weight[vocab_size:] = 0 - model.llm.model.set_input_embeddings(new_codec_embed) + model.llm.model.set_input_embeddings(model.speech_embedding) model.llm.model.to(device) model.llm.model.to(dtype) tmp_vocab_size = model.llm.model.config.vocab_size @@ -122,14 +107,12 @@ def export_cosyvoice2_vllm(model, model_path, device): del model.llm.model.generation_config.eos_token_id del model.llm.model.config.bos_token_id del model.llm.model.config.eos_token_id - model.llm.model.config.vocab_size = pad_vocab_size + model.llm.model.config.vocab_size = model.speech_embedding.num_embeddings model.llm.model.config.tie_word_embeddings = False model.llm.model.config.use_bias = use_bias model.llm.model.save_pretrained(model_path) if use_bias is True: os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path))) - else: - os.system('sed -i s@Qwen2ForCausalLM@Qwen2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path))) model.llm.model.config.vocab_size = tmp_vocab_size model.llm.model.config.tie_word_embeddings = tmp_tie_embedding model.llm.model.set_input_embeddings(embed_tokens) diff --git a/example.py b/example.py index 6216cf5..164acf6 100644 --- a/example.py +++ b/example.py @@ -1,6 +1,6 @@ import sys sys.path.append('third_party/Matcha-TTS') -from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2, CosyVoice3 +from cosyvoice.cli.cosyvoice import AutoModel from cosyvoice.utils.file_utils import load_wav import torchaudio @@ -8,14 +8,14 @@ import torchaudio def cosyvoice_example(): """ CosyVoice Usage, check https://fun-audio-llm.github.io/ for more details """ - cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False) + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_available_spks()) # change stream=True for chunk stream inference for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)): torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) - cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M') + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M') # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) @@ -26,7 +26,7 @@ def cosyvoice_example(): for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav', stream=False)): torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) - cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct') + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct') # instruct usage, support [laughter][breath] for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>', stream=False)): torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) @@ -34,7 +34,7 @@ def cosyvoice_example(): def cosyvoice2_example(): """ CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details """ - cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False) + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B') # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference # zero_shot usage @@ -68,7 +68,7 @@ def cosyvoice2_example(): def cosyvoice3_example(): """ CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details """ - cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_jit=False, load_trt=False, fp16=False) + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice3-0.5B') # zero_shot usage for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) diff --git a/vllm_example.py b/vllm_example.py index e4222af..5fbfe7d 100644 --- a/vllm_example.py +++ b/vllm_example.py @@ -4,7 +4,7 @@ from vllm import ModelRegistry from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM) -from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice3 +from cosyvoice.cli.cosyvoice import AutoModel from cosyvoice.utils.common import set_all_random_seed from tqdm import tqdm @@ -12,7 +12,7 @@ from tqdm import tqdm def cosyvoice2_example(): """ CosyVoice2 vllm usage """ - cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True) + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True) for i in tqdm(range(100)): set_all_random_seed(i) for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): @@ -21,7 +21,7 @@ def cosyvoice2_example(): def cosyvoice3_example(): """ CosyVoice3 vllm usage """ - cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=True) + cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False) for i in tqdm(range(100)): set_all_random_seed(i) for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): diff --git a/webui.py b/webui.py index 3552cd9..ee5a962 100644 --- a/webui.py +++ b/webui.py @@ -22,8 +22,8 @@ import random import librosa ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) -from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 -from cosyvoice.utils.file_utils import load_wav, logging +from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.utils.file_utils import logging from cosyvoice.utils.common import set_all_random_seed inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制'] @@ -42,23 +42,9 @@ def generate_seed(): "value": seed } - -def postprocess(speech, top_db=60, hop_length=220, win_length=440): - speech, _ = librosa.effects.trim( - speech, top_db=top_db, - frame_length=win_length, - hop_length=hop_length - ) - if speech.abs().max() > max_val: - speech = speech / speech.abs().max() * max_val - speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1) - return speech - - def change_instruction(mode_checkbox_group): return instruct_dict[mode_checkbox_group] - def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, stream, speed): if prompt_wav_upload is not None: @@ -118,15 +104,13 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '3s极速复刻': logging.info('get zero_shot inference request') - prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) - for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed): + for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_wav, stream=stream, speed=speed): yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '跨语种复刻': logging.info('get cross_lingual inference request') - prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) - for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed): + for i in cosyvoice.inference_cross_lingual(tts_text, prompt_wav, stream=stream, speed=speed): yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) else: logging.info('get instruct inference request') @@ -181,16 +165,10 @@ if __name__ == '__main__': default=8000) parser.add_argument('--model_dir', type=str, - default='pretrained_models/CosyVoice2-0.5B', + default='pretrained_models/CosyVoice3-0.5B', help='local path or modelscope repo id') args = parser.parse_args() - try: - cosyvoice = CosyVoice(args.model_dir) - except Exception: - try: - cosyvoice = CosyVoice2(args.model_dir) - except Exception: - raise TypeError('no valid model_type!') + model = AutoModel(model_dir=args.model_dir) sft_spk = cosyvoice.list_available_spks() if len(sft_spk) == 0: