From b6a1116d1505be1476e5582a4caf09d073ccdc6f Mon Sep 17 00:00:00 2001 From: "huzetao.hzt" Date: Tue, 7 Jan 2025 17:20:06 +0800 Subject: [PATCH 1/2] support online onnx to trt conversion --- cosyvoice/cli/cosyvoice.py | 2 +- cosyvoice/cli/model.py | 10 +-- cosyvoice/flow/flow_matching.py | 19 +---- cosyvoice/trt/estimator_trt.py | 141 ++++++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 26 deletions(-) create mode 100644 cosyvoice/trt/estimator_trt.py diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 68379a4..56c0bfc 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -149,7 +149,7 @@ class CosyVoice2(CosyVoice): if load_jit: self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32')) if load_trt: - self.model.load_trt('{}/flow.decoder.estimator.{}.v100.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32')) + self.model.load_trt('{}/flow.decoder.estimator'.format(model_dir), self.fp16) del configs def inference_instruct(self, *args, **kwargs): diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 0840e46..8de3fc5 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -19,6 +19,7 @@ from torch.nn import functional as F from contextlib import nullcontext import uuid from cosyvoice.utils.common import fade_in_out +from cosyvoice.trt.estimator_trt import EstimatorTRT class CosyVoiceModel: @@ -81,14 +82,9 @@ class CosyVoiceModel: flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device) self.flow.encoder = flow_encoder - def load_trt(self, flow_decoder_estimator_model): + def load_trt(self, flow_decoder_estimator_model, fp16): del self.flow.decoder.estimator - import tensorrt as trt - with open(flow_decoder_estimator_model, 'rb') as f: - self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read()) - if self.flow.decoder.estimator_engine is None: - raise ValueError('failed to load trt {}'.format(flow_decoder_estimator_model)) - self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context() + self.flow.decoder.estimator = EstimatorTRT(flow_decoder_estimator_model, self.device, fp16) def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): with self.llm_context: diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index 40062b3..96014b8 100644 --- a/cosyvoice/flow/flow_matching.py +++ b/cosyvoice/flow/flow_matching.py @@ -120,24 +120,7 @@ class ConditionalCFM(BASECFM): return sol[-1].float() def forward_estimator(self, x, mask, mu, t, spks, cond): - if isinstance(self.estimator, torch.nn.Module): - return self.estimator.forward(x, mask, mu, t, spks, cond) - else: - self.estimator.set_input_shape('x', (2, 80, x.size(2))) - self.estimator.set_input_shape('mask', (2, 1, x.size(2))) - self.estimator.set_input_shape('mu', (2, 80, x.size(2))) - self.estimator.set_input_shape('t', (2,)) - self.estimator.set_input_shape('spks', (2, 80)) - self.estimator.set_input_shape('cond', (2, 80, x.size(2))) - # run trt engine - self.estimator.execute_v2([x.contiguous().data_ptr(), - mask.contiguous().data_ptr(), - mu.contiguous().data_ptr(), - t.contiguous().data_ptr(), - spks.contiguous().data_ptr(), - cond.contiguous().data_ptr(), - x.data_ptr()]) - return x + return self.estimator.forward(x, mask, mu, t, spks, cond) def compute_loss(self, x1, mask, mu, spks=None, cond=None): """Computes diffusion loss diff --git a/cosyvoice/trt/estimator_trt.py b/cosyvoice/trt/estimator_trt.py new file mode 100644 index 0000000..b17bbab --- /dev/null +++ b/cosyvoice/trt/estimator_trt.py @@ -0,0 +1,141 @@ +import os +import torch +import tensorrt as trt +import logging +import threading + + +_min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2,), (2, 80), (2, 80, 4)] + +_opt_shape = [(2, 80, 193), (2, 1, 193), (2, 80, 193), (2,), (2, 80), (2, 80, 193)] + +_max_shape = [(2, 80, 6800), (2, 1, 6800), (2, 80, 6800), (2,), (2, 80), (2, 80, 6800)] + + +class EstimatorTRT: + def __init__(self, path_prefix: str, device: torch.device, fp16: bool = True): + self.lock = threading.Lock() + self.device = device + with torch.cuda.device(device): + self.input_names = ["x", "mask", "mu", "t", "spks", "cond"] + self.output_name = "estimator_out" + + onnx_path = path_prefix + ".fp32.onnx" + precision = ".fp16" if fp16 else ".fp32" + trt_path = path_prefix + precision +".plan" + + self.fp16 = fp16 + self.logger = trt.Logger(trt.Logger.INFO) + self.trt_runtime = trt.Runtime(self.logger) + + save_trt = not os.environ.get("NOT_SAVE_TRT", "0") == "1" + + if os.path.exists(trt_path): + self.engine = self._load_trt(trt_path) + else: + self.engine = self._convert_onnx_to_trt(onnx_path, trt_path, save_trt) + + self.context = self.engine.create_execution_context() + + def _convert_onnx_to_trt( + self, onnx_path: str, trt_path: str, save_trt: bool = True + ): + logging.info("Converting onnx to trt...") + + network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + builder = trt.Builder(self.logger) + network = builder.create_network(network_flags) + parser = trt.OnnxParser(network, self.logger) + config = builder.create_builder_config() + + config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 33) # 8GB + if (self.fp16): + config.set_flag(trt.BuilderFlag.FP16) + + profile = builder.create_optimization_profile() + + # load onnx model + with open(onnx_path, "rb") as f: + if not parser.parse(f.read()): + for error in range(parser.num_errors): + print(parser.get_error(error)) + exit(1) + + # set input shapes + for i in range(len(self.input_names)): + profile.set_shape( + self.input_names[i], _min_shape[i], _opt_shape[i], _max_shape[i] + ) + + tensor_dtype = trt.DataType.HALF if self.fp16 else trt.DataType.FLOAT + + # set input and output data type + for i in range(network.num_inputs): + input_tensor = network.get_input(i) + input_tensor.dtype = tensor_dtype + + for i in range(network.num_outputs): + output_tensor = network.get_output(i) + output_tensor.dtype = tensor_dtype + + config.add_optimization_profile(profile) + engine_bytes = builder.build_serialized_network(network, config) + + # save trt engine + if save_trt: + with open(trt_path, "wb") as f: + f.write(engine_bytes) + print("trt engine saved to {}".format(trt_path)) + + engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes) + return engine + + def _load_trt(self, trt_path: str): + logging.info("Found trt engine, loading...") + + with open(trt_path, "rb") as f: + engine_bytes = f.read() + engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes) + return engine + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor, + mu: torch.Tensor, + t: torch.Tensor, + spks: torch.Tensor, + cond: torch.Tensor, + ): + with self.lock: + with torch.cuda.device(self.device): + self.context.set_input_shape("x", (2, 80, x.size(2))) + self.context.set_input_shape("mask", (2, 1, x.size(2))) + self.context.set_input_shape("mu", (2, 80, x.size(2))) + self.context.set_input_shape("t", (2,)) + self.context.set_input_shape("spks", (2, 80)) + self.context.set_input_shape("cond", (2, 80, x.size(2))) + # run trt engine + self.context.execute_v2( + [ + x.contiguous().data_ptr(), + mask.contiguous().data_ptr(), + mu.contiguous().data_ptr(), + t.contiguous().data_ptr(), + spks.contiguous().data_ptr(), + cond.contiguous().data_ptr(), + x.data_ptr(), + ] + ) + return x + + def __call__( + self, + x: torch.Tensor, + mask: torch.Tensor, + mu: torch.Tensor, + t: torch.Tensor, + spks: torch.Tensor, + cond: torch.Tensor, + ): + return self.forward(x, mask, mu, t, spks, cond) From 2a0dd5447adac3a917516dfb9320a4a3494c77e7 Mon Sep 17 00:00:00 2001 From: "huzetao.hzt" Date: Tue, 7 Jan 2025 17:29:20 +0800 Subject: [PATCH 2/2] add usage --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index d5fc9ea..8ccca02 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,8 @@ import torchaudio **CosyVoice2 Usage** ```python +# NOTE if you want to use tensorRT to accerlate the flow matching inference, please set load_trt=True. +# if you don't want to save tensorRT model on disk, please set environment variable `NOT_SAVE_TRT=1`. cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False) # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference