add flow decoder cache

2026-02-05 18:09:24 +08:00 · 2025-01-23 16:48:13 +08:00
parent 190840b8dc
commit 1c062ab381
21 changed files with 1601 additions and 214 deletions
--- a/cosyvoice/bin/export_jit.py
+++ b/cosyvoice/bin/export_jit.py
@@ -24,6 +24,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging


 def get_args():
@@ -71,6 +72,7 @@ def main():
        script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
        script = get_optimized_script(llm_text_encoder.half())
        script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_text_encoder')

        # 2. export llm llm
        llm_llm = model.model.llm.llm
@@ -78,14 +80,23 @@ def main():
        script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
        script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
        script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_llm')

-    # 3. export flow encoder
-    flow_encoder = model.model.flow.encoder
-    script = get_optimized_script(flow_encoder)
-    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
-    script = get_optimized_script(flow_encoder.half())
-    script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
-
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+    else:
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder, ['forward_chunk'])
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half(), ['forward_chunk'])
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')

 if __name__ == '__main__':
    main()
--- a/cosyvoice/bin/export_onnx.py
+++ b/cosyvoice/bin/export_onnx.py
@@ -28,6 +28,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging


 def get_dummy_input(batch_size, seq_len, out_channels, device):
@@ -51,6 +52,7 @@ def get_args():
    return args


+@torch.no_grad()
 def main():
    args = get_args()
    logging.basicConfig(level=logging.DEBUG,
@@ -64,52 +66,125 @@ def main():
        except Exception:
            raise TypeError('no valid model_type!')

-    # 1. export flow decoder estimator
-    estimator = model.model.flow.decoder.estimator
+    if not isinstance(model, CosyVoice2):
+        # 1. export flow decoder estimator
+        estimator = model.model.flow.decoder.estimator
+        estimator.eval()

-    device = model.model.device
-    batch_size, seq_len = 2, 256
-    out_channels = model.model.flow.decoder.estimator.out_channels
-    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
-    torch.onnx.export(
-        estimator,
-        (x, mask, mu, t, spks, cond),
-        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
-        export_params=True,
-        opset_version=18,
-        do_constant_folding=True,
-        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
-        output_names=['estimator_out'],
-        dynamic_axes={
-            'x': {2: 'seq_len'},
-            'mask': {2: 'seq_len'},
-            'mu': {2: 'seq_len'},
-            'cond': {2: 'seq_len'},
-            'estimator_out': {2: 'seq_len'},
-        }
-    )
+        device = model.model.device
+        batch_size, seq_len = 2, 256
+        out_channels = model.model.flow.decoder.estimator.out_channels
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+        torch.onnx.export(
+            estimator,
+            (x, mask, mu, t, spks, cond),
+            '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+            export_params=True,
+            opset_version=18,
+            do_constant_folding=True,
+            input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+            output_names=['estimator_out'],
+            dynamic_axes={
+                'x': {2: 'seq_len'},
+                'mask': {2: 'seq_len'},
+                'mu': {2: 'seq_len'},
+                'cond': {2: 'seq_len'},
+                'estimator_out': {2: 'seq_len'},
+            }
+        )

-    # 2. test computation consistency
-    option = onnxruntime.SessionOptions()
-    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-    option.intra_op_num_threads = 1
-    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
-    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
-                                                  sess_options=option, providers=providers)
+        # 2. test computation consistency
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                    sess_options=option, providers=providers)

-    for _ in tqdm(range(10)):
-        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
-        output_pytorch = estimator(x, mask, mu, t, spks, cond)
-        ort_inputs = {
-            'x': x.cpu().numpy(),
-            'mask': mask.cpu().numpy(),
-            'mu': mu.cpu().numpy(),
-            't': t.cpu().numpy(),
-            'spks': spks.cpu().numpy(),
-            'cond': cond.cpu().numpy()
-        }
-        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
-        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+        for _ in tqdm(range(10)):
+            x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+            output_pytorch = estimator(x, mask, mu, t, spks, cond)
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
+            }
+            output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+            torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+        logging.info('successfully export estimator')
+    else:
+        # 1. export flow decoder estimator
+        estimator = model.model.flow.decoder.estimator
+        estimator.forward = estimator.forward_chunk
+        estimator.eval()
+
+        device = model.model.device
+        batch_size, seq_len = 2, 256
+        out_channels = model.model.flow.decoder.estimator.out_channels
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+        cache = model.model.init_flow_cache()['decoder_cache']
+        cache.pop('offset')
+        cache = {k: v[0] for k, v in cache.items()}
+        torch.onnx.export(
+            estimator,
+            (x, mask, mu, t, spks, cond,
+            cache['down_blocks_conv_cache'],
+            cache['down_blocks_kv_cache'],
+            cache['mid_blocks_conv_cache'],
+            cache['mid_blocks_kv_cache'],
+            cache['up_blocks_conv_cache'],
+            cache['up_blocks_kv_cache'],
+            cache['final_blocks_conv_cache']),
+            '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+            export_params=True,
+            opset_version=18,
+            do_constant_folding=True,
+            input_names=['x', 'mask', 'mu', 't', 'spks', 'cond', 'down_blocks_conv_cache', 'down_blocks_kv_cache', 'mid_blocks_conv_cache', 'mid_blocks_kv_cache', 'up_blocks_conv_cache', 'up_blocks_kv_cache', 'final_blocks_conv_cache'],
+            output_names=['estimator_out', 'down_blocks_conv_cache_out', 'down_blocks_kv_cache_out', 'mid_blocks_conv_cache_out', 'mid_blocks_kv_cache_out', 'up_blocks_conv_cache_out', 'up_blocks_kv_cache_out', 'final_blocks_conv_cache_out'],
+            dynamic_axes={
+                'x': {2: 'seq_len'},
+                'mask': {2: 'seq_len'},
+                'mu': {2: 'seq_len'},
+                'cond': {2: 'seq_len'},
+                'down_blocks_kv_cache': {3: 'seq_len'},
+                'mid_blocks_kv_cache': {3: 'seq_len'},
+                'up_blocks_kv_cache': {3: 'seq_len'},
+                'estimator_out': {2: 'seq_len'},
+                'down_blocks_kv_cache_out': {3: 'seq_len'},
+                'mid_blocks_kv_cache_out': {3: 'seq_len'},
+                'up_blocks_kv_cache_out': {3: 'seq_len'},
+            }
+        )
+
+        # 2. test computation consistency
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                    sess_options=option, providers=providers)
+
+        for _ in tqdm(range(10)):
+            x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+            cache = model.model.init_flow_cache()['decoder_cache']
+            cache.pop('offset')
+            cache = {k: v[0] for k, v in cache.items()}
+            output_pytorch = estimator(x, mask, mu, t, spks, cond, **{k: v.clone() for k, v in cache.items()})
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy(),
+            }
+            output_onnx = estimator_onnx.run(None, {**ort_inputs, **{k: v.clone().cpu().numpy() for k, v in cache.items()}})
+            for i, j in zip(output_pytorch, output_onnx):
+                torch.testing.assert_allclose(i, torch.from_numpy(j).to(device), rtol=1e-2, atol=1e-4)
+        logging.info('successfully export estimator')


 if __name__ == "__main__":
--- a/cosyvoice/bin/export_trt.sh
+++ b/cosyvoice/bin/export_trt.sh
@@ -3,8 +3,23 @@
 # download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability
 # for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
 TRT_DIR=<YOUR_TRT_DIR>
-MODEL_DIR=<COSYVOICE2_MODEL_DIR>
-
+MODEL_DIR=<YOUR_MODEL_DIR>
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64
+
+# cosyvoice export
 $TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw --outputIOFormats=fp32:chw
 $TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw
+
+# cosyvoice2 export with cache
+$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan \
+    --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4,down_blocks_kv_cache:1x4x2x0x512x2,mid_blocks_kv_cache:12x4x2x0x512x2,up_blocks_kv_cache:1x4x2x0x512x2 \
+    --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193,down_blocks_kv_cache:1x4x2x193x512x2,mid_blocks_kv_cache:12x4x2x193x512x2,up_blocks_kv_cache:1x4x2x193x512x2 \
+    --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800,down_blocks_kv_cache:1x4x2x200x512x2,mid_blocks_kv_cache:12x4x2x200x512x2,up_blocks_kv_cache:1x4x2x200x512x2 \
+    --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw \
+    --outputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw
+$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 \
+    --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4,down_blocks_kv_cache:1x4x2x0x512x2,mid_blocks_kv_cache:12x4x2x0x512x2,up_blocks_kv_cache:1x4x2x0x512x2 \
+    --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193,down_blocks_kv_cache:1x4x2x193x512x2,mid_blocks_kv_cache:12x4x2x193x512x2,up_blocks_kv_cache:1x4x2x193x512x2 \
+    --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800,down_blocks_kv_cache:1x4x2x200x512x2,mid_blocks_kv_cache:12x4x2x200x512x2,up_blocks_kv_cache:1x4x2x200x512x2 \
+    --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw \
+    --outputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw
--- a/cosyvoice/bin/inference.py
+++ b/cosyvoice/bin/inference.py
@@ -23,7 +23,7 @@ from torch.utils.data import DataLoader
 import torchaudio
 from hyperpyyaml import load_hyperpyyaml
 from tqdm import tqdm
-from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
 from cosyvoice.dataset.dataset import Dataset


@@ -33,6 +33,7 @@ def get_args():
    parser.add_argument('--prompt_data', required=True, help='prompt data file')
    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
    parser.add_argument('--llm_model', required=True, help='llm model file')
    parser.add_argument('--flow_model', required=True, help='flow model file')
    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
@@ -59,10 +60,18 @@ def main():
    # Init cosyvoice models from configs
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
-    with open(args.config, 'r') as f:
-        configs = load_hyperpyyaml(f)
+    try:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': args.qwen_pretrain_path})
+        model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16=False)
+    except Exception:
+        try:
+            with open(args.config, 'r') as f:
+                configs = load_hyperpyyaml(f)
+            model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16=False)
+        except Exception:
+            raise TypeError('no valid model_type!')

-    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
    model.load(args.llm_model, args.flow_model, args.hifigan_model)

    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
@@ -104,7 +113,7 @@ def main():
            tts_speeches = torch.concat(tts_speeches, dim=1)
            tts_key = '{}_{}'.format(utts[0], tts_index[0])
            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
-            torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
+            torchaudio.save(tts_fn, tts_speeches, sample_rate=configs['sample_rate'], backend='soundfile')
            f.write('{} {}\n'.format(tts_key, tts_fn))
            f.flush()
    f.close()
--- a/cosyvoice/bin/train.py
+++ b/cosyvoice/bin/train.py
@@ -46,6 +46,7 @@ def get_args():
    parser.add_argument('--config', required=True, help='config file')
    parser.add_argument('--train_data', required=True, help='train data file')
    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
    parser.add_argument('--checkpoint', help='checkpoint model')
    parser.add_argument('--model_dir', required=True, help='save model dir')
    parser.add_argument('--tensorboard_dir',
@@ -97,8 +98,12 @@ def main():
    override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
    if gan is True:
        override_dict.pop('hift')
-    with open(args.config, 'r') as f:
-        configs = load_hyperpyyaml(f, overrides=override_dict)
+    try:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path})
+    except Exception:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides=override_dict)
    if gan is True:
        configs['train_conf'] = configs['train_conf_gan']
    configs['train_conf'].update(vars(args))