update libritts cosyvoice3.yaml

2026-02-05 18:09:24 +08:00 · 2025-12-17 17:14:17 +08:00
parent f5816b4e51
commit 7baefaf0f2
1 changed files with 19 additions and 30 deletions
--- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
+++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
@@ -20,7 +20,7 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
-llm: !new:cosyvoice.llm.llm.Qwen2LM
+llm: !new:cosyvoice.llm.llm.CosyVoice3LM
    llm_input_size: !ref <llm_input_size>
    llm_output_size: !ref <llm_output_size>
    speech_token_size: 6561
@@ -35,8 +35,8 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
        win_size: 10
        tau_r: 0.1
-flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
+flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
-    input_size: 512
+    input_size: 80
    output_size: 80
    spk_embed_dim: !ref <spk_embed_dim>
    output_type: 'mel'
@@ -45,22 +45,10 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
    only_mask_loss: True
    token_mel_ratio: !ref <token_mel_ratio>
    pre_lookahead_len: 3
-    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
+    pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
-        output_size: 512
+        in_channels: 80
-        attention_heads: 8
+        channels: 1024
-        linear_units: 2048
+        pre_lookahead_len: 3
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        normalize_before: True
        input_layer: 'linear'
        pos_enc_layer_type: 'rel_pos_espnet'
        selfattention_layer_type: 'rel_selfattn'
        input_size: 512
        use_cnn_module: False
        macaron_style: False
        static_chunk_size: !ref <chunk_size>
    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
        in_channels: 240
        n_spks: 1
@@ -73,20 +61,20 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
                training_cfg_rate: 0.2
                inference_cfg_rate: 0.7
                reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
+        estimator: !new:cosyvoice.flow.DiT.dit.DiT
-            in_channels: 320
+            dim: 1024
            depth: 22
            heads: 16
            dim_head: 64
            ff_mult: 2
            mel_dim: 80
            mu_dim: 80
            spk_dim: 80
            out_channels: 80
            channels: [256]
            dropout: 0.0
            attention_head_dim: 64
            n_blocks: 4
            num_mid_blocks: 12
            num_heads: 8
            act_fn: 'gelu'
            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
            num_decoding_left_chunks: !ref <num_decoding_left_chunks>
-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
+hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
    in_channels: 80
    base_channels: 512
    nb_harmonics: 8
@@ -105,7 +93,8 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    lrelu_slope: 0.1
    audio_limit: 0.99
-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
+    conv_pre_look_right: 4
    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
        num_class: 1
        in_channels: 80
        cond_channels: 512