diff --git a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml index df36109..43e457f 100644 --- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml +++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml @@ -20,7 +20,7 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, # model params # for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. # for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.Qwen2LM +llm: !new:cosyvoice.llm.llm.CosyVoice3LM llm_input_size: !ref llm_output_size: !ref speech_token_size: 6561 @@ -35,8 +35,8 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM win_size: 10 tau_r: 0.1 -flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec - input_size: 512 +flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT + input_size: 80 output_size: 80 spk_embed_dim: !ref output_type: 'mel' @@ -45,22 +45,10 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec only_mask_loss: True token_mel_ratio: !ref pre_lookahead_len: 3 - encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder - output_size: 512 - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - static_chunk_size: !ref + pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer + in_channels: 80 + channels: 1024 + pre_lookahead_len: 3 decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM in_channels: 240 n_spks: 1 @@ -73,20 +61,20 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec training_cfg_rate: 0.2 inference_cfg_rate: 0.7 reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder - in_channels: 320 + estimator: !new:cosyvoice.flow.DiT.dit.DiT + dim: 1024 + depth: 22 + heads: 16 + dim_head: 64 + ff_mult: 2 + mel_dim: 80 + mu_dim: 80 + spk_dim: 80 out_channels: 80 - channels: [256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 12 - num_heads: 8 - act_fn: 'gelu' static_chunk_size: !ref * num_decoding_left_chunks: !ref -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator +hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator in_channels: 80 base_channels: 512 nb_harmonics: 8 @@ -105,7 +93,8 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] lrelu_slope: 0.1 audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + conv_pre_look_right: 4 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor num_class: 1 in_channels: 80 cond_channels: 512