update libritts cosyvoice3.yaml

This commit is contained in:
lyuxiang.lx
2025-12-17 17:14:17 +08:00
parent f5816b4e51
commit 7baefaf0f2

View File

@@ -20,7 +20,7 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:cosyvoice.llm.llm.Qwen2LM
llm: !new:cosyvoice.llm.llm.CosyVoice3LM
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
speech_token_size: 6561
@@ -35,8 +35,8 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
win_size: 10
tau_r: 0.1
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
input_size: 80
output_size: 80
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
@@ -45,22 +45,10 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
only_mask_loss: True
token_mel_ratio: !ref <token_mel_ratio>
pre_lookahead_len: 3
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
input_size: 512
use_cnn_module: False
macaron_style: False
static_chunk_size: !ref <chunk_size>
pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
in_channels: 80
channels: 1024
pre_lookahead_len: 3
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
@@ -73,20 +61,20 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
training_cfg_rate: 0.2
inference_cfg_rate: 0.7
reg_loss_type: 'l1'
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
in_channels: 320
estimator: !new:cosyvoice.flow.DiT.dit.DiT
dim: 1024
depth: 22
heads: 16
dim_head: 64
ff_mult: 2
mel_dim: 80
mu_dim: 80
spk_dim: 80
out_channels: 80
channels: [256]
dropout: 0.0
attention_head_dim: 64
n_blocks: 4
num_mid_blocks: 12
num_heads: 8
act_fn: 'gelu'
static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
in_channels: 80
base_channels: 512
nb_harmonics: 8
@@ -105,7 +93,8 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
lrelu_slope: 0.1
audio_limit: 0.99
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
conv_pre_look_right: 4
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
num_class: 1
in_channels: 80
cond_channels: 512