This commit is contained in:
lyuxiang.lx
2025-02-06 16:07:13 +08:00
parent 24f796a2b1
commit 2a3e033ee1
17 changed files with 187 additions and 135 deletions

View File

@@ -13,6 +13,10 @@ qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
# stream related params
chunk_size: 1 # streaming inference chunk size, in second
num_decoding_left_chunks: 2 # streaming inference flow decoder left chunk size, in second
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
@@ -56,7 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512
use_cnn_module: False
macaron_style: False
static_chunk_size: !ref <token_frame_rate> # 试试UpsampleConformerEncoder也是static
static_chunk_size: !ref <chunk_size> * <token_frame_rate>
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
@@ -79,8 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
num_mid_blocks: 12
num_heads: 8
act_fn: 'gelu'
static_chunk_size: !ref <token_frame_rate> * <token_mel_ratio> # here we use static_chunk_size because we want to fix kv cache size during inference
num_decoding_left_chunks: 2
static_chunk_size: !ref <chunk_size> * <token_frame_rate> * <token_mel_ratio> # here we use static_chunk_size because we want to fix kv cache size during inference
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80