From abc6f70ace1d63be8929d06faf4de7573f2129bc Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Sun, 29 Sep 2024 10:42:57 +0800 Subject: [PATCH] update 25hz yaml --- .../cosyvoice/conf/cosyvoice.fromscratch.yaml | 6 +++--- examples/libritts/cosyvoice/conf/cosyvoice.yaml | 6 +++--- .../cosyvoice/conf/cosyvoice.fromscratch.yaml | 11 ++++++++--- examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml | 11 ++++++++--- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml index 25d7269..0420d02 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index bca3898..b2ff51c 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml index 0f3495f..0420d02 100644 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' static_chunk_size: 1 + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec input_size: 512 @@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml index 6b084f2..b2ff51c 100644 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' static_chunk_size: 1 + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec input_size: 512 @@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en'