update 25hz yaml

This commit is contained in:
lyuxiang.lx
2024-09-29 10:42:57 +08:00
parent ffa28e3bbd
commit abc6f70ace
4 changed files with 22 additions and 12 deletions

View File

@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
text_encoder_input_size: !ref <text_encoder_input_size> text_encoder_input_size: !ref <text_encoder_input_size>
llm_input_size: !ref <llm_input_size> llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size> llm_output_size: !ref <llm_output_size>
text_token_size: 51866 text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
speech_token_size: 4096 speech_token_size: 4096
length_normalized_loss: True length_normalized_loss: True
lsm_weight: 0 lsm_weight: 0
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
spk_embed_dim: !ref <spk_embed_dim> spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel' output_type: 'mel'
vocab_size: 4096 vocab_size: 4096
input_frame_rate: 50 input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
only_mask_loss: True only_mask_loss: True
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
output_size: 512 output_size: 512
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
# processor functions # processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
multilingual: True multilingual: True
num_languages: 100 num_languages: 100
language: 'en' language: 'en'

View File

@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
text_encoder_input_size: !ref <text_encoder_input_size> text_encoder_input_size: !ref <text_encoder_input_size>
llm_input_size: !ref <llm_input_size> llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size> llm_output_size: !ref <llm_output_size>
text_token_size: 51866 text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
speech_token_size: 4096 speech_token_size: 4096
length_normalized_loss: True length_normalized_loss: True
lsm_weight: 0 lsm_weight: 0
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
spk_embed_dim: !ref <spk_embed_dim> spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel' output_type: 'mel'
vocab_size: 4096 vocab_size: 4096
input_frame_rate: 50 input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
only_mask_loss: True only_mask_loss: True
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
output_size: 512 output_size: 512
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
# processor functions # processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
multilingual: True multilingual: True
num_languages: 100 num_languages: 100
language: 'en' language: 'en'

View File

@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
text_encoder_input_size: !ref <text_encoder_input_size> text_encoder_input_size: !ref <text_encoder_input_size>
llm_input_size: !ref <llm_input_size> llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size> llm_output_size: !ref <llm_output_size>
text_token_size: 51866 text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
speech_token_size: 4096 speech_token_size: 4096
length_normalized_loss: True length_normalized_loss: True
lsm_weight: 0 lsm_weight: 0
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
pos_enc_layer_type: 'rel_pos_espnet' pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn' selfattention_layer_type: 'rel_selfattn'
static_chunk_size: 1 static_chunk_size: 1
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 10
tau_r: 0.1
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
input_size: 512 input_size: 512
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
spk_embed_dim: !ref <spk_embed_dim> spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel' output_type: 'mel'
vocab_size: 4096 vocab_size: 4096
input_frame_rate: 50 input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
only_mask_loss: True only_mask_loss: True
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
output_size: 512 output_size: 512
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
# processor functions # processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
multilingual: True multilingual: True
num_languages: 100 num_languages: 100
language: 'en' language: 'en'

View File

@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
text_encoder_input_size: !ref <text_encoder_input_size> text_encoder_input_size: !ref <text_encoder_input_size>
llm_input_size: !ref <llm_input_size> llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size> llm_output_size: !ref <llm_output_size>
text_token_size: 51866 text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
speech_token_size: 4096 speech_token_size: 4096
length_normalized_loss: True length_normalized_loss: True
lsm_weight: 0 lsm_weight: 0
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
pos_enc_layer_type: 'rel_pos_espnet' pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn' selfattention_layer_type: 'rel_selfattn'
static_chunk_size: 1 static_chunk_size: 1
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 10
tau_r: 0.1
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
input_size: 512 input_size: 512
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
spk_embed_dim: !ref <spk_embed_dim> spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel' output_type: 'mel'
vocab_size: 4096 vocab_size: 4096
input_frame_rate: 50 input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
only_mask_loss: True only_mask_loss: True
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
output_size: 512 output_size: 512
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
# processor functions # processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
multilingual: True multilingual: True
num_languages: 100 num_languages: 100
language: 'en' language: 'en'