mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
update 25hz yaml
This commit is contained in:
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|||||||
text_encoder_input_size: !ref <text_encoder_input_size>
|
text_encoder_input_size: !ref <text_encoder_input_size>
|
||||||
llm_input_size: !ref <llm_input_size>
|
llm_input_size: !ref <llm_input_size>
|
||||||
llm_output_size: !ref <llm_output_size>
|
llm_output_size: !ref <llm_output_size>
|
||||||
text_token_size: 51866
|
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
speech_token_size: 4096
|
speech_token_size: 4096
|
||||||
length_normalized_loss: True
|
length_normalized_loss: True
|
||||||
lsm_weight: 0
|
lsm_weight: 0
|
||||||
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|||||||
spk_embed_dim: !ref <spk_embed_dim>
|
spk_embed_dim: !ref <spk_embed_dim>
|
||||||
output_type: 'mel'
|
output_type: 'mel'
|
||||||
vocab_size: 4096
|
vocab_size: 4096
|
||||||
input_frame_rate: 50
|
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
only_mask_loss: True
|
only_mask_loss: True
|
||||||
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
||||||
output_size: 512
|
output_size: 512
|
||||||
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|||||||
|
|
||||||
# processor functions
|
# processor functions
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
multilingual: True
|
multilingual: True
|
||||||
num_languages: 100
|
num_languages: 100
|
||||||
language: 'en'
|
language: 'en'
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|||||||
text_encoder_input_size: !ref <text_encoder_input_size>
|
text_encoder_input_size: !ref <text_encoder_input_size>
|
||||||
llm_input_size: !ref <llm_input_size>
|
llm_input_size: !ref <llm_input_size>
|
||||||
llm_output_size: !ref <llm_output_size>
|
llm_output_size: !ref <llm_output_size>
|
||||||
text_token_size: 51866
|
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
speech_token_size: 4096
|
speech_token_size: 4096
|
||||||
length_normalized_loss: True
|
length_normalized_loss: True
|
||||||
lsm_weight: 0
|
lsm_weight: 0
|
||||||
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|||||||
spk_embed_dim: !ref <spk_embed_dim>
|
spk_embed_dim: !ref <spk_embed_dim>
|
||||||
output_type: 'mel'
|
output_type: 'mel'
|
||||||
vocab_size: 4096
|
vocab_size: 4096
|
||||||
input_frame_rate: 50
|
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
only_mask_loss: True
|
only_mask_loss: True
|
||||||
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
||||||
output_size: 512
|
output_size: 512
|
||||||
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|||||||
|
|
||||||
# processor functions
|
# processor functions
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
multilingual: True
|
multilingual: True
|
||||||
num_languages: 100
|
num_languages: 100
|
||||||
language: 'en'
|
language: 'en'
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|||||||
text_encoder_input_size: !ref <text_encoder_input_size>
|
text_encoder_input_size: !ref <text_encoder_input_size>
|
||||||
llm_input_size: !ref <llm_input_size>
|
llm_input_size: !ref <llm_input_size>
|
||||||
llm_output_size: !ref <llm_output_size>
|
llm_output_size: !ref <llm_output_size>
|
||||||
text_token_size: 51866
|
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
speech_token_size: 4096
|
speech_token_size: 4096
|
||||||
length_normalized_loss: True
|
length_normalized_loss: True
|
||||||
lsm_weight: 0
|
lsm_weight: 0
|
||||||
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
pos_enc_layer_type: 'rel_pos_espnet'
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
selfattention_layer_type: 'rel_selfattn'
|
||||||
static_chunk_size: 1
|
static_chunk_size: 1
|
||||||
|
sampling: !name:cosyvoice.utils.common.ras_sampling
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 25
|
||||||
|
win_size: 10
|
||||||
|
tau_r: 0.1
|
||||||
|
|
||||||
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
||||||
input_size: 512
|
input_size: 512
|
||||||
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|||||||
spk_embed_dim: !ref <spk_embed_dim>
|
spk_embed_dim: !ref <spk_embed_dim>
|
||||||
output_type: 'mel'
|
output_type: 'mel'
|
||||||
vocab_size: 4096
|
vocab_size: 4096
|
||||||
input_frame_rate: 50
|
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
only_mask_loss: True
|
only_mask_loss: True
|
||||||
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
||||||
output_size: 512
|
output_size: 512
|
||||||
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|||||||
|
|
||||||
# processor functions
|
# processor functions
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
multilingual: True
|
multilingual: True
|
||||||
num_languages: 100
|
num_languages: 100
|
||||||
language: 'en'
|
language: 'en'
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|||||||
text_encoder_input_size: !ref <text_encoder_input_size>
|
text_encoder_input_size: !ref <text_encoder_input_size>
|
||||||
llm_input_size: !ref <llm_input_size>
|
llm_input_size: !ref <llm_input_size>
|
||||||
llm_output_size: !ref <llm_output_size>
|
llm_output_size: !ref <llm_output_size>
|
||||||
text_token_size: 51866
|
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
speech_token_size: 4096
|
speech_token_size: 4096
|
||||||
length_normalized_loss: True
|
length_normalized_loss: True
|
||||||
lsm_weight: 0
|
lsm_weight: 0
|
||||||
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|||||||
pos_enc_layer_type: 'rel_pos_espnet'
|
pos_enc_layer_type: 'rel_pos_espnet'
|
||||||
selfattention_layer_type: 'rel_selfattn'
|
selfattention_layer_type: 'rel_selfattn'
|
||||||
static_chunk_size: 1
|
static_chunk_size: 1
|
||||||
|
sampling: !name:cosyvoice.utils.common.ras_sampling
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 25
|
||||||
|
win_size: 10
|
||||||
|
tau_r: 0.1
|
||||||
|
|
||||||
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
||||||
input_size: 512
|
input_size: 512
|
||||||
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|||||||
spk_embed_dim: !ref <spk_embed_dim>
|
spk_embed_dim: !ref <spk_embed_dim>
|
||||||
output_type: 'mel'
|
output_type: 'mel'
|
||||||
vocab_size: 4096
|
vocab_size: 4096
|
||||||
input_frame_rate: 50
|
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
only_mask_loss: True
|
only_mask_loss: True
|
||||||
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
||||||
output_size: 512
|
output_size: 512
|
||||||
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|||||||
|
|
||||||
# processor functions
|
# processor functions
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
multilingual: True
|
multilingual: True
|
||||||
num_languages: 100
|
num_languages: 100
|
||||||
language: 'en'
|
language: 'en'
|
||||||
|
|||||||
Reference in New Issue
Block a user