From c3250c222f2489dc29f5e61e4a097d924c6d19d6 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 27 May 2025 10:34:59 +0800 Subject: [PATCH] add trt_concurrent arg --- examples/magicdata-read/cosyvoice/conf | 1 + .../cosyvoice/conf/cosyvoice.fromscratch.yaml | 203 ------------------ .../cosyvoice/conf/cosyvoice.yaml | 203 ------------------ .../cosyvoice/conf/ds_stage2.json | 42 ---- examples/magicdata-read/cosyvoice/run.sh | 17 +- 5 files changed, 17 insertions(+), 449 deletions(-) create mode 120000 examples/magicdata-read/cosyvoice/conf delete mode 100644 examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml delete mode 100644 examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml delete mode 100644 examples/magicdata-read/cosyvoice/conf/ds_stage2.json diff --git a/examples/magicdata-read/cosyvoice/conf b/examples/magicdata-read/cosyvoice/conf new file mode 120000 index 0000000..e1368df --- /dev/null +++ b/examples/magicdata-read/cosyvoice/conf @@ -0,0 +1 @@ +../../libritts/cosyvoice/conf \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml deleted file mode 100644 index 0420d02..0000000 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ /dev/null @@ -1,203 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 22050 -text_encoder_input_size: 512 -llm_input_size: 1024 -llm_output_size: 1024 -spk_embed_dim: 192 - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.TransformerLM - text_encoder_input_size: !ref - llm_input_size: !ref - llm_output_size: !ref - text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe - speech_token_size: 4096 - length_normalized_loss: True - lsm_weight: 0 - spk_embed_dim: !ref - text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - input_size: !ref - output_size: 1024 - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - use_cnn_module: False - macaron_style: False - use_dynamic_chunk: False - use_dynamic_left_chunk: False - static_chunk_size: 1 - llm: !new:cosyvoice.transformer.encoder.TransformerEncoder - input_size: !ref - output_size: !ref - attention_heads: 8 - linear_units: 2048 - num_blocks: 7 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: 'linear_legacy' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - static_chunk_size: 1 - sampling: !name:cosyvoice.utils.common.ras_sampling - top_p: 0.8 - top_k: 25 - win_size: 10 - tau_r: 0.1 - -flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec - input_size: 512 - output_size: 80 - spk_embed_dim: !ref - output_type: 'mel' - vocab_size: 4096 - input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe - only_mask_loss: True - encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - output_size: 512 - attention_heads: 4 - linear_units: 1024 - num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator - channels: 80 - sampling_ratios: [1, 1, 1, 1] - decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM - in_channels: 240 - n_spks: 1 - spk_emb_dim: 80 - cfm_params: !new:omegaconf.DictConfig - content: - sigma_min: 1e-06 - solver: 'euler' - t_scheduler: 'cosine' - training_cfg_rate: 0.2 - inference_cfg_rate: 0.7 - reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder - in_channels: 320 - out_channels: 80 - channels: [256, 256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 8 - num_heads: 8 - act_fn: 'gelu' - -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -allowed_special: 'all' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: !ref -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 12000 -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: False # change to True during sft - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# train conf -train_conf: - optim: adam - optim_conf: - lr: 0.002 # change to 0.001 if you want to train flow from scratch - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml deleted file mode 100644 index b2ff51c..0000000 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +++ /dev/null @@ -1,203 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 22050 -text_encoder_input_size: 512 -llm_input_size: 1024 -llm_output_size: 1024 -spk_embed_dim: 192 - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.TransformerLM - text_encoder_input_size: !ref - llm_input_size: !ref - llm_output_size: !ref - text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe - speech_token_size: 4096 - length_normalized_loss: True - lsm_weight: 0 - spk_embed_dim: !ref - text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - input_size: !ref - output_size: 1024 - attention_heads: 16 - linear_units: 4096 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - use_cnn_module: False - macaron_style: False - use_dynamic_chunk: False - use_dynamic_left_chunk: False - static_chunk_size: 1 - llm: !new:cosyvoice.transformer.encoder.TransformerEncoder - input_size: !ref - output_size: !ref - attention_heads: 16 - linear_units: 4096 - num_blocks: 14 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: 'linear_legacy' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - static_chunk_size: 1 - sampling: !name:cosyvoice.utils.common.ras_sampling - top_p: 0.8 - top_k: 25 - win_size: 10 - tau_r: 0.1 - -flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec - input_size: 512 - output_size: 80 - spk_embed_dim: !ref - output_type: 'mel' - vocab_size: 4096 - input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe - only_mask_loss: True - encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - output_size: 512 - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator - channels: 80 - sampling_ratios: [1, 1, 1, 1] - decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM - in_channels: 240 - n_spks: 1 - spk_emb_dim: 80 - cfm_params: !new:omegaconf.DictConfig - content: - sigma_min: 1e-06 - solver: 'euler' - t_scheduler: 'cosine' - training_cfg_rate: 0.2 - inference_cfg_rate: 0.7 - reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder - in_channels: 320 - out_channels: 80 - channels: [256, 256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 12 - num_heads: 8 - act_fn: 'gelu' - -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -allowed_special: 'all' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: !ref -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 2000 -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: False # change to True during sft - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# train conf -train_conf: - optim: adam - optim_conf: - lr: 0.001 # change to 1e-5 during sft - scheduler: warmuplr # change to constantlr during sft - scheduler_conf: - warmup_steps: 2500 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/conf/ds_stage2.json b/examples/magicdata-read/cosyvoice/conf/ds_stage2.json deleted file mode 100644 index 2b2de3d..0000000 --- a/examples/magicdata-read/cosyvoice/conf/ds_stage2.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "train_micro_batch_size_per_gpu": 1, - "gradient_accumulation_steps": 1, - "steps_per_print": 100, - "gradient_clipping": 5, - "fp16": { - "enabled": false, - "auto_cast": false, - "loss_scale": 0, - "initial_scale_power": 16, - "loss_scale_window": 256, - "hysteresis": 2, - "consecutive_hysteresis": false, - "min_loss_scale": 1 - }, - "bf16": { - "enabled": false - }, - "zero_force_ds_cpu_optimizer": false, - "zero_optimization": { - "stage": 2, - "offload_optimizer": { - "device": "none", - "pin_memory": true - }, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": false, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients" : true - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 0.001, - "weight_decay": 0.0001, - "torch_adam": true, - "adam_w_mode": true - } - } -} \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/run.sh b/examples/magicdata-read/cosyvoice/run.sh index 1af1a28..888ac7b 100644 --- a/examples/magicdata-read/cosyvoice/run.sh +++ b/examples/magicdata-read/cosyvoice/run.sh @@ -83,7 +83,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then fi cp data/train/parquet/data.list data/train.data.list cp data/dev/parquet/data.list data/dev.data.list - for model in llm flow; do + for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ cosyvoice/bin/train.py \ @@ -99,11 +99,26 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --num_workers ${num_workers} \ --prefetch ${prefetch} \ --pin_memory \ + --use_amp \ --deepspeed_config ./conf/ds_stage2.json \ --deepspeed.save_states model+optimizer done fi +# average model +average_num=5 +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + for model in llm flow hifigan; do + decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python cosyvoice/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path `pwd`/exp/cosyvoice/$model/$train_engine \ + --num ${average_num} \ + --val_best + done +fi + if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir" python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir