diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index 59ebd48..6891b33 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -1,5 +1,5 @@ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) -# 2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua) +# 2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua, Shengqiang Li) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -420,8 +420,8 @@ class Qwen2LM(TransformerLM): rejected_lm_mask = rejected_lm_target == IGNORE_ID chosen_logps = torch.gather(chosen_logits.log_softmax(dim=-1), dim=2, index=chosen_lm_target.masked_fill(chosen_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1) rejected_logps = torch.gather(rejected_logits.log_softmax(dim=-1), dim=2, index=rejected_lm_target.masked_fill(rejected_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1) - chosen_logps = (chosen_logps * chosen_lm_mask).mean(dim=-1) - rejected_logps = (rejected_logps * chosen_lm_mask).mean(dim=-1) + chosen_logps = (chosen_logps * chosen_lm_mask).sum(dim=-1) / chosen_lm_mask.sum(dim=-1) + rejected_logps = (rejected_logps * rejected_lm_mask).sum(dim=-1) / rejected_lm_mask.sum(dim=-1) return {'loss': loss, 'acc': acc, 'chosen_logps': chosen_logps, 'rejected_logps': rejected_logps} @torch.inference_mode() diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml deleted file mode 100644 index 4feb14c..0000000 --- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ /dev/null @@ -1,257 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 22050 -text_encoder_input_size: 512 -llm_input_size: 1024 -llm_output_size: 1024 -spk_embed_dim: 192 - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm.TransformerLM - text_encoder_input_size: !ref - llm_input_size: !ref - llm_output_size: !ref - text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe - speech_token_size: 4096 - length_normalized_loss: True - lsm_weight: 0 - spk_embed_dim: !ref - text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - input_size: !ref - output_size: 1024 - attention_heads: 8 - linear_units: 2048 - num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - use_cnn_module: False - macaron_style: False - use_dynamic_chunk: False - use_dynamic_left_chunk: False - static_chunk_size: 1 - llm: !new:cosyvoice.transformer.encoder.TransformerEncoder - input_size: !ref - output_size: !ref - attention_heads: 8 - linear_units: 2048 - num_blocks: 7 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: 'linear_legacy' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - static_chunk_size: 1 - sampling: !name:cosyvoice.utils.common.ras_sampling - top_p: 0.8 - top_k: 25 - win_size: 10 - tau_r: 0.1 - -flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec - input_size: 512 - output_size: 80 - spk_embed_dim: !ref - output_type: 'mel' - vocab_size: 4096 - input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe - only_mask_loss: True - encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder - output_size: 512 - attention_heads: 4 - linear_units: 1024 - num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator - channels: 80 - sampling_ratios: [1, 1, 1, 1] - decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM - in_channels: 240 - n_spks: 1 - spk_emb_dim: 80 - cfm_params: !new:omegaconf.DictConfig - content: - sigma_min: 1e-06 - solver: 'euler' - t_scheduler: 'cosine' - training_cfg_rate: 0.2 - inference_cfg_rate: 0.7 - reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder - in_channels: 320 - out_channels: 80 - channels: [256, 256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 8 - num_heads: 8 - act_fn: 'gelu' - -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -# gan related module -mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: null - center: False -hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan - generator: !ref - discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator - mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator - mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator - mel_spec_transform: [ - !ref - ] - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -allowed_special: 'all' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: !ref -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -truncate: !name:cosyvoice.dataset.processor.truncate - truncate_length: 24576 # must be a multiplier of hop_size -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -compute_f0: !name:cosyvoice.dataset.processor.compute_f0 - sample_rate: !ref - hop_size: 256 -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 12000 -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: False # change to True during sft - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] -data_pipeline_gan: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# llm flow train conf -train_conf: - optim: adam - optim_conf: - lr: 0.002 # change to 0.001 if you want to train flow from scratch - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 - -# gan train conf -train_conf_gan: - optim: adam - optim_conf: - lr: 0.0002 # use small lr for gan training - scheduler: constantlr - optim_d: adam - optim_conf_d: - lr: 0.0002 # use small lr for gan training - scheduler_d: constantlr - max_epoch: 200 - grad_clip: 5 - accum_grad: 1 # in gan training, accum_grad must be 1 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml b/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml deleted file mode 100644 index d811026..0000000 --- a/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml +++ /dev/null @@ -1,226 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 24000 # 16000 for llm, 24000 for cfm -llm_input_size: 896 -llm_output_size: 896 -spk_embed_dim: 192 -qwen_pretrain_path: 'CosyVoice2-0.5B/CosyVoice-BlankEN' - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -llm: !new:cosyvoice.llm.llm_dpo.Qwen2LM - llm_input_size: !ref - llm_output_size: !ref - speech_token_size: 6561 - length_normalized_loss: True - lsm_weight: 0 - dpo: True - llm: !new:cosyvoice.llm.llm.Qwen2Encoder - pretrain_path: !ref - sampling: !name:cosyvoice.utils.common.ras_sampling - top_p: 0.8 - top_k: 25 - win_size: 10 - tau_r: 0.1 -flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec - input_size: 512 - output_size: 80 - spk_embed_dim: !ref - output_type: 'mel' - vocab_size: 6561 - input_frame_rate: 25 - only_mask_loss: True - token_mel_ratio: 2 - pre_lookahead_len: 3 - encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder - output_size: 512 - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - normalize_before: True - input_layer: 'linear' - pos_enc_layer_type: 'rel_pos_espnet' - selfattention_layer_type: 'rel_selfattn' - input_size: 512 - use_cnn_module: False - macaron_style: False - decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM - in_channels: 240 - n_spks: 1 - spk_emb_dim: 80 - cfm_params: !new:omegaconf.DictConfig - content: - sigma_min: 1e-06 - solver: 'euler' - t_scheduler: 'cosine' - training_cfg_rate: 0.2 - inference_cfg_rate: 0.7 - reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder - in_channels: 320 - out_channels: 80 - causal: True - channels: [256] - dropout: 0.0 - attention_head_dim: 64 - n_blocks: 4 - num_mid_blocks: 12 - num_heads: 8 - act_fn: 'gelu' - -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 5, 3] - upsample_kernel_sizes: [16, 11, 7] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -# gan related module -mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: null - center: False -hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan - generator: !ref - discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator - mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator - mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator - mel_spec_transform: [ - !ref - ] - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -allowed_special: 'all' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: !ref -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -truncate: !name:cosyvoice.dataset.processor.truncate - truncate_length: 24576 # must be a multiplier of hop_size -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -compute_f0: !name:cosyvoice.dataset.processor.compute_f0 - sample_rate: !ref - hop_size: 256 -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: True # change to True during sft - dpo: True - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] -data_pipeline_gan: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# llm flow train conf -train_conf: - optim: adam - optim_conf: - lr: 0.00001 # change to 1e-5 during sft - scheduler: warmuplr # change to constantlr during sft - scheduler_conf: - warmup_steps: 25000 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 - -# gan train conf -train_conf_gan: - optim: adam - optim_conf: - lr: 0.0002 # use small lr for gan training - scheduler: constantlr - optim_d: adam - optim_conf_d: - lr: 0.0002 # use small lr for gan training - scheduler_d: constantlr - max_epoch: 200 - grad_clip: 5 - accum_grad: 1 # in gan training, accum_grad must be 1 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/libritts/cosyvoice/run.sh b/examples/libritts/cosyvoice/run.sh index 1508410..b95a294 100644 --- a/examples/libritts/cosyvoice/run.sh +++ b/examples/libritts/cosyvoice/run.sh @@ -60,7 +60,7 @@ num_workers=2 prefetch=100 train_engine=torch_ddp if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml" + echo "Run train. We only support llm traning for now" if [ $train_engine == 'deepspeed' ]; then echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary" fi diff --git a/examples/libritts/cosyvoice2/run.sh b/examples/libritts/cosyvoice2/run.sh index 48c0289..ad59c0a 100644 --- a/examples/libritts/cosyvoice2/run.sh +++ b/examples/libritts/cosyvoice2/run.sh @@ -60,7 +60,7 @@ num_workers=2 prefetch=100 train_engine=torch_ddp if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml" + echo "Run train. We only support llm traning for now" if [ $train_engine == 'deepspeed' ]; then echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary" fi diff --git a/examples/libritts/cosyvoice2/run_dpo.sh b/examples/libritts/cosyvoice2/run_dpo.sh index 6b46929..1367e45 100644 --- a/examples/libritts/cosyvoice2/run_dpo.sh +++ b/examples/libritts/cosyvoice2/run_dpo.sh @@ -70,7 +70,7 @@ num_workers=2 prefetch=100 train_engine=torch_ddp if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml" + echo "Run train. We only support llm traning for now" if [ $train_engine == 'deepspeed' ]; then echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary" fi