fix dpo bug

2026-02-04 09:29:25 +08:00 · 2025-07-16 20:09:10 +08:00
parent 8555549e88
commit b048a2d6db
6 changed files with 6 additions and 489 deletions
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua, Shengqiang Li)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -420,8 +420,8 @@ class Qwen2LM(TransformerLM):
        rejected_lm_mask = rejected_lm_target == IGNORE_ID
        chosen_logps = torch.gather(chosen_logits.log_softmax(dim=-1), dim=2, index=chosen_lm_target.masked_fill(chosen_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
        rejected_logps = torch.gather(rejected_logits.log_softmax(dim=-1), dim=2, index=rejected_lm_target.masked_fill(rejected_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
-        chosen_logps = (chosen_logps * chosen_lm_mask).mean(dim=-1)
-        rejected_logps = (rejected_logps * chosen_lm_mask).mean(dim=-1)
+        chosen_logps = (chosen_logps * chosen_lm_mask).sum(dim=-1) / chosen_lm_mask.sum(dim=-1)
+        rejected_logps = (rejected_logps * rejected_lm_mask).sum(dim=-1) / rejected_lm_mask.sum(dim=-1)
        return {'loss': loss, 'acc': acc, 'chosen_logps': chosen_logps, 'rejected_logps': rejected_logps}

    @torch.inference_mode()
--- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
@@ -1,257 +0,0 @@
-# set random seed, so that you may reproduce your result.
-__set_seed1: !apply:random.seed [1986]
-__set_seed2: !apply:numpy.random.seed [1986]
-__set_seed3: !apply:torch.manual_seed [1986]
-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
-
-# fixed params
-sample_rate: 22050
-text_encoder_input_size: 512
-llm_input_size: 1024
-llm_output_size: 1024
-spk_embed_dim: 192
-
-# model params
-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
-# for system/third_party class/function, we do not require this.
-llm: !new:cosyvoice.llm.llm.TransformerLM
-    text_encoder_input_size: !ref <text_encoder_input_size>
-    llm_input_size: !ref <llm_input_size>
-    llm_output_size: !ref <llm_output_size>
-    text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
-    speech_token_size: 4096
-    length_normalized_loss: True
-    lsm_weight: 0
-    spk_embed_dim: !ref <spk_embed_dim>
-    text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
-        input_size: !ref <text_encoder_input_size>
-        output_size: 1024
-        attention_heads: 8
-        linear_units: 2048
-        num_blocks: 3
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        normalize_before: True
-        input_layer: 'linear'
-        pos_enc_layer_type: 'rel_pos_espnet'
-        selfattention_layer_type: 'rel_selfattn'
-        use_cnn_module: False
-        macaron_style: False
-        use_dynamic_chunk: False
-        use_dynamic_left_chunk: False
-        static_chunk_size: 1
-    llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
-        input_size: !ref <llm_input_size>
-        output_size: !ref <llm_output_size>
-        attention_heads: 8
-        linear_units: 2048
-        num_blocks: 7
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: 'linear_legacy'
-        pos_enc_layer_type: 'rel_pos_espnet'
-        selfattention_layer_type: 'rel_selfattn'
-        static_chunk_size: 1
-    sampling: !name:cosyvoice.utils.common.ras_sampling
-        top_p: 0.8
-        top_k: 25
-        win_size: 10
-        tau_r: 0.1
-
-flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
-    input_size: 512
-    output_size: 80
-    spk_embed_dim: !ref <spk_embed_dim>
-    output_type: 'mel'
-    vocab_size: 4096
-    input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
-    only_mask_loss: True
-    encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
-        output_size: 512
-        attention_heads: 4
-        linear_units: 1024
-        num_blocks: 3
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.1
-        normalize_before: True
-        input_layer: 'linear'
-        pos_enc_layer_type: 'rel_pos_espnet'
-        selfattention_layer_type: 'rel_selfattn'
-        input_size: 512
-        use_cnn_module: False
-        macaron_style: False
-    length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
-        channels: 80
-        sampling_ratios: [1, 1, 1, 1]
-    decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
-        in_channels: 240
-        n_spks: 1
-        spk_emb_dim: 80
-        cfm_params: !new:omegaconf.DictConfig
-            content:
-                sigma_min: 1e-06
-                solver: 'euler'
-                t_scheduler: 'cosine'
-                training_cfg_rate: 0.2
-                inference_cfg_rate: 0.7
-                reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
-            in_channels: 320
-            out_channels: 80
-            channels: [256, 256]
-            dropout: 0.0
-            attention_head_dim: 64
-            n_blocks: 4
-            num_mid_blocks: 8
-            num_heads: 8
-            act_fn: 'gelu'
-
-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
-    in_channels: 80
-    base_channels: 512
-    nb_harmonics: 8
-    sampling_rate: !ref <sample_rate>
-    nsf_alpha: 0.1
-    nsf_sigma: 0.003
-    nsf_voiced_threshold: 10
-    upsample_rates: [8, 8]
-    upsample_kernel_sizes: [16, 16]
-    istft_params:
-        n_fft: 16
-        hop_len: 4
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    source_resblock_kernel_sizes: [7, 11]
-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
-    lrelu_slope: 0.1
-    audio_limit: 0.99
-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
-        num_class: 1
-        in_channels: 80
-        cond_channels: 512
-
-# gan related module
-mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1024
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 256
-    win_size: 1024
-    fmin: 0
-    fmax: null
-    center: False
-hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
-    generator: !ref <hift>
-    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
-        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
-        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
-    mel_spec_transform: [
-        !ref <mel_spec_transform1>
-    ]
-
-# processor functions
-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
-    multilingual: True
-    num_languages: 100
-    language: 'en'
-    task: 'transcribe'
-allowed_special: 'all'
-tokenize: !name:cosyvoice.dataset.processor.tokenize
-    get_tokenizer: !ref <get_tokenizer>
-    allowed_special: !ref <allowed_special>
-filter: !name:cosyvoice.dataset.processor.filter
-    max_length: 40960
-    min_length: 0
-    token_max_length: 200
-    token_min_length: 1
-resample: !name:cosyvoice.dataset.processor.resample
-    resample_rate: !ref <sample_rate>
-truncate: !name:cosyvoice.dataset.processor.truncate
-    truncate_length: 24576 # must be a multiplier of hop_size
-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1024
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 256
-    win_size: 1024
-    fmin: 0
-    fmax: 8000
-    center: False
-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
-    feat_extractor: !ref <feat_extractor>
-compute_f0: !name:cosyvoice.dataset.processor.compute_f0
-    sample_rate: !ref <sample_rate>
-    hop_size: 256
-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
-    normalize: True
-shuffle: !name:cosyvoice.dataset.processor.shuffle
-    shuffle_size: 1000
-sort: !name:cosyvoice.dataset.processor.sort
-    sort_size: 500  # sort_size should be less than shuffle_size
-batch: !name:cosyvoice.dataset.processor.batch
-    batch_type: 'dynamic'
-    max_frames_in_batch: 12000
-padding: !name:cosyvoice.dataset.processor.padding
-    use_spk_embedding: False # change to True during sft
-
-# dataset processor pipeline
-data_pipeline: [
-    !ref <parquet_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <compute_fbank>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-data_pipeline_gan: [
-    !ref <parquet_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <truncate>,
-    !ref <compute_fbank>,
-    !ref <compute_f0>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-
-# llm flow train conf
-train_conf:
-    optim: adam
-    optim_conf:
-        lr: 0.002 # change to 0.001 if you want to train flow from scratch
-    scheduler: warmuplr
-    scheduler_conf:
-        warmup_steps: 25000
-    max_epoch: 200
-    grad_clip: 5
-    accum_grad: 2
-    log_interval: 100
-    save_per_step: -1
-
-# gan train conf
-train_conf_gan:
-    optim: adam
-    optim_conf:
-        lr: 0.0002 # use small lr for gan training
-    scheduler: constantlr
-    optim_d: adam
-    optim_conf_d:
-        lr: 0.0002 # use small lr for gan training
-    scheduler_d: constantlr
-    max_epoch: 200
-    grad_clip: 5
-    accum_grad: 1 # in gan training, accum_grad must be 1
-    log_interval: 100
-    save_per_step: -1
--- a/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml
@@ -1,226 +0,0 @@
-# set random seed, so that you may reproduce your result.
-__set_seed1: !apply:random.seed [1986]
-__set_seed2: !apply:numpy.random.seed [1986]
-__set_seed3: !apply:torch.manual_seed [1986]
-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
-
-# fixed params
-sample_rate: 24000   # 16000 for llm, 24000 for cfm
-llm_input_size: 896
-llm_output_size: 896
-spk_embed_dim: 192
-qwen_pretrain_path: 'CosyVoice2-0.5B/CosyVoice-BlankEN'
-
-# model params
-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
-# for system/third_party class/function, we do not require this.
-llm: !new:cosyvoice.llm.llm_dpo.Qwen2LM
-    llm_input_size: !ref <llm_input_size>
-    llm_output_size: !ref <llm_output_size>
-    speech_token_size: 6561
-    length_normalized_loss: True
-    lsm_weight: 0
-    dpo: True
-    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
-        pretrain_path: !ref <qwen_pretrain_path>
-    sampling: !name:cosyvoice.utils.common.ras_sampling
-        top_p: 0.8
-        top_k: 25
-        win_size: 10
-        tau_r: 0.1
-flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
-    input_size: 512
-    output_size: 80
-    spk_embed_dim: !ref <spk_embed_dim>
-    output_type: 'mel'
-    vocab_size: 6561
-    input_frame_rate: 25
-    only_mask_loss: True
-    token_mel_ratio: 2
-    pre_lookahead_len: 3
-    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
-        output_size: 512
-        attention_heads: 8
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.1
-        normalize_before: True
-        input_layer: 'linear'
-        pos_enc_layer_type: 'rel_pos_espnet'
-        selfattention_layer_type: 'rel_selfattn'
-        input_size: 512
-        use_cnn_module: False
-        macaron_style: False
-    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
-        in_channels: 240
-        n_spks: 1
-        spk_emb_dim: 80
-        cfm_params: !new:omegaconf.DictConfig
-            content:
-                sigma_min: 1e-06
-                solver: 'euler'
-                t_scheduler: 'cosine'
-                training_cfg_rate: 0.2
-                inference_cfg_rate: 0.7
-                reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
-            in_channels: 320
-            out_channels: 80
-            causal: True
-            channels: [256]
-            dropout: 0.0
-            attention_head_dim: 64
-            n_blocks: 4
-            num_mid_blocks: 12
-            num_heads: 8
-            act_fn: 'gelu'
-
-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
-    in_channels: 80
-    base_channels: 512
-    nb_harmonics: 8
-    sampling_rate: !ref <sample_rate>
-    nsf_alpha: 0.1
-    nsf_sigma: 0.003
-    nsf_voiced_threshold: 10
-    upsample_rates: [8, 5, 3]
-    upsample_kernel_sizes: [16, 11, 7]
-    istft_params:
-        n_fft: 16
-        hop_len: 4
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    source_resblock_kernel_sizes: [7, 7, 11]
-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    lrelu_slope: 0.1
-    audio_limit: 0.99
-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
-        num_class: 1
-        in_channels: 80
-        cond_channels: 512
-
-# gan related module
-mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1024
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 256
-    win_size: 1024
-    fmin: 0
-    fmax: null
-    center: False
-hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
-    generator: !ref <hift>
-    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
-        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
-        mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
-    mel_spec_transform: [
-        !ref <mel_spec_transform1>
-    ]
-
-# processor functions
-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
-    multilingual: True
-    num_languages: 100
-    language: 'en'
-    task: 'transcribe'
-allowed_special: 'all'
-tokenize: !name:cosyvoice.dataset.processor.tokenize
-    get_tokenizer: !ref <get_tokenizer>
-    allowed_special: !ref <allowed_special>
-filter: !name:cosyvoice.dataset.processor.filter
-    max_length: 40960
-    min_length: 0
-    token_max_length: 200
-    token_min_length: 1
-resample: !name:cosyvoice.dataset.processor.resample
-    resample_rate: !ref <sample_rate>
-truncate: !name:cosyvoice.dataset.processor.truncate
-    truncate_length: 24576 # must be a multiplier of hop_size
-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1024
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 256
-    win_size: 1024
-    fmin: 0
-    fmax: 8000
-    center: False
-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
-    feat_extractor: !ref <feat_extractor>
-compute_f0: !name:cosyvoice.dataset.processor.compute_f0
-    sample_rate: !ref <sample_rate>
-    hop_size: 256
-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
-    normalize: True
-shuffle: !name:cosyvoice.dataset.processor.shuffle
-    shuffle_size: 1000
-sort: !name:cosyvoice.dataset.processor.sort
-    sort_size: 500  # sort_size should be less than shuffle_size
-batch: !name:cosyvoice.dataset.processor.batch
-    batch_type: 'dynamic'
-    max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g
-padding: !name:cosyvoice.dataset.processor.padding
-    use_spk_embedding: True # change to True during sft
-    dpo: True
-
-# dataset processor pipeline
-data_pipeline: [
-    !ref <parquet_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <compute_fbank>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-data_pipeline_gan: [
-    !ref <parquet_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <truncate>,
-    !ref <compute_fbank>,
-    !ref <compute_f0>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-
-# llm flow train conf
-train_conf:
-    optim: adam
-    optim_conf:
-        lr: 0.00001 # change to 1e-5 during sft
-    scheduler: warmuplr # change to constantlr during sft
-    scheduler_conf:
-        warmup_steps: 25000
-    max_epoch: 200
-    grad_clip: 5
-    accum_grad: 2
-    log_interval: 100
-    save_per_step: -1
-
-# gan train conf
-train_conf_gan:
-    optim: adam
-    optim_conf:
-        lr: 0.0002 # use small lr for gan training
-    scheduler: constantlr
-    optim_d: adam
-    optim_conf_d:
-        lr: 0.0002 # use small lr for gan training
-    scheduler_d: constantlr
-    max_epoch: 200
-    grad_clip: 5
-    accum_grad: 1 # in gan training, accum_grad must be 1
-    log_interval: 100
-    save_per_step: -1
--- a/examples/libritts/cosyvoice/run.sh
+++ b/examples/libritts/cosyvoice/run.sh
@@ -60,7 +60,7 @@ num_workers=2
 prefetch=100
 train_engine=torch_ddp
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-  echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
+  echo "Run train. We only support llm traning for now"
  if [ $train_engine == 'deepspeed' ]; then
    echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
  fi
--- a/examples/libritts/cosyvoice2/run.sh
+++ b/examples/libritts/cosyvoice2/run.sh
@@ -60,7 +60,7 @@ num_workers=2
 prefetch=100
 train_engine=torch_ddp
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-  echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
+  echo "Run train. We only support llm traning for now"
  if [ $train_engine == 'deepspeed' ]; then
    echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
  fi
--- a/examples/libritts/cosyvoice2/run_dpo.sh
+++ b/examples/libritts/cosyvoice2/run_dpo.sh
@@ -70,7 +70,7 @@ num_workers=2
 prefetch=100
 train_engine=torch_ddp
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-  echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
+  echo "Run train. We only support llm traning for now"
  if [ $train_engine == 'deepspeed' ]; then
    echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
  fi