add hifigan train

2026-02-04 09:29:25 +08:00 · 2024-10-16 11:37:32 +08:00
parent cb200b21c5
commit 789ee9e5e7
13 changed files with 314 additions and 477 deletions
--- a/examples/libritts/cosyvoice/conf/cosyvoice.hifigan.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.hifigan.yaml
@@ -1,141 +0,0 @@
-# set random seed, so that you may reproduce your result.
-__set_seed1: !apply:random.seed [1986]
-__set_seed2: !apply:numpy.random.seed [1986]
-__set_seed3: !apply:torch.manual_seed [1986]
-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
-
-# fixed params
-sample_rate: 22050
-text_encoder_input_size: 512
-llm_input_size: 1024
-llm_output_size: 1024
-spk_embed_dim: 192
-
-# model params
-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
-# for system/third_party class/function, we do not require this.
-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
-    in_channels: 80
-    base_channels: 512
-    nb_harmonics: 8
-    sampling_rate: !ref <sample_rate>
-    nsf_alpha: 0.1
-    nsf_sigma: 0.003
-    nsf_voiced_threshold: 10
-    upsample_rates: [8, 8]
-    upsample_kernel_sizes: [16, 16]
-    istft_params:
-        n_fft: 16
-        hop_len: 4
-    resblock_kernel_sizes: [3, 7, 11]
-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-    source_resblock_kernel_sizes: [7, 11]
-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
-    lrelu_slope: 0.1
-    audio_limit: 0.99
-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
-        num_class: 1
-        in_channels: 80
-        cond_channels: 512
-
-mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1024
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 256
-    win_size: 1024
-    fmin: 0
-    fmax: 8000
-    center: False
-hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
-    generator: !ref <hift>
-    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
-        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
-        mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
-    mel_spec_transform: [
-        !ref <mel_spec_transform1>
-    ]
-
-# processor functions
-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
-    multilingual: True
-    num_languages: 100
-    language: 'en'
-    task: 'transcribe'
-tokenize: !name:cosyvoice.dataset.processor.tokenize
-    get_tokenizer: !ref <get_tokenizer>
-    allowed_special: 'all'
-filter: !name:cosyvoice.dataset.processor.filter
-    max_length: 40960
-    min_length: 0
-    token_max_length: 200
-    token_min_length: 1
-resample: !name:cosyvoice.dataset.processor.resample
-    resample_rate: !ref <sample_rate>
-truncate: !name:cosyvoice.dataset.processor.truncate
-    truncate_length: 24576 # must be a multiplier of hop_size
-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
-    n_fft: 1024
-    num_mels: 80
-    sampling_rate: !ref <sample_rate>
-    hop_size: 256
-    win_size: 1024
-    fmin: 0
-    fmax: 8000
-    center: False
-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
-    feat_extractor: !ref <feat_extractor>
-pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
-    sample_rate: !ref <sample_rate>
-    frame_length: 46.4 # match feat_extractor win_size/sampling_rate
-    frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
-compute_f0: !name:cosyvoice.dataset.processor.compute_f0
-    pitch_extractor: !ref <pitch_extractor>
-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
-    normalize: True
-shuffle: !name:cosyvoice.dataset.processor.shuffle
-    shuffle_size: 1000
-sort: !name:cosyvoice.dataset.processor.sort
-    sort_size: 500  # sort_size should be less than shuffle_size
-batch: !name:cosyvoice.dataset.processor.batch
-    batch_type: 'dynamic'
-    max_frames_in_batch: 1200
-padding: !name:cosyvoice.dataset.processor.padding
-    use_spk_embedding: False # change to True during sft
-
-# dataset processor pipeline
-data_pipeline: [
-    !ref <parquet_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <truncate>,
-    !ref <compute_fbank>,
-    !ref <compute_f0>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-
-# train conf
-train_conf:
-    optim: adam
-    optim_conf:
-        lr: 0.002 # change to 0.001 if you want to train flow from scratch
-    scheduler: warmuplr
-    scheduler_conf:
-        warmup_steps: 25000
-    optim_d: adam
-    optim_conf_d:
-        lr: 0.002 # change to 0.001 if you want to train flow from scratch
-    scheduler_d: warmuplr
-    scheduler_conf_d:
-        warmup_steps: 25000
-    max_epoch: 200
-    grad_clip: 5
-    accum_grad: 2
-    log_interval: 100
-    save_per_step: -1
--- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml
@@ -133,6 +133,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
        in_channels: 80
        cond_channels: 512

+# gan related module
+mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 8000
+    center: False
+hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
+    generator: !ref <hift>
+    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
+        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
+        mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
+    mel_spec_transform: [
+        !ref <mel_spec_transform1>
+    ]
+
 # processor functions
 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
@@ -151,6 +170,8 @@ filter: !name:cosyvoice.dataset.processor.filter
    token_min_length: 1
 resample: !name:cosyvoice.dataset.processor.resample
    resample_rate: !ref <sample_rate>
+truncate: !name:cosyvoice.dataset.processor.truncate
+    truncate_length: 24576 # must be a multiplier of hop_size
 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1024
    num_mels: 80
@@ -162,6 +183,12 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    center: False
 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
+pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
+    sample_rate: !ref <sample_rate>
+    frame_length: 46.4 # match feat_extractor win_size/sampling_rate
+    frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
+compute_f0: !name:cosyvoice.dataset.processor.compute_f0
+    pitch_extractor: !ref <pitch_extractor>
 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
    normalize: True
 shuffle: !name:cosyvoice.dataset.processor.shuffle
@@ -187,8 +214,22 @@ data_pipeline: [
    !ref <batch>,
    !ref <padding>,
 ]
+data_pipeline_gan: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <truncate>,
+    !ref <compute_fbank>,
+    !ref <compute_f0>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]

-# train conf
+# llm flow train conf
 train_conf:
    optim: adam
    optim_conf:
@@ -200,4 +241,20 @@ train_conf:
    grad_clip: 5
    accum_grad: 2
    log_interval: 100
+    save_per_step: -1
+
+# gan train conf
+train_conf_gan:
+    optim: adam
+    optim_conf:
+        lr: 0.0002 # use small lr for gan training
+    scheduler: constantlr
+    optim_d: adam
+    optim_conf_d:
+        lr: 0.0002 # use small lr for gan training
+    scheduler_d: constantlr
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 1 # in gan training, accum_grad must be 1
+    log_interval: 100
    save_per_step: -1
--- a/examples/libritts/cosyvoice/run.sh
+++ b/examples/libritts/cosyvoice/run.sh
@@ -83,9 +83,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
  fi
  cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list
  cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list
-  for model in llm flow; do
+  for model in llm flow hifigan; do
    torchrun --nnodes=1 --nproc_per_node=$num_gpus \
-        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
+        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
      cosyvoice/bin/train.py \
      --train_engine $train_engine \
      --config conf/cosyvoice.yaml \
@@ -99,12 +99,27 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
      --num_workers ${num_workers} \
      --prefetch ${prefetch} \
      --pin_memory \
+      --timeout 300 \
      --deepspeed_config ./conf/ds_stage2.json \
      --deepspeed.save_states model+optimizer
  done
 fi

+# average model
+average_num=5
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  for model in llm flow hifigan; do
+    decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python cosyvoice/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path `pwd`/exp/cosyvoice/$model/$train_engine  \
+      --num ${average_num} \
+      --val_best
+  done
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
  echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
  python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir
  python cosyvoice/bin/export_onnx.py --model_dir $pretrained_model_dir