mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
update hifigan
This commit is contained in:
@@ -18,6 +18,7 @@ import datetime
|
|||||||
import logging
|
import logging
|
||||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
import os
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import deepspeed
|
import deepspeed
|
||||||
@@ -112,7 +113,10 @@ def main():
|
|||||||
# load checkpoint
|
# load checkpoint
|
||||||
model = configs[args.model]
|
model = configs[args.model]
|
||||||
if args.checkpoint is not None:
|
if args.checkpoint is not None:
|
||||||
model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'), strict=False)
|
if os.path.exists(args.checkpoint):
|
||||||
|
model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'), strict=False)
|
||||||
|
else:
|
||||||
|
logging.warning('checkpoint {} do not exsist!'.format(args.checkpoint))
|
||||||
|
|
||||||
# Dispatch model from cpu to gpu
|
# Dispatch model from cpu to gpu
|
||||||
model = wrap_cuda_model(args, model)
|
model = wrap_cuda_model(args, model)
|
||||||
@@ -125,7 +129,7 @@ def main():
|
|||||||
save_model(model, 'init', info_dict)
|
save_model(model, 'init', info_dict)
|
||||||
|
|
||||||
# Get executor
|
# Get executor
|
||||||
executor = Executor()
|
executor = Executor(gan=gan)
|
||||||
|
|
||||||
# Start training loop
|
# Start training loop
|
||||||
for epoch in range(info_dict['max_epoch']):
|
for epoch in range(info_dict['max_epoch']):
|
||||||
|
|||||||
@@ -393,8 +393,6 @@ def padding(data, use_spk_embedding, mode='train', gan=False):
|
|||||||
"speech_token_len": speech_token_len,
|
"speech_token_len": speech_token_len,
|
||||||
"speech_feat": speech_feat,
|
"speech_feat": speech_feat,
|
||||||
"speech_feat_len": speech_feat_len,
|
"speech_feat_len": speech_feat_len,
|
||||||
"pitch_feat": pitch_feat,
|
|
||||||
"pitch_feat_len": pitch_feat_len,
|
|
||||||
"text": text,
|
"text": text,
|
||||||
"text_token": text_token,
|
"text_token": text_token,
|
||||||
"text_token_len": text_token_len,
|
"text_token_len": text_token_len,
|
||||||
|
|||||||
@@ -133,6 +133,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|||||||
in_channels: 80
|
in_channels: 80
|
||||||
cond_channels: 512
|
cond_channels: 512
|
||||||
|
|
||||||
|
# gan related module
|
||||||
|
mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
|
||||||
|
n_fft: 1024
|
||||||
|
num_mels: 80
|
||||||
|
sampling_rate: !ref <sample_rate>
|
||||||
|
hop_size: 256
|
||||||
|
win_size: 1024
|
||||||
|
fmin: 0
|
||||||
|
fmax: 8000
|
||||||
|
center: False
|
||||||
|
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
||||||
|
generator: !ref <hift>
|
||||||
|
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
|
||||||
|
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
|
||||||
|
mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
|
||||||
|
mel_spec_transform: [
|
||||||
|
!ref <mel_spec_transform1>
|
||||||
|
]
|
||||||
|
|
||||||
# processor functions
|
# processor functions
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||||
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
||||||
@@ -151,6 +170,8 @@ filter: !name:cosyvoice.dataset.processor.filter
|
|||||||
token_min_length: 1
|
token_min_length: 1
|
||||||
resample: !name:cosyvoice.dataset.processor.resample
|
resample: !name:cosyvoice.dataset.processor.resample
|
||||||
resample_rate: !ref <sample_rate>
|
resample_rate: !ref <sample_rate>
|
||||||
|
truncate: !name:cosyvoice.dataset.processor.truncate
|
||||||
|
truncate_length: 24576 # must be a multiplier of hop_size
|
||||||
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
||||||
n_fft: 1024
|
n_fft: 1024
|
||||||
num_mels: 80
|
num_mels: 80
|
||||||
@@ -162,6 +183,12 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
center: False
|
center: False
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||||
feat_extractor: !ref <feat_extractor>
|
feat_extractor: !ref <feat_extractor>
|
||||||
|
pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
|
||||||
|
sample_rate: !ref <sample_rate>
|
||||||
|
frame_length: 46.4 # match feat_extractor win_size/sampling_rate
|
||||||
|
frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
|
||||||
|
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||||
|
pitch_extractor: !ref <pitch_extractor>
|
||||||
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
||||||
normalize: True
|
normalize: True
|
||||||
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
||||||
@@ -187,8 +214,22 @@ data_pipeline: [
|
|||||||
!ref <batch>,
|
!ref <batch>,
|
||||||
!ref <padding>,
|
!ref <padding>,
|
||||||
]
|
]
|
||||||
|
data_pipeline_gan: [
|
||||||
|
!ref <parquet_opener>,
|
||||||
|
!ref <tokenize>,
|
||||||
|
!ref <filter>,
|
||||||
|
!ref <resample>,
|
||||||
|
!ref <truncate>,
|
||||||
|
!ref <compute_fbank>,
|
||||||
|
!ref <compute_f0>,
|
||||||
|
!ref <parse_embedding>,
|
||||||
|
!ref <shuffle>,
|
||||||
|
!ref <sort>,
|
||||||
|
!ref <batch>,
|
||||||
|
!ref <padding>,
|
||||||
|
]
|
||||||
|
|
||||||
# train conf
|
# llm flow train conf
|
||||||
train_conf:
|
train_conf:
|
||||||
optim: adam
|
optim: adam
|
||||||
optim_conf:
|
optim_conf:
|
||||||
@@ -201,3 +242,19 @@ train_conf:
|
|||||||
accum_grad: 2
|
accum_grad: 2
|
||||||
log_interval: 100
|
log_interval: 100
|
||||||
save_per_step: -1
|
save_per_step: -1
|
||||||
|
|
||||||
|
# gan train conf
|
||||||
|
train_conf_gan:
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.0002 # use small lr for gan training
|
||||||
|
scheduler: constantlr
|
||||||
|
optim_d: adam
|
||||||
|
optim_conf_d:
|
||||||
|
lr: 0.0002 # use small lr for gan training
|
||||||
|
scheduler_d: constantlr
|
||||||
|
max_epoch: 200
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 1 # in gan training, accum_grad must be 1
|
||||||
|
log_interval: 100
|
||||||
|
save_per_step: -1
|
||||||
Reference in New Issue
Block a user