From 789ee9e5e777a6c9c30d84b5150c73c2cf2fe05d Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 16 Oct 2024 11:37:32 +0800 Subject: [PATCH] add hifigan train --- cosyvoice/bin/average_model.py | 93 ++++++++++++ cosyvoice/bin/train.py | 17 ++- cosyvoice/bin/train_gan.py | 137 ----------------- cosyvoice/cli/cosyvoice.py | 4 +- cosyvoice/cli/model.py | 18 ++- cosyvoice/dataset/dataset.py | 6 +- cosyvoice/dataset/processor.py | 20 ++- cosyvoice/utils/executor.py | 62 +++++++- cosyvoice/utils/executor_gan.py | 118 --------------- cosyvoice/utils/train_utils.py | 97 +++++------- .../cosyvoice/conf/cosyvoice.hifigan.yaml | 141 ------------------ .../libritts/cosyvoice/conf/cosyvoice.yaml | 59 +++++++- examples/libritts/cosyvoice/run.sh | 19 ++- 13 files changed, 314 insertions(+), 477 deletions(-) create mode 100644 cosyvoice/bin/average_model.py delete mode 100644 cosyvoice/bin/train_gan.py delete mode 100644 cosyvoice/utils/executor_gan.py delete mode 100644 examples/libritts/cosyvoice/conf/cosyvoice.hifigan.yaml diff --git a/cosyvoice/bin/average_model.py b/cosyvoice/bin/average_model.py new file mode 100644 index 0000000..3112f29 --- /dev/null +++ b/cosyvoice/bin/average_model.py @@ -0,0 +1,93 @@ +# Copyright (c) 2020 Mobvoi Inc (Di Wu) +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import glob +import sys + +import yaml +import torch + + +def get_args(): + parser = argparse.ArgumentParser(description='average model') + parser.add_argument('--dst_model', required=True, help='averaged model') + parser.add_argument('--src_path', + required=True, + help='src model path for average') + parser.add_argument('--val_best', + action="store_true", + help='averaged model') + parser.add_argument('--num', + default=5, + type=int, + help='nums for averaged model') + + args = parser.parse_args() + print(args) + return args + + +def main(): + args = get_args() + val_scores = [] + if args.val_best: + yamls = glob.glob('{}/*.yaml'.format(args.src_path)) + yamls = [ + f for f in yamls + if not (os.path.basename(f).startswith('train') + or os.path.basename(f).startswith('init')) + ] + for y in yamls: + with open(y, 'r') as f: + dic_yaml = yaml.load(f, Loader=yaml.BaseLoader) + loss = float(dic_yaml['loss_dict']['loss']) + epoch = int(dic_yaml['epoch']) + step = int(dic_yaml['step']) + tag = dic_yaml['tag'] + val_scores += [[epoch, step, loss, tag]] + sorted_val_scores = sorted(val_scores, + key=lambda x: x[2], + reverse=False) + print("best val (epoch, step, loss, tag) = " + + str(sorted_val_scores[:args.num])) + path_list = [ + args.src_path + '/epoch_{}_whole.pt'.format(score[0]) + for score in sorted_val_scores[:args.num] + ] + print(path_list) + avg = {} + num = args.num + assert num == len(path_list) + for path in path_list: + print('Processing {}'.format(path)) + states = torch.load(path, map_location=torch.device('cpu')) + for k in states.keys(): + if k not in avg.keys(): + avg[k] = states[k].clone() + else: + avg[k] += states[k] + # average + for k in avg.keys(): + if avg[k] is not None: + # pytorch 1.6 use true_divide instead of /= + avg[k] = torch.true_divide(avg[k], num) + print('Saving to {}'.format(args.dst_model)) + torch.save(avg, args.dst_model) + + +if __name__ == '__main__': + main() diff --git a/cosyvoice/bin/train.py b/cosyvoice/bin/train.py index 016663f..d125bde 100644 --- a/cosyvoice/bin/train.py +++ b/cosyvoice/bin/train.py @@ -86,8 +86,12 @@ def main(): args = get_args() logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') + # gan train has some special initialization logic + gan = True if args.model == 'hifigan' else False - override_dict = {k: None for k in ['llm', 'flow', 'hifigan'] if k != args.model} + override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model} + if gan is True: + override_dict.pop('hift') with open(args.config, 'r') as f: configs = load_hyperpyyaml(f, overrides=override_dict) configs['train_conf'].update(vars(args)) @@ -97,7 +101,7 @@ def main(): # Get dataset & dataloader train_dataset, cv_dataset, train_data_loader, cv_data_loader = \ - init_dataset_and_dataloader(args, configs) + init_dataset_and_dataloader(args, configs, gan) # Do some sanity checks and save config to arsg.model_dir configs = check_modify_and_save_config(args, configs) @@ -108,13 +112,13 @@ def main(): # load checkpoint model = configs[args.model] if args.checkpoint is not None: - model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')) + model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'), strict=False) # Dispatch model from cpu to gpu model = wrap_cuda_model(args, model) # Get optimizer & scheduler - model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model) + model, optimizer, scheduler, optimizer_d, scheduler_d = init_optimizer_and_scheduler(args, configs, model, gan) # Save init checkpoints info_dict = deepcopy(configs['train_conf']) @@ -129,7 +133,10 @@ def main(): train_dataset.set_epoch(epoch) dist.barrier() group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) - executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) + if gan is True: + executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader, writer, info_dict, group_join) + else: + executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) dist.destroy_process_group(group_join) diff --git a/cosyvoice/bin/train_gan.py b/cosyvoice/bin/train_gan.py deleted file mode 100644 index 96bf988..0000000 --- a/cosyvoice/bin/train_gan.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import argparse -import datetime -import logging -logging.getLogger('matplotlib').setLevel(logging.WARNING) -from copy import deepcopy -import torch -import torch.distributed as dist -import deepspeed - -from hyperpyyaml import load_hyperpyyaml - -from torch.distributed.elastic.multiprocessing.errors import record - -from cosyvoice.utils.executor_gan import Executor -from cosyvoice.utils.train_utils import ( - init_distributed, - init_dataset_and_dataloader, - init_optimizer_and_scheduler_gan, - init_summarywriter, save_model, - wrap_cuda_model, check_modify_and_save_config) - - -def get_args(): - parser = argparse.ArgumentParser(description='training your network') - parser.add_argument('--train_engine', - default='torch_ddp', - choices=['torch_ddp', 'deepspeed'], - help='Engine for paralleled training') - parser.add_argument('--model', required=True, help='model which will be trained') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--train_data', required=True, help='train data file') - parser.add_argument('--cv_data', required=True, help='cv data file') - parser.add_argument('--checkpoint', help='checkpoint model') - parser.add_argument('--model_dir', required=True, help='save model dir') - parser.add_argument('--tensorboard_dir', - default='tensorboard', - help='tensorboard log dir') - parser.add_argument('--ddp.dist_backend', - dest='dist_backend', - default='nccl', - choices=['nccl', 'gloo'], - help='distributed backend') - parser.add_argument('--num_workers', - default=0, - type=int, - help='num of subprocess workers for reading') - parser.add_argument('--prefetch', - default=100, - type=int, - help='prefetch number') - parser.add_argument('--pin_memory', - action='store_true', - default=False, - help='Use pinned memory buffers used for reading') - parser.add_argument('--deepspeed.save_states', - dest='save_states', - default='model_only', - choices=['model_only', 'model+optimizer'], - help='save model/optimizer states') - parser.add_argument('--timeout', - default=30, - type=int, - help='timeout (in seconds) of cosyvoice_join.') - parser = deepspeed.add_config_arguments(parser) - args = parser.parse_args() - return args - - -@record -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - - override_dict = {k: None for k in ['llm', 'flow', 'hifigan'] if k != args.model} - with open(args.config, 'r') as f: - configs = load_hyperpyyaml(f, overrides=override_dict, overrides_must_match=False) - configs['train_conf'].update(vars(args)) - - # Init env for ddp - init_distributed(args) - - # Get dataset & dataloader - train_dataset, cv_dataset, train_data_loader, cv_data_loader = \ - init_dataset_and_dataloader(args, configs) - - # Do some sanity checks and save config to arsg.model_dir - configs = check_modify_and_save_config(args, configs) - - # Tensorboard summary - writer = init_summarywriter(args) - - # load checkpoint - model = configs[args.model] - if args.checkpoint is not None: - model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')) - - # Dispatch model from cpu to gpu - model = wrap_cuda_model(args, model) - - # Get optimizer & scheduler - model, optimizer, scheduler, optimizer_d, scheduler_d = init_optimizer_and_scheduler_gan(args, configs, model) - - # Save init checkpoints - info_dict = deepcopy(configs['train_conf']) - save_model(model, 'init', info_dict) - - # Get executor - executor = Executor() - - # Start training loop - for epoch in range(info_dict['max_epoch']): - executor.epoch = epoch - train_dataset.set_epoch(epoch) - dist.barrier() - group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) - executor.train_one_epoc(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader, writer, info_dict, group_join) - dist.destroy_process_group(group_join) - - -if __name__ == '__main__': - main() diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index bedb2ba..48babf3 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -23,7 +23,7 @@ from cosyvoice.utils.file_utils import logging class CosyVoice: - def __init__(self, model_dir, load_jit=True, load_onnx=False): + def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True): instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir if not os.path.exists(model_dir): @@ -37,7 +37,7 @@ class CosyVoice: '{}/spk2info.pt'.format(model_dir), instruct, configs['allowed_special']) - self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) + self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), '{}/hift.pt'.format(model_dir)) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 489978d..cf6389d 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -26,11 +26,13 @@ class CosyVoiceModel: def __init__(self, llm: torch.nn.Module, flow: torch.nn.Module, - hift: torch.nn.Module): + hift: torch.nn.Module, + fp16: bool): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.llm = llm self.flow = flow self.hift = hift + self.fp16 = fp16 self.token_min_hop_len = 2 * self.flow.input_frame_rate self.token_max_hop_len = 4 * self.flow.input_frame_rate self.token_overlap_len = 20 @@ -56,13 +58,17 @@ class CosyVoiceModel: def load(self, llm_model, flow_model, hift_model): self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) self.llm.to(self.device).eval() - self.llm.half() + if self.fp16 is True: + self.llm.half() self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) self.flow.to(self.device).eval() - self.hift.load_state_dict(torch.load(hift_model, map_location=self.device)) + # in case hift_model is a hifigan model + hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device)} + self.hift.load_state_dict(hift_state_dict, strict=False) self.hift.to(self.device).eval() def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model): + assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model" llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device) self.llm.text_encoder = llm_text_encoder llm_llm = torch.jit.load(llm_llm_model, map_location=self.device) @@ -80,6 +86,8 @@ class CosyVoiceModel: self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers) def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): + if self.fp16 is True: + llm_embedding = llm_embedding.half() with self.llm_context: for i in self.llm.inference(text=text.to(self.device), text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device), @@ -87,7 +95,7 @@ class CosyVoiceModel: prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), prompt_speech_token=llm_prompt_speech_token.to(self.device), prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), - embedding=llm_embedding.to(self.device).half()): + embedding=llm_embedding.to(self.device)): self.tts_speech_token_dict[uuid].append(i) self.llm_end_dict[uuid] = True @@ -123,7 +131,7 @@ class CosyVoiceModel: if speed != 1.0: assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode' tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear') - tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source) + tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source) if self.hift_cache_dict[uuid] is not None: tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) return tts_speech diff --git a/cosyvoice/dataset/dataset.py b/cosyvoice/dataset/dataset.py index 299e5ec..4a59139 100644 --- a/cosyvoice/dataset/dataset.py +++ b/cosyvoice/dataset/dataset.py @@ -126,6 +126,7 @@ class DataList(IterableDataset): def Dataset(data_list_file, data_pipeline, mode='train', + gan=False, shuffle=True, partition=True, tts_file='', @@ -153,8 +154,11 @@ def Dataset(data_list_file, shuffle=shuffle, partition=partition) if mode == 'inference': - # map partial arg tts_data in inference mode + # map partial arg to parquet_opener func in inference mode data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data) + if gan is True: + # map partial arg to padding func in gan mode + data_pipeline[-1] = partial(data_pipeline[-1], gan=gan) for func in data_pipeline: dataset = Processor(dataset, func, mode=mode) return dataset diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index ba92911..759e093 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -350,7 +350,7 @@ def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, m logging.fatal('Unsupported batch type {}'.format(batch_type)) -def padding(data, use_spk_embedding, mode='train'): +def padding(data, use_spk_embedding, mode='train', gan=False): """ Padding the data into training data Args: @@ -379,11 +379,6 @@ def padding(data, use_spk_embedding, mode='train'): speech_feat = pad_sequence(speech_feat, batch_first=True, padding_value=0) - pitch_feat = [sample[i]['pitch_feat'] for i in order] - pitch_feat_len = torch.tensor([i.size(0) for i in pitch_feat], dtype=torch.int32) - pitch_feat = pad_sequence(pitch_feat, - batch_first=True, - padding_value=0) text = [sample[i]['text'] for i in order] text_token = [torch.tensor(sample[i]['text_token']) for i in order] text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32) @@ -406,6 +401,19 @@ def padding(data, use_spk_embedding, mode='train'): "utt_embedding": utt_embedding, "spk_embedding": spk_embedding, } + if gan is True: + # in gan train, we need pitch_feat + pitch_feat = [sample[i]['pitch_feat'] for i in order] + pitch_feat_len = torch.tensor([i.size(0) for i in pitch_feat], dtype=torch.int32) + pitch_feat = pad_sequence(pitch_feat, + batch_first=True, + padding_value=0) + batch["pitch_feat"] = pitch_feat + batch["pitch_feat_len"] = pitch_feat_len + else: + # only gan train needs speech, delete it to save memory + del batch["speech"] + del batch["speech_len"] if mode == 'inference': tts_text = [sample[i]['tts_text'] for i in order] tts_index = [sample[i]['tts_index'] for i in order] diff --git a/cosyvoice/utils/executor.py b/cosyvoice/utils/executor.py index 98a06db..bb7289c 100644 --- a/cosyvoice/utils/executor.py +++ b/cosyvoice/utils/executor.py @@ -25,7 +25,8 @@ from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, l class Executor: - def __init__(self): + def __init__(self, gan: bool=False): + self.gan = gan self.step = 0 self.epoch = 0 self.rank = int(os.environ.get('RANK', 0)) @@ -80,6 +81,63 @@ class Executor: dist.barrier() self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) + def train_one_epoc_gan(self, model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader, writer, info_dict, group_join): + ''' Train one epoch + ''' + + lr = optimizer.param_groups[0]['lr'] + logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank)) + logging.info('using accumulate grad, new batch size is {} times' + ' larger than before'.format(info_dict['accum_grad'])) + # A context manager to be used in conjunction with an instance of + # torch.nn.parallel.DistributedDataParallel to be able to train + # with uneven inputs across participating processes. + model.train() + model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext + with model_context(): + for batch_idx, batch_dict in enumerate(train_data_loader): + info_dict["tag"] = "TRAIN" + info_dict["step"] = self.step + info_dict["epoch"] = self.epoch + info_dict["batch_idx"] = batch_idx + if cosyvoice_join(group_join, info_dict): + break + + # Disable gradient synchronizations across DDP processes. + # Within this context, gradients will be accumulated on module + # variables, which will later be synchronized. + if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0: + context = model.no_sync + # Used for single gpu training and DDP gradient synchronization + # processes. + else: + context = nullcontext + + with context(): + batch_dict['turn'] = 'discriminator' + info_dict = batch_forward(model, batch_dict, info_dict) + info_dict = batch_backward(model, info_dict) + info_dict = update_parameter_and_lr(model, optimizer_d, scheduler_d, info_dict) + optimizer.zero_grad() + log_per_step(writer, info_dict) + with context(): + batch_dict['turn'] = 'generator' + info_dict = batch_forward(model, batch_dict, info_dict) + info_dict = batch_backward(model, info_dict) + info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict) + optimizer_d.zero_grad() + log_per_step(writer, info_dict) + # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save + if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and \ + (batch_idx + 1) % info_dict["accum_grad"] == 0: + dist.barrier() + self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False) + model.train() + if (batch_idx + 1) % info_dict["accum_grad"] == 0: + self.step += 1 + dist.barrier() + self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) + @torch.inference_mode() def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True): ''' Cross validation on @@ -96,6 +154,8 @@ class Executor: num_utts = len(batch_dict["utts"]) total_num_utts += num_utts + if self.gan is True: + batch_dict['turn'] = 'generator' info_dict = batch_forward(model, batch_dict, info_dict) for k, v in info_dict['loss_dict'].items(): diff --git a/cosyvoice/utils/executor_gan.py b/cosyvoice/utils/executor_gan.py deleted file mode 100644 index 9fb1b51..0000000 --- a/cosyvoice/utils/executor_gan.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -# 2024 Alibaba Inc (authors: Xiang Lyu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from contextlib import nullcontext -import os - -import torch -import torch.distributed as dist - -from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, log_per_save, batch_forward, batch_backward, save_model, cosyvoice_join - - -class Executor: - - def __init__(self): - self.step = 0 - self.epoch = 0 - self.rank = int(os.environ.get('RANK', 0)) - self.device = torch.device('cuda:{}'.format(self.rank)) - - def train_one_epoc(self, model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader, writer, info_dict, group_join): - ''' Train one epoch - ''' - - lr = optimizer.param_groups[0]['lr'] - logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank)) - logging.info('using accumulate grad, new batch size is {} times' - ' larger than before'.format(info_dict['accum_grad'])) - # A context manager to be used in conjunction with an instance of - # torch.nn.parallel.DistributedDataParallel to be able to train - # with uneven inputs across participating processes. - model.train() - model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext - with model_context(): - for batch_idx, batch_dict in enumerate(train_data_loader): - info_dict["tag"] = "TRAIN" - info_dict["step"] = self.step - info_dict["epoch"] = self.epoch - info_dict["batch_idx"] = batch_idx - if cosyvoice_join(group_join, info_dict): - break - - # Disable gradient synchronizations across DDP processes. - # Within this context, gradients will be accumulated on module - # variables, which will later be synchronized. - if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0: - context = model.no_sync - # Used for single gpu training and DDP gradient synchronization - # processes. - else: - context = nullcontext - - with context(): - batch_dict['turn'] = 'discriminator' - info_dict = batch_forward(model, batch_dict, info_dict) - info_dict = batch_backward(model, info_dict) - info_dict = update_parameter_and_lr(model, optimizer_d, scheduler_d, info_dict) - log_per_step(writer, info_dict) - with context(): - batch_dict['turn'] = 'generator' - info_dict = batch_forward(model, batch_dict, info_dict) - info_dict = batch_backward(model, info_dict) - info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict) - log_per_step(writer, info_dict) - # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save - if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and \ - (batch_idx + 1) % info_dict["accum_grad"] == 0: - dist.barrier() - self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False) - model.train() - if (batch_idx + 1) % info_dict["accum_grad"] == 0: - self.step += 1 - dist.barrier() - self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) - - @torch.inference_mode() - def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True): - ''' Cross validation on - ''' - logging.info('Epoch {} Step {} on_batch_end {} CV rank {}'.format(self.epoch, self.step + 1, on_batch_end, self.rank)) - model.eval() - total_num_utts, total_loss_dict = 0, {} # avoid division by 0 - for batch_idx, batch_dict in enumerate(cv_data_loader): - info_dict["tag"] = "CV" - info_dict["step"] = self.step - info_dict["epoch"] = self.epoch - info_dict["batch_idx"] = batch_idx - - num_utts = len(batch_dict["utts"]) - total_num_utts += num_utts - - batch_dict['turn'] = 'generator' - info_dict = batch_forward(model, batch_dict, info_dict) - - for k, v in info_dict['loss_dict'].items(): - if k not in total_loss_dict: - total_loss_dict[k] = [] - total_loss_dict[k].append(v.item() * num_utts) - log_per_step(None, info_dict) - for k, v in total_loss_dict.items(): - total_loss_dict[k] = sum(v) / total_num_utts - info_dict['loss_dict'] = total_loss_dict - log_per_save(writer, info_dict) - model_name = 'epoch_{}_whole'.format(self.epoch) if on_batch_end else 'epoch_{}_step_{}'.format(self.epoch, self.step + 1) - save_model(model, model_name, info_dict) diff --git a/cosyvoice/utils/train_utils.py b/cosyvoice/utils/train_utils.py index 2fbba78..efe655f 100644 --- a/cosyvoice/utils/train_utils.py +++ b/cosyvoice/utils/train_utils.py @@ -51,9 +51,10 @@ def init_distributed(args): return world_size, local_rank, rank -def init_dataset_and_dataloader(args, configs): - train_dataset = Dataset(args.train_data, data_pipeline=configs['data_pipeline'], mode='train', shuffle=True, partition=True) - cv_dataset = Dataset(args.cv_data, data_pipeline=configs['data_pipeline'], mode='train', shuffle=False, partition=False) +def init_dataset_and_dataloader(args, configs, gan): + data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline'] + train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=True, partition=True) + cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', gan=gan, shuffle=False, partition=False) # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts train_data_loader = DataLoader(train_dataset, @@ -108,30 +109,31 @@ def wrap_cuda_model(args, model): return model -def init_optimizer_and_scheduler(args, configs, model): - if configs['train_conf']['optim'] == 'adam': - optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim'] == 'adamw': - optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf']) +def init_optimizer_and_scheduler(args, configs, model, gan): + key = 'train_conf_gan' if gan is True else 'train_conf' + if configs[key]['optim'] == 'adam': + optimizer = optim.Adam(model.parameters(), **configs[key]['optim_conf']) + elif configs[key]['optim'] == 'adamw': + optimizer = optim.AdamW(model.parameters(), **configs[key]['optim_conf']) else: - raise ValueError("unknown optimizer: " + configs['train_conf']) + raise ValueError("unknown optimizer: " + configs[key]) - if configs['train_conf']['scheduler'] == 'warmuplr': + if configs[key]['scheduler'] == 'warmuplr': scheduler_type = WarmupLR - scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': + scheduler = WarmupLR(optimizer, **configs[key]['scheduler_conf']) + elif configs[key]['scheduler'] == 'NoamHoldAnnealing': scheduler_type = NoamHoldAnnealing - scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': + scheduler = NoamHoldAnnealing(optimizer, **configs[key]['scheduler_conf']) + elif configs[key]['scheduler'] == 'constantlr': scheduler_type = ConstantLR scheduler = ConstantLR(optimizer) else: - raise ValueError("unknown scheduler: " + configs['train_conf']) + raise ValueError("unknown scheduler: " + configs[key]) # use deepspeed optimizer for speedup if args.train_engine == "deepspeed": def scheduler(opt): - return scheduler_type(opt, **configs['train_conf']['scheduler_conf']) + return scheduler_type(opt, **configs[key]['scheduler_conf']) model, optimizer, _, scheduler = deepspeed.initialize( args=args, model=model, @@ -139,49 +141,28 @@ def init_optimizer_and_scheduler(args, configs, model): lr_scheduler=scheduler, model_parameters=model.parameters()) - return model, optimizer, scheduler - - -def init_optimizer_and_scheduler_gan(args, configs, model): - if configs['train_conf']['optim'] == 'adam': - optimizer = optim.Adam(model.module.generator.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim'] == 'adamw': - optimizer = optim.AdamW(model.module.generator.parameters(), **configs['train_conf']['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['train_conf']) - - if configs['train_conf']['scheduler'] == 'warmuplr': - scheduler_type = WarmupLR - scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': - scheduler_type = NoamHoldAnnealing - scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': - scheduler_type = ConstantLR - scheduler = ConstantLR(optimizer) - else: - raise ValueError("unknown scheduler: " + configs['train_conf']) - - if configs['train_conf']['optim_d'] == 'adam': - optimizer_d = optim.Adam(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf']) - elif configs['train_conf']['optim_d'] == 'adamw': - optimizer_d = optim.AdamW(model.module.discriminator.parameters(), **configs['train_conf']['optim_conf']) - else: - raise ValueError("unknown optimizer: " + configs['train_conf']) - - if configs['train_conf']['scheduler_d'] == 'warmuplr': - scheduler_type = WarmupLR - scheduler_d = WarmupLR(optimizer_d, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler_d'] == 'NoamHoldAnnealing': - scheduler_type = NoamHoldAnnealing - scheduler_d = NoamHoldAnnealing(optimizer_d, **configs['train_conf']['scheduler_conf']) - elif configs['train_conf']['scheduler'] == 'constantlr': - scheduler_type = ConstantLR - scheduler_d = ConstantLR(optimizer_d) - else: - raise ValueError("unknown scheduler: " + configs['train_conf']) - # currently we wrap generator and discriminator in one model, so we cannot use deepspeed + if gan is True: + if configs[key]['optim_d'] == 'adam': + optimizer_d = optim.Adam(model.module.discriminator.parameters(), **configs[key]['optim_conf']) + elif configs[key]['optim_d'] == 'adamw': + optimizer_d = optim.AdamW(model.module.discriminator.parameters(), **configs[key]['optim_conf']) + else: + raise ValueError("unknown optimizer: " + configs[key]) + + if configs[key]['scheduler_d'] == 'warmuplr': + scheduler_type = WarmupLR + scheduler_d = WarmupLR(optimizer_d, **configs[key]['scheduler_conf']) + elif configs[key]['scheduler_d'] == 'NoamHoldAnnealing': + scheduler_type = NoamHoldAnnealing + scheduler_d = NoamHoldAnnealing(optimizer_d, **configs[key]['scheduler_conf']) + elif configs[key]['scheduler'] == 'constantlr': + scheduler_type = ConstantLR + scheduler_d = ConstantLR(optimizer_d) + else: + raise ValueError("unknown scheduler: " + configs[key]) + else: + optimizer_d, scheduler_d = None, None return model, optimizer, scheduler, optimizer_d, scheduler_d diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.hifigan.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.hifigan.yaml deleted file mode 100644 index 80b5745..0000000 --- a/examples/libritts/cosyvoice/conf/cosyvoice.hifigan.yaml +++ /dev/null @@ -1,141 +0,0 @@ -# set random seed, so that you may reproduce your result. -__set_seed1: !apply:random.seed [1986] -__set_seed2: !apply:numpy.random.seed [1986] -__set_seed3: !apply:torch.manual_seed [1986] -__set_seed4: !apply:torch.cuda.manual_seed_all [1986] - -# fixed params -sample_rate: 22050 -text_encoder_input_size: 512 -llm_input_size: 1024 -llm_output_size: 1024 -spk_embed_dim: 192 - -# model params -# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. -# for system/third_party class/function, we do not require this. -hift: !new:cosyvoice.hifigan.generator.HiFTGenerator - in_channels: 80 - base_channels: 512 - nb_harmonics: 8 - sampling_rate: !ref - nsf_alpha: 0.1 - nsf_sigma: 0.003 - nsf_voiced_threshold: 10 - upsample_rates: [8, 8] - upsample_kernel_sizes: [16, 16] - istft_params: - n_fft: 16 - hop_len: 4 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - source_resblock_kernel_sizes: [7, 11] - source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] - lrelu_slope: 0.1 - audio_limit: 0.99 - f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor - num_class: 1 - in_channels: 80 - cond_channels: 512 - -mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan - generator: !ref - discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator - mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator - mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator - mel_spec_transform: [ - !ref - ] - -# processor functions -parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe - multilingual: True - num_languages: 100 - language: 'en' - task: 'transcribe' -tokenize: !name:cosyvoice.dataset.processor.tokenize - get_tokenizer: !ref - allowed_special: 'all' -filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 - min_length: 0 - token_max_length: 200 - token_min_length: 1 -resample: !name:cosyvoice.dataset.processor.resample - resample_rate: !ref -truncate: !name:cosyvoice.dataset.processor.truncate - truncate_length: 24576 # must be a multiplier of hop_size -feat_extractor: !name:matcha.utils.audio.mel_spectrogram - n_fft: 1024 - num_mels: 80 - sampling_rate: !ref - hop_size: 256 - win_size: 1024 - fmin: 0 - fmax: 8000 - center: False -compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank - feat_extractor: !ref -pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch - sample_rate: !ref - frame_length: 46.4 # match feat_extractor win_size/sampling_rate - frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate -compute_f0: !name:cosyvoice.dataset.processor.compute_f0 - pitch_extractor: !ref -parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding - normalize: True -shuffle: !name:cosyvoice.dataset.processor.shuffle - shuffle_size: 1000 -sort: !name:cosyvoice.dataset.processor.sort - sort_size: 500 # sort_size should be less than shuffle_size -batch: !name:cosyvoice.dataset.processor.batch - batch_type: 'dynamic' - max_frames_in_batch: 1200 -padding: !name:cosyvoice.dataset.processor.padding - use_spk_embedding: False # change to True during sft - -# dataset processor pipeline -data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] - -# train conf -train_conf: - optim: adam - optim_conf: - lr: 0.002 # change to 0.001 if you want to train flow from scratch - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - optim_d: adam - optim_conf_d: - lr: 0.002 # change to 0.001 if you want to train flow from scratch - scheduler_d: warmuplr - scheduler_conf_d: - warmup_steps: 25000 - max_epoch: 200 - grad_clip: 5 - accum_grad: 2 - log_interval: 100 - save_per_step: -1 \ No newline at end of file diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index b2ff51c..95a2e9e 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -133,6 +133,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator in_channels: 80 cond_channels: 512 +# gan related module +mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1024 + num_mels: 80 + sampling_rate: !ref + hop_size: 256 + win_size: 1024 + fmin: 0 + fmax: 8000 + center: False +hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan + generator: !ref + discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator + mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator + mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator + mel_spec_transform: [ + !ref + ] + # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe @@ -151,6 +170,8 @@ filter: !name:cosyvoice.dataset.processor.filter token_min_length: 1 resample: !name:cosyvoice.dataset.processor.resample resample_rate: !ref +truncate: !name:cosyvoice.dataset.processor.truncate + truncate_length: 24576 # must be a multiplier of hop_size feat_extractor: !name:matcha.utils.audio.mel_spectrogram n_fft: 1024 num_mels: 80 @@ -162,6 +183,12 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram center: False compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank feat_extractor: !ref +pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch + sample_rate: !ref + frame_length: 46.4 # match feat_extractor win_size/sampling_rate + frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate +compute_f0: !name:cosyvoice.dataset.processor.compute_f0 + pitch_extractor: !ref parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding normalize: True shuffle: !name:cosyvoice.dataset.processor.shuffle @@ -187,8 +214,22 @@ data_pipeline: [ !ref , !ref , ] +data_pipeline_gan: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] -# train conf +# llm flow train conf train_conf: optim: adam optim_conf: @@ -200,4 +241,20 @@ train_conf: grad_clip: 5 accum_grad: 2 log_interval: 100 + save_per_step: -1 + +# gan train conf +train_conf_gan: + optim: adam + optim_conf: + lr: 0.0002 # use small lr for gan training + scheduler: constantlr + optim_d: adam + optim_conf_d: + lr: 0.0002 # use small lr for gan training + scheduler_d: constantlr + max_epoch: 200 + grad_clip: 5 + accum_grad: 1 # in gan training, accum_grad must be 1 + log_interval: 100 save_per_step: -1 \ No newline at end of file diff --git a/examples/libritts/cosyvoice/run.sh b/examples/libritts/cosyvoice/run.sh index 0b20756..5dc79c5 100644 --- a/examples/libritts/cosyvoice/run.sh +++ b/examples/libritts/cosyvoice/run.sh @@ -83,9 +83,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then fi cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list - for model in llm flow; do + for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ - --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice.yaml \ @@ -99,12 +99,27 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --num_workers ${num_workers} \ --prefetch ${prefetch} \ --pin_memory \ + --timeout 300 \ --deepspeed_config ./conf/ds_stage2.json \ --deepspeed.save_states model+optimizer done fi +# average model +average_num=5 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + for model in llm flow hifigan; do + decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python cosyvoice/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path `pwd`/exp/cosyvoice/$model/$train_engine \ + --num ${average_num} \ + --val_best + done +fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir" python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir python cosyvoice/bin/export_onnx.py --model_dir $pretrained_model_dir