From ee9e87b4d3e239acbb9ab0c2e2ed739baea55533 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 9 Jul 2024 23:48:23 +0800 Subject: [PATCH 1/5] add empty cache --- cosyvoice/cli/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 98f19b2..f4625e3 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -56,4 +56,5 @@ class CosyVoiceModel: prompt_feat_len=prompt_speech_feat_len.to(self.device), embedding=flow_embedding.to(self.device)) tts_speech = self.hift.inference(mel=tts_mel).cpu() + torch.cuda.empty_cache() return {'tts_speech': tts_speech} From 6a3e44242ad24f01ba64430d8f6ac5718442b0da Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 10 Jul 2024 00:21:56 +0800 Subject: [PATCH 2/5] keep only embedding mean as spk embedding --- cosyvoice/dataset/processor.py | 2 +- tools/extract_embedding.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index 9477d02..cb34a0c 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -167,7 +167,7 @@ def parse_embedding(data, normalize, mode='train'): """ for sample in data: sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32) - sample['spk_embedding'] = torch.stack([torch.tensor(i, dtype=torch.float32) for i in sample['spk_embedding']], dim=0).mean(dim=0) + sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32) if normalize: sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0) sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0) diff --git a/tools/extract_embedding.py b/tools/extract_embedding.py index 02fa2f6..9c6f568 100755 --- a/tools/extract_embedding.py +++ b/tools/extract_embedding.py @@ -53,6 +53,8 @@ def main(args): if spk not in spk2embedding: spk2embedding[spk] = [] spk2embedding[spk].append(embedding) + for k, v in spk2embedding.items(): + spk2embedding[k] = torch.tensor(v).mean(dim=0, keepdim=True).tolist() torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir)) torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir)) From 225b56de052ad53afa6ca84612cd6da2e5d6d298 Mon Sep 17 00:00:00 2001 From: cyz Date: Wed, 10 Jul 2024 12:02:41 +0800 Subject: [PATCH 3/5] =?UTF-8?q?FIX:=20=E4=BF=AE=E5=A4=8D=E8=87=AA=E7=84=B6?= =?UTF-8?q?=E8=AF=AD=E8=A8=80=E6=8E=A7=E5=88=B6=E7=94=9F=E6=88=90=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E6=97=B6=E5=8F=91=E7=94=9F=E9=94=99=E8=AF=AF=EF=BC=8C?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=E4=BF=A1=E6=81=AF=E5=A6=82=E4=B8=8B=EF=BC=9A?= =?UTF-8?q?AttributeError:=20'CosyVoiceFrontEnd'=20object=20has=20no=20att?= =?UTF-8?q?ribute=20'en=5Ftn=5Fmodel'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosyvoice/cli/frontend.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index d2983b7..0fb9ca9 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -114,7 +114,10 @@ class CosyVoiceFrontEnd: token_min_n=60, merge_len=20, comma_split=False)] else: - text = self.en_tn_model.normalize(text) + if self.use_ttsfrd: + text = self.frd.get_frd_extra_info(text, 'input') + else: + text = self.en_tn_model.normalize(text) text = spell_out_number(text, self.inflect_parser) texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, token_min_n=60, merge_len=20, From 793a24862caed695a375661ae57fa99b5fc7793f Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 10 Jul 2024 16:37:25 +0800 Subject: [PATCH 4/5] add constant lr scheduler --- cosyvoice/utils/scheduler.py | 22 +++++++++++++++++++ cosyvoice/utils/train_utils.py | 5 ++++- .../libritts/cosyvoice/conf/cosyvoice.yaml | 4 ++-- tools/extract_embedding.py | 2 +- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/cosyvoice/utils/scheduler.py b/cosyvoice/utils/scheduler.py index eed1ea0..fbf4803 100644 --- a/cosyvoice/utils/scheduler.py +++ b/cosyvoice/utils/scheduler.py @@ -715,3 +715,25 @@ class NoamHoldAnnealing(WarmupHoldPolicy): def set_step(self, step: int): self.last_epoch = step + + +class ConstantLR(_LRScheduler): + """The ConstantLR scheduler + + This scheduler keeps a constant lr + + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + ): + # __init__() must be invoked before setting field + # because step() is also invoked in __init__() + super().__init__(optimizer) + + def get_lr(self): + return self.base_lrs + + def set_step(self, step: int): + self.last_epoch = step diff --git a/cosyvoice/utils/train_utils.py b/cosyvoice/utils/train_utils.py index df3a321..f8d7b45 100644 --- a/cosyvoice/utils/train_utils.py +++ b/cosyvoice/utils/train_utils.py @@ -34,7 +34,7 @@ from torch.nn.utils import clip_grad_norm_ from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live from cosyvoice.dataset.dataset import Dataset -from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing +from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR def init_distributed(args): @@ -122,6 +122,9 @@ def init_optimizer_and_scheduler(args, configs, model): elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': scheduler_type = NoamHoldAnnealing scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'constantlr': + scheduler_type = ConstantLR + scheduler = ConstantLR(optimizer) else: raise ValueError("unknown scheduler: " + configs['train_conf']) diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index cc5eee0..c791c76 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -186,8 +186,8 @@ data_pipeline: [ train_conf: optim: adam optim_conf: - lr: 0.001 - scheduler: warmuplr + lr: 0.001 # change to 1e-5 during sft + scheduler: warmuplr # change to constantlr during sft scheduler_conf: warmup_steps: 2500 max_epoch: 200 diff --git a/tools/extract_embedding.py b/tools/extract_embedding.py index 9c6f568..96a043c 100755 --- a/tools/extract_embedding.py +++ b/tools/extract_embedding.py @@ -54,7 +54,7 @@ def main(args): spk2embedding[spk] = [] spk2embedding[spk].append(embedding) for k, v in spk2embedding.items(): - spk2embedding[k] = torch.tensor(v).mean(dim=0, keepdim=True).tolist() + spk2embedding[k] = torch.tensor(v).mean(dim=0).tolist() torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir)) torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir)) From 0fd15bb12b9f79bbdb86a496b920311130f1710c Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 10 Jul 2024 17:49:32 +0800 Subject: [PATCH 5/5] use spk_embedding when sft --- cosyvoice/flow/flow.py | 2 +- cosyvoice/llm/llm.py | 2 +- cosyvoice/utils/executor.py | 4 ++++ examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml | 1 + examples/libritts/cosyvoice/conf/cosyvoice.yaml | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index d0dbcd0..009160a 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -60,7 +60,7 @@ class MaskedDiffWithXvec(torch.nn.Module): token_len = batch['speech_token_len'].to(device) feat = batch['speech_feat'].to(device) feat_len = batch['speech_feat_len'].to(device) - embedding = batch['utt_embedding'].to(device) + embedding = batch['embedding'].to(device) # xvec projection embedding = F.normalize(embedding, dim=1) diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index 05c22ef..3b418c5 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -97,7 +97,7 @@ class TransformerLM(torch.nn.Module): text_token_len = batch['text_token_len'].to(device) speech_token = batch['speech_token'].to(device) speech_token_len = batch['speech_token_len'].to(device) - embedding = batch['utt_embedding'].to(device) + embedding = batch['embedding'].to(device) # 1. prepare llm_target lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))] diff --git a/cosyvoice/utils/executor.py b/cosyvoice/utils/executor.py index c12e52d..f7dfb0e 100644 --- a/cosyvoice/utils/executor.py +++ b/cosyvoice/utils/executor.py @@ -52,6 +52,10 @@ class Executor: info_dict["batch_idx"] = batch_idx if cosyvoice_join(group_join, info_dict): break + if info_dict["use_spk_embedding"] is True: + batch_dict["embedding"] = batch_dict["spk_embedding"] + else: + batch_dict["embedding"] = batch_dict["utt_embedding"] # Disable gradient synchronizations across DDP processes. # Within this context, gradients will be accumulated on module diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml index 10206e6..b67b528 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -190,6 +190,7 @@ train_conf: scheduler: warmuplr scheduler_conf: warmup_steps: 25000 + use_spk_embedding: False # change to True during sft max_epoch: 200 grad_clip: 5 accum_grad: 2 diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index c791c76..588086c 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -190,6 +190,7 @@ train_conf: scheduler: warmuplr # change to constantlr during sft scheduler_conf: warmup_steps: 2500 + use_spk_embedding: False # change to True during sft max_epoch: 200 grad_clip: 5 accum_grad: 2