From ee9e87b4d3e239acbb9ab0c2e2ed739baea55533 Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Tue, 9 Jul 2024 23:48:23 +0800
Subject: [PATCH 1/5] add empty cache

---
 cosyvoice/cli/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py
index 98f19b2..f4625e3 100644
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -56,4 +56,5 @@ class CosyVoiceModel:
                                       prompt_feat_len=prompt_speech_feat_len.to(self.device),
                                       embedding=flow_embedding.to(self.device))
         tts_speech = self.hift.inference(mel=tts_mel).cpu()
+        torch.cuda.empty_cache()
         return {'tts_speech': tts_speech}

From 6a3e44242ad24f01ba64430d8f6ac5718442b0da Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Wed, 10 Jul 2024 00:21:56 +0800
Subject: [PATCH 2/5] keep only embedding mean as spk embedding

---
 cosyvoice/dataset/processor.py | 2 +-
 tools/extract_embedding.py     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py
index 9477d02..cb34a0c 100644
--- a/cosyvoice/dataset/processor.py
+++ b/cosyvoice/dataset/processor.py
@@ -167,7 +167,7 @@ def parse_embedding(data, normalize, mode='train'):
     """
     for sample in data:
         sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
-        sample['spk_embedding'] = torch.stack([torch.tensor(i, dtype=torch.float32) for i in sample['spk_embedding']], dim=0).mean(dim=0)
+        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
         if normalize:
             sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
             sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
diff --git a/tools/extract_embedding.py b/tools/extract_embedding.py
index 02fa2f6..9c6f568 100755
--- a/tools/extract_embedding.py
+++ b/tools/extract_embedding.py
@@ -53,6 +53,8 @@ def main(args):
         if spk not in spk2embedding:
             spk2embedding[spk] = []
         spk2embedding[spk].append(embedding)
+    for k, v in spk2embedding.items():
+        spk2embedding[k] = torch.tensor(v).mean(dim=0, keepdim=True).tolist()
 
     torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir))
     torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir))

From 225b56de052ad53afa6ca84612cd6da2e5d6d298 Mon Sep 17 00:00:00 2001
From: cyz <cyz@hzlh.com>
Date: Wed, 10 Jul 2024 12:02:41 +0800
Subject: [PATCH 3/5] =?UTF-8?q?FIX:=20=E4=BF=AE=E5=A4=8D=E8=87=AA=E7=84=B6?=
 =?UTF-8?q?=E8=AF=AD=E8=A8=80=E6=8E=A7=E5=88=B6=E7=94=9F=E6=88=90=E9=9F=B3?=
 =?UTF-8?q?=E9=A2=91=E6=97=B6=E5=8F=91=E7=94=9F=E9=94=99=E8=AF=AF=EF=BC=8C?=
 =?UTF-8?q?=E5=BC=82=E5=B8=B8=E4=BF=A1=E6=81=AF=E5=A6=82=E4=B8=8B=EF=BC=9A?=
 =?UTF-8?q?AttributeError:=20'CosyVoiceFrontEnd'=20object=20has=20no=20att?=
 =?UTF-8?q?ribute=20'en=5Ftn=5Fmodel'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cosyvoice/cli/frontend.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index d2983b7..0fb9ca9 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -114,7 +114,10 @@ class CosyVoiceFrontEnd:
                                                 token_min_n=60, merge_len=20,
                                                 comma_split=False)]
         else:
-            text = self.en_tn_model.normalize(text)
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.en_tn_model.normalize(text)
             text = spell_out_number(text, self.inflect_parser)
             texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
                                                 token_min_n=60, merge_len=20,

From 793a24862caed695a375661ae57fa99b5fc7793f Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Wed, 10 Jul 2024 16:37:25 +0800
Subject: [PATCH 4/5] add constant lr scheduler

---
 cosyvoice/utils/scheduler.py                  | 22 +++++++++++++++++++
 cosyvoice/utils/train_utils.py                |  5 ++++-
 .../libritts/cosyvoice/conf/cosyvoice.yaml    |  4 ++--
 tools/extract_embedding.py                    |  2 +-
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/cosyvoice/utils/scheduler.py b/cosyvoice/utils/scheduler.py
index eed1ea0..fbf4803 100644
--- a/cosyvoice/utils/scheduler.py
+++ b/cosyvoice/utils/scheduler.py
@@ -715,3 +715,25 @@ class NoamHoldAnnealing(WarmupHoldPolicy):
 
     def set_step(self, step: int):
         self.last_epoch = step
+
+
+class ConstantLR(_LRScheduler):
+    """The ConstantLR scheduler
+
+    This scheduler keeps a constant lr
+
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+    ):
+        # __init__() must be invoked before setting field
+        # because step() is also invoked in __init__()
+        super().__init__(optimizer)
+
+    def get_lr(self):
+        return self.base_lrs
+
+    def set_step(self, step: int):
+        self.last_epoch = step
diff --git a/cosyvoice/utils/train_utils.py b/cosyvoice/utils/train_utils.py
index df3a321..f8d7b45 100644
--- a/cosyvoice/utils/train_utils.py
+++ b/cosyvoice/utils/train_utils.py
@@ -34,7 +34,7 @@ from torch.nn.utils import clip_grad_norm_
 from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
 
 from cosyvoice.dataset.dataset import Dataset
-from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing
+from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR
 
 
 def init_distributed(args):
@@ -122,6 +122,9 @@ def init_optimizer_and_scheduler(args, configs, model):
     elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
         scheduler_type = NoamHoldAnnealing
         scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
+    elif configs['train_conf']['scheduler'] == 'constantlr':
+        scheduler_type = ConstantLR
+        scheduler = ConstantLR(optimizer)
     else:
         raise ValueError("unknown scheduler: " + configs['train_conf'])
 
diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml
index cc5eee0..c791c76 100644
--- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml
@@ -186,8 +186,8 @@ data_pipeline: [
 train_conf:
     optim: adam
     optim_conf:
-        lr: 0.001
-    scheduler: warmuplr
+        lr: 0.001 # change to 1e-5 during sft
+    scheduler: warmuplr # change to constantlr during sft
     scheduler_conf:
         warmup_steps: 2500
     max_epoch: 200
diff --git a/tools/extract_embedding.py b/tools/extract_embedding.py
index 9c6f568..96a043c 100755
--- a/tools/extract_embedding.py
+++ b/tools/extract_embedding.py
@@ -54,7 +54,7 @@ def main(args):
             spk2embedding[spk] = []
         spk2embedding[spk].append(embedding)
     for k, v in spk2embedding.items():
-        spk2embedding[k] = torch.tensor(v).mean(dim=0, keepdim=True).tolist()
+        spk2embedding[k] = torch.tensor(v).mean(dim=0).tolist()
 
     torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir))
     torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir))

From 0fd15bb12b9f79bbdb86a496b920311130f1710c Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Wed, 10 Jul 2024 17:49:32 +0800
Subject: [PATCH 5/5] use spk_embedding when sft

---
 cosyvoice/flow/flow.py                                      | 2 +-
 cosyvoice/llm/llm.py                                        | 2 +-
 cosyvoice/utils/executor.py                                 | 4 ++++
 examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml | 1 +
 examples/libritts/cosyvoice/conf/cosyvoice.yaml             | 1 +
 5 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index d0dbcd0..009160a 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -60,7 +60,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
         token_len = batch['speech_token_len'].to(device)
         feat = batch['speech_feat'].to(device)
         feat_len = batch['speech_feat_len'].to(device)
-        embedding = batch['utt_embedding'].to(device)
+        embedding = batch['embedding'].to(device)
 
         # xvec projection
         embedding = F.normalize(embedding, dim=1)
diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py
index 05c22ef..3b418c5 100644
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -97,7 +97,7 @@ class TransformerLM(torch.nn.Module):
         text_token_len = batch['text_token_len'].to(device)
         speech_token = batch['speech_token'].to(device)
         speech_token_len = batch['speech_token_len'].to(device)
-        embedding = batch['utt_embedding'].to(device)
+        embedding = batch['embedding'].to(device)
 
         # 1. prepare llm_target
         lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
diff --git a/cosyvoice/utils/executor.py b/cosyvoice/utils/executor.py
index c12e52d..f7dfb0e 100644
--- a/cosyvoice/utils/executor.py
+++ b/cosyvoice/utils/executor.py
@@ -52,6 +52,10 @@ class Executor:
                 info_dict["batch_idx"] = batch_idx
                 if cosyvoice_join(group_join, info_dict):
                     break
+                if info_dict["use_spk_embedding"] is True:
+                    batch_dict["embedding"] = batch_dict["spk_embedding"]
+                else:
+                    batch_dict["embedding"] = batch_dict["utt_embedding"]
 
                 # Disable gradient synchronizations across DDP processes.
                 # Within this context, gradients will be accumulated on module
diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
index 10206e6..b67b528 100644
--- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
@@ -190,6 +190,7 @@ train_conf:
     scheduler: warmuplr
     scheduler_conf:
         warmup_steps: 25000
+    use_spk_embedding: False # change to True during sft
     max_epoch: 200
     grad_clip: 5
     accum_grad: 2
diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml
index c791c76..588086c 100644
--- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml
@@ -190,6 +190,7 @@ train_conf:
     scheduler: warmuplr # change to constantlr during sft
     scheduler_conf:
         warmup_steps: 2500
+    use_spk_embedding: False # change to True during sft
     max_epoch: 200
     grad_clip: 5
     accum_grad: 2