From 144f1719f1564a1815b44da69836ac08c9d40e11 Mon Sep 17 00:00:00 2001
From: DBin_K <DBinKv1@Gmail.com>
Date: Mon, 8 Jul 2024 17:37:30 +0800
Subject: [PATCH 1/8] Update README.md

correct  correct spelling
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index d341d97..69438e3 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
 
 You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
 
-You can also scan the QR code to join our officla Dingding chat group.
+You can also scan the QR code to join our official Dingding chat group.
 
 <img src="./asset/dingding.png" width="250px">
 

From 2f496104ecc7e79627bb17d5d7ac3c2d4cbff539 Mon Sep 17 00:00:00 2001
From: passerbya <hanghang3103@163.com>
Date: Tue, 9 Jul 2024 08:17:34 +0800
Subject: [PATCH 2/8] =?UTF-8?q?=E5=8D=8A=E8=A7=92=E5=8F=A5=E5=8F=B7?=
 =?UTF-8?q?=E4=BC=9A=E5=AF=BC=E8=87=B4=E5=90=88=E6=88=90=E5=A4=B1=E8=B4=A5?=
 =?UTF-8?q?=EF=BC=9ARuntimeError:=20torch.cat():=20expected=20a=20non-empt?=
 =?UTF-8?q?y=20list=20of=20Tensors?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

text='小明因为感冒,鼻子不通,讲话总带着齉音.'
  File "/usr/local/data/CosyVoice/cosyvoice/cli/cosyvoice.py", line 62, in inference_zero_shot
    return {'tts_speech': torch.concat(tts_speeches, dim=1)}
RuntimeError: torch.cat(): expected a non-empty list of Tensors

原因为self.frontend.text_normalize(tts_text, split=True)返回为空
---
 cosyvoice/utils/frontend_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py
index dee829f..bcd637c 100644
--- a/cosyvoice/utils/frontend_utils.py
+++ b/cosyvoice/utils/frontend_utils.py
@@ -74,7 +74,7 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=
             return len(tokenize(_text)) < merge_len
 
     if lang == "zh":
-        pounc = ['。', '？', '！', '；', '：', '.', '?', '!', ';']
+        pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
     else:
         pounc = ['.', '?', '!', ';', ':']
     if comma_split:

From 88c8bf7b9e545c440dd81e3f161af3d7a04f6bf6 Mon Sep 17 00:00:00 2001
From: passerbya <hanghang3103@163.com>
Date: Tue, 9 Jul 2024 08:22:06 +0800
Subject: [PATCH 3/8] =?UTF-8?q?=E6=9B=B4=E6=8D=A2=E5=89=8D=E7=AB=AF?=
 =?UTF-8?q?=E4=B8=BAWeTextProcessing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cosyvoice/cli/frontend.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index 32e5539..f10f655 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -21,12 +21,9 @@ import torchaudio.compliance.kaldi as kaldi
 import torchaudio
 import os
 import inflect
-try:
-    import ttsfrd
-    use_ttsfrd = True
-except:
-    print("failed to import ttsfrd, please normalize input text manually")
-    use_ttsfrd = False
+from tn.chinese.normalizer import Normalizer as ZhNormalizer
+from tn.english.normalizer import Normalizer as EnNormalizer
+
 from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
 
 
@@ -53,14 +50,8 @@ class CosyVoiceFrontEnd:
         self.instruct = instruct
         self.allowed_special = allowed_special
         self.inflect_parser = inflect.engine()
-        self.use_ttsfrd = use_ttsfrd
-        if self.use_ttsfrd:
-            self.frd = ttsfrd.TtsFrontendEngine()
-            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource'
-            self.frd.set_lang_type('pinyin')
-            self.frd.enable_pinyin_mix(True)
-            self.frd.set_breakmodel_index(1)
+        self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False)
+        self.en_tn_model = EnNormalizer()
 
     def _extract_text_token(self, text):
         text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
@@ -95,8 +86,7 @@ class CosyVoiceFrontEnd:
     def text_normalize(self, text, split=True):
         text = text.strip()
         if contains_chinese(text):
-            if self.use_ttsfrd:
-                text = self.frd.get_frd_extra_info(text, 'input')
+            text = self.zh_tn_model.normalize(text)
             text = text.replace("\n", "")
             text = replace_blank(text)
             text = replace_corner_mark(text)
@@ -107,6 +97,7 @@ class CosyVoiceFrontEnd:
                                                 token_min_n=60, merge_len=20,
                                                 comma_split=False)]
         else:
+            text = self.en_tn_model.normalize(text)
             text = spell_out_number(text, self.inflect_parser)
             texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
                                                 token_min_n=60, merge_len=20,

From 39afb98fa1e3dbf4f505ba065d81f5342390eb74 Mon Sep 17 00:00:00 2001
From: passerbya <hanghang3103@163.com>
Date: Tue, 9 Jul 2024 08:22:31 +0800
Subject: [PATCH 4/8] =?UTF-8?q?=E6=9B=B4=E6=8D=A2=E5=89=8D=E7=AB=AF?=
 =?UTF-8?q?=E4=B8=BAWeTextProcessing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 39e1374..24639f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,4 +25,5 @@ soundfile==0.12.1
 tensorboard==2.14.0
 torch==2.0.1
 torchaudio==2.0.2
-wget==3.2
\ No newline at end of file
+wget==3.2
+WeTextProcessing

From 95b8866f3c40e3f94c74cb5c7bb28c839eb341c8 Mon Sep 17 00:00:00 2001
From: passerbya <hanghang3103@163.com>
Date: Tue, 9 Jul 2024 17:25:55 +0800
Subject: [PATCH 5/8] =?UTF-8?q?=E4=BC=98=E5=85=88=E4=BD=BF=E7=94=A8ttsfrd?=
 =?UTF-8?q?=EF=BC=8Cttsfrd=E4=B8=8D=E5=AD=98=E5=9C=A8=E6=97=B6=E4=BD=BF?=
 =?UTF-8?q?=E7=94=A8WeTextProcessing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cosyvoice/cli/frontend.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index f10f655..c30135a 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -23,7 +23,12 @@ import os
 import inflect
 from tn.chinese.normalizer import Normalizer as ZhNormalizer
 from tn.english.normalizer import Normalizer as EnNormalizer
-
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except:
+    print("failed to import ttsfrd, please normalize input text manually")
+    use_ttsfrd = False
 from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
 
 
@@ -50,8 +55,17 @@ class CosyVoiceFrontEnd:
         self.instruct = instruct
         self.allowed_special = allowed_special
         self.inflect_parser = inflect.engine()
-        self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False)
-        self.en_tn_model = EnNormalizer()
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyin')
+            self.frd.enable_pinyin_mix(True)
+            self.frd.set_breakmodel_index(1)
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False)
+            self.en_tn_model = EnNormalizer()	
 
     def _extract_text_token(self, text):
         text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
@@ -86,7 +100,10 @@ class CosyVoiceFrontEnd:
     def text_normalize(self, text, split=True):
         text = text.strip()
         if contains_chinese(text):
-            text = self.zh_tn_model.normalize(text)
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.zh_tn_model.normalize(text)	
             text = text.replace("\n", "")
             text = replace_blank(text)
             text = replace_corner_mark(text)

From f9fe31f2005acecd865c34c2c1cd29cd9052460e Mon Sep 17 00:00:00 2001
From: passerbya <hanghang3103@163.com>
Date: Tue, 9 Jul 2024 17:26:19 +0800
Subject: [PATCH 6/8] =?UTF-8?q?=E6=96=87=E6=9C=AC=E4=B8=AD=E6=B2=A1?=
 =?UTF-8?q?=E6=9C=89=E6=A0=87=E7=82=B9=E6=97=B6=E6=97=A0=E6=B3=95=E5=90=88?=
 =?UTF-8?q?=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cosyvoice/utils/frontend_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py
index bcd637c..196c82f 100644
--- a/cosyvoice/utils/frontend_utils.py
+++ b/cosyvoice/utils/frontend_utils.py
@@ -91,6 +91,8 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=
                 st = i + 2
             else:
                 st = i + 1
+    if len(utts) == 0:
+        utts.append(text)
     final_utts = []
     cur_utt = ""
     for utt in utts:

From 69026d83bba336e2cb51ecfe696c04953437489d Mon Sep 17 00:00:00 2001
From: passerbya <hanghang3103@163.com>
Date: Tue, 9 Jul 2024 17:42:40 +0800
Subject: [PATCH 7/8] =?UTF-8?q?=E6=B2=A1=E6=9C=89=E6=A0=87=E7=82=B9?=
 =?UTF-8?q?=E7=BB=93=E5=B0=BE=E6=97=B6=E9=BB=98=E8=AE=A4=E5=8A=A0=E4=B8=8A?=
 =?UTF-8?q?=E5=8F=A5=E5=8F=B7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cosyvoice/utils/frontend_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py
index 196c82f..59489a7 100644
--- a/cosyvoice/utils/frontend_utils.py
+++ b/cosyvoice/utils/frontend_utils.py
@@ -92,7 +92,10 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=
             else:
                 st = i + 1
     if len(utts) == 0:
-        utts.append(text)
+        if lang == "zh":
+            utts.append(text + '。')
+        else:
+            utts.append(text + '.')
     final_utts = []
     cur_utt = ""
     for utt in utts:

From 798179652357711796aa937e9a4b17a813c60afd Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Tue, 9 Jul 2024 23:37:54 +0800
Subject: [PATCH 8/8] add WeTextProcessing

---
 README.md                      |  6 ++++--
 cosyvoice/cli/frontend.py      | 14 +++++++-------
 cosyvoice/dataset/processor.py |  1 -
 requirements.txt               |  2 +-
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 69438e3..3b4783a 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,8 @@ git submodule update --init --recursive
 ``` sh
 conda create -n cosyvoice python=3.8
 conda activate cosyvoice
+# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
+conda install -y -c conda-forge pynini==2.1.5
 pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
 
 # If you encounter sox compatibility issues
@@ -55,9 +57,9 @@ git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_m
 git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
 ```
 
-Optionaly, you can unzip `ttsfrd` resouce and install `ttsfrd` package.
+Optionaly, you can unzip `ttsfrd` resouce and install `ttsfrd` package for better text normalization performance.
 
-Notice that this step is not necessary. If you do not install `ttsfrd` package, you need to normalize input text manually.
+Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use WeTextProcessing by default.
 
 ``` sh
 cd pretrained_models/CosyVoice-ttsfrd/
diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index c30135a..d2983b7 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -21,13 +21,13 @@ import torchaudio.compliance.kaldi as kaldi
 import torchaudio
 import os
 import inflect
-from tn.chinese.normalizer import Normalizer as ZhNormalizer
-from tn.english.normalizer import Normalizer as EnNormalizer
 try:
     import ttsfrd
     use_ttsfrd = True
-except:
-    print("failed to import ttsfrd, please normalize input text manually")
+except ImportError:
+    print("failed to import ttsfrd, use WeTextProcessing instead")
+    from tn.chinese.normalizer import Normalizer as ZhNormalizer
+    from tn.english.normalizer import Normalizer as EnNormalizer
     use_ttsfrd = False
 from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
 
@@ -64,8 +64,8 @@ class CosyVoiceFrontEnd:
             self.frd.enable_pinyin_mix(True)
             self.frd.set_breakmodel_index(1)
         else:
-            self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False)
-            self.en_tn_model = EnNormalizer()	
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+            self.en_tn_model = EnNormalizer()
 
     def _extract_text_token(self, text):
         text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
@@ -103,7 +103,7 @@ class CosyVoiceFrontEnd:
             if self.use_ttsfrd:
                 text = self.frd.get_frd_extra_info(text, 'input')
             else:
-                text = self.zh_tn_model.normalize(text)	
+                text = self.zh_tn_model.normalize(text)
             text = text.replace("\n", "")
             text = replace_blank(text)
             text = replace_corner_mark(text)
diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py
index fa8b339..9477d02 100644
--- a/cosyvoice/dataset/processor.py
+++ b/cosyvoice/dataset/processor.py
@@ -22,7 +22,6 @@ from torch.nn.utils.rnn import pad_sequence
 import torch.nn.functional as F
 
 torchaudio.set_audio_backend('soundfile')
-torchaudio.utils.sox_utils.set_buffer_size(16500)
 
 AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
 
diff --git a/requirements.txt b/requirements.txt
index 24639f2..46df823 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,4 +26,4 @@ tensorboard==2.14.0
 torch==2.0.1
 torchaudio==2.0.2
 wget==3.2
-WeTextProcessing
+WeTextProcessing==1.0.3