From 144f1719f1564a1815b44da69836ac08c9d40e11 Mon Sep 17 00:00:00 2001 From: DBin_K Date: Mon, 8 Jul 2024 17:37:30 +0800 Subject: [PATCH 1/8] Update README.md correct correct spelling --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d341d97..69438e3 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ python3 client.py --port 50000 --mode You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues). -You can also scan the QR code to join our officla Dingding chat group. +You can also scan the QR code to join our official Dingding chat group. From 2f496104ecc7e79627bb17d5d7ac3c2d4cbff539 Mon Sep 17 00:00:00 2001 From: passerbya Date: Tue, 9 Jul 2024 08:17:34 +0800 Subject: [PATCH 2/8] =?UTF-8?q?=E5=8D=8A=E8=A7=92=E5=8F=A5=E5=8F=B7?= =?UTF-8?q?=E4=BC=9A=E5=AF=BC=E8=87=B4=E5=90=88=E6=88=90=E5=A4=B1=E8=B4=A5?= =?UTF-8?q?=EF=BC=9ARuntimeError:=20torch.cat():=20expected=20a=20non-empt?= =?UTF-8?q?y=20list=20of=20Tensors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit text='小明因为感冒,鼻子不通,讲话总带着齉音.' File "/usr/local/data/CosyVoice/cosyvoice/cli/cosyvoice.py", line 62, in inference_zero_shot return {'tts_speech': torch.concat(tts_speeches, dim=1)} RuntimeError: torch.cat(): expected a non-empty list of Tensors 原因为self.frontend.text_normalize(tts_text, split=True)返回为空 --- cosyvoice/utils/frontend_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index dee829f..bcd637c 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -74,7 +74,7 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= return len(tokenize(_text)) < merge_len if lang == "zh": - pounc = ['。', '?', '!', ';', ':', '.', '?', '!', ';'] + pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] else: pounc = ['.', '?', '!', ';', ':'] if comma_split: From 88c8bf7b9e545c440dd81e3f161af3d7a04f6bf6 Mon Sep 17 00:00:00 2001 From: passerbya Date: Tue, 9 Jul 2024 08:22:06 +0800 Subject: [PATCH 3/8] =?UTF-8?q?=E6=9B=B4=E6=8D=A2=E5=89=8D=E7=AB=AF?= =?UTF-8?q?=E4=B8=BAWeTextProcessing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosyvoice/cli/frontend.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 32e5539..f10f655 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -21,12 +21,9 @@ import torchaudio.compliance.kaldi as kaldi import torchaudio import os import inflect -try: - import ttsfrd - use_ttsfrd = True -except: - print("failed to import ttsfrd, please normalize input text manually") - use_ttsfrd = False +from tn.chinese.normalizer import Normalizer as ZhNormalizer +from tn.english.normalizer import Normalizer as EnNormalizer + from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph @@ -53,14 +50,8 @@ class CosyVoiceFrontEnd: self.instruct = instruct self.allowed_special = allowed_special self.inflect_parser = inflect.engine() - self.use_ttsfrd = use_ttsfrd - if self.use_ttsfrd: - self.frd = ttsfrd.TtsFrontendEngine() - ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) - assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource' - self.frd.set_lang_type('pinyin') - self.frd.enable_pinyin_mix(True) - self.frd.set_breakmodel_index(1) + self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False) + self.en_tn_model = EnNormalizer() def _extract_text_token(self, text): text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special) @@ -95,8 +86,7 @@ class CosyVoiceFrontEnd: def text_normalize(self, text, split=True): text = text.strip() if contains_chinese(text): - if self.use_ttsfrd: - text = self.frd.get_frd_extra_info(text, 'input') + text = self.zh_tn_model.normalize(text) text = text.replace("\n", "") text = replace_blank(text) text = replace_corner_mark(text) @@ -107,6 +97,7 @@ class CosyVoiceFrontEnd: token_min_n=60, merge_len=20, comma_split=False)] else: + text = self.en_tn_model.normalize(text) text = spell_out_number(text, self.inflect_parser) texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, token_min_n=60, merge_len=20, From 39afb98fa1e3dbf4f505ba065d81f5342390eb74 Mon Sep 17 00:00:00 2001 From: passerbya Date: Tue, 9 Jul 2024 08:22:31 +0800 Subject: [PATCH 4/8] =?UTF-8?q?=E6=9B=B4=E6=8D=A2=E5=89=8D=E7=AB=AF?= =?UTF-8?q?=E4=B8=BAWeTextProcessing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 39e1374..24639f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,4 +25,5 @@ soundfile==0.12.1 tensorboard==2.14.0 torch==2.0.1 torchaudio==2.0.2 -wget==3.2 \ No newline at end of file +wget==3.2 +WeTextProcessing From 95b8866f3c40e3f94c74cb5c7bb28c839eb341c8 Mon Sep 17 00:00:00 2001 From: passerbya Date: Tue, 9 Jul 2024 17:25:55 +0800 Subject: [PATCH 5/8] =?UTF-8?q?=E4=BC=98=E5=85=88=E4=BD=BF=E7=94=A8ttsfrd?= =?UTF-8?q?=EF=BC=8Cttsfrd=E4=B8=8D=E5=AD=98=E5=9C=A8=E6=97=B6=E4=BD=BF?= =?UTF-8?q?=E7=94=A8WeTextProcessing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosyvoice/cli/frontend.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index f10f655..c30135a 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -23,7 +23,12 @@ import os import inflect from tn.chinese.normalizer import Normalizer as ZhNormalizer from tn.english.normalizer import Normalizer as EnNormalizer - +try: + import ttsfrd + use_ttsfrd = True +except: + print("failed to import ttsfrd, please normalize input text manually") + use_ttsfrd = False from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph @@ -50,8 +55,17 @@ class CosyVoiceFrontEnd: self.instruct = instruct self.allowed_special = allowed_special self.inflect_parser = inflect.engine() - self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False) - self.en_tn_model = EnNormalizer() + self.use_ttsfrd = use_ttsfrd + if self.use_ttsfrd: + self.frd = ttsfrd.TtsFrontendEngine() + ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource' + self.frd.set_lang_type('pinyin') + self.frd.enable_pinyin_mix(True) + self.frd.set_breakmodel_index(1) + else: + self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False) + self.en_tn_model = EnNormalizer() def _extract_text_token(self, text): text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special) @@ -86,7 +100,10 @@ class CosyVoiceFrontEnd: def text_normalize(self, text, split=True): text = text.strip() if contains_chinese(text): - text = self.zh_tn_model.normalize(text) + if self.use_ttsfrd: + text = self.frd.get_frd_extra_info(text, 'input') + else: + text = self.zh_tn_model.normalize(text) text = text.replace("\n", "") text = replace_blank(text) text = replace_corner_mark(text) From f9fe31f2005acecd865c34c2c1cd29cd9052460e Mon Sep 17 00:00:00 2001 From: passerbya Date: Tue, 9 Jul 2024 17:26:19 +0800 Subject: [PATCH 6/8] =?UTF-8?q?=E6=96=87=E6=9C=AC=E4=B8=AD=E6=B2=A1?= =?UTF-8?q?=E6=9C=89=E6=A0=87=E7=82=B9=E6=97=B6=E6=97=A0=E6=B3=95=E5=90=88?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosyvoice/utils/frontend_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index bcd637c..196c82f 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -91,6 +91,8 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= st = i + 2 else: st = i + 1 + if len(utts) == 0: + utts.append(text) final_utts = [] cur_utt = "" for utt in utts: From 69026d83bba336e2cb51ecfe696c04953437489d Mon Sep 17 00:00:00 2001 From: passerbya Date: Tue, 9 Jul 2024 17:42:40 +0800 Subject: [PATCH 7/8] =?UTF-8?q?=E6=B2=A1=E6=9C=89=E6=A0=87=E7=82=B9?= =?UTF-8?q?=E7=BB=93=E5=B0=BE=E6=97=B6=E9=BB=98=E8=AE=A4=E5=8A=A0=E4=B8=8A?= =?UTF-8?q?=E5=8F=A5=E5=8F=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosyvoice/utils/frontend_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index 196c82f..59489a7 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -92,7 +92,10 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= else: st = i + 1 if len(utts) == 0: - utts.append(text) + if lang == "zh": + utts.append(text + '。') + else: + utts.append(text + '.') final_utts = [] cur_utt = "" for utt in utts: From 798179652357711796aa937e9a4b17a813c60afd Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 9 Jul 2024 23:37:54 +0800 Subject: [PATCH 8/8] add WeTextProcessing --- README.md | 6 ++++-- cosyvoice/cli/frontend.py | 14 +++++++------- cosyvoice/dataset/processor.py | 1 - requirements.txt | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 69438e3..3b4783a 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ git submodule update --init --recursive ``` sh conda create -n cosyvoice python=3.8 conda activate cosyvoice +# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform. +conda install -y -c conda-forge pynini==2.1.5 pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com # If you encounter sox compatibility issues @@ -55,9 +57,9 @@ git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_m git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd ``` -Optionaly, you can unzip `ttsfrd` resouce and install `ttsfrd` package. +Optionaly, you can unzip `ttsfrd` resouce and install `ttsfrd` package for better text normalization performance. -Notice that this step is not necessary. If you do not install `ttsfrd` package, you need to normalize input text manually. +Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use WeTextProcessing by default. ``` sh cd pretrained_models/CosyVoice-ttsfrd/ diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index c30135a..d2983b7 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -21,13 +21,13 @@ import torchaudio.compliance.kaldi as kaldi import torchaudio import os import inflect -from tn.chinese.normalizer import Normalizer as ZhNormalizer -from tn.english.normalizer import Normalizer as EnNormalizer try: import ttsfrd use_ttsfrd = True -except: - print("failed to import ttsfrd, please normalize input text manually") +except ImportError: + print("failed to import ttsfrd, use WeTextProcessing instead") + from tn.chinese.normalizer import Normalizer as ZhNormalizer + from tn.english.normalizer import Normalizer as EnNormalizer use_ttsfrd = False from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph @@ -64,8 +64,8 @@ class CosyVoiceFrontEnd: self.frd.enable_pinyin_mix(True) self.frd.set_breakmodel_index(1) else: - self.zh_tn_model = ZhNormalizer(remove_erhua=False,full_to_half=False) - self.en_tn_model = EnNormalizer() + self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False) + self.en_tn_model = EnNormalizer() def _extract_text_token(self, text): text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special) @@ -103,7 +103,7 @@ class CosyVoiceFrontEnd: if self.use_ttsfrd: text = self.frd.get_frd_extra_info(text, 'input') else: - text = self.zh_tn_model.normalize(text) + text = self.zh_tn_model.normalize(text) text = text.replace("\n", "") text = replace_blank(text) text = replace_corner_mark(text) diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index fa8b339..9477d02 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -22,7 +22,6 @@ from torch.nn.utils.rnn import pad_sequence import torch.nn.functional as F torchaudio.set_audio_backend('soundfile') -torchaudio.utils.sox_utils.set_buffer_size(16500) AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) diff --git a/requirements.txt b/requirements.txt index 24639f2..46df823 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,4 +26,4 @@ tensorboard==2.14.0 torch==2.0.1 torchaudio==2.0.2 wget==3.2 -WeTextProcessing +WeTextProcessing==1.0.3