From ffa28e3bbda47952e758481b154e37173f0bc47d Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Sun, 29 Sep 2024 10:35:10 +0800 Subject: [PATCH 1/6] update token args --- cosyvoice/cli/model.py | 9 +++------ cosyvoice/llm/llm.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index ea0ec4a..489978d 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -31,8 +31,8 @@ class CosyVoiceModel: self.llm = llm self.flow = flow self.hift = hift - self.token_min_hop_len = 100 - self.token_max_hop_len = 200 + self.token_min_hop_len = 2 * self.flow.input_frame_rate + self.token_max_hop_len = 4 * self.flow.input_frame_rate self.token_overlap_len = 20 # mel fade in out self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256) @@ -87,10 +87,7 @@ class CosyVoiceModel: prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), prompt_speech_token=llm_prompt_speech_token.to(self.device), prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), - embedding=llm_embedding.to(self.device).half(), - sampling=25, - max_token_text_ratio=30, - min_token_text_ratio=3): + embedding=llm_embedding.to(self.device).half()): self.tts_speech_token_dict[uuid].append(i) self.llm_end_dict[uuid] = True diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index eb377f1..00e4af0 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -197,7 +197,7 @@ class TransformerLM(torch.nn.Module): offset = 0 att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device) for i in range(max_len): - y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, + y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=offset, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache, att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool)) From abc6f70ace1d63be8929d06faf4de7573f2129bc Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Sun, 29 Sep 2024 10:42:57 +0800 Subject: [PATCH 2/6] update 25hz yaml --- .../cosyvoice/conf/cosyvoice.fromscratch.yaml | 6 +++--- examples/libritts/cosyvoice/conf/cosyvoice.yaml | 6 +++--- .../cosyvoice/conf/cosyvoice.fromscratch.yaml | 11 ++++++++--- examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml | 11 ++++++++--- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml index 25d7269..0420d02 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index bca3898..b2ff51c 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml index 0f3495f..0420d02 100644 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' static_chunk_size: 1 + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec input_size: 512 @@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml index 6b084f2..b2ff51c 100644 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' static_chunk_size: 1 + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec input_size: 512 @@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' From 2c1877a5d4c10c094df53c956374af16451ea789 Mon Sep 17 00:00:00 2001 From: zhuyunfeng <42790740+zhuzizyf@users.noreply.github.com> Date: Sun, 29 Sep 2024 13:29:28 +0800 Subject: [PATCH 3/6] Update frontend.py Fix potential issues caused by ending with a Chinese comma --- cosyvoice/cli/frontend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 8e12a1c..ac138b1 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -118,10 +118,10 @@ class CosyVoiceFrontEnd: text = text.replace("\n", "") text = replace_blank(text) text = replace_corner_mark(text) - text = text.replace(".", "、") + text = text.replace(".", "。") text = text.replace(" - ", ",") text = remove_bracket(text) - text = re.sub(r'[,,]+$', '。', text) + text = re.sub(r'[,,、]+$', '。', text) texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False)) else: From 9c0aa1918bef41e4a094af16664ab522b2e6e617 Mon Sep 17 00:00:00 2001 From: zhuyunfeng <42790740+zhuzizyf@users.noreply.github.com> Date: Sun, 29 Sep 2024 14:19:31 +0800 Subject: [PATCH 4/6] Update frontend_utils.py "Fix the bug in `split_paragraph` where the last sentence of synthesized text with multiple paragraphs loses punctuation, causing it to be lost." --- cosyvoice/utils/frontend_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index 7c6e19e..215cbe4 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -80,6 +80,13 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= pounc = ['.', '?', '!', ';', ':'] if comma_split: pounc.extend([',', ',']) + + if text[-1] not in pounc: + if lang == "zh": + text += "。" + else: + text += "." + st = 0 utts = [] for i, c in enumerate(text): @@ -92,11 +99,7 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= st = i + 2 else: st = i + 1 - if len(utts) == 0: - if lang == "zh": - utts.append(text + '。') - else: - utts.append(text + '.') + final_utts = [] cur_utt = "" for utt in utts: From 74a449ad1f2288595c0a37e2b345e6653002873f Mon Sep 17 00:00:00 2001 From: zhuyunfeng <42790740+zhuzizyf@users.noreply.github.com> Date: Sun, 29 Sep 2024 14:26:33 +0800 Subject: [PATCH 5/6] Update frontend_utils.py Fix typo --- cosyvoice/utils/frontend_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index 215cbe4..5514193 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -86,7 +86,7 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= text += "。" else: text += "." - + st = 0 utts = [] for i, c in enumerate(text): From 0b76dfa1eb8e4b7768180359ec7043024ba7cef4 Mon Sep 17 00:00:00 2001 From: zhuyunfeng <42790740+zhuzizyf@users.noreply.github.com> Date: Sun, 29 Sep 2024 14:41:33 +0800 Subject: [PATCH 6/6] Update frontend_utils.py Fix typo --- cosyvoice/utils/frontend_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index 5514193..ab01a1f 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -86,7 +86,7 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= text += "。" else: text += "." - + st = 0 utts = [] for i, c in enumerate(text):