diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 8e12a1c..ac138b1 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -118,10 +118,10 @@ class CosyVoiceFrontEnd: text = text.replace("\n", "") text = replace_blank(text) text = replace_corner_mark(text) - text = text.replace(".", "、") + text = text.replace(".", "。") text = text.replace(" - ", ",") text = remove_bracket(text) - text = re.sub(r'[,,]+$', '。', text) + text = re.sub(r'[,,、]+$', '。', text) texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False)) else: diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index ea0ec4a..489978d 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -31,8 +31,8 @@ class CosyVoiceModel: self.llm = llm self.flow = flow self.hift = hift - self.token_min_hop_len = 100 - self.token_max_hop_len = 200 + self.token_min_hop_len = 2 * self.flow.input_frame_rate + self.token_max_hop_len = 4 * self.flow.input_frame_rate self.token_overlap_len = 20 # mel fade in out self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256) @@ -87,10 +87,7 @@ class CosyVoiceModel: prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device), prompt_speech_token=llm_prompt_speech_token.to(self.device), prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device), - embedding=llm_embedding.to(self.device).half(), - sampling=25, - max_token_text_ratio=30, - min_token_text_ratio=3): + embedding=llm_embedding.to(self.device).half()): self.tts_speech_token_dict[uuid].append(i) self.llm_end_dict[uuid] = True diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index eb377f1..00e4af0 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -197,7 +197,7 @@ class TransformerLM(torch.nn.Module): offset = 0 att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device) for i in range(max_len): - y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, + y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=offset, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache, att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool)) diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py index 7c6e19e..ab01a1f 100644 --- a/cosyvoice/utils/frontend_utils.py +++ b/cosyvoice/utils/frontend_utils.py @@ -80,6 +80,13 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= pounc = ['.', '?', '!', ';', ':'] if comma_split: pounc.extend([',', ',']) + + if text[-1] not in pounc: + if lang == "zh": + text += "。" + else: + text += "." + st = 0 utts = [] for i, c in enumerate(text): @@ -92,11 +99,7 @@ def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n= st = i + 2 else: st = i + 1 - if len(utts) == 0: - if lang == "zh": - utts.append(text + '。') - else: - utts.append(text + '.') + final_utts = [] cur_utt = "" for utt in utts: diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml index 25d7269..0420d02 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/libritts/cosyvoice/conf/cosyvoice.yaml b/examples/libritts/cosyvoice/conf/cosyvoice.yaml index bca3898..b2ff51c 100644 --- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml +++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml index 0f3495f..0420d02 100644 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml +++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' static_chunk_size: 1 + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec input_size: 512 @@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en' diff --git a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml index 6b084f2..b2ff51c 100644 --- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml @@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM text_encoder_input_size: !ref llm_input_size: !ref llm_output_size: !ref - text_token_size: 51866 + text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe speech_token_size: 4096 length_normalized_loss: True lsm_weight: 0 @@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM pos_enc_layer_type: 'rel_pos_espnet' selfattention_layer_type: 'rel_selfattn' static_chunk_size: 1 + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec input_size: 512 @@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 4096 - input_frame_rate: 50 + input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe only_mask_loss: True encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder output_size: 512 @@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener -get_tokenizer: !name:whisper.tokenizer.get_tokenizer +get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe multilingual: True num_languages: 100 language: 'en'