diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 4292931..0942da6 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -122,12 +122,12 @@ class CosyVoiceFrontEnd: return speech_feat, speech_feat_len def text_normalize(self, text, split=True, text_frontend=True): - # NOTE skip text_frontend when ssml symbol in text - if '<|' in text and '|>' in text: - text_frontend = False if isinstance(text, Generator): logging.info('get tts_text generator, will skip text_normalize!') return [text] + # NOTE skip text_frontend when ssml symbol in text + if '<|' in text and '|>' in text: + text_frontend = False if text_frontend is False or text == '': return [text] if split is True else text text = text.strip() diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 01e76c6..8e67b0c 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -413,18 +413,18 @@ class CosyVoice3Model(CosyVoice2Model): embedding=embedding.to(self.device), streaming=stream, finalize=finalize) - tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:] - # append mel cache - if self.hift_cache_dict[uuid] is not None: - hift_cache_mel = self.hift_cache_dict[uuid]['mel'] - tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2) - self.hift_cache_dict[uuid]['mel'] = tts_mel - else: - self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0} - if speed != 1.0: - assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode' - tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear') - tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize) - tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:] - self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1] + tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:] + # append mel cache + if self.hift_cache_dict[uuid] is not None: + hift_cache_mel = self.hift_cache_dict[uuid]['mel'] + tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2) + self.hift_cache_dict[uuid]['mel'] = tts_mel + else: + self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0} + if speed != 1.0: + assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode' + tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear') + tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize) + tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:] + self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1] return tts_speech diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py index c893b11..045cb4e 100644 --- a/cosyvoice/hifigan/generator.py +++ b/cosyvoice/hifigan/generator.py @@ -155,11 +155,13 @@ class SineGen(torch.nn.Module): @torch.no_grad() def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, dim=1, length) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) """ - :param f0: [B, 1, sample_len], Hz - :return: [B, 1, sample_len] - """ - + f0 = f0.transpose(1, 2) F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device) for i in range(self.harmonic_num + 1): F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate @@ -184,7 +186,7 @@ class SineGen(torch.nn.Module): # first: set the unvoiced part to 0 by uv # then: additive noise sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise + return sine_waves.transpose(1, 2), uv.transpose(1, 2), noise class SineGen2(torch.nn.Module): @@ -221,7 +223,7 @@ class SineGen2(torch.nn.Module): if causal is True: self.rand_ini = torch.rand(1, 9) self.rand_ini[:, 0] = 0 - self.sine_waves = torch.rand(1, 60 * 16000, 9) + self.sine_waves = torch.rand(1, 300 * 24000, 9) def _f02uv(self, f0): # generate uv signal @@ -351,7 +353,7 @@ class SourceModuleHnNSF(torch.nn.Module): self.l_tanh = torch.nn.Tanh() self.causal = causal if causal is True: - self.uv = torch.rand(1, 60 * 24000, 1) + self.uv = torch.rand(1, 300 * 24000, 1) def forward(self, x): """ diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index c0b3400..eacde5b 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -17,6 +17,7 @@ import random import time import threading from typing import Dict, Optional, Callable, List, Generator +import numpy as np import torch from torch import nn import torch.nn.functional as F @@ -216,7 +217,7 @@ class TransformerLM(torch.nn.Module): att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool)) logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) - top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item() + top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False) if top_ids == self.eos_token: break # in stream mode, yield token one by one @@ -544,7 +545,7 @@ class Qwen2LM(TransformerLM): cache = None # NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5 text_cache = self.llm.model.model.embed_tokens(prompt_text) - next_fill_index = -1 + next_fill_index = (int(prompt_speech_token.shape[1] / self.mix_ratio[1]) + 1) * self.mix_ratio[1] - prompt_speech_token.shape[1] for this_text in text: text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1) # prompt_speech_token_emb not empty, try append to lm_input @@ -582,7 +583,7 @@ class Qwen2LM(TransformerLM): top_ids = self.fill_token next_fill_index += (self.mix_ratio[1] + 1) else: - top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item() + top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True) if top_ids == self.fill_token: next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1 logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index)) diff --git a/example.py b/example.py index 1e507c0..85952ae 100644 --- a/example.py +++ b/example.py @@ -15,15 +15,15 @@ def cosyvoice_example(): torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M') - # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean + # zero_shot usage for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')): torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) - # cross_lingual usage + # cross_lingual usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', './asset/cross_lingual_prompt.wav')): torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) # vc usage - for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav')): + for i, j in enumerate(cosyvoice.inference_vc('./asset/cross_lingual_prompt.wav', './asset/zero_shot_prompt.wav')): torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct') @@ -65,7 +65,7 @@ def cosyvoice2_example(): yield '让我心中充满了甜蜜的快乐,' yield '笑容如花儿般绽放。' for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)): - torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) + torchaudio.save('zero_shot_bistream_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate) def cosyvoice3_example(): @@ -97,8 +97,8 @@ def cosyvoice3_example(): def main(): - cosyvoice_example() - cosyvoice2_example() + # cosyvoice_example() + # cosyvoice2_example() cosyvoice3_example() diff --git a/vllm_example.py b/vllm_example.py index f82cfe7..697d7a9 100644 --- a/vllm_example.py +++ b/vllm_example.py @@ -31,7 +31,7 @@ def cosyvoice3_example(): def main(): - cosyvoice2_example() + # cosyvoice2_example() cosyvoice3_example()