mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 01:49:25 +08:00
Merge pull request #618 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
This commit is contained in:
@@ -67,6 +67,8 @@ class CosyVoice:
|
||||
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
||||
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
||||
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
||||
if len(i) < 0.5 * len(prompt_text):
|
||||
logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
|
||||
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
|
||||
start_time = time.time()
|
||||
logging.info('synthesis text {}'.format(i))
|
||||
|
||||
@@ -202,6 +202,9 @@ class TransformerLM(torch.nn.Module):
|
||||
att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
|
||||
device=lm_input.device)).to(torch.bool))
|
||||
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
|
||||
# force continue decode first token
|
||||
if i == 0:
|
||||
logp[:, self.speech_token_size] = -float('inf')
|
||||
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
|
||||
if top_ids == self.speech_token_size:
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user