mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 09:29:25 +08:00
fix bistream bug
This commit is contained in:
@@ -122,12 +122,12 @@ class CosyVoiceFrontEnd:
|
||||
return speech_feat, speech_feat_len
|
||||
|
||||
def text_normalize(self, text, split=True, text_frontend=True):
|
||||
# NOTE skip text_frontend when ssml symbol in text
|
||||
if '<|' in text and '|>' in text:
|
||||
text_frontend = False
|
||||
if isinstance(text, Generator):
|
||||
logging.info('get tts_text generator, will skip text_normalize!')
|
||||
return [text]
|
||||
# NOTE skip text_frontend when ssml symbol in text
|
||||
if '<|' in text and '|>' in text:
|
||||
text_frontend = False
|
||||
if text_frontend is False or text == '':
|
||||
return [text] if split is True else text
|
||||
text = text.strip()
|
||||
|
||||
@@ -413,18 +413,18 @@ class CosyVoice3Model(CosyVoice2Model):
|
||||
embedding=embedding.to(self.device),
|
||||
streaming=stream,
|
||||
finalize=finalize)
|
||||
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
|
||||
# append mel cache
|
||||
if self.hift_cache_dict[uuid] is not None:
|
||||
hift_cache_mel = self.hift_cache_dict[uuid]['mel']
|
||||
tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
|
||||
self.hift_cache_dict[uuid]['mel'] = tts_mel
|
||||
else:
|
||||
self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
|
||||
if speed != 1.0:
|
||||
assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
|
||||
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
|
||||
tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
|
||||
tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
|
||||
self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
|
||||
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
|
||||
# append mel cache
|
||||
if self.hift_cache_dict[uuid] is not None:
|
||||
hift_cache_mel = self.hift_cache_dict[uuid]['mel']
|
||||
tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
|
||||
self.hift_cache_dict[uuid]['mel'] = tts_mel
|
||||
else:
|
||||
self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
|
||||
if speed != 1.0:
|
||||
assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
|
||||
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
|
||||
tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
|
||||
tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
|
||||
self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
|
||||
return tts_speech
|
||||
|
||||
@@ -155,11 +155,13 @@ class SineGen(torch.nn.Module):
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, f0):
|
||||
""" sine_tensor, uv = forward(f0)
|
||||
input F0: tensor(batchsize=1, dim=1, length)
|
||||
f0 for unvoiced steps should be 0
|
||||
output sine_tensor: tensor(batchsize=1, length, dim)
|
||||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
:param f0: [B, 1, sample_len], Hz
|
||||
:return: [B, 1, sample_len]
|
||||
"""
|
||||
|
||||
f0 = f0.transpose(1, 2)
|
||||
F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
|
||||
for i in range(self.harmonic_num + 1):
|
||||
F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
|
||||
@@ -184,7 +186,7 @@ class SineGen(torch.nn.Module):
|
||||
# first: set the unvoiced part to 0 by uv
|
||||
# then: additive noise
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
return sine_waves.transpose(1, 2), uv.transpose(1, 2), noise
|
||||
|
||||
|
||||
class SineGen2(torch.nn.Module):
|
||||
@@ -221,7 +223,7 @@ class SineGen2(torch.nn.Module):
|
||||
if causal is True:
|
||||
self.rand_ini = torch.rand(1, 9)
|
||||
self.rand_ini[:, 0] = 0
|
||||
self.sine_waves = torch.rand(1, 60 * 16000, 9)
|
||||
self.sine_waves = torch.rand(1, 300 * 24000, 9)
|
||||
|
||||
def _f02uv(self, f0):
|
||||
# generate uv signal
|
||||
@@ -351,7 +353,7 @@ class SourceModuleHnNSF(torch.nn.Module):
|
||||
self.l_tanh = torch.nn.Tanh()
|
||||
self.causal = causal
|
||||
if causal is True:
|
||||
self.uv = torch.rand(1, 60 * 24000, 1)
|
||||
self.uv = torch.rand(1, 300 * 24000, 1)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
|
||||
@@ -17,6 +17,7 @@ import random
|
||||
import time
|
||||
import threading
|
||||
from typing import Dict, Optional, Callable, List, Generator
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
@@ -216,7 +217,7 @@ class TransformerLM(torch.nn.Module):
|
||||
att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
|
||||
device=lm_input.device)).to(torch.bool))
|
||||
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
|
||||
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
|
||||
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
|
||||
if top_ids == self.eos_token:
|
||||
break
|
||||
# in stream mode, yield token one by one
|
||||
@@ -544,7 +545,7 @@ class Qwen2LM(TransformerLM):
|
||||
cache = None
|
||||
# NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
|
||||
text_cache = self.llm.model.model.embed_tokens(prompt_text)
|
||||
next_fill_index = -1
|
||||
next_fill_index = (int(prompt_speech_token.shape[1] / self.mix_ratio[1]) + 1) * self.mix_ratio[1] - prompt_speech_token.shape[1]
|
||||
for this_text in text:
|
||||
text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
|
||||
# prompt_speech_token_emb not empty, try append to lm_input
|
||||
@@ -582,7 +583,7 @@ class Qwen2LM(TransformerLM):
|
||||
top_ids = self.fill_token
|
||||
next_fill_index += (self.mix_ratio[1] + 1)
|
||||
else:
|
||||
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
|
||||
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True)
|
||||
if top_ids == self.fill_token:
|
||||
next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
|
||||
logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
|
||||
|
||||
Reference in New Issue
Block a user