Merge pull request #618 from FunAudioLLM/dev/lyuxiang.lx

Dev/lyuxiang.lx
This commit is contained in:
Xiang Lyu
2024-11-05 09:36:51 +08:00
committed by GitHub
5 changed files with 8 additions and 2 deletions

View File

@@ -67,6 +67,8 @@ class CosyVoice:
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0): def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
prompt_text = self.frontend.text_normalize(prompt_text, split=False) prompt_text = self.frontend.text_normalize(prompt_text, split=False)
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)): for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
if len(i) < 0.5 * len(prompt_text):
logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
start_time = time.time() start_time = time.time()
logging.info('synthesis text {}'.format(i)) logging.info('synthesis text {}'.format(i))

View File

@@ -202,6 +202,9 @@ class TransformerLM(torch.nn.Module):
att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
device=lm_input.device)).to(torch.bool)) device=lm_input.device)).to(torch.bool))
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
# force continue decode first token
if i == 0:
logp[:, self.speech_token_size] = -float('inf')
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item() top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
if top_ids == self.speech_token_size: if top_ids == self.speech_token_size:
break break

View File

@@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
hop_size: 256 hop_size: 256
win_size: 1024 win_size: 1024
fmin: 0 fmin: 0
fmax: 8000 fmax: null
center: False center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift> generator: !ref <hift>

View File

@@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
hop_size: 256 hop_size: 256
win_size: 1024 win_size: 1024
fmin: 0 fmin: 0
fmax: 8000 fmax: null
center: False center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift> generator: !ref <hift>

View File

@@ -6,6 +6,7 @@ gdown==5.1.0
gradio==4.32.2 gradio==4.32.2
grpcio==1.57.0 grpcio==1.57.0
grpcio-tools==1.57.0 grpcio-tools==1.57.0
huggingface-hub==0.23.5
hydra-core==1.3.2 hydra-core==1.3.2
HyperPyYAML==1.2.2 HyperPyYAML==1.2.2
inflect==7.3.1 inflect==7.3.1