mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
Merge pull request #618 from FunAudioLLM/dev/lyuxiang.lx
Dev/lyuxiang.lx
This commit is contained in:
@@ -67,6 +67,8 @@ class CosyVoice:
|
|||||||
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
||||||
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
||||||
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
||||||
|
if len(i) < 0.5 * len(prompt_text):
|
||||||
|
logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
|
||||||
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
|
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
logging.info('synthesis text {}'.format(i))
|
logging.info('synthesis text {}'.format(i))
|
||||||
|
|||||||
@@ -202,6 +202,9 @@ class TransformerLM(torch.nn.Module):
|
|||||||
att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
|
att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
|
||||||
device=lm_input.device)).to(torch.bool))
|
device=lm_input.device)).to(torch.bool))
|
||||||
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
|
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
|
||||||
|
# force continue decode first token
|
||||||
|
if i == 0:
|
||||||
|
logp[:, self.speech_token_size] = -float('inf')
|
||||||
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
|
top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
|
||||||
if top_ids == self.speech_token_size:
|
if top_ids == self.speech_token_size:
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
hop_size: 256
|
hop_size: 256
|
||||||
win_size: 1024
|
win_size: 1024
|
||||||
fmin: 0
|
fmin: 0
|
||||||
fmax: 8000
|
fmax: null
|
||||||
center: False
|
center: False
|
||||||
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
||||||
generator: !ref <hift>
|
generator: !ref <hift>
|
||||||
|
|||||||
@@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
hop_size: 256
|
hop_size: 256
|
||||||
win_size: 1024
|
win_size: 1024
|
||||||
fmin: 0
|
fmin: 0
|
||||||
fmax: 8000
|
fmax: null
|
||||||
center: False
|
center: False
|
||||||
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
||||||
generator: !ref <hift>
|
generator: !ref <hift>
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ gdown==5.1.0
|
|||||||
gradio==4.32.2
|
gradio==4.32.2
|
||||||
grpcio==1.57.0
|
grpcio==1.57.0
|
||||||
grpcio-tools==1.57.0
|
grpcio-tools==1.57.0
|
||||||
|
huggingface-hub==0.23.5
|
||||||
hydra-core==1.3.2
|
hydra-core==1.3.2
|
||||||
HyperPyYAML==1.2.2
|
HyperPyYAML==1.2.2
|
||||||
inflect==7.3.1
|
inflect==7.3.1
|
||||||
|
|||||||
Reference in New Issue
Block a user