mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 09:29:25 +08:00
fix bug
This commit is contained in:
@@ -193,13 +193,13 @@ class CosyVoiceFrontEnd:
|
||||
model_input = self.frontend_sft(tts_text, spk_id)
|
||||
# in instruct mode, we remove spk_embedding in llm due to information leakage
|
||||
del model_input['llm_embedding']
|
||||
instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
|
||||
instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text)
|
||||
model_input['prompt_text'] = instruct_text_token
|
||||
model_input['prompt_text_len'] = instruct_text_token_len
|
||||
return model_input
|
||||
|
||||
def frontend_instruct2(self, tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id):
|
||||
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_wav, resample_rate, zero_shot_spk_id)
|
||||
model_input = self.frontend_zero_shot(tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id)
|
||||
del model_input['llm_prompt_speech_token']
|
||||
del model_input['llm_prompt_speech_token_len']
|
||||
return model_input
|
||||
|
||||
@@ -129,7 +129,7 @@ class CosyVoiceModel:
|
||||
|
||||
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
|
||||
with torch.cuda.amp.autocast(self.fp16):
|
||||
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
|
||||
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
|
||||
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
||||
prompt_token=prompt_token.to(self.device),
|
||||
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
||||
@@ -284,7 +284,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
||||
|
||||
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
|
||||
with torch.cuda.amp.autocast(self.fp16):
|
||||
tts_mel, _ = self.flow.inference(token=token.to(self.device),
|
||||
tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
|
||||
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
||||
prompt_token=prompt_token.to(self.device),
|
||||
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
||||
@@ -413,7 +413,7 @@ class CosyVoice3Model(CosyVoice2Model):
|
||||
|
||||
def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
|
||||
with torch.cuda.amp.autocast(self.fp16):
|
||||
tts_mel, _ = self.flow.inference(token=token.to(self.device),
|
||||
tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
|
||||
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
||||
prompt_token=prompt_token.to(self.device),
|
||||
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
||||
|
||||
Reference in New Issue
Block a user