mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
fix white space
This commit is contained in:
@@ -133,7 +133,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
|||||||
option.intra_op_num_threads = 1
|
option.intra_op_num_threads = 1
|
||||||
self.spk_model = onnxruntime.InferenceSession(f"{model_dir}/campplus.onnx", sess_options=option,
|
self.spk_model = onnxruntime.InferenceSession(f"{model_dir}/campplus.onnx", sess_options=option,
|
||||||
providers=["CPUExecutionProvider"])
|
providers=["CPUExecutionProvider"])
|
||||||
|
|
||||||
self.audio_tokenizer = s3tokenizer.load_model(f"{model_dir}/speech_tokenizer_v2_25hz.onnx").to(self.device).eval()
|
self.audio_tokenizer = s3tokenizer.load_model(f"{model_dir}/speech_tokenizer_v2_25hz.onnx").to(self.device).eval()
|
||||||
|
|
||||||
gpu="l20"
|
gpu="l20"
|
||||||
@@ -319,13 +318,11 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
|||||||
):
|
):
|
||||||
assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
|
assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
|
||||||
|
|
||||||
|
|
||||||
prompt_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow = self.prepare_prompt_audio(prompt_audios_list, prompt_audios_sample_rate)
|
prompt_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow = self.prepare_prompt_audio(prompt_audios_list, prompt_audios_sample_rate)
|
||||||
|
|
||||||
generated_mels, generated_mels_lens = self.forward_flow(prompt_speech_tokens_list, generated_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow)
|
generated_mels, generated_mels_lens = self.forward_flow(prompt_speech_tokens_list, generated_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow)
|
||||||
|
|
||||||
generated_wavs = self.forward_hift(generated_mels, generated_mels_lens, prompt_mels_lens_for_flow)
|
generated_wavs = self.forward_hift(generated_mels, generated_mels_lens, prompt_mels_lens_for_flow)
|
||||||
|
|
||||||
return generated_wavs
|
return generated_wavs
|
||||||
|
|
||||||
def prepare_prompt_audio(
|
def prepare_prompt_audio(
|
||||||
@@ -333,13 +330,11 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
|||||||
):
|
):
|
||||||
assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
|
assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
|
||||||
|
|
||||||
|
|
||||||
prompt_speech_tokens_list = self.prompt_audio_tokenization(prompt_audios_list)
|
prompt_speech_tokens_list = self.prompt_audio_tokenization(prompt_audios_list)
|
||||||
|
|
||||||
prompt_mels_for_flow, prompt_mels_lens_for_flow = self.get_prompt_mels(prompt_audios_list, prompt_audios_sample_rate)
|
prompt_mels_for_flow, prompt_mels_lens_for_flow = self.get_prompt_mels(prompt_audios_list, prompt_audios_sample_rate)
|
||||||
|
|
||||||
spk_emb_for_flow = self.get_spk_emb(prompt_audios_list)
|
spk_emb_for_flow = self.get_spk_emb(prompt_audios_list)
|
||||||
|
|
||||||
return prompt_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow
|
return prompt_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -365,7 +365,6 @@ def main(args):
|
|||||||
runner = None
|
runner = None
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported backend: {args.backend}")
|
raise ValueError(f"Unsupported backend: {args.backend}")
|
||||||
|
|
||||||
if 'Step-Audio-2-mini' in args.token2wav_path:
|
if 'Step-Audio-2-mini' in args.token2wav_path:
|
||||||
from token2wav_dit import CosyVoice2_Token2Wav
|
from token2wav_dit import CosyVoice2_Token2Wav
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user