mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
clean code
This commit is contained in:
@@ -106,13 +106,10 @@ class TritonPythonModel:
|
||||
# Process each request in batch
|
||||
for request in requests:
|
||||
target_speech_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "target_speech_tokens").as_numpy()
|
||||
target_speech_tokens = torch.from_numpy(target_speech_tokens_tensor)#.to(self.device)
|
||||
# shift the speech tokens according to the original vocab size
|
||||
target_speech_tokens = torch.from_numpy(target_speech_tokens_tensor)
|
||||
target_speech_tokens = target_speech_tokens - ORIGINAL_VOCAB_SIZE
|
||||
target_speech_tokens = target_speech_tokens.squeeze().tolist()
|
||||
|
||||
# We set token_offset as an optional input to support streaming/offline tts. It has to be None when offline tts.
|
||||
|
||||
finalize = pb_utils.get_input_tensor_by_name(request, "finalize").as_numpy().item()
|
||||
|
||||
request_id = request.request_id()
|
||||
@@ -124,23 +121,14 @@ class TritonPythonModel:
|
||||
request, "reference_wav_len").as_numpy().item()
|
||||
|
||||
wav_array = torch.from_numpy(wav_array)
|
||||
# Prepare inputs
|
||||
wav = wav_array[:, :wav_len].squeeze(0)
|
||||
|
||||
spk_id = get_spk_id_from_prompt_audio(wav)
|
||||
# wav = wav.to(self.device)
|
||||
|
||||
# update cache before forward
|
||||
# self.token2wav_model.streaming_flow_cache[request_id]
|
||||
# self.token2wav_model.hift_cache_dict[request_id]
|
||||
|
||||
audio_hat = self.token2wav_model.forward_streaming(target_speech_tokens, finalize, request_id=request_id, speaker_id=f"{spk_id}", prompt_audio=wav, prompt_audio_sample_rate=16000)
|
||||
|
||||
# get the cache after forward
|
||||
outputs = []
|
||||
|
||||
generated_wave = audio_hat.squeeze(0).cpu().numpy()
|
||||
|
||||
wav_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio_hat))
|
||||
outputs.append(wav_tensor)
|
||||
inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
|
||||
|
||||
@@ -320,7 +320,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
||||
def forward(
|
||||
self, generated_speech_tokens_list: list[list[int]], prompt_audios_list: list[torch.Tensor], prompt_audios_sample_rate: list[int]
|
||||
):
|
||||
# assert all item in prompt_audios_sample_rate is 16000
|
||||
assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
|
||||
|
||||
|
||||
@@ -335,7 +334,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
||||
def prepare_prompt_audio(
|
||||
self, prompt_audios_list: list[torch.Tensor], prompt_audios_sample_rate: list[int]
|
||||
):
|
||||
# assert all item in prompt_audios_sample_rate is 16000
|
||||
assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
|
||||
|
||||
|
||||
@@ -385,7 +383,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
||||
|
||||
cache_dict = self.get_prompt_audio_cache_for_streaming_tts(prompt_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow)
|
||||
self.speaker_cache[speaker_id] = {'prompt_audio_dict': prompt_audio_dict, 'cache_dict': cache_dict}
|
||||
print(f"speaker_id {speaker_id} added to cache")
|
||||
|
||||
if request_id not in self.streaming_flow_cache:
|
||||
self.streaming_flow_cache[request_id] = {k: v.clone() for k, v in self.speaker_cache[speaker_id]['cache_dict'].items()}
|
||||
@@ -394,12 +391,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
|
||||
source = torch.zeros(1, 1, 0, device='cuda'),
|
||||
speech = torch.zeros(1, 0, device='cuda'),
|
||||
)
|
||||
# else:
|
||||
# for k, v in self.streaming_flow_cache[request_id].items():
|
||||
# print(f"k: {k}, v: {v.shape}, dtype: {v.dtype}")
|
||||
# for k, v in self.hift_cache_dict[request_id].items():
|
||||
# print(f"k: {k}, v: {v.shape}, dtype: {v.dtype}")
|
||||
# breakpoint()
|
||||
|
||||
current_request_cache = self.streaming_flow_cache[request_id]
|
||||
|
||||
@@ -477,7 +468,6 @@ def get_args():
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
model = CosyVoice2_Token2Wav(model_dir=args.model_dir, enable_trt=args.enable_trt)
|
||||
# mkdir output_dir if not exists
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
dataset_name = "yuekai/seed_tts_cosy2"
|
||||
|
||||
Reference in New Issue
Block a user