mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
Fix: generate token2wav_request_id from cosyvoice2
- Since all token2wav requests within a single cosyvoice2 request must share the same request_id, modify the logic so that a new request_id is generated only if it does not already exist, and ensure that the same request_id is sent consistently.
This commit is contained in:
@@ -28,6 +28,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@@ -364,6 +365,7 @@ class TritonPythonModel:
|
|||||||
# Generate semantic tokens with LLM
|
# Generate semantic tokens with LLM
|
||||||
generated_ids_iter = self.forward_llm(input_ids)
|
generated_ids_iter = self.forward_llm(input_ids)
|
||||||
|
|
||||||
|
token2wav_request_id = request_id or str(uuid4())
|
||||||
if self.decoupled:
|
if self.decoupled:
|
||||||
response_sender = request.get_response_sender()
|
response_sender = request.get_response_sender()
|
||||||
|
|
||||||
@@ -392,7 +394,7 @@ class TritonPythonModel:
|
|||||||
this_tts_speech_token = torch.tensor(this_tts_speech_token).unsqueeze(dim=0).to(torch.int32).to(self.device)
|
this_tts_speech_token = torch.tensor(this_tts_speech_token).unsqueeze(dim=0).to(torch.int32).to(self.device)
|
||||||
|
|
||||||
sub_tts_speech = self.forward_token2wav(
|
sub_tts_speech = self.forward_token2wav(
|
||||||
this_tts_speech_token, request_id, prompt_speech_tokens,
|
this_tts_speech_token, token2wav_request_id, prompt_speech_tokens,
|
||||||
prompt_speech_feat, prompt_spk_embedding, token_offset, False
|
prompt_speech_feat, prompt_spk_embedding, token_offset, False
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -427,7 +429,7 @@ class TritonPythonModel:
|
|||||||
time.sleep(0.02)
|
time.sleep(0.02)
|
||||||
|
|
||||||
this_tts_speech_token = torch.tensor(semantic_token_ids_arr).unsqueeze(dim=0).to(torch.int32).to(self.device)
|
this_tts_speech_token = torch.tensor(semantic_token_ids_arr).unsqueeze(dim=0).to(torch.int32).to(self.device)
|
||||||
sub_tts_speech = self.forward_token2wav(this_tts_speech_token, request_id, prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding, token_offset, True)
|
sub_tts_speech = self.forward_token2wav(this_tts_speech_token, token2wav_request_id, prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding, token_offset, True)
|
||||||
audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(sub_tts_speech))
|
audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(sub_tts_speech))
|
||||||
inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
|
inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
|
||||||
response_sender.send(inference_response)
|
response_sender.send(inference_response)
|
||||||
@@ -441,7 +443,7 @@ class TritonPythonModel:
|
|||||||
if generated_ids is None or len(generated_ids) == 0:
|
if generated_ids is None or len(generated_ids) == 0:
|
||||||
raise pb_utils.TritonModelException("Generated IDs is None or empty")
|
raise pb_utils.TritonModelException("Generated IDs is None or empty")
|
||||||
|
|
||||||
audio = self.forward_token2wav(generated_ids, request_id, prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding)
|
audio = self.forward_token2wav(generated_ids, token2wav_request_id, prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding)
|
||||||
|
|
||||||
# Prepare response
|
# Prepare response
|
||||||
audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio))
|
audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio))
|
||||||
|
|||||||
Reference in New Issue
Block a user