clean code

2026-02-05 18:09:24 +08:00 · 2025-10-08 16:48:00 +08:00
parent f186ec3338
commit a019a2504e
5 changed files with 46 additions and 193 deletions
--- a/runtime/triton_trtllm/model_repo/token2wav_dit/1/model.py
+++ b/runtime/triton_trtllm/model_repo/token2wav_dit/1/model.py
@@ -106,13 +106,10 @@ class TritonPythonModel:
        # Process each request in batch
        for request in requests:
            target_speech_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "target_speech_tokens").as_numpy()
-            target_speech_tokens = torch.from_numpy(target_speech_tokens_tensor)#.to(self.device)
-            # shift the speech tokens according to the original vocab size
+            target_speech_tokens = torch.from_numpy(target_speech_tokens_tensor)
            target_speech_tokens = target_speech_tokens - ORIGINAL_VOCAB_SIZE
            target_speech_tokens = target_speech_tokens.squeeze().tolist()

-            # We set token_offset as an optional input to support streaming/offline tts. It has to be None when offline tts.
-           
            finalize = pb_utils.get_input_tensor_by_name(request, "finalize").as_numpy().item()
                
            request_id = request.request_id()
@@ -124,23 +121,14 @@ class TritonPythonModel:
                request, "reference_wav_len").as_numpy().item()

            wav_array = torch.from_numpy(wav_array)
-            # Prepare inputs
            wav = wav_array[:, :wav_len].squeeze(0)

            spk_id = get_spk_id_from_prompt_audio(wav)
-            # wav = wav.to(self.device)
-
-            # update cache before forward
-            # self.token2wav_model.streaming_flow_cache[request_id]
-            # self.token2wav_model.hift_cache_dict[request_id]

            audio_hat = self.token2wav_model.forward_streaming(target_speech_tokens, finalize, request_id=request_id, speaker_id=f"{spk_id}", prompt_audio=wav, prompt_audio_sample_rate=16000)

-            # get the cache after forward
            outputs = []

-            generated_wave = audio_hat.squeeze(0).cpu().numpy()
-
            wav_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio_hat))
            outputs.append(wav_tensor)
            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
--- a/runtime/triton_trtllm/model_repo/token2wav_dit/1/token2wav_dit.py
+++ b/runtime/triton_trtllm/model_repo/token2wav_dit/1/token2wav_dit.py
@@ -320,7 +320,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
    def forward(
        self, generated_speech_tokens_list: list[list[int]], prompt_audios_list: list[torch.Tensor], prompt_audios_sample_rate: list[int]
    ):
-        # assert all item in prompt_audios_sample_rate is 16000
        assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
        

@@ -335,7 +334,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
    def prepare_prompt_audio(
        self, prompt_audios_list: list[torch.Tensor], prompt_audios_sample_rate: list[int]
    ):
-        # assert all item in prompt_audios_sample_rate is 16000
        assert all(sample_rate == 16000 for sample_rate in prompt_audios_sample_rate)
        

@@ -385,7 +383,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):

            cache_dict = self.get_prompt_audio_cache_for_streaming_tts(prompt_speech_tokens_list, prompt_mels_for_flow, prompt_mels_lens_for_flow, spk_emb_for_flow)
            self.speaker_cache[speaker_id] = {'prompt_audio_dict': prompt_audio_dict, 'cache_dict': cache_dict}
-            print(f"speaker_id {speaker_id} added to cache")

        if request_id not in self.streaming_flow_cache:
            self.streaming_flow_cache[request_id] = {k: v.clone() for k, v in self.speaker_cache[speaker_id]['cache_dict'].items()}
@@ -394,12 +391,6 @@ class CosyVoice2_Token2Wav(torch.nn.Module):
            source = torch.zeros(1, 1, 0, device='cuda'),
            speech = torch.zeros(1, 0, device='cuda'),
            )
-        # else:
-        #     for k, v in self.streaming_flow_cache[request_id].items():
-        #         print(f"k: {k}, v: {v.shape}, dtype: {v.dtype}")
-        #     for k, v in self.hift_cache_dict[request_id].items():
-        #         print(f"k: {k}, v: {v.shape}, dtype: {v.dtype}")
-        #     breakpoint()

        current_request_cache = self.streaming_flow_cache[request_id]

@@ -477,7 +468,6 @@ def get_args():
 if __name__ == "__main__":
    args = get_args()
    model = CosyVoice2_Token2Wav(model_dir=args.model_dir, enable_trt=args.enable_trt)
-    # mkdir output_dir if not exists
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    dataset_name = "yuekai/seed_tts_cosy2"