fix lint

2026-02-05 18:09:24 +08:00 · 2025-09-03 03:45:17 -07:00
parent 633b991290
commit ad257b06e3
4 changed files with 14 additions and 20 deletions
--- a/runtime/triton_trtllm/model_repo/cosyvoice2/1/model.py
+++ b/runtime/triton_trtllm/model_repo/cosyvoice2/1/model.py
@@ -41,11 +41,11 @@ from transformers import AutoTokenizer
 import torchaudio


-
 from matcha.utils.audio import mel_spectrogram

 torch.set_num_threads(1)

+
 class TritonPythonModel:
    """Triton Python model for Spark TTS.

@@ -193,7 +193,6 @@ class TritonPythonModel:

        return prompt_speech_tokens

-
    def forward_speaker_embedding(self, wav):
        """Forward pass through the speaker embedding component.

@@ -219,7 +218,6 @@ class TritonPythonModel:

        return prompt_spk_embedding

-
    def forward_token2wav(
            self,
            prompt_speech_tokens: torch.Tensor,
@@ -254,7 +252,6 @@ class TritonPythonModel:
            inputs_tensor.append(token_offset_tensor)
            inputs_tensor.append(finalize_tensor)

-
        # Create and execute inference request
        inference_request = pb_utils.InferenceRequest(
            model_name='token2wav',
@@ -281,7 +278,6 @@ class TritonPythonModel:
        input_ids = torch.cat([input_ids, prompt_speech_tokens], dim=1)
        return input_ids

-
    def _extract_speech_feat(self, speech):
        speech_feat = mel_spectrogram(
            speech,
@@ -299,7 +295,6 @@ class TritonPythonModel:
        speech_feat = speech_feat.unsqueeze(dim=0)
        return speech_feat

-
    def _llm_gen_thread(self, generated_ids_iter, semantic_token_ids_arr, llm_is_done_flag):
        for generated_ids in generated_ids_iter:
            generated_ids = generated_ids.tolist()
@@ -338,7 +333,6 @@ class TritonPythonModel:
            prompt_speech_feat = speech_feat[:, :2 * token_len].contiguous().half()
            prompt_speech_tokens = prompt_speech_tokens[:, :token_len].contiguous()

-
            flow_prompt_speech_token_len = prompt_speech_tokens.shape[-1]

            reference_text = pb_utils.get_input_tensor_by_name(request, "reference_text").as_numpy()
@@ -385,7 +379,9 @@ class TritonPythonModel:
                        this_tts_speech_token = semantic_token_ids_arr[:token_offset + this_token_hop_len + self.flow_pre_lookahead_len]
                        this_tts_speech_token = torch.tensor(this_tts_speech_token).unsqueeze(dim=0).to(torch.int32).to(self.device)

-                        sub_tts_speech = self.forward_token2wav(prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding, this_tts_speech_token, request_id, token_offset, False)
+                        sub_tts_speech = self.forward_token2wav(
+                            prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding,
+                            this_tts_speech_token, request_id, token_offset, False)

                        audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(sub_tts_speech))
                        inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
@@ -413,7 +409,6 @@ class TritonPythonModel:
                                    else:
                                        this_token_hop_len = self.token_hop_len
                                    this_token_hop_len = max(self.token_hop_len, this_token_hop_len)
-                        
                        chunk_index += 1
                    else:
                        time.sleep(0.02)
--- a/runtime/triton_trtllm/model_repo/speaker_embedding/1/model.py
+++ b/runtime/triton_trtllm/model_repo/speaker_embedding/1/model.py
@@ -143,7 +143,6 @@ class TritonPythonModel:

            embedding = self._extract_spk_embedding(wav_array)

-
            prompt_spk_embedding_tensor = pb_utils.Tensor.from_dlpack(
                "prompt_spk_embedding", to_dlpack(embedding))
            inference_response = pb_utils.InferenceResponse(
--- a/runtime/triton_trtllm/model_repo/token2wav/1/model.py
+++ b/runtime/triton_trtllm/model_repo/token2wav/1/model.py
@@ -49,6 +49,7 @@ logger = logging.getLogger(__name__)
 ORIGINAL_VOCAB_SIZE = 151663
 torch.set_num_threads(1)

+
 class CosyVoice2:

    def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1, device='cuda'):
@@ -123,7 +124,6 @@ class CosyVoice2Model:
        input_names = ["x", "mask", "mu", "cond"]
        return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}

-
    def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
        with torch.cuda.amp.autocast(self.fp16):
            tts_mel, _ = self.flow.inference(token=token.to(self.device),