From 05bdf4c76983c875fdb03f56272ac0da6c0682c9 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Tue, 5 Aug 2025 11:15:42 +0800 Subject: [PATCH] add contributor info --- README.md | 4 ++++ runtime/triton_trtllm/Dockerfile.server | 2 ++ runtime/triton_trtllm/README.md | 2 ++ runtime/triton_trtllm/run.sh | 9 +++++---- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 18fcb49..5e3cfd5 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,10 @@ ## Roadmap +- [x] 2025/08 + + - [x] Thanks to the contribution from NVIDIA Yuekai Zhang, add triton trtllm runtime support + - [x] 2025/07 - [x] release cosyvoice 3.0 eval set diff --git a/runtime/triton_trtllm/Dockerfile.server b/runtime/triton_trtllm/Dockerfile.server index 8e827b4..38494f4 100644 --- a/runtime/triton_trtllm/Dockerfile.server +++ b/runtime/triton_trtllm/Dockerfile.server @@ -1,4 +1,6 @@ FROM nvcr.io/nvidia/tritonserver:25.06-trtllm-python-py3 +LABEL maintainer="zhangyuekai@foxmail.com" + RUN apt-get update && apt-get install -y cmake RUN git clone https://github.com/pytorch/audio.git && cd audio && git checkout c670ad8 && PATH=/usr/local/cuda/bin:$PATH python3 setup.py develop COPY ./requirements.txt /workspace/requirements.txt diff --git a/runtime/triton_trtllm/README.md b/runtime/triton_trtllm/README.md index 017d8fc..b1e091c 100644 --- a/runtime/triton_trtllm/README.md +++ b/runtime/triton_trtllm/README.md @@ -1,5 +1,7 @@ ## Best Practices for Serving CosyVoice with NVIDIA Triton Inference Server +Thanks to the contribution from NVIDIA Yuekai Zhang. + ### Quick Start Launch the service directly with Docker Compose: ```sh diff --git a/runtime/triton_trtllm/run.sh b/runtime/triton_trtllm/run.sh index 922105d..2e81896 100644 --- a/runtime/triton_trtllm/run.sh +++ b/runtime/triton_trtllm/run.sh @@ -1,4 +1,5 @@ - +#!/bin/bash +# Copyright (c) 2025 NVIDIA (authors: Yuekai Zhang) export CUDA_VISIBLE_DEVICES=0 cosyvoice_path=/workspace/CosyVoice export PYTHONPATH=${cosyvoice_path}:$PYTHONPATH @@ -24,8 +25,8 @@ fi if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then echo "Downloading CosyVoice2-0.5B" - huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm - modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir + huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm + modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir fi @@ -67,7 +68,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then BLS_INSTANCE_NUM=4 TRITON_MAX_BATCH_SIZE=16 DECOUPLED_MODE=False - + python3 scripts/fill_template.py -i ${model_repo}/token2wav/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS} python3 scripts/fill_template.py -i ${model_repo}/audio_tokenizer/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS} python3 scripts/fill_template.py -i ${model_repo}/${cosyvoice2_dir}/config.pbtxt model_dir:${MODEL_DIR},bls_instance_num:${BLS_INSTANCE_NUM},llm_tokenizer_dir:${LLM_TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}