add disaggregated deployment

2026-02-05 18:09:24 +08:00 · 2025-10-16 15:58:22 +08:00
parent a224be6117
commit 1fc8435146
3 changed files with 93 additions and 5 deletions
--- a/runtime/triton_trtllm/README.DIT.md
+++ b/runtime/triton_trtllm/README.DIT.md
@@ -45,7 +45,8 @@ bash run_stepaudio2_dit_token2wav.sh <start_stage> <stop_stage>
 - **Stage 4**: Runs the gRPC benchmark client for performance testing.
 - **Stage 5**: Runs the offline TTS inference benchmark test.
 - **Stage 6**: Runs a standalone inference script for the Step-Audio2-mini DiT Token2Wav model.
-
+- **Stage 7**: Launches servers in a disaggregated setup, with the LLM on GPU 0 and Token2Wav servers on GPUs 1-3.
 - **Stage 8**: Runs the benchmark client for the disaggregated server configuration.
 ### Export Models and Launch Server
 Inside the Docker container, prepare the models and start the Triton server by running stages 0-3:
@@ -100,6 +101,40 @@ The following results were obtained by decoding on a single L20 GPU with the `yu
 | TRTLLM | 16 | 2.01 |  5.03 | 0.0292 |
 ### Disaggregated Server
 When the LLM and token2wav components are deployed on the same GPU, they compete for resources. To optimize performance, we use a disaggregated setup where the LLM is deployed on one dedicated L20 GPU, taking advantage of in-flight batching for inference. The token2wav module is deployed on separate, dedicated GPUs.
 The table below shows the first chunk latency results for this configuration. In our tests, we deploy two token2wav instances on each dedicated token2wav GPU.
 | token2wav_num_gpu | concurrent_task_per_instance | concurrent_tasks_per_gpu | avg (ms) | p50 (ms) | p90 (ms) | p99 (ms) |
 |---|---|---|---|---|---|---|
 | 1 | 1 | 1.00 | 218.53 | 217.86 | 254.07 | 296.49 |
 | 2 | 1 | 1.33 | 218.82 | 219.21 | 256.62 | 303.13 |
 | 3 | 1 | 1.50 | 229.08 | 223.27 | 302.13 | 324.41 |
 | 4 | 1 | 1.60 | 203.87 | 198.23 | 254.92 | 279.31 |
 | 1 | 2 | 2.00 | 293.46 | 280.53 | 370.81 | 407.40 |
 | 2 | 2 | 2.67 | 263.38 | 236.84 | 350.82 | 397.39 |
 | 3 | 2 | 3.00 | 308.09 | 275.48 | 385.22 | 521.45 |
 | 4 | 2 | 3.20 | 271.85 | 253.25 | 359.03 | 387.91 |
 | 1 | 3 | 3.00 | 389.15 | 373.01 | 469.22 | 542.89 |
 | 2 | 3 | 4.00 | 403.48 | 394.80 | 481.24 | 507.75 |
 | 3 | 3 | 4.50 | 406.33 | 391.28 | 495.43 | 571.29 |
 | 4 | 3 | 4.80 | 436.72 | 383.81 | 638.44 | 879.23 |
 | 1 | 4 | 4.00 | 520.12 | 493.98 | 610.38 | 739.85 |
 | 2 | 4 | 5.33 | 494.60 | 490.50 | 605.93 | 708.09 |
 | 3 | 4 | 6.00 | 538.23 | 508.33 | 687.62 | 736.96 |
 | 4 | 4 | 6.40 | 579.68 | 546.20 | 721.53 | 958.04 |
 | 1 | 5 | 5.00 | 635.02 | 623.30 | 786.85 | 819.84 |
 | 2 | 5 | 6.67 | 598.23 | 617.09 | 741.00 | 788.96 |
 | 3 | 5 | 7.50 | 644.78 | 684.40 | 786.45 | 1009.45 |
 | 4 | 5 | 8.00 | 733.92 | 642.26 | 1024.79 | 1281.55 |
 | 1 | 6 | 6.00 | 715.38 | 745.68 | 887.04 | 906.68 |
 | 2 | 6 | 8.00 | 748.31 | 753.94 | 873.59 | 1007.14 |
 | 3 | 6 | 9.00 | 900.27 | 822.28 | 1431.14 | 1800.23 |
 | 4 | 6 | 9.60 | 857.54 | 820.33 | 1150.30 | 1298.53 |
 The `concurrent_task_per_gpu` is calculated as:
 `concurrent_task_per_gpu = concurrent_task_per_instance * num_token2wav_instance_per_gpu (2) * token2wav_gpus / (token2wav_gpus + llm_gpus (1))`
 ### Acknowledgements
--- a/runtime/triton_trtllm/client_grpc.py
+++ b/runtime/triton_trtllm/client_grpc.py
@@ -134,6 +134,8 @@ def write_triton_stats(stats, summary_file):
                compute_output = batch["compute_output"]
                compute_infer = batch["compute_infer"]
                batch_count = int(compute_infer["count"])
                if batch_count == 0:
                    continue
                assert compute_infer["count"] == compute_output["count"] == compute_input["count"]
                compute_infer_time_ms = int(compute_infer["ns"]) / 1e6
                compute_input_time_ms = int(compute_input["ns"]) / 1e6
--- a/runtime/triton_trtllm/run_stepaudio2_dit_token2wav.sh
+++ b/runtime/triton_trtllm/run_stepaudio2_dit_token2wav.sh
@@ -20,7 +20,7 @@ trt_weights_dir=./trt_weights_${trt_dtype}
 trt_engines_dir=./trt_engines_${trt_dtype}
 model_repo=./model_repo_cosyvoice2_dit
-bls_instance_num=4
+bls_instance_num=10
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
@@ -58,7 +58,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
    echo "Building TensorRT engines"
    trtllm-build --checkpoint_dir $trt_weights_dir \
                --output_dir $trt_engines_dir \
-                --max_batch_size 16 \
+                --max_batch_size 64 \
                --max_num_tokens 32768 \
                --gemm_plugin $trt_dtype || exit 1
@@ -100,14 +100,14 @@ fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   echo "Starting Token2wav Triton server and Cosyvoice2 llm using trtllm-serve"
-   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 16  --kv_cache_free_gpu_memory_fraction 0.4 &
+   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
   tritonserver --model-repository $model_repo --http-port 18000 &
   wait
    # Test using curl
    # curl http://localhost:8000/v1/chat/completions \
    #     -H "Content-Type: application/json" \
    #     -d '{
-    #         "model": "trt_engines_bfloat16",
+    #         "model": "",
    #         "messages":[{"role": "user", "content": "Where is New York?"},
    #                     {"role": "assistant", "content": "<|s_1708|><|s_2050|><|s_2159|>"}],
    #         "max_tokens": 512,
@@ -172,3 +172,54 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
 fi
 if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
   echo "Disaggregated Server: LLM and Token2wav on different GPUs"
   echo "Starting LLM server on GPU 0"
   export CUDA_VISIBLE_DEVICES=0
   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
   echo "Starting Token2wav server on GPUs 1-3"
   Token2wav_num_gpus=3
   http_port=17000
   grpc_port=18000
   metrics_port=16000
   for i in $(seq 0 $(($Token2wav_num_gpus - 1))); do
       echo "Starting server on GPU $i"
       http_port=$((http_port + 1))
       grpc_port=$((grpc_port + 1))
       metrics_port=$((metrics_port + 1))
       # Two instances of Token2wav server on the same GPU
       CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
       http_port=$((http_port + 1))
       grpc_port=$((grpc_port + 1))
       metrics_port=$((metrics_port + 1))
       CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
   done
   wait
 fi
 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
    echo "Running benchmark client for Disaggregated Server"
    per_gpu_instances=2
    mode=streaming
    BLS_INSTANCE_NUM=$bls_instance_num
    Token2wav_num_gpus=(1 2 3)
    concurrent_tasks=(1 2 3 4 5 6)
    for n_gpu in ${Token2wav_num_gpus[@]}; do
        echo "Test 1 GPU for LLM server and $n_gpu GPUs for Token2wav servers"
        for concurrent_task in ${concurrent_tasks[@]}; do
            num_instances=$((per_gpu_instances * n_gpu))
            for i in $(seq 1 $num_instances); do
                port=$(($i + 18000))
                python3 client_grpc.py \
                    --server-addr localhost \
                    --server-port $port \
                    --model-name cosyvoice2_dit \
                    --num-tasks $concurrent_task \
                    --mode $mode \
                    --huggingface-dataset yuekai/seed_tts_cosy2 \
                    --log-dir ./log_disagg_concurrent_tasks_${concurrent_task}_per_instance_total_token2wav_instances_${num_instances}_port_${port} &
            done
            wait
        done
    done
 fi