Merge branch 'main' into dev/lyuxiang.lx

2026-02-04 09:29:25 +08:00 · 2025-12-04 18:00:17 +08:00
parent f76f5abcc1 6e01309e01
commit d985100326
35 changed files with 3584 additions and 311 deletions
--- a/examples/grpo/cosyvoice2/README.md
+++ b/examples/grpo/cosyvoice2/README.md
@@ -36,7 +36,7 @@ Stage `0` converts raw JSONL files into the parquet format expected by veRL:
 ```bash
 bash run.sh 0 0
 ```
-Create two JSONL files—`train.jsonl` and `test.jsonl`.  
+Create two JSONL files—`train.jsonl` and `test.jsonl`.
 The script will then generate two Parquet files:

 ```
@@ -111,7 +111,7 @@ bash run.sh 5 5

 The script converts the Hugging Face checkpoint back into the format expected by the CosyVoice repository.
 > [!TIP]
->  However, we observed a slight accuracy drop when using the RL-trained model after conversion, compared with the Hugging Face format. 
+>  However, we observed a slight accuracy drop when using the RL-trained model after conversion, compared with the Hugging Face format.

 ## Results

--- a/examples/grpo/cosyvoice2/infer_dataset.py
+++ b/examples/grpo/cosyvoice2/infer_dataset.py
@@ -53,7 +53,7 @@ except RuntimeError:
    pass


-TEMPLATE = "{% for message in messages %}{%- if message['role'] == 'user' %}{{- '<|im_start|>' + message['role'] + '\n' + 'Convert the text to speech: ' + message['content'] + '<|im_end|>\n'}}{%- elif message['role'] == 'assistant' %}{{- '<|im_start|>' + message['role'] + '\n' + '<|SPEECH_GENERATION_START|>' + message['content']}}{%- endif %}{%- endfor %}"
+TEMPLATE = "{% for message in messages %}{%- if message['role'] == 'user' %}{{- '<|im_start|>' + message['role'] + '\n' + 'Convert the text to speech: ' + message['content'] + '<|im_end|>\n'}}{%- elif message['role'] == 'assistant' %}{{- '<|im_start|>' + message['role'] + '\n' + '<|SPEECH_GENERATION_START|>' + message['content']}}{%- endif %}{%- endfor %}"  # noqa: E501


 def audio_decode_cosyvoice2(
--- a/examples/grpo/cosyvoice2/pretrained_to_huggingface.py
+++ b/examples/grpo/cosyvoice2/pretrained_to_huggingface.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
--- a/examples/grpo/cosyvoice2/run.sh
+++ b/examples/grpo/cosyvoice2/run.sh
@@ -33,7 +33,7 @@ fi

 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "stage -1: download official CosyVoice2-0.5B LLM model and convert to huggingface compatible checkpoint"
-  modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_path 
+  modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_path
  python3 pretrained_to_huggingface.py \
    --pretrained-cosyvoice2-path $model_scope_model_path \
    --save-path $sft_model_path
@@ -61,7 +61,7 @@ fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "stage 1: start token2wav asr server for reward function"
  python3 token2wav_asr_server.py --number-of-devices 8
-fi 
+fi

 exp_name=official_llm_aishell3_grpo
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
@@ -125,7 +125,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
      --backend fsdp \
      --local_dir $llm_path/actor \
      --target_dir $llm_path/merged_hf_model || exit 1
-fi 
+fi

 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "stage 4: Test the model"
--- a/examples/grpo/cosyvoice2/scripts/offline-decode-files.py
+++ b/examples/grpo/cosyvoice2/scripts/offline-decode-files.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-#
 # Copyright (c)  2023 by manyeyes
 # Copyright (c)  2023  Xiaomi Corporation

@@ -195,7 +193,7 @@ def write_error_stats(
            hyp = list("".join(hyp))
            results[i] = (cut_id, ref, hyp)

-    for cut_id, ref, hyp in results:
+    for _cut_id, ref, hyp in results:
        ali = kaldialign.align(ref, hyp, ERR, sclite_mode=sclite_mode)
        for ref_word, hyp_word in ali:
            if ref_word == ERR:
--- a/examples/grpo/cosyvoice2/token2wav_asr_server.py
+++ b/examples/grpo/cosyvoice2/token2wav_asr_server.py
@@ -295,7 +295,7 @@ def main():
        metrics_port=8002,
    )

-    device_ids = [i for i in range(args.number_of_devices)]
+    device_ids = list(range(args.number_of_devices))
    device_ids = device_ids * args.number_of_instances_per_device

    with Triton(config=triton_config) as triton: