From 0bc48c11804c4462df33a18bd11b5e1e0d6377df Mon Sep 17 00:00:00 2001 From: root Date: Wed, 30 Jul 2025 11:05:49 +0000 Subject: [PATCH] update readme --- examples/grpo/cosyvoice2/Dockerfile | 2 +- examples/grpo/cosyvoice2/README.md | 22 +++++++------ .../cosyvoice2/pretrained_to_huggingface.py | 17 +++++++--- ...rements-cosyvoice.txt => requirements.txt} | 0 examples/grpo/cosyvoice2/run.sh | 31 ++++++++++++++++--- .../grpo/cosyvoice2/scripts/compute_wer.sh | 1 + 6 files changed, 54 insertions(+), 19 deletions(-) rename examples/grpo/cosyvoice2/{requirements-cosyvoice.txt => requirements.txt} (100%) diff --git a/examples/grpo/cosyvoice2/Dockerfile b/examples/grpo/cosyvoice2/Dockerfile index 0585c20..17d80ed 100644 --- a/examples/grpo/cosyvoice2/Dockerfile +++ b/examples/grpo/cosyvoice2/Dockerfile @@ -1,5 +1,5 @@ FROM verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2 -COPY requirements-cosyvoice.txt /myworkspace/requirements.txt +COPY requirements.txt /myworkspace/requirements.txt RUN pip install -r /myworkspace/requirements.txt RUN pip install -U nvidia-pytriton RUN git clone https://github.com/yuekaizhang/verl.git /myworkspace/verl -b thread && cd /myworkspace/verl && pip install --no-deps -e . diff --git a/examples/grpo/cosyvoice2/README.md b/examples/grpo/cosyvoice2/README.md index 2e1787a..8783aa1 100644 --- a/examples/grpo/cosyvoice2/README.md +++ b/examples/grpo/cosyvoice2/README.md @@ -1,6 +1,6 @@ # CosyVoice2 LLM Reinforcement Learning Recipe -This recipe demonstrates how to fine-tune the **CosyVoice2** large language model with reinforcement learning algorithms—specifically **GRPO**—using the [veRL](https://github.com/volcengine/verl) framework. Our experiments show that applying GRPO reduces the character error rate (CER) on the CosyVoice3 `zero_shot_zh` set from 4.08 % to 3.36 %. +This recipe demonstrates how to fine-tune the **CosyVoice2** large language model with reinforcement learning algorithms—specifically **GRPO**—using the [veRL](https://github.com/volcengine/verl) framework. Our experiments show that applying GRPO reduces the character error rate (CER) on the CosyVoice3 `zero_shot_zh` set from 4.08% to 3.36%. ## Table of Contents @@ -18,6 +18,7 @@ We recommend using the pre-built Docker image below. Alternatively, you can manu ```bash docker pull soar97/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2 ``` +If Docker is not available, you can refer to `run.sh` `stage -2` to install the dependencies locally. ## Data Preparation @@ -43,16 +44,16 @@ data/parquet_tiny/train.parquet data/parquet_tiny/test.parquet ``` -Each sample is automatically wrapped into a cosyvoice2-style prompt so that the LLM learns to output CosyVoice2 speech tokens. +Each sample is automatically wrapped into a CosyVoice2-style prompt so that the LLM learns to output CosyVoice2 speech tokens. ## Reward Function & ASR Server -To compute rewards we run a lightweight server that: +To compute rewards, we run a lightweight server that: 1. Converts generated speech tokens back to a 16 kHz waveform with the **CosyVoice2** pretrained U-Net model. 2. Transcribes the waveform with **SenseVoice** ASR. -3. Calculates the pinyin-level error rate relative to the ground-truth text and maps it to a score in the range \[0-1\]. +3. Calculates the pinyin-level error rate relative to the ground-truth text and maps it to a score between 0 and 1. Start the server (stage `1`) in a dedicated terminal or on a separate GPU: @@ -61,7 +62,7 @@ bash run.sh 1 1 # Triton server listens on ports 8000/8001/8002 ``` -The custom reward implementation lives in [`reward_tts.py`](./reward_tts.py) and calls the server to obtain the reward score. +The custom reward implementation is located in [`reward_tts.py`](./reward_tts.py) and calls the server to obtain the reward score. ## Training @@ -78,10 +79,12 @@ Key CLI arguments passed to `verl.trainer.main_ppo`: * `custom_reward_function.path=reward_tts.py` – custom reward function described above. Adjust `CUDA_VISIBLE_DEVICES`, batch sizes, and other hyperparameters to match your hardware. +> [!TIP] +> Note: the lm_head bias is disabled during training to make the model compatible with VLLM and Transformers' Qwen model. ## Evaluation -After training completes, collect the sharded FSDP weights and export a Hugging Face-style checkpoint (stage `3`): +After training is complete, collect the sharded FSDP weights and export a Hugging Face-style checkpoint (stage `3`): ```bash bash run.sh 3 3 # merges weights into $llm_path/merged_hf_model @@ -107,15 +110,16 @@ bash run.sh 5 5 ``` The script converts the Hugging Face checkpoint back into the format expected by the CosyVoice repository. +> [!TIP] +> However, we observed a slight accuracy drop when using the RL-trained model after conversion, compared with the Hugging Face format. ## Results | Model | Seed-TTS `test_zh` CER | CosyVoice3 `zero_shot_zh` CER | Comment | |-------|------------------------|------------------------------|---------| -| CosyVoice2 LLM (official) | 1.45 % | 4.08 % | See the [paper](https://arxiv.org/abs/2412.10117) | -| CosyVoice2 LLM + GRPO | 1.37 % | **3.36 %** | See the [decoding results](yuekai/official-cosyvoice-llm-grpo-aishell3) | +| CosyVoice2 LLM (official) | 1.45% | 4.08% | See the [paper](https://arxiv.org/abs/2412.10117) | +| CosyVoice2 LLM + GRPO | 1.37% | **3.36%** | See the [decoding results](yuekai/official-cosyvoice-llm-grpo-aishell3), Hugging Face-format model | ## Acknowledgement This work was inspired by the implementation in [ch-tts-llasa-rl-grpo](https://github.com/channel-io/ch-tts-llasa-rl-grpo). - diff --git a/examples/grpo/cosyvoice2/pretrained_to_huggingface.py b/examples/grpo/cosyvoice2/pretrained_to_huggingface.py index e2a9962..161a11f 100644 --- a/examples/grpo/cosyvoice2/pretrained_to_huggingface.py +++ b/examples/grpo/cosyvoice2/pretrained_to_huggingface.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -94,7 +95,8 @@ if __name__ == "__main__": with torch.no_grad(): # set the weight and bias of the new lm_head to 0 new_lm_head.weight.data.zero_() - new_lm_head.bias.data.zero_() + # make bias value -inf + new_lm_head.bias.data.fill_(-float('inf')) new_lm_head.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.weight new_lm_head.bias[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.bias @@ -107,8 +109,7 @@ if __name__ == "__main__": eos_token_ids = [original_tokenizer_vocab_size + cosyvoice2_token_size, original_tokenizer_vocab_size + cosyvoice2_token_size + 1, - original_tokenizer_vocab_size + cosyvoice2_token_size + 2, - original_tokenizer_vocab_size + cosyvoice2_token_size + 3] + original_tokenizer_vocab_size + cosyvoice2_token_size + 2] llm.generation_config.eos_token_id = eos_token_ids llm.generation_config.temperature = 1.0 llm.generation_config.top_p = 0.8 @@ -121,6 +122,14 @@ if __name__ == "__main__": llm.to(torch.bfloat16) llm.save_pretrained(args.save_path) - TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}" + TEMPLATE = ( + "{%- for message in messages %}" + "{%- if message['role'] == 'user' %}" + "{{- '<|sos|>' + message['content'] + '<|task_id|>' }}" + "{%- elif message['role'] == 'assistant' %}" + "{{- message['content']}}" + "{%- endif %}" + "{%- endfor %}" + ) tokenizer.chat_template = TEMPLATE tokenizer.save_pretrained(args.save_path) diff --git a/examples/grpo/cosyvoice2/requirements-cosyvoice.txt b/examples/grpo/cosyvoice2/requirements.txt similarity index 100% rename from examples/grpo/cosyvoice2/requirements-cosyvoice.txt rename to examples/grpo/cosyvoice2/requirements.txt diff --git a/examples/grpo/cosyvoice2/run.sh b/examples/grpo/cosyvoice2/run.sh index 1a09104..389914e 100644 --- a/examples/grpo/cosyvoice2/run.sh +++ b/examples/grpo/cosyvoice2/run.sh @@ -3,7 +3,7 @@ set -eou pipefail stage=-1 -stop_stage=5 +stop_stage=4 log() { # This function is from espnet @@ -15,6 +15,22 @@ export PYTHONPATH=/workspace/CosyVoice model_scope_model_path=./CosyVoice2-0.5B sft_model_path=./transformers_cosyvoice2_llm +if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then + log "stage -2: install dependencies locally if pre-built docker image is not available" + conda create -n cosyvoice2 python=3.10 -y + conda activate cosyvoice2 + # install verl + git clone https://github.com/yuekaizhang/verl.git -b thread + cd verl + USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh + pip install --no-deps -e . + cd - + # install requirements + pip install -r requirements.txt + pip install -U nvidia-pytriton + git clone https://github.com/yuekaizhang/PytritonSenseVoice.git && cd PytritonSenseVoice && pip install -e . +fi + if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "stage -1: download official CosyVoice2-0.5B LLM model and convert to huggingface compatible checkpoint" modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_path @@ -24,13 +40,15 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then # Or, you could use the following command to download the huggingface compatible checkpoint # huggingface-cli download --local-dir $sft_model_path yuekai/cosyvoice2_llm + + # Note: we remove the lm_head's bias to make it compatible with the Qwen2.5-0.5B model in Transformers. fi data_dir=data/parquet_aishell3 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "stage 0: prepare data into verl format" mkdir -p $data_dir - wget https://huggingface.co/datasets/SparkAudio/voxbox/resolve/main/metadata/aishell-3.jsonl -O data/aishell-3.jsonl + wget -O data/aishell-3.jsonl https://huggingface.co/datasets/SparkAudio/voxbox/resolve/main/metadata/aishell-3.jsonl # total 88035 samples head -n 80000 data/aishell-3.jsonl > data/train.jsonl tail -n 100 data/aishell-3.jsonl > data/test.jsonl @@ -98,7 +116,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then trainer.val_before_train=False fi -step=400 +steps=(100 200 300 400 500) +for step in ${steps[@]}; do llm_path=./checkpoints/cosyvoice2_grpo/$exp_name/global_step_${step} if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "stage 3: merge the model" @@ -111,7 +130,7 @@ fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "stage 4: Test the model" dataset=zero_shot_zh - # dataset=test_zh + # dataset=test_zh seed_tts test_zh output_dir=./outputs_${exp_name}_${step}_${dataset} token2wav_path=/workspace/CosyVoice2-0.5B @@ -127,12 +146,14 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then bash scripts/compute_wer.sh $output_dir ${dataset} fi +done if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "stage 5: Convert the RL trained model to CosyVoice repo format" python3 huggingface_to_pretrained.py \ --hf-cosyvoice2-llm-path $llm_path/merged_hf_model \ - --pretrained-cosyvoice2-path /workspace/CosyVoice2-0.5B \ --output-path /workspace/CosyVoice2-0.5B/llm-new.pt # You need to manually move the llm-new.pt to overwrite /workspace/CosyVoice2-0.5B/llm.pt + # However, we found that the RL trained model accuracy would slightly drop after this conversion. + # Please be careful or use the huggingface format inference code. fi \ No newline at end of file diff --git a/examples/grpo/cosyvoice2/scripts/compute_wer.sh b/examples/grpo/cosyvoice2/scripts/compute_wer.sh index 55ae1a7..43a6872 100644 --- a/examples/grpo/cosyvoice2/scripts/compute_wer.sh +++ b/examples/grpo/cosyvoice2/scripts/compute_wer.sh @@ -10,6 +10,7 @@ model_path=models/sherpa-onnx-paraformer-zh-2023-09-14 if [ ! -d $model_path ]; then pip install sherpa-onnx wget -nc https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + mkdir models tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 -C models fi