mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
138 lines
5.2 KiB
Bash
138 lines
5.2 KiB
Bash
#!/usr/bin/env bash
|
|
|
|
set -eou pipefail
|
|
|
|
stage=-1
|
|
stop_stage=5
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
export PYTHONPATH=/workspace/CosyVoice
|
|
model_scope_model_path=./CosyVoice2-0.5B
|
|
sft_model_path=./transformers_cosyvoice2_llm
|
|
|
|
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
|
log "stage -1: download official CosyVoice2-0.5B LLM model and convert to huggingface compatible checkpoint"
|
|
modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_path
|
|
python3 pretrained_to_huggingface.py \
|
|
--pretrained-cosyvoice2-path $model_scope_model_path \
|
|
--save-path $sft_model_path
|
|
|
|
# Or, you could use the following command to download the huggingface compatible checkpoint
|
|
# huggingface-cli download --local-dir $sft_model_path yuekai/cosyvoice2_llm
|
|
fi
|
|
|
|
data_dir=data/parquet_aishell3
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
log "stage 0: prepare data into verl format"
|
|
mkdir -p $data_dir
|
|
wget https://huggingface.co/datasets/SparkAudio/voxbox/resolve/main/metadata/aishell-3.jsonl -O data/aishell-3.jsonl
|
|
# total 88035 samples
|
|
head -n 80000 data/aishell-3.jsonl > data/train.jsonl
|
|
tail -n 100 data/aishell-3.jsonl > data/test.jsonl
|
|
python prepare_data.py \
|
|
--train_file data/train.jsonl \
|
|
--test_file data/test.jsonl \
|
|
--local_dir $data_dir
|
|
fi
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "stage 1: start token2wav asr server for reward function"
|
|
python3 token2wav_asr_server.py --number-of-devices 8
|
|
fi
|
|
|
|
exp_name=official_llm_aishell3_grpo
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "stage 2: grpo train"
|
|
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
|
|
export MKL_SERVICE_FORCE_INTEL=TRUE
|
|
n_gpus_per_node=8
|
|
micro_batch_size=4
|
|
train_batch_size=32
|
|
python3 -m verl.trainer.main_ppo \
|
|
algorithm.adv_estimator=grpo \
|
|
data.train_files=$data_dir/train.parquet \
|
|
data.val_files=$data_dir/test.parquet \
|
|
data.train_batch_size=$train_batch_size \
|
|
data.max_prompt_length=1024 \
|
|
data.max_response_length=512 \
|
|
data.truncation='error' \
|
|
actor_rollout_ref.model.use_remove_padding=False \
|
|
actor_rollout_ref.model.path=$sft_model_path \
|
|
actor_rollout_ref.actor.optim.lr=1e-6 \
|
|
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
|
|
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$micro_batch_size \
|
|
actor_rollout_ref.actor.use_kl_loss=False \
|
|
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
|
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
|
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
|
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$micro_batch_size \
|
|
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
|
actor_rollout_ref.rollout.name=vllm \
|
|
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
|
actor_rollout_ref.rollout.do_sample=true \
|
|
actor_rollout_ref.rollout.temperature=0.8 \
|
|
actor_rollout_ref.rollout.top_p=0.95 \
|
|
actor_rollout_ref.rollout.top_k=25 \
|
|
actor_rollout_ref.rollout.n=4 \
|
|
actor_rollout_ref.rollout.val_kwargs.do_sample=true \
|
|
actor_rollout_ref.rollout.val_kwargs.temperature=0.8 \
|
|
actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
|
|
actor_rollout_ref.rollout.val_kwargs.top_k=25 \
|
|
reward_model.reward_manager=prime \
|
|
custom_reward_function.path=reward_tts.py \
|
|
custom_reward_function.name=compute_score \
|
|
trainer.project_name='cosyvoice2_grpo' \
|
|
trainer.experiment_name=$exp_name \
|
|
trainer.logger=['console','wandb'] \
|
|
trainer.n_gpus_per_node=$n_gpus_per_node \
|
|
trainer.nnodes=1 \
|
|
trainer.save_freq=100 \
|
|
trainer.test_freq=100 \
|
|
trainer.resume_mode='auto' \
|
|
trainer.total_epochs=1 \
|
|
trainer.val_before_train=False
|
|
fi
|
|
|
|
step=400
|
|
llm_path=./checkpoints/cosyvoice2_grpo/$exp_name/global_step_${step}
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
log "stage 3: merge the model"
|
|
python -m verl.model_merger merge \
|
|
--backend fsdp \
|
|
--local_dir $llm_path/actor \
|
|
--target_dir $llm_path/merged_hf_model || exit 1
|
|
fi
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
log "stage 4: Test the model"
|
|
dataset=zero_shot_zh
|
|
# dataset=test_zh
|
|
output_dir=./outputs_${exp_name}_${step}_${dataset}
|
|
|
|
token2wav_path=/workspace/CosyVoice2-0.5B
|
|
model_path=$llm_path/merged_hf_model
|
|
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
|
torchrun --nproc_per_node=8 \
|
|
infer_dataset.py \
|
|
--output-dir $output_dir \
|
|
--llm-model-name-or-path $model_path \
|
|
--token2wav-path $token2wav_path \
|
|
--split-name ${dataset} || exit 1
|
|
|
|
bash scripts/compute_wer.sh $output_dir ${dataset}
|
|
fi
|
|
|
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
log "stage 5: Convert the RL trained model to CosyVoice repo format"
|
|
python3 huggingface_to_pretrained.py \
|
|
--hf-cosyvoice2-llm-path $llm_path/merged_hf_model \
|
|
--pretrained-cosyvoice2-path /workspace/CosyVoice2-0.5B \
|
|
--output-path /workspace/CosyVoice2-0.5B/llm-new.pt
|
|
# You need to manually move the llm-new.pt to overwrite /workspace/CosyVoice2-0.5B/llm.pt
|
|
fi |