mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
online feature
This commit is contained in:
@@ -1 +0,0 @@
|
||||
../../../cosyvoice
|
||||
@@ -54,6 +54,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument('--des_dir',
|
||||
type=str)
|
||||
parser.add_argument('--instruct',
|
||||
type=str)
|
||||
type=str,
|
||||
default='')
|
||||
args = parser.parse_args()
|
||||
main()
|
||||
|
||||
@@ -27,7 +27,7 @@ fi
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_embedding.py --dir data/$x \
|
||||
../../../tools/extract_embedding.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||
done
|
||||
fi
|
||||
@@ -35,7 +35,7 @@ fi
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_speech_token.py --dir data/$x \
|
||||
../../../tools/extract_speech_token.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
|
||||
done
|
||||
fi
|
||||
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mkdir -p data/$x/parquet
|
||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
--num_processes 10 \
|
||||
--src_dir data/$x \
|
||||
--des_dir data/$x/parquet
|
||||
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
for model in llm flow hifigan; do
|
||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||
cosyvoice/bin/train.py \
|
||||
../../../cosyvoice/bin/train.py \
|
||||
--train_engine $train_engine \
|
||||
--config conf/cosyvoice.yaml \
|
||||
--train_data data/train.data.list \
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../tools
|
||||
@@ -139,7 +139,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
|
||||
get_tokenizer: !ref <get_tokenizer>
|
||||
allowed_special: !ref <allowed_special>
|
||||
filter: !name:cosyvoice.dataset.processor.filter
|
||||
max_length: 40960
|
||||
max_length: 6000
|
||||
min_length: 100
|
||||
token_max_length: 200
|
||||
token_min_length: 1
|
||||
@@ -158,7 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
||||
center: False
|
||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||
feat_extractor: !ref <feat_extractor>
|
||||
token_mel_ratio: 2
|
||||
num_frames: 960
|
||||
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||
sample_rate: !ref <sample_rate>
|
||||
hop_size: 480
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../cosyvoice
|
||||
@@ -24,27 +24,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_embedding.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_speech_token.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
|
||||
done
|
||||
fi
|
||||
|
||||
# NOTE embedding/token extraction is not necessary now as we support online feature extraction
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mkdir -p data/$x/parquet
|
||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
--num_processes 10 \
|
||||
--src_dir data/$x \
|
||||
--des_dir data/$x/parquet
|
||||
@@ -69,12 +54,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
for model in llm flow hifigan; do
|
||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||
cosyvoice/bin/train.py \
|
||||
../../../cosyvoice/bin/train.py \
|
||||
--train_engine $train_engine \
|
||||
--config conf/cosyvoice2.yaml \
|
||||
--train_data data/train.data.list \
|
||||
--cv_data data/dev.data.list \
|
||||
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
||||
--onnx_path $pretrained_model_dir \
|
||||
--model $model \
|
||||
--checkpoint $pretrained_model_dir/$model.pt \
|
||||
--model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
|
||||
|
||||
@@ -36,7 +36,7 @@ fi
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_embedding.py --dir data/$x \
|
||||
../../../tools/extract_embedding.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||
done
|
||||
fi
|
||||
@@ -44,7 +44,7 @@ fi
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_speech_token.py --dir data/$x \
|
||||
../../../tools/extract_speech_token.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
|
||||
done
|
||||
fi
|
||||
@@ -53,7 +53,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mkdir -p data/$x/parquet
|
||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
--num_processes 10 \
|
||||
--dpo \
|
||||
--src_dir data/$x \
|
||||
@@ -80,11 +80,12 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
for model in llm; do
|
||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||
cosyvoice/bin/train.py \
|
||||
../../../cosyvoice/bin/train.py \
|
||||
--train_engine $train_engine \
|
||||
--config conf/cosyvoice2.yaml \
|
||||
--train_data data/train.data.list \
|
||||
--cv_data data/dev.data.list \
|
||||
--onnx_path $pretrained_model_dir \
|
||||
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
||||
--model $model \
|
||||
--checkpoint $pretrained_model_dir/$model.pt \
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../tools
|
||||
@@ -129,7 +129,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
|
||||
get_tokenizer: !ref <get_tokenizer>
|
||||
allowed_special: !ref <allowed_special>
|
||||
filter: !name:cosyvoice.dataset.processor.filter
|
||||
max_length: 40960
|
||||
max_length: 6000
|
||||
min_length: 100
|
||||
token_max_length: 200
|
||||
token_min_length: 1
|
||||
@@ -148,7 +148,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
||||
center: False
|
||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||
feat_extractor: !ref <feat_extractor>
|
||||
token_mel_ratio: 2
|
||||
num_frames: 960
|
||||
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||
sample_rate: !ref <sample_rate>
|
||||
hop_size: 480
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../cosyvoice
|
||||
@@ -25,36 +25,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_embedding.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
tools/extract_speech_token.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v3.onnx
|
||||
done
|
||||
fi
|
||||
|
||||
# NOTE embedding/token extraction is not necessary now as we support online feature extraction
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mkdir -p data/$x/parquet
|
||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
--num_processes 10 \
|
||||
--instruct \
|
||||
--src_dir data/$x \
|
||||
--des_dir data/$x/parquet
|
||||
done
|
||||
fi
|
||||
|
||||
# train llm
|
||||
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
||||
export CUDA_VISIBLE_DEVICES="0"
|
||||
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
job_id=1986
|
||||
dist_backend="nccl"
|
||||
@@ -71,12 +55,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
for model in llm flow hifigan; do
|
||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||
cosyvoice/bin/train.py \
|
||||
../../../cosyvoice/bin/train.py \
|
||||
--train_engine $train_engine \
|
||||
--config conf/cosyvoice3.yaml \
|
||||
--train_data data/train.data.list \
|
||||
--cv_data data/dev.data.list \
|
||||
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
||||
--onnx_path $pretrained_model_dir \
|
||||
--model $model \
|
||||
--checkpoint $pretrained_model_dir/$model.pt \
|
||||
--model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../tools
|
||||
@@ -1 +0,0 @@
|
||||
../../../cosyvoice
|
||||
@@ -27,7 +27,7 @@ fi
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||
for x in dev test train; do
|
||||
tools/extract_embedding.py --dir data/$x \
|
||||
../../../tools/extract_embedding.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||
done
|
||||
fi
|
||||
@@ -35,7 +35,7 @@ fi
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||
for x in dev test train; do
|
||||
tools/extract_speech_token.py --dir data/$x \
|
||||
../../../tools/extract_speech_token.py --dir data/$x \
|
||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
|
||||
done
|
||||
fi
|
||||
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||
for x in dev test train; do
|
||||
mkdir -p data/$x/parquet
|
||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||
--num_processes 10 \
|
||||
--src_dir data/$x \
|
||||
--des_dir data/$x/parquet
|
||||
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
for model in llm flow hifigan; do
|
||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
|
||||
cosyvoice/bin/train.py \
|
||||
../../../cosyvoice/bin/train.py \
|
||||
--train_engine $train_engine \
|
||||
--config conf/cosyvoice.yaml \
|
||||
--train_data data/train.data.list \
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../../../tools
|
||||
Reference in New Issue
Block a user