online feature

2026-02-05 18:09:24 +08:00 · 2026-01-28 15:19:07 +00:00
parent 1822c5c908
commit 66b80dbccb
22 changed files with 133 additions and 116 deletions
--- a/examples/libritts/cosyvoice/cosyvoice
+++ b/examples/libritts/cosyvoice/cosyvoice
@@ -1 +0,0 @@
-../../../cosyvoice
--- a/examples/libritts/cosyvoice/local/prepare_data.py
+++ b/examples/libritts/cosyvoice/local/prepare_data.py
@@ -54,6 +54,7 @@ if __name__ == "__main__":
    parser.add_argument('--des_dir',
                        type=str)
    parser.add_argument('--instruct',
-                        type=str)
+                        type=str,
+                        default='')
    args = parser.parse_args()
    main()
--- a/examples/libritts/cosyvoice/run.sh
+++ b/examples/libritts/cosyvoice/run.sh
@@ -27,7 +27,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_embedding.py --dir data/$x \
+    ../../../tools/extract_embedding.py --dir data/$x \
      --onnx_path $pretrained_model_dir/campplus.onnx
  done
 fi
@@ -35,7 +35,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_speech_token.py --dir data/$x \
+    ../../../tools/extract_speech_token.py --dir data/$x \
      --onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
  done
 fi
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
    mkdir -p data/$x/parquet
-    tools/make_parquet_list.py --num_utts_per_parquet 1000 \
+    ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
      --num_processes 10 \
      --src_dir data/$x \
      --des_dir data/$x/parquet
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
  for model in llm flow hifigan; do
    torchrun --nnodes=1 --nproc_per_node=$num_gpus \
        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
-      cosyvoice/bin/train.py \
+      ../../../cosyvoice/bin/train.py \
      --train_engine $train_engine \
      --config conf/cosyvoice.yaml \
      --train_data data/train.data.list \
--- a/examples/libritts/cosyvoice/tools
+++ b/examples/libritts/cosyvoice/tools
@@ -1 +0,0 @@
-../../../tools
--- a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
+++ b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
@@ -139,7 +139,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
    get_tokenizer: !ref <get_tokenizer>
    allowed_special: !ref <allowed_special>
 filter: !name:cosyvoice.dataset.processor.filter
-    max_length: 40960
+    max_length: 6000
    min_length: 100
    token_max_length: 200
    token_min_length: 1
@@ -158,7 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    center: False
 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
-    token_mel_ratio: 2
+    num_frames: 960
 compute_f0: !name:cosyvoice.dataset.processor.compute_f0
    sample_rate: !ref <sample_rate>
    hop_size: 480
--- a/examples/libritts/cosyvoice2/cosyvoice
+++ b/examples/libritts/cosyvoice2/cosyvoice
@@ -1 +0,0 @@
-../../../cosyvoice
--- a/examples/libritts/cosyvoice2/run.sh
+++ b/examples/libritts/cosyvoice2/run.sh
@@ -24,27 +24,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  done
 fi

-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-  echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
-  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_embedding.py --dir data/$x \
-      --onnx_path $pretrained_model_dir/campplus.onnx
-  done
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-  echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
-  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_speech_token.py --dir data/$x \
-      --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
-  done
-fi
-
+# NOTE embedding/token extraction is not necessary now as we support online feature extraction
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
    mkdir -p data/$x/parquet
-    tools/make_parquet_list.py --num_utts_per_parquet 1000 \
+    ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
      --num_processes 10 \
      --src_dir data/$x \
      --des_dir data/$x/parquet
@@ -69,12 +54,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
  for model in llm flow hifigan; do
    torchrun --nnodes=1 --nproc_per_node=$num_gpus \
        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
-      cosyvoice/bin/train.py \
+      ../../../cosyvoice/bin/train.py \
      --train_engine $train_engine \
      --config conf/cosyvoice2.yaml \
      --train_data data/train.data.list \
      --cv_data data/dev.data.list \
      --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
+      --onnx_path $pretrained_model_dir \
      --model $model \
      --checkpoint $pretrained_model_dir/$model.pt \
      --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
--- a/examples/libritts/cosyvoice2/run_dpo.sh
+++ b/examples/libritts/cosyvoice2/run_dpo.sh
@@ -36,7 +36,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_embedding.py --dir data/$x \
+    ../../../tools/extract_embedding.py --dir data/$x \
      --onnx_path $pretrained_model_dir/campplus.onnx
  done
 fi
@@ -44,7 +44,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
  for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do
-    tools/extract_speech_token.py --dir data/$x \
+    ../../../tools/extract_speech_token.py --dir data/$x \
      --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
  done
 fi
@@ -53,7 +53,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
    mkdir -p data/$x/parquet
-    tools/make_parquet_list.py --num_utts_per_parquet 1000 \
+    ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
      --num_processes 10 \
      --dpo \
      --src_dir data/$x \
@@ -80,11 +80,12 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
  for model in llm; do
    torchrun --nnodes=1 --nproc_per_node=$num_gpus \
        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
-      cosyvoice/bin/train.py \
+      ../../../cosyvoice/bin/train.py \
      --train_engine $train_engine \
      --config conf/cosyvoice2.yaml \
      --train_data data/train.data.list \
      --cv_data data/dev.data.list \
+      --onnx_path $pretrained_model_dir \
      --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
      --model $model \
      --checkpoint $pretrained_model_dir/$model.pt \
--- a/examples/libritts/cosyvoice2/tools
+++ b/examples/libritts/cosyvoice2/tools
@@ -1 +0,0 @@
-../../../tools
--- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
+++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
@@ -129,7 +129,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
    get_tokenizer: !ref <get_tokenizer>
    allowed_special: !ref <allowed_special>
 filter: !name:cosyvoice.dataset.processor.filter
-    max_length: 40960
+    max_length: 6000
    min_length: 100
    token_max_length: 200
    token_min_length: 1
@@ -148,7 +148,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    center: False
 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
-    token_mel_ratio: 2
+    num_frames: 960
 compute_f0: !name:cosyvoice.dataset.processor.compute_f0
    sample_rate: !ref <sample_rate>
    hop_size: 480
--- a/examples/libritts/cosyvoice3/cosyvoice
+++ b/examples/libritts/cosyvoice3/cosyvoice
@@ -1 +0,0 @@
-../../../cosyvoice
--- a/examples/libritts/cosyvoice3/run.sh
+++ b/examples/libritts/cosyvoice3/run.sh
@@ -25,36 +25,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  done
 fi

-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-  echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
-  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_embedding.py --dir data/$x \
-      --onnx_path $pretrained_model_dir/campplus.onnx
-  done
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-  echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
-  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-    tools/extract_speech_token.py --dir data/$x \
-      --onnx_path $pretrained_model_dir/speech_tokenizer_v3.onnx
-  done
-fi
-
+# NOTE embedding/token extraction is not necessary now as we support online feature extraction
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
  for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
    mkdir -p data/$x/parquet
-    tools/make_parquet_list.py --num_utts_per_parquet 1000 \
+    ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
      --num_processes 10 \
-      --instruct \
      --src_dir data/$x \
      --des_dir data/$x/parquet
  done
 fi

 # train llm
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export CUDA_VISIBLE_DEVICES="0"
 num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 job_id=1986
 dist_backend="nccl"
@@ -71,12 +55,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
  for model in llm flow hifigan; do
    torchrun --nnodes=1 --nproc_per_node=$num_gpus \
        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
-      cosyvoice/bin/train.py \
+      ../../../cosyvoice/bin/train.py \
      --train_engine $train_engine \
      --config conf/cosyvoice3.yaml \
      --train_data data/train.data.list \
      --cv_data data/dev.data.list \
      --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
+      --onnx_path $pretrained_model_dir \
      --model $model \
      --checkpoint $pretrained_model_dir/$model.pt \
      --model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \
--- a/examples/libritts/cosyvoice3/tools
+++ b/examples/libritts/cosyvoice3/tools
@@ -1 +0,0 @@
-../../../tools