From 66b80dbccbd60023034806e5d5d42698d885294e Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Wed, 28 Jan 2026 15:19:07 +0000 Subject: [PATCH] online feature --- cosyvoice/bin/train.py | 12 ++-- cosyvoice/dataset/processor.py | 35 ++++++----- cosyvoice/flow/flow.py | 1 + cosyvoice/llm/llm.py | 1 + cosyvoice/utils/onnx.py | 59 +++++++++++++++++++ examples/libritts/cosyvoice/cosyvoice | 1 - .../libritts/cosyvoice/local/prepare_data.py | 3 +- examples/libritts/cosyvoice/run.sh | 8 +-- examples/libritts/cosyvoice/tools | 1 - .../libritts/cosyvoice2/conf/cosyvoice2.yaml | 4 +- examples/libritts/cosyvoice2/cosyvoice | 1 - examples/libritts/cosyvoice2/run.sh | 22 ++----- examples/libritts/cosyvoice2/run_dpo.sh | 9 +-- examples/libritts/cosyvoice2/tools | 1 - .../libritts/cosyvoice3/conf/cosyvoice3.yaml | 4 +- examples/libritts/cosyvoice3/cosyvoice | 1 - examples/libritts/cosyvoice3/run.sh | 25 ++------ examples/libritts/cosyvoice3/tools | 1 - examples/magicdata-read/cosyvoice/cosyvoice | 1 - examples/magicdata-read/cosyvoice/run.sh | 8 +-- examples/magicdata-read/cosyvoice/tools | 1 - tools/make_parquet_list.py | 50 +++++++--------- 22 files changed, 133 insertions(+), 116 deletions(-) create mode 100644 cosyvoice/utils/onnx.py delete mode 120000 examples/libritts/cosyvoice/cosyvoice delete mode 120000 examples/libritts/cosyvoice/tools delete mode 120000 examples/libritts/cosyvoice2/cosyvoice delete mode 120000 examples/libritts/cosyvoice2/tools delete mode 120000 examples/libritts/cosyvoice3/cosyvoice delete mode 120000 examples/libritts/cosyvoice3/tools delete mode 120000 examples/magicdata-read/cosyvoice/cosyvoice delete mode 120000 examples/magicdata-read/cosyvoice/tools diff --git a/cosyvoice/bin/train.py b/cosyvoice/bin/train.py index 3e4016f..d650161 100644 --- a/cosyvoice/bin/train.py +++ b/cosyvoice/bin/train.py @@ -49,6 +49,7 @@ def get_args(): parser.add_argument('--train_data', required=True, help='train data file') parser.add_argument('--cv_data', required=True, help='cv data file') parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path') + parser.add_argument('--onnx_path', required=False, help='onnx path, which is required for online feature extraction') parser.add_argument('--checkpoint', help='checkpoint model') parser.add_argument('--model_dir', required=True, help='save model dir') parser.add_argument('--tensorboard_dir', @@ -96,6 +97,7 @@ def get_args(): @record def main(): args = get_args() + os.environ['onnx_path'] = args.onnx_path logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') # gan train has some special initialization logic @@ -104,12 +106,10 @@ def main(): override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model} if gan is True: override_dict.pop('hift') - try: - with open(args.config, 'r') as f: - configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path}) - except Exception: - with open(args.config, 'r') as f: - configs = load_hyperpyyaml(f, overrides=override_dict) + if args.qwen_pretrain_path is not None: + override_dict['qwen_pretrain_path'] = args.qwen_pretrain_path + with open(args.config, 'r') as f: + configs = load_hyperpyyaml(f, overrides=override_dict) if gan is True: configs['train_conf'] = configs['train_conf_gan'] configs['train_conf'].update(vars(args)) diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index 78e84f8..cf4b963 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -16,12 +16,13 @@ import random import pyarrow.parquet as pq from io import BytesIO +import numpy as np import torch import torchaudio from torch.nn.utils.rnn import pad_sequence import torch.nn.functional as F import pyworld as pw - +from cosyvoice.utils.onnx import embedding_extractor, online_feature AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'} @@ -92,9 +93,9 @@ def filter(data, continue if len(sample['text_token']) > token_max_length: continue - if len(sample['speech_token']) == 0: + if online_feature is False and len(sample['speech_token']) == 0: continue - if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0: + if online_feature is False and 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0: continue if num_frames != 0: if len(sample['text_token']) / num_frames < min_output_input_ratio: @@ -155,7 +156,7 @@ def truncate(data, truncate_length=24576, mode='train'): def compute_fbank(data, feat_extractor, - token_mel_ratio=0, + num_frames=-1, mode='train'): """ Extract fbank @@ -170,14 +171,11 @@ def compute_fbank(data, assert 'speech' in sample assert 'utt' in sample assert 'text_token' in sample - waveform = sample['speech'] - feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) - if token_mel_ratio != 0: - # trim to align speech_token and speech_feat - token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0])) - feat = feat[:token_mel_ratio * token_len] - sample["speech_token"] = sample["speech_token"][:token_len] - sample['speech_feat'] = feat + # NOTE in cosyvoice2/3, we support online token extraction, so we need to align speech to 25hz first + if num_frames != -1: + index = int(np.ceil(sample['speech'].shape[1] / num_frames)) + sample['speech'] = torch.concat([sample['speech'], torch.zeros(1, index * num_frames - sample['speech'].shape[1])], dim=1) + sample['speech_feat'] = feat_extractor(sample['speech']).squeeze(dim=0).transpose(0, 1) yield sample @@ -216,6 +214,10 @@ def parse_embedding(data, normalize, mode='train'): Iterable[{key, feat, label}] """ for sample in data: + if 'utt_embedding' not in sample and 'spk_embedding' not in sample: + speech_16k = torchaudio.transforms.Resample(orig_freq=sample['sample_rate'], new_freq=16000)(sample['speech']) + embedding = embedding_extractor.inference(speech_16k) + sample['spk_embedding'] = sample['utt_embedding'] = embedding sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32) sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32) if normalize: @@ -256,13 +258,14 @@ def shuffle(data, shuffle_size=10000, mode='train'): Iterable[{key, feat, label}] """ buf = [] + yield_size = int(shuffle_size / 2) for sample in data: buf.append(sample) if len(buf) >= shuffle_size: random.shuffle(buf) - for x in buf: + for x in buf[:yield_size]: yield x - buf = [] + buf = buf[yield_size:] # The sample left over random.shuffle(buf) for x in buf: @@ -420,10 +423,6 @@ def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False): padding_value=0) batch["pitch_feat"] = pitch_feat batch["pitch_feat_len"] = pitch_feat_len - else: - # only gan train needs speech, delete it to save memory - del batch["speech"] - del batch["speech_len"] if dpo is True: reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order] reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32) diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index 63651e0..9b71779 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -19,6 +19,7 @@ import torch.nn as nn from torch.nn import functional as F from omegaconf import DictConfig from cosyvoice.utils.mask import make_pad_mask +from cosyvoice.utils.onnx import SpeechTokenExtractor class MaskedDiffWithXvec(torch.nn.Module): diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index f48b6da..8401a94 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -28,6 +28,7 @@ from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss from cosyvoice.utils.common import th_accuracy from cosyvoice.utils.file_utils import logging from cosyvoice.utils.mask import make_pad_mask +from cosyvoice.utils.onnx import SpeechTokenExtractor class TransformerLM(torch.nn.Module): diff --git a/cosyvoice/utils/onnx.py b/cosyvoice/utils/onnx.py new file mode 100644 index 0000000..df3bca1 --- /dev/null +++ b/cosyvoice/utils/onnx.py @@ -0,0 +1,59 @@ +import onnxruntime +import torch, random +from torch import nn +import os +import whisper +import numpy as np +import torchaudio.compliance.kaldi as kaldi +import torch.nn.functional as F + + +class SpeechTokenExtractor(): + def __init__(self, model_path): + self.local_rank = int(os.environ.get("LOCAL_RANK", 0)) + option = onnxruntime.SessionOptions() + option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + option.intra_op_num_threads = 1 + self.speech_tokenizer_session = onnxruntime.InferenceSession(model_path, + sess_options=option, + providers=[("CUDAExecutionProvider", {'device_id': self.local_rank})]) + + def inference(self, feat, feat_lengths, device): + ort_out = self.speech_tokenizer_session.run(None, + {self.speech_tokenizer_session.get_inputs()[0].name: + feat.detach().cpu().numpy(), + self.speech_tokenizer_session.get_inputs()[1].name: + feat_lengths.detach().cpu().numpy()}) + speech_token, speech_token_embedding = ort_out[0], ort_out[1] + return torch.tensor(speech_token).to(device), (feat_lengths / 2).to(torch.int32).to(device) + + +class EmbeddingExtractor(): + def __init__(self, model_path): + option = onnxruntime.SessionOptions() + option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + option.intra_op_num_threads = 1 + self.max_len = 10 * 16000 + self.campplus_session = onnxruntime.InferenceSession(model_path, + sess_options=option, + providers=["CPUExecutionProvider"]) + + def inference(self, speech): + if speech.shape[1] > self.max_len: + start_index = random.randint(0, speech.shape[1] - self.max_len) + speech = speech[:, start_index: start_index + self.max_len] + feat = kaldi.fbank(speech, + num_mel_bins=80, + dither=0, + sample_frequency=16000) + feat = feat - feat.mean(dim=0, keepdim=True) + embedding = self.campplus_session.run(None, + {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist() + return torch.tensor(embedding).to(speech.device) + +# singleton mode, only initialized once +onnx_path = os.environ.get('onnx_path') +if onnx_path is not None: + embedding_extractor, online_feature = EmbeddingExtractor(model_path=os.path.join(onnx_path, 'campplus.onnx')), True +else: + embedding_extractor, online_feature = None, False \ No newline at end of file diff --git a/examples/libritts/cosyvoice/cosyvoice b/examples/libritts/cosyvoice/cosyvoice deleted file mode 120000 index 3903806..0000000 --- a/examples/libritts/cosyvoice/cosyvoice +++ /dev/null @@ -1 +0,0 @@ -../../../cosyvoice \ No newline at end of file diff --git a/examples/libritts/cosyvoice/local/prepare_data.py b/examples/libritts/cosyvoice/local/prepare_data.py index ded523d..2f0b850 100644 --- a/examples/libritts/cosyvoice/local/prepare_data.py +++ b/examples/libritts/cosyvoice/local/prepare_data.py @@ -54,6 +54,7 @@ if __name__ == "__main__": parser.add_argument('--des_dir', type=str) parser.add_argument('--instruct', - type=str) + type=str, + default='') args = parser.parse_args() main() diff --git a/examples/libritts/cosyvoice/run.sh b/examples/libritts/cosyvoice/run.sh index b95a294..5449ce6 100644 --- a/examples/libritts/cosyvoice/run.sh +++ b/examples/libritts/cosyvoice/run.sh @@ -27,7 +27,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_embedding.py --dir data/$x \ + ../../../tools/extract_embedding.py --dir data/$x \ --onnx_path $pretrained_model_dir/campplus.onnx done fi @@ -35,7 +35,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_speech_token.py --dir data/$x \ + ../../../tools/extract_speech_token.py --dir data/$x \ --onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx done fi @@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do mkdir -p data/$x/parquet - tools/make_parquet_list.py --num_utts_per_parquet 1000 \ + ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \ --num_processes 10 \ --src_dir data/$x \ --des_dir data/$x/parquet @@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ - cosyvoice/bin/train.py \ + ../../../cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice.yaml \ --train_data data/train.data.list \ diff --git a/examples/libritts/cosyvoice/tools b/examples/libritts/cosyvoice/tools deleted file mode 120000 index c92f417..0000000 --- a/examples/libritts/cosyvoice/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools \ No newline at end of file diff --git a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml index df36109..6421d51 100644 --- a/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml +++ b/examples/libritts/cosyvoice2/conf/cosyvoice2.yaml @@ -139,7 +139,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize get_tokenizer: !ref allowed_special: !ref filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 + max_length: 6000 min_length: 100 token_max_length: 200 token_min_length: 1 @@ -158,7 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram center: False compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank feat_extractor: !ref - token_mel_ratio: 2 + num_frames: 960 compute_f0: !name:cosyvoice.dataset.processor.compute_f0 sample_rate: !ref hop_size: 480 diff --git a/examples/libritts/cosyvoice2/cosyvoice b/examples/libritts/cosyvoice2/cosyvoice deleted file mode 120000 index 3903806..0000000 --- a/examples/libritts/cosyvoice2/cosyvoice +++ /dev/null @@ -1 +0,0 @@ -../../../cosyvoice \ No newline at end of file diff --git a/examples/libritts/cosyvoice2/run.sh b/examples/libritts/cosyvoice2/run.sh index 538c71a..10d177b 100644 --- a/examples/libritts/cosyvoice2/run.sh +++ b/examples/libritts/cosyvoice2/run.sh @@ -24,27 +24,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then done fi -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" - for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_embedding.py --dir data/$x \ - --onnx_path $pretrained_model_dir/campplus.onnx - done -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" - for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_speech_token.py --dir data/$x \ - --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx - done -fi - +# NOTE embedding/token extraction is not necessary now as we support online feature extraction if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do mkdir -p data/$x/parquet - tools/make_parquet_list.py --num_utts_per_parquet 1000 \ + ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \ --num_processes 10 \ --src_dir data/$x \ --des_dir data/$x/parquet @@ -69,12 +54,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ - cosyvoice/bin/train.py \ + ../../../cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice2.yaml \ --train_data data/train.data.list \ --cv_data data/dev.data.list \ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ + --onnx_path $pretrained_model_dir \ --model $model \ --checkpoint $pretrained_model_dir/$model.pt \ --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \ diff --git a/examples/libritts/cosyvoice2/run_dpo.sh b/examples/libritts/cosyvoice2/run_dpo.sh index 1367e45..cceb441 100644 --- a/examples/libritts/cosyvoice2/run_dpo.sh +++ b/examples/libritts/cosyvoice2/run_dpo.sh @@ -36,7 +36,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_embedding.py --dir data/$x \ + ../../../tools/extract_embedding.py --dir data/$x \ --onnx_path $pretrained_model_dir/campplus.onnx done fi @@ -44,7 +44,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do - tools/extract_speech_token.py --dir data/$x \ + ../../../tools/extract_speech_token.py --dir data/$x \ --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx done fi @@ -53,7 +53,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do mkdir -p data/$x/parquet - tools/make_parquet_list.py --num_utts_per_parquet 1000 \ + ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \ --num_processes 10 \ --dpo \ --src_dir data/$x \ @@ -80,11 +80,12 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then for model in llm; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ - cosyvoice/bin/train.py \ + ../../../cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice2.yaml \ --train_data data/train.data.list \ --cv_data data/dev.data.list \ + --onnx_path $pretrained_model_dir \ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ --model $model \ --checkpoint $pretrained_model_dir/$model.pt \ diff --git a/examples/libritts/cosyvoice2/tools b/examples/libritts/cosyvoice2/tools deleted file mode 120000 index c92f417..0000000 --- a/examples/libritts/cosyvoice2/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools \ No newline at end of file diff --git a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml index 16ab187..d1da95b 100644 --- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml +++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml @@ -129,7 +129,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize get_tokenizer: !ref allowed_special: !ref filter: !name:cosyvoice.dataset.processor.filter - max_length: 40960 + max_length: 6000 min_length: 100 token_max_length: 200 token_min_length: 1 @@ -148,7 +148,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram center: False compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank feat_extractor: !ref - token_mel_ratio: 2 + num_frames: 960 compute_f0: !name:cosyvoice.dataset.processor.compute_f0 sample_rate: !ref hop_size: 480 diff --git a/examples/libritts/cosyvoice3/cosyvoice b/examples/libritts/cosyvoice3/cosyvoice deleted file mode 120000 index 3903806..0000000 --- a/examples/libritts/cosyvoice3/cosyvoice +++ /dev/null @@ -1 +0,0 @@ -../../../cosyvoice \ No newline at end of file diff --git a/examples/libritts/cosyvoice3/run.sh b/examples/libritts/cosyvoice3/run.sh index 3eed628..4261822 100644 --- a/examples/libritts/cosyvoice3/run.sh +++ b/examples/libritts/cosyvoice3/run.sh @@ -25,36 +25,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then done fi -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" - for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_embedding.py --dir data/$x \ - --onnx_path $pretrained_model_dir/campplus.onnx - done -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" - for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - tools/extract_speech_token.py --dir data/$x \ - --onnx_path $pretrained_model_dir/speech_tokenizer_v3.onnx - done -fi - +# NOTE embedding/token extraction is not necessary now as we support online feature extraction if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do mkdir -p data/$x/parquet - tools/make_parquet_list.py --num_utts_per_parquet 1000 \ + ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \ --num_processes 10 \ - --instruct \ --src_dir data/$x \ --des_dir data/$x/parquet done fi # train llm -export CUDA_VISIBLE_DEVICES="0,1,2,3" +export CUDA_VISIBLE_DEVICES="0" num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') job_id=1986 dist_backend="nccl" @@ -71,12 +55,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ - cosyvoice/bin/train.py \ + ../../../cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice3.yaml \ --train_data data/train.data.list \ --cv_data data/dev.data.list \ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ + --onnx_path $pretrained_model_dir \ --model $model \ --checkpoint $pretrained_model_dir/$model.pt \ --model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \ diff --git a/examples/libritts/cosyvoice3/tools b/examples/libritts/cosyvoice3/tools deleted file mode 120000 index c92f417..0000000 --- a/examples/libritts/cosyvoice3/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/cosyvoice b/examples/magicdata-read/cosyvoice/cosyvoice deleted file mode 120000 index 3903806..0000000 --- a/examples/magicdata-read/cosyvoice/cosyvoice +++ /dev/null @@ -1 +0,0 @@ -../../../cosyvoice \ No newline at end of file diff --git a/examples/magicdata-read/cosyvoice/run.sh b/examples/magicdata-read/cosyvoice/run.sh index 4a69b6b..b3e0336 100644 --- a/examples/magicdata-read/cosyvoice/run.sh +++ b/examples/magicdata-read/cosyvoice/run.sh @@ -27,7 +27,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" for x in dev test train; do - tools/extract_embedding.py --dir data/$x \ + ../../../tools/extract_embedding.py --dir data/$x \ --onnx_path $pretrained_model_dir/campplus.onnx done fi @@ -35,7 +35,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" for x in dev test train; do - tools/extract_speech_token.py --dir data/$x \ + ../../../tools/extract_speech_token.py --dir data/$x \ --onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx done fi @@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" for x in dev test train; do mkdir -p data/$x/parquet - tools/make_parquet_list.py --num_utts_per_parquet 1000 \ + ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \ --num_processes 10 \ --src_dir data/$x \ --des_dir data/$x/parquet @@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then for model in llm flow hifigan; do torchrun --nnodes=1 --nproc_per_node=$num_gpus \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ - cosyvoice/bin/train.py \ + ../../../cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice.yaml \ --train_data data/train.data.list \ diff --git a/examples/magicdata-read/cosyvoice/tools b/examples/magicdata-read/cosyvoice/tools deleted file mode 120000 index c92f417..0000000 --- a/examples/magicdata-read/cosyvoice/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools \ No newline at end of file diff --git a/tools/make_parquet_list.py b/tools/make_parquet_list.py index 2f35ae6..485d2ab 100755 --- a/tools/make_parquet_list.py +++ b/tools/make_parquet_list.py @@ -29,31 +29,24 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file): for utt in tqdm(utt_list): data = open(utt2wav[utt], 'rb').read() data_list.append(data) - wav_list = [utt2wav[utt] for utt in utt_list] - text_list = [utt2text[utt] for utt in utt_list] - spk_list = [utt2spk[utt] for utt in utt_list] - uttembedding_list = [utt2embedding[utt] for utt in utt_list] - spkembedding_list = [spk2embedding[utt2spk[utt]] for utt in utt_list] - speech_token_list = [utt2speech_token.get(utt, []) for utt in utt_list] - if args.dpo: - reject_speech_token_list = [utt2reject_speech_token[utt] for utt in utt_list] - if args.instruct: - instruct_list = [utt2instruct[utt] for utt in utt_list] # 保存到parquet,utt2parquet_file,spk2parquet_file df = pd.DataFrame() df['utt'] = utt_list - df['wav'] = wav_list df['audio_data'] = data_list - df['text'] = text_list - df['spk'] = spk_list - df['utt_embedding'] = uttembedding_list - df['spk_embedding'] = spkembedding_list - df['speech_token'] = speech_token_list + df['wav'] = [utt2wav[utt] for utt in utt_list] + df['text'] = [utt2text[utt] for utt in utt_list] + df['spk'] = [utt2spk[utt] for utt in utt_list] + if utt2embedding is not None: + df['utt_embedding'] = [utt2embedding[utt] for utt in utt_list] + if spk2embedding is not None: + df['spk_embedding'] = [spk2embedding[utt2spk[utt]] for utt in utt_list] + if utt2speech_token is not None: + df['speech_token'] = [utt2speech_token[utt] for utt in utt_list] + if utt2instruct is not None: + df['instruct'] = [utt2instruct[utt] for utt in utt_list] if args.dpo: - df['reject_speech_token'] = reject_speech_token_list - if args.instruct: - df['instruct'] = instruct_list + df['reject_speech_token'] = [utt2reject_speech_token.get(utt, None) for utt in utt_list] df.to_parquet(parquet_file) with open(utt2parquet_file, 'w') as f: json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2) @@ -72,10 +65,6 @@ if __name__ == "__main__": type=int, default=1, help='num processes for make parquets') - parser.add_argument('--instruct', - action='store_true', - default=False, - help='has instruct file or not') parser.add_argument('--src_dir', type=str) parser.add_argument('--des_dir', @@ -86,7 +75,7 @@ if __name__ == "__main__": help='Use Direct Preference Optimization') args = parser.parse_args() - utt2wav, utt2text, utt2spk, utt2instruct = {}, {}, {}, {} + utt2wav, utt2text, utt2spk = {}, {}, {} with open('{}/wav.scp'.format(args.src_dir)) as f: for l in f: l = l.replace('\n', '').split() @@ -99,16 +88,19 @@ if __name__ == "__main__": for l in f: l = l.replace('\n', '').split() utt2spk[l[0]] = l[1] - if args.instruct is True: + if os.path.exists('{}/instruct'.format(args.src_dir)): + utt2instruct = {} with open('{}/instruct'.format(args.src_dir)) as f: for l in f: l = l.replace('\n', '').split() utt2instruct[l[0]] = ' '.join(l[1:]) - utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir)) - spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir)) - utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir)) + else: + utt2instruct = None + utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir)) if os.path.exists('{}/utt2embedding.pt'.format(args.src_dir)) else None + spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir)) if os.path.exists('{}/spk2embedding.pt'.format(args.src_dir)) else None + utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir)) if os.path.exists('{}/utt2speech_token.pt'.format(args.src_dir)) else None if args.dpo: - utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir)) + utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir)) if os.path.exists('{}_reject/utt2speech_token.pt'.format(args.src_dir)) else {} utts = list(utt2wav.keys()) # Using process pool to speedup