online feature

This commit is contained in:
lyuxiang.lx
2026-01-28 15:19:07 +00:00
parent 1822c5c908
commit 66b80dbccb
22 changed files with 133 additions and 116 deletions

View File

@@ -49,6 +49,7 @@ def get_args():
parser.add_argument('--train_data', required=True, help='train data file') parser.add_argument('--train_data', required=True, help='train data file')
parser.add_argument('--cv_data', required=True, help='cv data file') parser.add_argument('--cv_data', required=True, help='cv data file')
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path') parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
parser.add_argument('--onnx_path', required=False, help='onnx path, which is required for online feature extraction')
parser.add_argument('--checkpoint', help='checkpoint model') parser.add_argument('--checkpoint', help='checkpoint model')
parser.add_argument('--model_dir', required=True, help='save model dir') parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--tensorboard_dir', parser.add_argument('--tensorboard_dir',
@@ -96,6 +97,7 @@ def get_args():
@record @record
def main(): def main():
args = get_args() args = get_args()
os.environ['onnx_path'] = args.onnx_path
logging.basicConfig(level=logging.DEBUG, logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s') format='%(asctime)s %(levelname)s %(message)s')
# gan train has some special initialization logic # gan train has some special initialization logic
@@ -104,12 +106,10 @@ def main():
override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model} override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
if gan is True: if gan is True:
override_dict.pop('hift') override_dict.pop('hift')
try: if args.qwen_pretrain_path is not None:
with open(args.config, 'r') as f: override_dict['qwen_pretrain_path'] = args.qwen_pretrain_path
configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path}) with open(args.config, 'r') as f:
except Exception: configs = load_hyperpyyaml(f, overrides=override_dict)
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f, overrides=override_dict)
if gan is True: if gan is True:
configs['train_conf'] = configs['train_conf_gan'] configs['train_conf'] = configs['train_conf_gan']
configs['train_conf'].update(vars(args)) configs['train_conf'].update(vars(args))

View File

@@ -16,12 +16,13 @@ import random
import pyarrow.parquet as pq import pyarrow.parquet as pq
from io import BytesIO from io import BytesIO
import numpy as np
import torch import torch
import torchaudio import torchaudio
from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F import torch.nn.functional as F
import pyworld as pw import pyworld as pw
from cosyvoice.utils.onnx import embedding_extractor, online_feature
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'} AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
@@ -92,9 +93,9 @@ def filter(data,
continue continue
if len(sample['text_token']) > token_max_length: if len(sample['text_token']) > token_max_length:
continue continue
if len(sample['speech_token']) == 0: if online_feature is False and len(sample['speech_token']) == 0:
continue continue
if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0: if online_feature is False and 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
continue continue
if num_frames != 0: if num_frames != 0:
if len(sample['text_token']) / num_frames < min_output_input_ratio: if len(sample['text_token']) / num_frames < min_output_input_ratio:
@@ -155,7 +156,7 @@ def truncate(data, truncate_length=24576, mode='train'):
def compute_fbank(data, def compute_fbank(data,
feat_extractor, feat_extractor,
token_mel_ratio=0, num_frames=-1,
mode='train'): mode='train'):
""" Extract fbank """ Extract fbank
@@ -170,14 +171,11 @@ def compute_fbank(data,
assert 'speech' in sample assert 'speech' in sample
assert 'utt' in sample assert 'utt' in sample
assert 'text_token' in sample assert 'text_token' in sample
waveform = sample['speech'] # NOTE in cosyvoice2/3, we support online token extraction, so we need to align speech to 25hz first
feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) if num_frames != -1:
if token_mel_ratio != 0: index = int(np.ceil(sample['speech'].shape[1] / num_frames))
# trim to align speech_token and speech_feat sample['speech'] = torch.concat([sample['speech'], torch.zeros(1, index * num_frames - sample['speech'].shape[1])], dim=1)
token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0])) sample['speech_feat'] = feat_extractor(sample['speech']).squeeze(dim=0).transpose(0, 1)
feat = feat[:token_mel_ratio * token_len]
sample["speech_token"] = sample["speech_token"][:token_len]
sample['speech_feat'] = feat
yield sample yield sample
@@ -216,6 +214,10 @@ def parse_embedding(data, normalize, mode='train'):
Iterable[{key, feat, label}] Iterable[{key, feat, label}]
""" """
for sample in data: for sample in data:
if 'utt_embedding' not in sample and 'spk_embedding' not in sample:
speech_16k = torchaudio.transforms.Resample(orig_freq=sample['sample_rate'], new_freq=16000)(sample['speech'])
embedding = embedding_extractor.inference(speech_16k)
sample['spk_embedding'] = sample['utt_embedding'] = embedding
sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32) sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32) sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
if normalize: if normalize:
@@ -256,13 +258,14 @@ def shuffle(data, shuffle_size=10000, mode='train'):
Iterable[{key, feat, label}] Iterable[{key, feat, label}]
""" """
buf = [] buf = []
yield_size = int(shuffle_size / 2)
for sample in data: for sample in data:
buf.append(sample) buf.append(sample)
if len(buf) >= shuffle_size: if len(buf) >= shuffle_size:
random.shuffle(buf) random.shuffle(buf)
for x in buf: for x in buf[:yield_size]:
yield x yield x
buf = [] buf = buf[yield_size:]
# The sample left over # The sample left over
random.shuffle(buf) random.shuffle(buf)
for x in buf: for x in buf:
@@ -420,10 +423,6 @@ def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
padding_value=0) padding_value=0)
batch["pitch_feat"] = pitch_feat batch["pitch_feat"] = pitch_feat
batch["pitch_feat_len"] = pitch_feat_len batch["pitch_feat_len"] = pitch_feat_len
else:
# only gan train needs speech, delete it to save memory
del batch["speech"]
del batch["speech_len"]
if dpo is True: if dpo is True:
reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order] reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32) reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)

View File

@@ -19,6 +19,7 @@ import torch.nn as nn
from torch.nn import functional as F from torch.nn import functional as F
from omegaconf import DictConfig from omegaconf import DictConfig
from cosyvoice.utils.mask import make_pad_mask from cosyvoice.utils.mask import make_pad_mask
from cosyvoice.utils.onnx import SpeechTokenExtractor
class MaskedDiffWithXvec(torch.nn.Module): class MaskedDiffWithXvec(torch.nn.Module):

View File

@@ -28,6 +28,7 @@ from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
from cosyvoice.utils.common import th_accuracy from cosyvoice.utils.common import th_accuracy
from cosyvoice.utils.file_utils import logging from cosyvoice.utils.file_utils import logging
from cosyvoice.utils.mask import make_pad_mask from cosyvoice.utils.mask import make_pad_mask
from cosyvoice.utils.onnx import SpeechTokenExtractor
class TransformerLM(torch.nn.Module): class TransformerLM(torch.nn.Module):

59
cosyvoice/utils/onnx.py Normal file
View File

@@ -0,0 +1,59 @@
import onnxruntime
import torch, random
from torch import nn
import os
import whisper
import numpy as np
import torchaudio.compliance.kaldi as kaldi
import torch.nn.functional as F
class SpeechTokenExtractor():
def __init__(self, model_path):
self.local_rank = int(os.environ.get("LOCAL_RANK", 0))
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
self.speech_tokenizer_session = onnxruntime.InferenceSession(model_path,
sess_options=option,
providers=[("CUDAExecutionProvider", {'device_id': self.local_rank})])
def inference(self, feat, feat_lengths, device):
ort_out = self.speech_tokenizer_session.run(None,
{self.speech_tokenizer_session.get_inputs()[0].name:
feat.detach().cpu().numpy(),
self.speech_tokenizer_session.get_inputs()[1].name:
feat_lengths.detach().cpu().numpy()})
speech_token, speech_token_embedding = ort_out[0], ort_out[1]
return torch.tensor(speech_token).to(device), (feat_lengths / 2).to(torch.int32).to(device)
class EmbeddingExtractor():
def __init__(self, model_path):
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
self.max_len = 10 * 16000
self.campplus_session = onnxruntime.InferenceSession(model_path,
sess_options=option,
providers=["CPUExecutionProvider"])
def inference(self, speech):
if speech.shape[1] > self.max_len:
start_index = random.randint(0, speech.shape[1] - self.max_len)
speech = speech[:, start_index: start_index + self.max_len]
feat = kaldi.fbank(speech,
num_mel_bins=80,
dither=0,
sample_frequency=16000)
feat = feat - feat.mean(dim=0, keepdim=True)
embedding = self.campplus_session.run(None,
{self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
return torch.tensor(embedding).to(speech.device)
# singleton mode, only initialized once
onnx_path = os.environ.get('onnx_path')
if onnx_path is not None:
embedding_extractor, online_feature = EmbeddingExtractor(model_path=os.path.join(onnx_path, 'campplus.onnx')), True
else:
embedding_extractor, online_feature = None, False

View File

@@ -1 +0,0 @@
../../../cosyvoice

View File

@@ -54,6 +54,7 @@ if __name__ == "__main__":
parser.add_argument('--des_dir', parser.add_argument('--des_dir',
type=str) type=str)
parser.add_argument('--instruct', parser.add_argument('--instruct',
type=str) type=str,
default='')
args = parser.parse_args() args = parser.parse_args()
main() main()

View File

@@ -27,7 +27,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_embedding.py --dir data/$x \ ../../../tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx --onnx_path $pretrained_model_dir/campplus.onnx
done done
fi fi
@@ -35,7 +35,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_speech_token.py --dir data/$x \ ../../../tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx --onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
done done
fi fi
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x/parquet mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \ ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \ --num_processes 10 \
--src_dir data/$x \ --src_dir data/$x \
--des_dir data/$x/parquet --des_dir data/$x/parquet
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
for model in llm flow hifigan; do for model in llm flow hifigan; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \ torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
cosyvoice/bin/train.py \ ../../../cosyvoice/bin/train.py \
--train_engine $train_engine \ --train_engine $train_engine \
--config conf/cosyvoice.yaml \ --config conf/cosyvoice.yaml \
--train_data data/train.data.list \ --train_data data/train.data.list \

View File

@@ -1 +0,0 @@
../../../tools

View File

@@ -139,7 +139,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer> get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special> allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter filter: !name:cosyvoice.dataset.processor.filter
max_length: 40960 max_length: 6000
min_length: 100 min_length: 100
token_max_length: 200 token_max_length: 200
token_min_length: 1 token_min_length: 1
@@ -158,7 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
center: False center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor> feat_extractor: !ref <feat_extractor>
token_mel_ratio: 2 num_frames: 960
compute_f0: !name:cosyvoice.dataset.processor.compute_f0 compute_f0: !name:cosyvoice.dataset.processor.compute_f0
sample_rate: !ref <sample_rate> sample_rate: !ref <sample_rate>
hop_size: 480 hop_size: 480

View File

@@ -1 +0,0 @@
../../../cosyvoice

View File

@@ -24,27 +24,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
done done
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # NOTE embedding/token extraction is not necessary now as we support online feature extraction
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x/parquet mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \ ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \ --num_processes 10 \
--src_dir data/$x \ --src_dir data/$x \
--des_dir data/$x/parquet --des_dir data/$x/parquet
@@ -69,12 +54,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
for model in llm flow hifigan; do for model in llm flow hifigan; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \ torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
cosyvoice/bin/train.py \ ../../../cosyvoice/bin/train.py \
--train_engine $train_engine \ --train_engine $train_engine \
--config conf/cosyvoice2.yaml \ --config conf/cosyvoice2.yaml \
--train_data data/train.data.list \ --train_data data/train.data.list \
--cv_data data/dev.data.list \ --cv_data data/dev.data.list \
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
--onnx_path $pretrained_model_dir \
--model $model \ --model $model \
--checkpoint $pretrained_model_dir/$model.pt \ --checkpoint $pretrained_model_dir/$model.pt \
--model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \ --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \

View File

@@ -36,7 +36,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_embedding.py --dir data/$x \ ../../../tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx --onnx_path $pretrained_model_dir/campplus.onnx
done done
fi fi
@@ -44,7 +44,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do
tools/extract_speech_token.py --dir data/$x \ ../../../tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
done done
fi fi
@@ -53,7 +53,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x/parquet mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \ ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \ --num_processes 10 \
--dpo \ --dpo \
--src_dir data/$x \ --src_dir data/$x \
@@ -80,11 +80,12 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
for model in llm; do for model in llm; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \ torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
cosyvoice/bin/train.py \ ../../../cosyvoice/bin/train.py \
--train_engine $train_engine \ --train_engine $train_engine \
--config conf/cosyvoice2.yaml \ --config conf/cosyvoice2.yaml \
--train_data data/train.data.list \ --train_data data/train.data.list \
--cv_data data/dev.data.list \ --cv_data data/dev.data.list \
--onnx_path $pretrained_model_dir \
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
--model $model \ --model $model \
--checkpoint $pretrained_model_dir/$model.pt \ --checkpoint $pretrained_model_dir/$model.pt \

View File

@@ -1 +0,0 @@
../../../tools

View File

@@ -129,7 +129,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer> get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special> allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter filter: !name:cosyvoice.dataset.processor.filter
max_length: 40960 max_length: 6000
min_length: 100 min_length: 100
token_max_length: 200 token_max_length: 200
token_min_length: 1 token_min_length: 1
@@ -148,7 +148,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
center: False center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor> feat_extractor: !ref <feat_extractor>
token_mel_ratio: 2 num_frames: 960
compute_f0: !name:cosyvoice.dataset.processor.compute_f0 compute_f0: !name:cosyvoice.dataset.processor.compute_f0
sample_rate: !ref <sample_rate> sample_rate: !ref <sample_rate>
hop_size: 480 hop_size: 480

View File

@@ -1 +0,0 @@
../../../cosyvoice

View File

@@ -25,36 +25,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
done done
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # NOTE embedding/token extraction is not necessary now as we support online feature extraction
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v3.onnx
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mkdir -p data/$x/parquet mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \ ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \ --num_processes 10 \
--instruct \
--src_dir data/$x \ --src_dir data/$x \
--des_dir data/$x/parquet --des_dir data/$x/parquet
done done
fi fi
# train llm # train llm
export CUDA_VISIBLE_DEVICES="0,1,2,3" export CUDA_VISIBLE_DEVICES="0"
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
job_id=1986 job_id=1986
dist_backend="nccl" dist_backend="nccl"
@@ -71,12 +55,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
for model in llm flow hifigan; do for model in llm flow hifigan; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \ torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
cosyvoice/bin/train.py \ ../../../cosyvoice/bin/train.py \
--train_engine $train_engine \ --train_engine $train_engine \
--config conf/cosyvoice3.yaml \ --config conf/cosyvoice3.yaml \
--train_data data/train.data.list \ --train_data data/train.data.list \
--cv_data data/dev.data.list \ --cv_data data/dev.data.list \
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
--onnx_path $pretrained_model_dir \
--model $model \ --model $model \
--checkpoint $pretrained_model_dir/$model.pt \ --checkpoint $pretrained_model_dir/$model.pt \
--model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \ --model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \

View File

@@ -1 +0,0 @@
../../../tools

View File

@@ -1 +0,0 @@
../../../cosyvoice

View File

@@ -27,7 +27,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
for x in dev test train; do for x in dev test train; do
tools/extract_embedding.py --dir data/$x \ ../../../tools/extract_embedding.py --dir data/$x \
--onnx_path $pretrained_model_dir/campplus.onnx --onnx_path $pretrained_model_dir/campplus.onnx
done done
fi fi
@@ -35,7 +35,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
for x in dev test train; do for x in dev test train; do
tools/extract_speech_token.py --dir data/$x \ ../../../tools/extract_speech_token.py --dir data/$x \
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx --onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
done done
fi fi
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
for x in dev test train; do for x in dev test train; do
mkdir -p data/$x/parquet mkdir -p data/$x/parquet
tools/make_parquet_list.py --num_utts_per_parquet 1000 \ ../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
--num_processes 10 \ --num_processes 10 \
--src_dir data/$x \ --src_dir data/$x \
--des_dir data/$x/parquet --des_dir data/$x/parquet
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
for model in llm flow hifigan; do for model in llm flow hifigan; do
torchrun --nnodes=1 --nproc_per_node=$num_gpus \ torchrun --nnodes=1 --nproc_per_node=$num_gpus \
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
cosyvoice/bin/train.py \ ../../../cosyvoice/bin/train.py \
--train_engine $train_engine \ --train_engine $train_engine \
--config conf/cosyvoice.yaml \ --config conf/cosyvoice.yaml \
--train_data data/train.data.list \ --train_data data/train.data.list \

View File

@@ -1 +0,0 @@
../../../tools

View File

@@ -29,31 +29,24 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file):
for utt in tqdm(utt_list): for utt in tqdm(utt_list):
data = open(utt2wav[utt], 'rb').read() data = open(utt2wav[utt], 'rb').read()
data_list.append(data) data_list.append(data)
wav_list = [utt2wav[utt] for utt in utt_list]
text_list = [utt2text[utt] for utt in utt_list]
spk_list = [utt2spk[utt] for utt in utt_list]
uttembedding_list = [utt2embedding[utt] for utt in utt_list]
spkembedding_list = [spk2embedding[utt2spk[utt]] for utt in utt_list]
speech_token_list = [utt2speech_token.get(utt, []) for utt in utt_list]
if args.dpo:
reject_speech_token_list = [utt2reject_speech_token[utt] for utt in utt_list]
if args.instruct:
instruct_list = [utt2instruct[utt] for utt in utt_list]
# 保存到parquet,utt2parquet_file,spk2parquet_file # 保存到parquet,utt2parquet_file,spk2parquet_file
df = pd.DataFrame() df = pd.DataFrame()
df['utt'] = utt_list df['utt'] = utt_list
df['wav'] = wav_list
df['audio_data'] = data_list df['audio_data'] = data_list
df['text'] = text_list df['wav'] = [utt2wav[utt] for utt in utt_list]
df['spk'] = spk_list df['text'] = [utt2text[utt] for utt in utt_list]
df['utt_embedding'] = uttembedding_list df['spk'] = [utt2spk[utt] for utt in utt_list]
df['spk_embedding'] = spkembedding_list if utt2embedding is not None:
df['speech_token'] = speech_token_list df['utt_embedding'] = [utt2embedding[utt] for utt in utt_list]
if spk2embedding is not None:
df['spk_embedding'] = [spk2embedding[utt2spk[utt]] for utt in utt_list]
if utt2speech_token is not None:
df['speech_token'] = [utt2speech_token[utt] for utt in utt_list]
if utt2instruct is not None:
df['instruct'] = [utt2instruct[utt] for utt in utt_list]
if args.dpo: if args.dpo:
df['reject_speech_token'] = reject_speech_token_list df['reject_speech_token'] = [utt2reject_speech_token.get(utt, None) for utt in utt_list]
if args.instruct:
df['instruct'] = instruct_list
df.to_parquet(parquet_file) df.to_parquet(parquet_file)
with open(utt2parquet_file, 'w') as f: with open(utt2parquet_file, 'w') as f:
json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2) json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2)
@@ -72,10 +65,6 @@ if __name__ == "__main__":
type=int, type=int,
default=1, default=1,
help='num processes for make parquets') help='num processes for make parquets')
parser.add_argument('--instruct',
action='store_true',
default=False,
help='has instruct file or not')
parser.add_argument('--src_dir', parser.add_argument('--src_dir',
type=str) type=str)
parser.add_argument('--des_dir', parser.add_argument('--des_dir',
@@ -86,7 +75,7 @@ if __name__ == "__main__":
help='Use Direct Preference Optimization') help='Use Direct Preference Optimization')
args = parser.parse_args() args = parser.parse_args()
utt2wav, utt2text, utt2spk, utt2instruct = {}, {}, {}, {} utt2wav, utt2text, utt2spk = {}, {}, {}
with open('{}/wav.scp'.format(args.src_dir)) as f: with open('{}/wav.scp'.format(args.src_dir)) as f:
for l in f: for l in f:
l = l.replace('\n', '').split() l = l.replace('\n', '').split()
@@ -99,16 +88,19 @@ if __name__ == "__main__":
for l in f: for l in f:
l = l.replace('\n', '').split() l = l.replace('\n', '').split()
utt2spk[l[0]] = l[1] utt2spk[l[0]] = l[1]
if args.instruct is True: if os.path.exists('{}/instruct'.format(args.src_dir)):
utt2instruct = {}
with open('{}/instruct'.format(args.src_dir)) as f: with open('{}/instruct'.format(args.src_dir)) as f:
for l in f: for l in f:
l = l.replace('\n', '').split() l = l.replace('\n', '').split()
utt2instruct[l[0]] = ' '.join(l[1:]) utt2instruct[l[0]] = ' '.join(l[1:])
utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir)) else:
spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir)) utt2instruct = None
utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir)) utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir)) if os.path.exists('{}/utt2embedding.pt'.format(args.src_dir)) else None
spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir)) if os.path.exists('{}/spk2embedding.pt'.format(args.src_dir)) else None
utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir)) if os.path.exists('{}/utt2speech_token.pt'.format(args.src_dir)) else None
if args.dpo: if args.dpo:
utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir)) utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir)) if os.path.exists('{}_reject/utt2speech_token.pt'.format(args.src_dir)) else {}
utts = list(utt2wav.keys()) utts = list(utt2wav.keys())
# Using process pool to speedup # Using process pool to speedup