mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 09:29:25 +08:00
online feature
This commit is contained in:
@@ -49,6 +49,7 @@ def get_args():
|
|||||||
parser.add_argument('--train_data', required=True, help='train data file')
|
parser.add_argument('--train_data', required=True, help='train data file')
|
||||||
parser.add_argument('--cv_data', required=True, help='cv data file')
|
parser.add_argument('--cv_data', required=True, help='cv data file')
|
||||||
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
|
parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
|
||||||
|
parser.add_argument('--onnx_path', required=False, help='onnx path, which is required for online feature extraction')
|
||||||
parser.add_argument('--checkpoint', help='checkpoint model')
|
parser.add_argument('--checkpoint', help='checkpoint model')
|
||||||
parser.add_argument('--model_dir', required=True, help='save model dir')
|
parser.add_argument('--model_dir', required=True, help='save model dir')
|
||||||
parser.add_argument('--tensorboard_dir',
|
parser.add_argument('--tensorboard_dir',
|
||||||
@@ -96,6 +97,7 @@ def get_args():
|
|||||||
@record
|
@record
|
||||||
def main():
|
def main():
|
||||||
args = get_args()
|
args = get_args()
|
||||||
|
os.environ['onnx_path'] = args.onnx_path
|
||||||
logging.basicConfig(level=logging.DEBUG,
|
logging.basicConfig(level=logging.DEBUG,
|
||||||
format='%(asctime)s %(levelname)s %(message)s')
|
format='%(asctime)s %(levelname)s %(message)s')
|
||||||
# gan train has some special initialization logic
|
# gan train has some special initialization logic
|
||||||
@@ -104,12 +106,10 @@ def main():
|
|||||||
override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
|
override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
|
||||||
if gan is True:
|
if gan is True:
|
||||||
override_dict.pop('hift')
|
override_dict.pop('hift')
|
||||||
try:
|
if args.qwen_pretrain_path is not None:
|
||||||
with open(args.config, 'r') as f:
|
override_dict['qwen_pretrain_path'] = args.qwen_pretrain_path
|
||||||
configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path})
|
with open(args.config, 'r') as f:
|
||||||
except Exception:
|
configs = load_hyperpyyaml(f, overrides=override_dict)
|
||||||
with open(args.config, 'r') as f:
|
|
||||||
configs = load_hyperpyyaml(f, overrides=override_dict)
|
|
||||||
if gan is True:
|
if gan is True:
|
||||||
configs['train_conf'] = configs['train_conf_gan']
|
configs['train_conf'] = configs['train_conf_gan']
|
||||||
configs['train_conf'].update(vars(args))
|
configs['train_conf'].update(vars(args))
|
||||||
|
|||||||
@@ -16,12 +16,13 @@ import random
|
|||||||
|
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import pyworld as pw
|
import pyworld as pw
|
||||||
|
from cosyvoice.utils.onnx import embedding_extractor, online_feature
|
||||||
|
|
||||||
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
|
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
|
||||||
|
|
||||||
@@ -92,9 +93,9 @@ def filter(data,
|
|||||||
continue
|
continue
|
||||||
if len(sample['text_token']) > token_max_length:
|
if len(sample['text_token']) > token_max_length:
|
||||||
continue
|
continue
|
||||||
if len(sample['speech_token']) == 0:
|
if online_feature is False and len(sample['speech_token']) == 0:
|
||||||
continue
|
continue
|
||||||
if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
|
if online_feature is False and 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
|
||||||
continue
|
continue
|
||||||
if num_frames != 0:
|
if num_frames != 0:
|
||||||
if len(sample['text_token']) / num_frames < min_output_input_ratio:
|
if len(sample['text_token']) / num_frames < min_output_input_ratio:
|
||||||
@@ -155,7 +156,7 @@ def truncate(data, truncate_length=24576, mode='train'):
|
|||||||
|
|
||||||
def compute_fbank(data,
|
def compute_fbank(data,
|
||||||
feat_extractor,
|
feat_extractor,
|
||||||
token_mel_ratio=0,
|
num_frames=-1,
|
||||||
mode='train'):
|
mode='train'):
|
||||||
""" Extract fbank
|
""" Extract fbank
|
||||||
|
|
||||||
@@ -170,14 +171,11 @@ def compute_fbank(data,
|
|||||||
assert 'speech' in sample
|
assert 'speech' in sample
|
||||||
assert 'utt' in sample
|
assert 'utt' in sample
|
||||||
assert 'text_token' in sample
|
assert 'text_token' in sample
|
||||||
waveform = sample['speech']
|
# NOTE in cosyvoice2/3, we support online token extraction, so we need to align speech to 25hz first
|
||||||
feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
|
if num_frames != -1:
|
||||||
if token_mel_ratio != 0:
|
index = int(np.ceil(sample['speech'].shape[1] / num_frames))
|
||||||
# trim to align speech_token and speech_feat
|
sample['speech'] = torch.concat([sample['speech'], torch.zeros(1, index * num_frames - sample['speech'].shape[1])], dim=1)
|
||||||
token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
|
sample['speech_feat'] = feat_extractor(sample['speech']).squeeze(dim=0).transpose(0, 1)
|
||||||
feat = feat[:token_mel_ratio * token_len]
|
|
||||||
sample["speech_token"] = sample["speech_token"][:token_len]
|
|
||||||
sample['speech_feat'] = feat
|
|
||||||
yield sample
|
yield sample
|
||||||
|
|
||||||
|
|
||||||
@@ -216,6 +214,10 @@ def parse_embedding(data, normalize, mode='train'):
|
|||||||
Iterable[{key, feat, label}]
|
Iterable[{key, feat, label}]
|
||||||
"""
|
"""
|
||||||
for sample in data:
|
for sample in data:
|
||||||
|
if 'utt_embedding' not in sample and 'spk_embedding' not in sample:
|
||||||
|
speech_16k = torchaudio.transforms.Resample(orig_freq=sample['sample_rate'], new_freq=16000)(sample['speech'])
|
||||||
|
embedding = embedding_extractor.inference(speech_16k)
|
||||||
|
sample['spk_embedding'] = sample['utt_embedding'] = embedding
|
||||||
sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
|
sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
|
||||||
sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
|
sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
|
||||||
if normalize:
|
if normalize:
|
||||||
@@ -256,13 +258,14 @@ def shuffle(data, shuffle_size=10000, mode='train'):
|
|||||||
Iterable[{key, feat, label}]
|
Iterable[{key, feat, label}]
|
||||||
"""
|
"""
|
||||||
buf = []
|
buf = []
|
||||||
|
yield_size = int(shuffle_size / 2)
|
||||||
for sample in data:
|
for sample in data:
|
||||||
buf.append(sample)
|
buf.append(sample)
|
||||||
if len(buf) >= shuffle_size:
|
if len(buf) >= shuffle_size:
|
||||||
random.shuffle(buf)
|
random.shuffle(buf)
|
||||||
for x in buf:
|
for x in buf[:yield_size]:
|
||||||
yield x
|
yield x
|
||||||
buf = []
|
buf = buf[yield_size:]
|
||||||
# The sample left over
|
# The sample left over
|
||||||
random.shuffle(buf)
|
random.shuffle(buf)
|
||||||
for x in buf:
|
for x in buf:
|
||||||
@@ -420,10 +423,6 @@ def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
|
|||||||
padding_value=0)
|
padding_value=0)
|
||||||
batch["pitch_feat"] = pitch_feat
|
batch["pitch_feat"] = pitch_feat
|
||||||
batch["pitch_feat_len"] = pitch_feat_len
|
batch["pitch_feat_len"] = pitch_feat_len
|
||||||
else:
|
|
||||||
# only gan train needs speech, delete it to save memory
|
|
||||||
del batch["speech"]
|
|
||||||
del batch["speech_len"]
|
|
||||||
if dpo is True:
|
if dpo is True:
|
||||||
reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
|
reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
|
||||||
reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
|
reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import torch.nn as nn
|
|||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from omegaconf import DictConfig
|
from omegaconf import DictConfig
|
||||||
from cosyvoice.utils.mask import make_pad_mask
|
from cosyvoice.utils.mask import make_pad_mask
|
||||||
|
from cosyvoice.utils.onnx import SpeechTokenExtractor
|
||||||
|
|
||||||
|
|
||||||
class MaskedDiffWithXvec(torch.nn.Module):
|
class MaskedDiffWithXvec(torch.nn.Module):
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
|
|||||||
from cosyvoice.utils.common import th_accuracy
|
from cosyvoice.utils.common import th_accuracy
|
||||||
from cosyvoice.utils.file_utils import logging
|
from cosyvoice.utils.file_utils import logging
|
||||||
from cosyvoice.utils.mask import make_pad_mask
|
from cosyvoice.utils.mask import make_pad_mask
|
||||||
|
from cosyvoice.utils.onnx import SpeechTokenExtractor
|
||||||
|
|
||||||
|
|
||||||
class TransformerLM(torch.nn.Module):
|
class TransformerLM(torch.nn.Module):
|
||||||
|
|||||||
59
cosyvoice/utils/onnx.py
Normal file
59
cosyvoice/utils/onnx.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import onnxruntime
|
||||||
|
import torch, random
|
||||||
|
from torch import nn
|
||||||
|
import os
|
||||||
|
import whisper
|
||||||
|
import numpy as np
|
||||||
|
import torchaudio.compliance.kaldi as kaldi
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechTokenExtractor():
|
||||||
|
def __init__(self, model_path):
|
||||||
|
self.local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||||
|
option = onnxruntime.SessionOptions()
|
||||||
|
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||||
|
option.intra_op_num_threads = 1
|
||||||
|
self.speech_tokenizer_session = onnxruntime.InferenceSession(model_path,
|
||||||
|
sess_options=option,
|
||||||
|
providers=[("CUDAExecutionProvider", {'device_id': self.local_rank})])
|
||||||
|
|
||||||
|
def inference(self, feat, feat_lengths, device):
|
||||||
|
ort_out = self.speech_tokenizer_session.run(None,
|
||||||
|
{self.speech_tokenizer_session.get_inputs()[0].name:
|
||||||
|
feat.detach().cpu().numpy(),
|
||||||
|
self.speech_tokenizer_session.get_inputs()[1].name:
|
||||||
|
feat_lengths.detach().cpu().numpy()})
|
||||||
|
speech_token, speech_token_embedding = ort_out[0], ort_out[1]
|
||||||
|
return torch.tensor(speech_token).to(device), (feat_lengths / 2).to(torch.int32).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingExtractor():
|
||||||
|
def __init__(self, model_path):
|
||||||
|
option = onnxruntime.SessionOptions()
|
||||||
|
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||||
|
option.intra_op_num_threads = 1
|
||||||
|
self.max_len = 10 * 16000
|
||||||
|
self.campplus_session = onnxruntime.InferenceSession(model_path,
|
||||||
|
sess_options=option,
|
||||||
|
providers=["CPUExecutionProvider"])
|
||||||
|
|
||||||
|
def inference(self, speech):
|
||||||
|
if speech.shape[1] > self.max_len:
|
||||||
|
start_index = random.randint(0, speech.shape[1] - self.max_len)
|
||||||
|
speech = speech[:, start_index: start_index + self.max_len]
|
||||||
|
feat = kaldi.fbank(speech,
|
||||||
|
num_mel_bins=80,
|
||||||
|
dither=0,
|
||||||
|
sample_frequency=16000)
|
||||||
|
feat = feat - feat.mean(dim=0, keepdim=True)
|
||||||
|
embedding = self.campplus_session.run(None,
|
||||||
|
{self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
|
||||||
|
return torch.tensor(embedding).to(speech.device)
|
||||||
|
|
||||||
|
# singleton mode, only initialized once
|
||||||
|
onnx_path = os.environ.get('onnx_path')
|
||||||
|
if onnx_path is not None:
|
||||||
|
embedding_extractor, online_feature = EmbeddingExtractor(model_path=os.path.join(onnx_path, 'campplus.onnx')), True
|
||||||
|
else:
|
||||||
|
embedding_extractor, online_feature = None, False
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../../cosyvoice
|
|
||||||
@@ -54,6 +54,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--des_dir',
|
parser.add_argument('--des_dir',
|
||||||
type=str)
|
type=str)
|
||||||
parser.add_argument('--instruct',
|
parser.add_argument('--instruct',
|
||||||
type=str)
|
type=str,
|
||||||
|
default='')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ fi
|
|||||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
tools/extract_embedding.py --dir data/$x \
|
../../../tools/extract_embedding.py --dir data/$x \
|
||||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
@@ -35,7 +35,7 @@ fi
|
|||||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
tools/extract_speech_token.py --dir data/$x \
|
../../../tools/extract_speech_token.py --dir data/$x \
|
||||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
|
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
|||||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
mkdir -p data/$x/parquet
|
mkdir -p data/$x/parquet
|
||||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||||
--num_processes 10 \
|
--num_processes 10 \
|
||||||
--src_dir data/$x \
|
--src_dir data/$x \
|
||||||
--des_dir data/$x/parquet
|
--des_dir data/$x/parquet
|
||||||
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
for model in llm flow hifigan; do
|
for model in llm flow hifigan; do
|
||||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||||
cosyvoice/bin/train.py \
|
../../../cosyvoice/bin/train.py \
|
||||||
--train_engine $train_engine \
|
--train_engine $train_engine \
|
||||||
--config conf/cosyvoice.yaml \
|
--config conf/cosyvoice.yaml \
|
||||||
--train_data data/train.data.list \
|
--train_data data/train.data.list \
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../../../tools
|
|
||||||
@@ -139,7 +139,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
|
|||||||
get_tokenizer: !ref <get_tokenizer>
|
get_tokenizer: !ref <get_tokenizer>
|
||||||
allowed_special: !ref <allowed_special>
|
allowed_special: !ref <allowed_special>
|
||||||
filter: !name:cosyvoice.dataset.processor.filter
|
filter: !name:cosyvoice.dataset.processor.filter
|
||||||
max_length: 40960
|
max_length: 6000
|
||||||
min_length: 100
|
min_length: 100
|
||||||
token_max_length: 200
|
token_max_length: 200
|
||||||
token_min_length: 1
|
token_min_length: 1
|
||||||
@@ -158,7 +158,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
center: False
|
center: False
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||||
feat_extractor: !ref <feat_extractor>
|
feat_extractor: !ref <feat_extractor>
|
||||||
token_mel_ratio: 2
|
num_frames: 960
|
||||||
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||||
sample_rate: !ref <sample_rate>
|
sample_rate: !ref <sample_rate>
|
||||||
hop_size: 480
|
hop_size: 480
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../../../cosyvoice
|
|
||||||
@@ -24,27 +24,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
|||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
# NOTE embedding/token extraction is not necessary now as we support online feature extraction
|
||||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
|
||||||
tools/extract_embedding.py --dir data/$x \
|
|
||||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
|
||||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
|
||||||
tools/extract_speech_token.py --dir data/$x \
|
|
||||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
mkdir -p data/$x/parquet
|
mkdir -p data/$x/parquet
|
||||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||||
--num_processes 10 \
|
--num_processes 10 \
|
||||||
--src_dir data/$x \
|
--src_dir data/$x \
|
||||||
--des_dir data/$x/parquet
|
--des_dir data/$x/parquet
|
||||||
@@ -69,12 +54,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
for model in llm flow hifigan; do
|
for model in llm flow hifigan; do
|
||||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||||
cosyvoice/bin/train.py \
|
../../../cosyvoice/bin/train.py \
|
||||||
--train_engine $train_engine \
|
--train_engine $train_engine \
|
||||||
--config conf/cosyvoice2.yaml \
|
--config conf/cosyvoice2.yaml \
|
||||||
--train_data data/train.data.list \
|
--train_data data/train.data.list \
|
||||||
--cv_data data/dev.data.list \
|
--cv_data data/dev.data.list \
|
||||||
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
||||||
|
--onnx_path $pretrained_model_dir \
|
||||||
--model $model \
|
--model $model \
|
||||||
--checkpoint $pretrained_model_dir/$model.pt \
|
--checkpoint $pretrained_model_dir/$model.pt \
|
||||||
--model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
|
--model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ fi
|
|||||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
tools/extract_embedding.py --dir data/$x \
|
../../../tools/extract_embedding.py --dir data/$x \
|
||||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
@@ -44,7 +44,7 @@ fi
|
|||||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 train-clean-100_reject train-clean-360_reject dev-clean dev-other test-clean test-other; do
|
||||||
tools/extract_speech_token.py --dir data/$x \
|
../../../tools/extract_speech_token.py --dir data/$x \
|
||||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
|
--onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
@@ -53,7 +53,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
|||||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
mkdir -p data/$x/parquet
|
mkdir -p data/$x/parquet
|
||||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||||
--num_processes 10 \
|
--num_processes 10 \
|
||||||
--dpo \
|
--dpo \
|
||||||
--src_dir data/$x \
|
--src_dir data/$x \
|
||||||
@@ -80,11 +80,12 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
for model in llm; do
|
for model in llm; do
|
||||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||||
cosyvoice/bin/train.py \
|
../../../cosyvoice/bin/train.py \
|
||||||
--train_engine $train_engine \
|
--train_engine $train_engine \
|
||||||
--config conf/cosyvoice2.yaml \
|
--config conf/cosyvoice2.yaml \
|
||||||
--train_data data/train.data.list \
|
--train_data data/train.data.list \
|
||||||
--cv_data data/dev.data.list \
|
--cv_data data/dev.data.list \
|
||||||
|
--onnx_path $pretrained_model_dir \
|
||||||
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
||||||
--model $model \
|
--model $model \
|
||||||
--checkpoint $pretrained_model_dir/$model.pt \
|
--checkpoint $pretrained_model_dir/$model.pt \
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../../../tools
|
|
||||||
@@ -129,7 +129,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
|
|||||||
get_tokenizer: !ref <get_tokenizer>
|
get_tokenizer: !ref <get_tokenizer>
|
||||||
allowed_special: !ref <allowed_special>
|
allowed_special: !ref <allowed_special>
|
||||||
filter: !name:cosyvoice.dataset.processor.filter
|
filter: !name:cosyvoice.dataset.processor.filter
|
||||||
max_length: 40960
|
max_length: 6000
|
||||||
min_length: 100
|
min_length: 100
|
||||||
token_max_length: 200
|
token_max_length: 200
|
||||||
token_min_length: 1
|
token_min_length: 1
|
||||||
@@ -148,7 +148,7 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
center: False
|
center: False
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||||
feat_extractor: !ref <feat_extractor>
|
feat_extractor: !ref <feat_extractor>
|
||||||
token_mel_ratio: 2
|
num_frames: 960
|
||||||
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||||
sample_rate: !ref <sample_rate>
|
sample_rate: !ref <sample_rate>
|
||||||
hop_size: 480
|
hop_size: 480
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../../../cosyvoice
|
|
||||||
@@ -25,36 +25,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
|||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
# NOTE embedding/token extraction is not necessary now as we support online feature extraction
|
||||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
|
||||||
tools/extract_embedding.py --dir data/$x \
|
|
||||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
|
||||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
|
||||||
tools/extract_speech_token.py --dir data/$x \
|
|
||||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v3.onnx
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
mkdir -p data/$x/parquet
|
mkdir -p data/$x/parquet
|
||||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||||
--num_processes 10 \
|
--num_processes 10 \
|
||||||
--instruct \
|
|
||||||
--src_dir data/$x \
|
--src_dir data/$x \
|
||||||
--des_dir data/$x/parquet
|
--des_dir data/$x/parquet
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# train llm
|
# train llm
|
||||||
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
export CUDA_VISIBLE_DEVICES="0"
|
||||||
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
job_id=1986
|
job_id=1986
|
||||||
dist_backend="nccl"
|
dist_backend="nccl"
|
||||||
@@ -71,12 +55,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
for model in llm flow hifigan; do
|
for model in llm flow hifigan; do
|
||||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
|
||||||
cosyvoice/bin/train.py \
|
../../../cosyvoice/bin/train.py \
|
||||||
--train_engine $train_engine \
|
--train_engine $train_engine \
|
||||||
--config conf/cosyvoice3.yaml \
|
--config conf/cosyvoice3.yaml \
|
||||||
--train_data data/train.data.list \
|
--train_data data/train.data.list \
|
||||||
--cv_data data/dev.data.list \
|
--cv_data data/dev.data.list \
|
||||||
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
--qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
|
||||||
|
--onnx_path $pretrained_model_dir \
|
||||||
--model $model \
|
--model $model \
|
||||||
--checkpoint $pretrained_model_dir/$model.pt \
|
--checkpoint $pretrained_model_dir/$model.pt \
|
||||||
--model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \
|
--model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../../../tools
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../../cosyvoice
|
|
||||||
@@ -27,7 +27,7 @@ fi
|
|||||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
|
||||||
for x in dev test train; do
|
for x in dev test train; do
|
||||||
tools/extract_embedding.py --dir data/$x \
|
../../../tools/extract_embedding.py --dir data/$x \
|
||||||
--onnx_path $pretrained_model_dir/campplus.onnx
|
--onnx_path $pretrained_model_dir/campplus.onnx
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
@@ -35,7 +35,7 @@ fi
|
|||||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
|
||||||
for x in dev test train; do
|
for x in dev test train; do
|
||||||
tools/extract_speech_token.py --dir data/$x \
|
../../../tools/extract_speech_token.py --dir data/$x \
|
||||||
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
|
--onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
@@ -44,7 +44,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
|||||||
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
|
||||||
for x in dev test train; do
|
for x in dev test train; do
|
||||||
mkdir -p data/$x/parquet
|
mkdir -p data/$x/parquet
|
||||||
tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
../../../tools/make_parquet_list.py --num_utts_per_parquet 1000 \
|
||||||
--num_processes 10 \
|
--num_processes 10 \
|
||||||
--src_dir data/$x \
|
--src_dir data/$x \
|
||||||
--des_dir data/$x/parquet
|
--des_dir data/$x/parquet
|
||||||
@@ -69,7 +69,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
|||||||
for model in llm flow hifigan; do
|
for model in llm flow hifigan; do
|
||||||
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
torchrun --nnodes=1 --nproc_per_node=$num_gpus \
|
||||||
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
|
--rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
|
||||||
cosyvoice/bin/train.py \
|
../../../cosyvoice/bin/train.py \
|
||||||
--train_engine $train_engine \
|
--train_engine $train_engine \
|
||||||
--config conf/cosyvoice.yaml \
|
--config conf/cosyvoice.yaml \
|
||||||
--train_data data/train.data.list \
|
--train_data data/train.data.list \
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../../../tools
|
|
||||||
@@ -29,31 +29,24 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file):
|
|||||||
for utt in tqdm(utt_list):
|
for utt in tqdm(utt_list):
|
||||||
data = open(utt2wav[utt], 'rb').read()
|
data = open(utt2wav[utt], 'rb').read()
|
||||||
data_list.append(data)
|
data_list.append(data)
|
||||||
wav_list = [utt2wav[utt] for utt in utt_list]
|
|
||||||
text_list = [utt2text[utt] for utt in utt_list]
|
|
||||||
spk_list = [utt2spk[utt] for utt in utt_list]
|
|
||||||
uttembedding_list = [utt2embedding[utt] for utt in utt_list]
|
|
||||||
spkembedding_list = [spk2embedding[utt2spk[utt]] for utt in utt_list]
|
|
||||||
speech_token_list = [utt2speech_token.get(utt, []) for utt in utt_list]
|
|
||||||
if args.dpo:
|
|
||||||
reject_speech_token_list = [utt2reject_speech_token[utt] for utt in utt_list]
|
|
||||||
if args.instruct:
|
|
||||||
instruct_list = [utt2instruct[utt] for utt in utt_list]
|
|
||||||
|
|
||||||
# 保存到parquet,utt2parquet_file,spk2parquet_file
|
# 保存到parquet,utt2parquet_file,spk2parquet_file
|
||||||
df = pd.DataFrame()
|
df = pd.DataFrame()
|
||||||
df['utt'] = utt_list
|
df['utt'] = utt_list
|
||||||
df['wav'] = wav_list
|
|
||||||
df['audio_data'] = data_list
|
df['audio_data'] = data_list
|
||||||
df['text'] = text_list
|
df['wav'] = [utt2wav[utt] for utt in utt_list]
|
||||||
df['spk'] = spk_list
|
df['text'] = [utt2text[utt] for utt in utt_list]
|
||||||
df['utt_embedding'] = uttembedding_list
|
df['spk'] = [utt2spk[utt] for utt in utt_list]
|
||||||
df['spk_embedding'] = spkembedding_list
|
if utt2embedding is not None:
|
||||||
df['speech_token'] = speech_token_list
|
df['utt_embedding'] = [utt2embedding[utt] for utt in utt_list]
|
||||||
|
if spk2embedding is not None:
|
||||||
|
df['spk_embedding'] = [spk2embedding[utt2spk[utt]] for utt in utt_list]
|
||||||
|
if utt2speech_token is not None:
|
||||||
|
df['speech_token'] = [utt2speech_token[utt] for utt in utt_list]
|
||||||
|
if utt2instruct is not None:
|
||||||
|
df['instruct'] = [utt2instruct[utt] for utt in utt_list]
|
||||||
if args.dpo:
|
if args.dpo:
|
||||||
df['reject_speech_token'] = reject_speech_token_list
|
df['reject_speech_token'] = [utt2reject_speech_token.get(utt, None) for utt in utt_list]
|
||||||
if args.instruct:
|
|
||||||
df['instruct'] = instruct_list
|
|
||||||
df.to_parquet(parquet_file)
|
df.to_parquet(parquet_file)
|
||||||
with open(utt2parquet_file, 'w') as f:
|
with open(utt2parquet_file, 'w') as f:
|
||||||
json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2)
|
json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2)
|
||||||
@@ -72,10 +65,6 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
help='num processes for make parquets')
|
help='num processes for make parquets')
|
||||||
parser.add_argument('--instruct',
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help='has instruct file or not')
|
|
||||||
parser.add_argument('--src_dir',
|
parser.add_argument('--src_dir',
|
||||||
type=str)
|
type=str)
|
||||||
parser.add_argument('--des_dir',
|
parser.add_argument('--des_dir',
|
||||||
@@ -86,7 +75,7 @@ if __name__ == "__main__":
|
|||||||
help='Use Direct Preference Optimization')
|
help='Use Direct Preference Optimization')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
utt2wav, utt2text, utt2spk, utt2instruct = {}, {}, {}, {}
|
utt2wav, utt2text, utt2spk = {}, {}, {}
|
||||||
with open('{}/wav.scp'.format(args.src_dir)) as f:
|
with open('{}/wav.scp'.format(args.src_dir)) as f:
|
||||||
for l in f:
|
for l in f:
|
||||||
l = l.replace('\n', '').split()
|
l = l.replace('\n', '').split()
|
||||||
@@ -99,16 +88,19 @@ if __name__ == "__main__":
|
|||||||
for l in f:
|
for l in f:
|
||||||
l = l.replace('\n', '').split()
|
l = l.replace('\n', '').split()
|
||||||
utt2spk[l[0]] = l[1]
|
utt2spk[l[0]] = l[1]
|
||||||
if args.instruct is True:
|
if os.path.exists('{}/instruct'.format(args.src_dir)):
|
||||||
|
utt2instruct = {}
|
||||||
with open('{}/instruct'.format(args.src_dir)) as f:
|
with open('{}/instruct'.format(args.src_dir)) as f:
|
||||||
for l in f:
|
for l in f:
|
||||||
l = l.replace('\n', '').split()
|
l = l.replace('\n', '').split()
|
||||||
utt2instruct[l[0]] = ' '.join(l[1:])
|
utt2instruct[l[0]] = ' '.join(l[1:])
|
||||||
utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir))
|
else:
|
||||||
spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir))
|
utt2instruct = None
|
||||||
utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir))
|
utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir)) if os.path.exists('{}/utt2embedding.pt'.format(args.src_dir)) else None
|
||||||
|
spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir)) if os.path.exists('{}/spk2embedding.pt'.format(args.src_dir)) else None
|
||||||
|
utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir)) if os.path.exists('{}/utt2speech_token.pt'.format(args.src_dir)) else None
|
||||||
if args.dpo:
|
if args.dpo:
|
||||||
utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir))
|
utt2reject_speech_token = torch.load('{}_reject/utt2speech_token.pt'.format(args.src_dir)) if os.path.exists('{}_reject/utt2speech_token.pt'.format(args.src_dir)) else {}
|
||||||
utts = list(utt2wav.keys())
|
utts = list(utt2wav.keys())
|
||||||
|
|
||||||
# Using process pool to speedup
|
# Using process pool to speedup
|
||||||
|
|||||||
Reference in New Issue
Block a user