mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
add cosyvoice code
This commit is contained in:
114
cosyvoice/bin/inference.py
Normal file
114
cosyvoice/bin/inference.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
import torchaudio
|
||||
from hyperpyyaml import load_hyperpyyaml
|
||||
from tqdm import tqdm
|
||||
from cosyvoice.cli.model import CosyVoiceModel
|
||||
|
||||
from cosyvoice.dataset.dataset import Dataset
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(description='inference with your model')
|
||||
parser.add_argument('--config', required=True, help='config file')
|
||||
parser.add_argument('--prompt_data', required=True, help='prompt data file')
|
||||
parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
|
||||
parser.add_argument('--tts_text', required=True, help='tts input file')
|
||||
parser.add_argument('--llm_model', required=True, help='llm model file')
|
||||
parser.add_argument('--flow_model', required=True, help='flow model file')
|
||||
parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
|
||||
parser.add_argument('--gpu',
|
||||
type=int,
|
||||
default=-1,
|
||||
help='gpu id for this rank, -1 for cpu')
|
||||
parser.add_argument('--mode',
|
||||
default='sft',
|
||||
choices=['sft', 'zero_shot'],
|
||||
help='inference mode')
|
||||
parser.add_argument('--result_dir', required=True, help='asr result file')
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
logging.basicConfig(level=logging.DEBUG,
|
||||
format='%(asctime)s %(levelname)s %(message)s')
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
|
||||
|
||||
# Init cosyvoice models from configs
|
||||
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
|
||||
device = torch.device('cuda' if use_cuda else 'cpu')
|
||||
with open(args.config, 'r') as f:
|
||||
configs = load_hyperpyyaml(f)
|
||||
|
||||
model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
|
||||
model.load(args.llm_model, args.flow_model, args.hifigan_model)
|
||||
|
||||
test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
|
||||
test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
|
||||
|
||||
del configs
|
||||
os.makedirs(args.result_dir, exist_ok=True)
|
||||
fn = os.path.join(args.result_dir, 'wav.scp')
|
||||
f = open(fn, 'w')
|
||||
with torch.no_grad():
|
||||
for batch_idx, batch in tqdm(enumerate(test_data_loader)):
|
||||
utts = batch["utts"]
|
||||
assert len(utts) == 1, "inference mode only support batchsize 1"
|
||||
text = batch["text"]
|
||||
text_token = batch["text_token"].to(device)
|
||||
text_token_len = batch["text_token_len"].to(device)
|
||||
tts_text = batch["tts_text"]
|
||||
tts_index = batch["tts_index"]
|
||||
tts_text_token = batch["tts_text_token"].to(device)
|
||||
tts_text_token_len = batch["tts_text_token_len"].to(device)
|
||||
speech_token = batch["speech_token"].to(device)
|
||||
speech_token_len = batch["speech_token_len"].to(device)
|
||||
speech_feat = batch["speech_feat"].to(device)
|
||||
speech_feat_len = batch["speech_feat_len"].to(device)
|
||||
utt_embedding = batch["utt_embedding"].to(device)
|
||||
spk_embedding = batch["spk_embedding"].to(device)
|
||||
if args.mode == 'sft':
|
||||
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
|
||||
'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
|
||||
else:
|
||||
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
|
||||
'prompt_text': text_token, 'prompt_text_len': text_token_len,
|
||||
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
|
||||
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
|
||||
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
||||
'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
|
||||
model_output = model.inference(**model_input)
|
||||
tts_key = '{}_{}'.format(utts[0], tts_index[0])
|
||||
tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
|
||||
torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
|
||||
f.write('{} {}\n'.format(tts_key, tts_fn))
|
||||
f.flush()
|
||||
f.close()
|
||||
logging.info('Result wav.scp saved in {}'.format(fn))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
137
cosyvoice/bin/train.py
Normal file
137
cosyvoice/bin/train.py
Normal file
@@ -0,0 +1,137 @@
|
||||
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
import argparse
|
||||
import datetime
|
||||
import logging
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
from copy import deepcopy
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import deepspeed
|
||||
|
||||
from hyperpyyaml import load_hyperpyyaml
|
||||
|
||||
from torch.distributed.elastic.multiprocessing.errors import record
|
||||
|
||||
from cosyvoice.utils.executor import Executor
|
||||
from cosyvoice.utils.train_utils import (
|
||||
init_distributed,
|
||||
init_dataset_and_dataloader,
|
||||
init_optimizer_and_scheduler,
|
||||
init_summarywriter, save_model,
|
||||
wrap_cuda_model, check_modify_and_save_config)
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(description='training your network')
|
||||
parser.add_argument('--train_engine',
|
||||
default='torch_ddp',
|
||||
choices=['torch_ddp', 'deepspeed'],
|
||||
help='Engine for paralleled training')
|
||||
parser.add_argument('--model', required=True, help='model which will be trained')
|
||||
parser.add_argument('--config', required=True, help='config file')
|
||||
parser.add_argument('--train_data', required=True, help='train data file')
|
||||
parser.add_argument('--cv_data', required=True, help='cv data file')
|
||||
parser.add_argument('--checkpoint', help='checkpoint model')
|
||||
parser.add_argument('--model_dir', required=True, help='save model dir')
|
||||
parser.add_argument('--tensorboard_dir',
|
||||
default='tensorboard',
|
||||
help='tensorboard log dir')
|
||||
parser.add_argument('--ddp.dist_backend',
|
||||
dest='dist_backend',
|
||||
default='nccl',
|
||||
choices=['nccl', 'gloo'],
|
||||
help='distributed backend')
|
||||
parser.add_argument('--num_workers',
|
||||
default=0,
|
||||
type=int,
|
||||
help='num of subprocess workers for reading')
|
||||
parser.add_argument('--prefetch',
|
||||
default=100,
|
||||
type=int,
|
||||
help='prefetch number')
|
||||
parser.add_argument('--pin_memory',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Use pinned memory buffers used for reading')
|
||||
parser.add_argument('--deepspeed.save_states',
|
||||
dest='save_states',
|
||||
default='model_only',
|
||||
choices=['model_only', 'model+optimizer'],
|
||||
help='save model/optimizer states')
|
||||
parser.add_argument('--timeout',
|
||||
default=30,
|
||||
type=int,
|
||||
help='timeout (in seconds) of cosyvoice_join. ' +
|
||||
'30s for aishell & 300s for wenetspeech')
|
||||
parser = deepspeed.add_config_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
@record
|
||||
def main():
|
||||
args = get_args()
|
||||
logging.basicConfig(level=logging.DEBUG,
|
||||
format='%(asctime)s %(levelname)s %(message)s')
|
||||
|
||||
override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
|
||||
with open(args.config, 'r') as f:
|
||||
configs = load_hyperpyyaml(f, overrides=override_dict)
|
||||
configs['train_conf'].update(vars(args))
|
||||
|
||||
# Init env for ddp
|
||||
init_distributed(args)
|
||||
|
||||
# Get dataset & dataloader
|
||||
train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
|
||||
init_dataset_and_dataloader(args, configs)
|
||||
|
||||
# Do some sanity checks and save config to arsg.model_dir
|
||||
configs = check_modify_and_save_config(args, configs)
|
||||
|
||||
# Tensorboard summary
|
||||
writer = init_summarywriter(args)
|
||||
|
||||
# load checkpoint
|
||||
model = configs[args.model]
|
||||
if args.checkpoint is not None:
|
||||
model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
|
||||
|
||||
# Dispatch model from cpu to gpu
|
||||
model = wrap_cuda_model(args, model)
|
||||
|
||||
# Get optimizer & scheduler
|
||||
model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
|
||||
|
||||
# Save init checkpoints
|
||||
info_dict = deepcopy(configs['train_conf'])
|
||||
save_model(model, 'init', info_dict)
|
||||
|
||||
# Get executor
|
||||
executor = Executor()
|
||||
|
||||
# Start training loop
|
||||
for epoch in range(info_dict['max_epoch']):
|
||||
executor.epoch = epoch
|
||||
train_dataset.set_epoch(epoch)
|
||||
dist.barrier()
|
||||
group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
|
||||
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
|
||||
dist.destroy_process_group(group_join)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user