add cosyvoice code

2026-02-05 18:09:24 +08:00 · 2024-07-04 21:15:12 +08:00
parent 06984ac149
commit 076829ab84
64 changed files with 8428 additions and 18 deletions
--- a/cosyvoice/bin/inference.py
+++ b/cosyvoice/bin/inference.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel
+
+from cosyvoice.dataset.dataset import Dataset
+
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text = batch["text"]
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_text = batch["tts_text"]
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            model_output = model.inference(**model_input)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+
+
+if __name__ == '__main__':
+    main()
--- a/cosyvoice/bin/train.py
+++ b/cosyvoice/bin/train.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import torch
+import torch.distributed as dist
+import deepspeed
+
+from hyperpyyaml import load_hyperpyyaml
+
+from torch.distributed.elastic.multiprocessing.errors import record
+
+from cosyvoice.utils.executor import Executor
+from cosyvoice.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join. ' +
+                        '30s for aishell & 300s for wenetspeech')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+
+
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    configs['train_conf'].update(vars(args))
+
+    # Init env for ddp
+    init_distributed(args)
+
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs)
+
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
+
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
+
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    save_model(model, 'init', info_dict)
+
+    # Get executor
+    executor = Executor()
+
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
+        dist.destroy_process_group(group_join)
+
+if __name__ == '__main__':
+    main()