From ebef63066f9cb50d3685ecbba6ea146b21aaf0ea Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 11 Dec 2025 09:43:25 +0000 Subject: [PATCH 1/2] add instruct --- cosyvoice/dataset/processor.py | 9 +++++++++ cosyvoice/llm/llm.py | 3 +++ examples/libritts/cosyvoice/local/prepare_data.py | 11 +++++++++-- examples/libritts/cosyvoice3/run.sh | 3 ++- tools/make_parquet_list.py | 13 +++++++++++++ 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index 1eec976..f186ed2 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -242,6 +242,10 @@ def tokenize(data, get_tokenizer, allowed_special, mode='train'): for sample in data: assert 'text' in sample sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special) + if 'instruct' in sample: + sample['instruct_token'] = tokenizer.encode(sample['instruct'], allowed_special=allowed_special) + else: + sample['instruct_token'] = tokenizer.encode('', allowed_special=allowed_special) yield sample @@ -390,6 +394,9 @@ def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False): text_token = [torch.tensor(sample[i]['text_token']) for i in order] text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32) text_token = pad_sequence(text_token, batch_first=True, padding_value=0) + instruct_token = [torch.tensor(sample[i]['instruct_token']) for i in order] + instruct_token_len = torch.tensor([i.size(0) for i in instruct_token], dtype=torch.int32) + instruct_token = pad_sequence(instruct_token, batch_first=True, padding_value=0) utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0) spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0) batch = { @@ -403,6 +410,8 @@ def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False): "text": text, "text_token": text_token, "text_token_len": text_token_len, + "instruct_token": instruct_token, + "instruct_token_len": instruct_token_len, "utt_embedding": utt_embedding, "spk_embedding": spk_embedding, } diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py index 6b3a7b0..c0b3400 100644 --- a/cosyvoice/llm/llm.py +++ b/cosyvoice/llm/llm.py @@ -674,6 +674,9 @@ class CosyVoice3LM(Qwen2LM): text_token_len = batch['text_token_len'].to(device) speech_token = batch['speech_token'].to(device) speech_token_len = batch['speech_token_len'].to(device) + # NOTE should append instruct_token to sequence, not implemented yet + instruct_token = batch['instruct_token'].to(device) + instruct_token_len = batch['instruct_token_len'].to(device) # 1. encode text_token text_token_emb = self.llm.model.model.embed_tokens(text_token) diff --git a/examples/libritts/cosyvoice/local/prepare_data.py b/examples/libritts/cosyvoice/local/prepare_data.py index 918aef3..fffa9fb 100644 --- a/examples/libritts/cosyvoice/local/prepare_data.py +++ b/examples/libritts/cosyvoice/local/prepare_data.py @@ -40,6 +40,11 @@ def main(): with open('{}/spk2utt'.format(args.des_dir), 'w') as f: for k, v in spk2utt.items(): f.write('{} {}\n'.format(k, ' '.join(v))) + if args.instruct is True: + with open('{}/instruct'.format(args.des_dir), 'w') as f: + for k, v in utt2text.items(): + # NOTE in CosyVoice3, we add instruct in sequence + f.write('{} You are a helpful assistant.<|endofprompt|>\n'.format(k, v)) return @@ -49,7 +54,9 @@ if __name__ == "__main__": type=str) parser.add_argument('--des_dir', type=str) - parser.add_argument('--ref_model', - type=str) + parser.add_argument('--instruct', + action='store_true', + default=False, + help='create instruct file or not') args = parser.parse_args() main() diff --git a/examples/libritts/cosyvoice3/run.sh b/examples/libritts/cosyvoice3/run.sh index ce20043..4e6ce11 100644 --- a/examples/libritts/cosyvoice3/run.sh +++ b/examples/libritts/cosyvoice3/run.sh @@ -20,7 +20,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt" for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do mkdir -p data/$x - python local/prepare_data.py --src_dir $data_dir/LibriTTS/$x --des_dir data/$x + python local/prepare_data.py --src_dir $data_dir/LibriTTS/$x --des_dir data/$x --instruct done fi @@ -46,6 +46,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then mkdir -p data/$x/parquet tools/make_parquet_list.py --num_utts_per_parquet 1000 \ --num_processes 10 \ + --instruct \ --src_dir data/$x \ --des_dir data/$x/parquet done diff --git a/tools/make_parquet_list.py b/tools/make_parquet_list.py index 8920841..29f42cc 100755 --- a/tools/make_parquet_list.py +++ b/tools/make_parquet_list.py @@ -37,6 +37,8 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file): speech_token_list = [utt2speech_token.get(utt, []) for utt in utt_list] if args.dpo: reject_speech_token_list = [utt2reject_speech_token[utt] for utt in utt_list] + if args.instruct: + instruct_list = [utt2instruct[utt] for utt in utt_list] # 保存到parquet,utt2parquet_file,spk2parquet_file df = pd.DataFrame() @@ -50,6 +52,8 @@ def job(utt_list, parquet_file, utt2parquet_file, spk2parquet_file): df['speech_token'] = speech_token_list if args.dpo: df['reject_speech_token'] = reject_speech_token_list + if args.instruct: + df['instruct'] = instruct_list df.to_parquet(parquet_file) with open(utt2parquet_file, 'w') as f: json.dump({k: parquet_file for k in utt_list}, f, ensure_ascii=False, indent=2) @@ -68,6 +72,10 @@ if __name__ == "__main__": type=int, default=1, help='num processes for make parquets') + parser.add_argument('--instruct', + action='store_true', + default=False, + help='has instruct file or not') parser.add_argument('--src_dir', type=str) parser.add_argument('--des_dir', @@ -91,6 +99,11 @@ if __name__ == "__main__": for l in f: l = l.replace('\n', '').split() utt2spk[l[0]] = l[1] + if args.instruct is True: + with open('{}/instruct'.format(args.src_dir)) as f: + for l in f: + l = l.replace('\n', '').split() + utt2instruct[l[0]] = ' '.join(l[1:]) utt2embedding = torch.load('{}/utt2embedding.pt'.format(args.src_dir)) spk2embedding = torch.load('{}/spk2embedding.pt'.format(args.src_dir)) utt2speech_token = torch.load('{}/utt2speech_token.pt'.format(args.src_dir)) From 5bc4b23f02278151c3d2aade403e07a4bdaf1871 Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Fri, 12 Dec 2025 06:53:28 +0000 Subject: [PATCH 2/2] use amp in flow --- cosyvoice/bin/inference_deprecated.py | 126 -------------------------- cosyvoice/cli/model.py | 9 -- cosyvoice/flow/flow_matching.py | 13 +-- 3 files changed, 7 insertions(+), 141 deletions(-) delete mode 100644 cosyvoice/bin/inference_deprecated.py diff --git a/cosyvoice/bin/inference_deprecated.py b/cosyvoice/bin/inference_deprecated.py deleted file mode 100644 index 0d45cc7..0000000 --- a/cosyvoice/bin/inference_deprecated.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import logging -logging.getLogger('matplotlib').setLevel(logging.WARNING) -import os -import torch -from torch.utils.data import DataLoader -import torchaudio -from hyperpyyaml import load_hyperpyyaml -from tqdm import tqdm -from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model -from cosyvoice.dataset.dataset import Dataset - - -def get_args(): - parser = argparse.ArgumentParser(description='inference with your model') - parser.add_argument('--config', required=True, help='config file') - parser.add_argument('--prompt_data', required=True, help='prompt data file') - parser.add_argument('--prompt_utt2data', required=True, help='prompt data file') - parser.add_argument('--tts_text', required=True, help='tts input file') - parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path') - parser.add_argument('--llm_model', required=True, help='llm model file') - parser.add_argument('--flow_model', required=True, help='flow model file') - parser.add_argument('--hifigan_model', required=True, help='hifigan model file') - parser.add_argument('--gpu', - type=int, - default=-1, - help='gpu id for this rank, -1 for cpu') - parser.add_argument('--mode', - default='sft', - choices=['sft', 'zero_shot'], - help='inference mode') - parser.add_argument('--result_dir', required=True, help='asr result file') - args = parser.parse_args() - print(args) - return args - - -def main(): - args = get_args() - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(message)s') - os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # Init cosyvoice models from configs - use_cuda = args.gpu >= 0 and torch.cuda.is_available() - device = torch.device('cuda' if use_cuda else 'cpu') - try: - with open(args.config, 'r') as f: - configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': args.qwen_pretrain_path}) - model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift']) - except Exception: - try: - with open(args.config, 'r') as f: - configs = load_hyperpyyaml(f) - model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) - except Exception: - raise TypeError('no valid model_type!') - - model.load(args.llm_model, args.flow_model, args.hifigan_model) - - test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, - tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data) - test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) - - sample_rate = configs['sample_rate'] - del configs - os.makedirs(args.result_dir, exist_ok=True) - fn = os.path.join(args.result_dir, 'wav.scp') - f = open(fn, 'w') - with torch.no_grad(): - for _, batch in tqdm(enumerate(test_data_loader)): - utts = batch["utts"] - assert len(utts) == 1, "inference mode only support batchsize 1" - text_token = batch["text_token"].to(device) - text_token_len = batch["text_token_len"].to(device) - tts_index = batch["tts_index"] - tts_text_token = batch["tts_text_token"].to(device) - tts_text_token_len = batch["tts_text_token_len"].to(device) - speech_token = batch["speech_token"].to(device) - speech_token_len = batch["speech_token_len"].to(device) - speech_feat = batch["speech_feat"].to(device) - speech_feat_len = batch["speech_feat_len"].to(device) - utt_embedding = batch["utt_embedding"].to(device) - spk_embedding = batch["spk_embedding"].to(device) - if args.mode == 'sft': - model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, - 'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding} - else: - model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, - 'prompt_text': text_token, 'prompt_text_len': text_token_len, - 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len, - 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len, - 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, - 'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding} - tts_speeches = [] - for model_output in model.tts(**model_input): - tts_speeches.append(model_output['tts_speech']) - tts_speeches = torch.concat(tts_speeches, dim=1) - tts_key = '{}_{}'.format(utts[0], tts_index[0]) - tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key)) - torchaudio.save(tts_fn, tts_speeches, sample_rate=sample_rate, backend='soundfile') - f.write('{} {}\n'.format(tts_key, tts_fn)) - f.flush() - f.close() - logging.info('Result wav.scp saved in {}'.format(fn)) - - -if __name__ == '__main__': - logging.warning('this code has been deprecated, please refer to README for CosyVoice inference usage!') - main() diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 0f9416a..01e76c6 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -38,9 +38,6 @@ class CosyVoiceModel: self.flow = flow self.hift = hift self.fp16 = fp16 - if self.fp16 is True: - self.llm.half() - self.flow.half() self.token_min_hop_len = 2 * self.flow.input_frame_rate self.token_max_hop_len = 4 * self.flow.input_frame_rate self.token_overlap_len = 20 @@ -249,9 +246,6 @@ class CosyVoice2Model(CosyVoiceModel): self.flow = flow self.hift = hift self.fp16 = fp16 - if self.fp16 is True: - self.llm.half() - self.flow.half() # NOTE must matching training static_chunk_size self.token_hop_len = 25 # hift cache @@ -398,9 +392,6 @@ class CosyVoice3Model(CosyVoice2Model): self.flow = flow self.hift = hift self.fp16 = fp16 - if self.fp16 is True: - self.llm.half() - self.flow.half() # NOTE must matching training static_chunk_size self.token_hop_len = 25 # rtf and decoding related diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py index 7f92df5..a45337a 100644 --- a/cosyvoice/flow/flow_matching.py +++ b/cosyvoice/flow/flow_matching.py @@ -91,12 +91,13 @@ class ConditionalCFM(BASECFM): sol = [] # Do not use concat, it may cause memory format changed and trt infer with wrong results! - x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) - mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype) - mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) - t_in = torch.zeros([2], device=x.device, dtype=x.dtype) - spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype) - cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype) + # NOTE when flow run in amp mode, x.dtype is float32, which cause nan in trt fp16 inference, so set dtype=spks.dtype + x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=spks.dtype) + mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=spks.dtype) + mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=spks.dtype) + t_in = torch.zeros([2], device=x.device, dtype=spks.dtype) + spks_in = torch.zeros([2, 80], device=x.device, dtype=spks.dtype) + cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=spks.dtype) for step in range(1, len(t_span)): # Classifier-Free Guidance inference introduced in VoiceBox x_in[:] = x