This commit is contained in:
lyuxiang.lx
2025-07-07 14:49:51 +08:00
parent 5a00aefa20
commit 3047591fad
3 changed files with 9 additions and 5 deletions

View File

@@ -401,7 +401,8 @@ class Qwen2LM(TransformerLM):
speech_token_combined_emb = self.speech_embedding(speech_token_combined) speech_token_combined_emb = self.speech_embedding(speech_token_combined)
# 3. prepare llm_input/target # 3. prepare llm_input/target
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(text_token.repeat(2, 1), text_token_emb.repeat(2, 1, 1), text_token_len.repeat(2), speech_token_combined, speech_token_combined_emb, speech_token_combined_len) lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(text_token.repeat(2, 1), text_token_emb.repeat(2, 1, 1), text_token_len.repeat(2), \
speech_token_combined, speech_token_combined_emb, speech_token_combined_len)
lm_target = lm_target.to(device) lm_target = lm_target.to(device)
# 4. run lm forward # 4. run lm forward

View File

@@ -2,7 +2,8 @@ import argparse
import logging import logging
import os import os
from tqdm import tqdm from tqdm import tqdm
import torch, torchaudio import torch
import torchaudio
from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav from cosyvoice.utils.file_utils import load_wav
@@ -30,7 +31,7 @@ def main():
if prompt_speech_16k.shape[1] >= 30 * 16000: if prompt_speech_16k.shape[1] >= 30 * 16000:
continue continue
speech_list = [] speech_list = []
for i, j in enumerate(cosyvoice.inference_zero_shot(utt2text[utt], utt2text[utt], prompt_speech_16k, stream=False, text_frontend=False)): for _, j in enumerate(cosyvoice.inference_zero_shot(utt2text[utt], utt2text[utt], prompt_speech_16k, stream=False, text_frontend=False)):
speech_list.append(j['tts_speech']) speech_list.append(j['tts_speech'])
negative_wav = os.path.abspath('{}/wav/{}'.format(args.des_dir, os.path.basename(wav))) negative_wav = os.path.abspath('{}/wav/{}'.format(args.des_dir, os.path.basename(wav)))
torchaudio.save(negative_wav, torch.concat(speech_list, dim=1), cosyvoice.sample_rate, backend='soundfile') torchaudio.save(negative_wav, torch.concat(speech_list, dim=1), cosyvoice.sample_rate, backend='soundfile')

View File

@@ -9,13 +9,15 @@ from cosyvoice.utils.file_utils import load_wav
from cosyvoice.utils.common import set_all_random_seed from cosyvoice.utils.common import set_all_random_seed
from tqdm import tqdm from tqdm import tqdm
def main(): def main():
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True) cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True)
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000) prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
for i in tqdm(range(100)): for i in tqdm(range(100)):
set_all_random_seed(i) set_all_random_seed(i)
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
continue continue
if __name__=='__main__':
if __name__ == '__main__':
main() main()