mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
use automodel
This commit is contained in:
@@ -23,8 +23,10 @@ import torch
|
|||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
sys.path.append('{}/../..'.format(ROOT_DIR))
|
sys.path.append('{}/../..'.format(ROOT_DIR))
|
||||||
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
|
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
|
||||||
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
|
from cosyvoice.cli.cosyvoice import AutoModel
|
||||||
|
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model, CosyVoice3Model
|
||||||
from cosyvoice.utils.file_utils import logging
|
from cosyvoice.utils.file_utils import logging
|
||||||
|
from cosyvoice.utils.class_utils import get_model_type
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@@ -57,15 +59,17 @@ def main():
|
|||||||
torch._C._jit_set_profiling_mode(False)
|
torch._C._jit_set_profiling_mode(False)
|
||||||
torch._C._jit_set_profiling_executor(False)
|
torch._C._jit_set_profiling_executor(False)
|
||||||
|
|
||||||
try:
|
model = AutoModel(model_dir=args.model_dir)
|
||||||
model = CosyVoice(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
model = CosyVoice2(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
raise TypeError('no valid model_type!')
|
|
||||||
|
|
||||||
if not isinstance(model, CosyVoice2):
|
if get_model_type(model.model) == CosyVoiceModel:
|
||||||
|
# 1. export flow encoder
|
||||||
|
flow_encoder = model.model.flow.encoder
|
||||||
|
script = get_optimized_script(flow_encoder)
|
||||||
|
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
|
||||||
|
script = get_optimized_script(flow_encoder.half())
|
||||||
|
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
|
||||||
|
logging.info('successfully export flow_encoder')
|
||||||
|
elif get_model_type(model.model) == CosyVoice2Model:
|
||||||
# 1. export llm text_encoder
|
# 1. export llm text_encoder
|
||||||
llm_text_encoder = model.model.llm.text_encoder
|
llm_text_encoder = model.model.llm.text_encoder
|
||||||
script = get_optimized_script(llm_text_encoder)
|
script = get_optimized_script(llm_text_encoder)
|
||||||
@@ -90,13 +94,7 @@ def main():
|
|||||||
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
|
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
|
||||||
logging.info('successfully export flow_encoder')
|
logging.info('successfully export flow_encoder')
|
||||||
else:
|
else:
|
||||||
# 3. export flow encoder
|
raise ValueError('unsupported model type')
|
||||||
flow_encoder = model.model.flow.encoder
|
|
||||||
script = get_optimized_script(flow_encoder)
|
|
||||||
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
|
|
||||||
script = get_optimized_script(flow_encoder.half())
|
|
||||||
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
|
|
||||||
logging.info('successfully export flow_encoder')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ from tqdm import tqdm
|
|||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
sys.path.append('{}/../..'.format(ROOT_DIR))
|
sys.path.append('{}/../..'.format(ROOT_DIR))
|
||||||
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
|
sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
|
||||||
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2, CosyVoice3
|
from cosyvoice.cli.cosyvoice import AutoModel
|
||||||
from cosyvoice.utils.file_utils import logging
|
from cosyvoice.utils.file_utils import logging
|
||||||
|
|
||||||
|
|
||||||
@@ -58,16 +58,7 @@ def main():
|
|||||||
logging.basicConfig(level=logging.DEBUG,
|
logging.basicConfig(level=logging.DEBUG,
|
||||||
format='%(asctime)s %(levelname)s %(message)s')
|
format='%(asctime)s %(levelname)s %(message)s')
|
||||||
|
|
||||||
try:
|
model = AutoModel(model_dir=args.model_dir)
|
||||||
model = CosyVoice(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
model = CosyVoice2(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
model = CosyVoice3(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
raise TypeError('no valid model_type!')
|
|
||||||
|
|
||||||
# 1. export flow decoder estimator
|
# 1. export flow decoder estimator
|
||||||
estimator = model.model.flow.decoder.estimator
|
estimator = model.model.flow.decoder.estimator
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ class CosyVoice2(CosyVoice):
|
|||||||
|
|
||||||
class CosyVoice3(CosyVoice2):
|
class CosyVoice3(CosyVoice2):
|
||||||
|
|
||||||
def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
|
def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
|
||||||
self.instruct = True if '-Instruct' in model_dir else False
|
self.instruct = True if '-Instruct' in model_dir else False
|
||||||
self.model_dir = model_dir
|
self.model_dir = model_dir
|
||||||
self.fp16 = fp16
|
self.fp16 = fp16
|
||||||
@@ -215,9 +215,9 @@ class CosyVoice3(CosyVoice2):
|
|||||||
'{}/spk2info.pt'.format(model_dir),
|
'{}/spk2info.pt'.format(model_dir),
|
||||||
configs['allowed_special'])
|
configs['allowed_special'])
|
||||||
self.sample_rate = configs['sample_rate']
|
self.sample_rate = configs['sample_rate']
|
||||||
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
if torch.cuda.is_available() is False and (load_trt is True or fp16 is True):
|
||||||
load_jit, load_trt, fp16 = False, False, False
|
load_trt, fp16 = False, False
|
||||||
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
logging.warning('no cuda device, set load_trt/fp16 to False')
|
||||||
self.model = CosyVoice3Model(configs['llm'], configs['flow'], configs['hift'], fp16)
|
self.model = CosyVoice3Model(configs['llm'], configs['flow'], configs['hift'], fp16)
|
||||||
self.model.load('{}/llm.pt'.format(model_dir),
|
self.model.load('{}/llm.pt'.format(model_dir),
|
||||||
'{}/flow.pt'.format(model_dir),
|
'{}/flow.pt'.format(model_dir),
|
||||||
@@ -225,8 +225,23 @@ class CosyVoice3(CosyVoice2):
|
|||||||
if load_vllm:
|
if load_vllm:
|
||||||
self.model.load_vllm('{}/vllm'.format(model_dir))
|
self.model.load_vllm('{}/vllm'.format(model_dir))
|
||||||
if load_trt:
|
if load_trt:
|
||||||
|
if self.fp16 is True:
|
||||||
|
logging.warning('DiT tensorRT fp16 engine have some performance issue, use at caution!')
|
||||||
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
||||||
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
|
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
|
||||||
trt_concurrent,
|
trt_concurrent,
|
||||||
self.fp16)
|
self.fp16)
|
||||||
del configs
|
del configs
|
||||||
|
|
||||||
|
|
||||||
|
def AutoModel(**kwargs):
|
||||||
|
if not os.path.exists(kwargs['model_dir']):
|
||||||
|
kwargs['model_dir'] = snapshot_download(kwargs['model_dir'])
|
||||||
|
if os.path.exists('{}/cosyvoice.yaml'.format(kwargs['model_dir'])):
|
||||||
|
return CosyVoice(**kwargs)
|
||||||
|
elif os.path.exists('{}/cosyvoice2.yaml'.format(kwargs['model_dir'])):
|
||||||
|
return CosyVoice2(**kwargs)
|
||||||
|
elif os.path.exists('{}/cosyvoice3.yaml'.format(kwargs['model_dir'])):
|
||||||
|
return CosyVoice3(**kwargs)
|
||||||
|
else:
|
||||||
|
raise TypeError('No valid model type found!')
|
||||||
|
|||||||
@@ -122,6 +122,9 @@ class CosyVoiceFrontEnd:
|
|||||||
return speech_feat, speech_feat_len
|
return speech_feat, speech_feat_len
|
||||||
|
|
||||||
def text_normalize(self, text, split=True, text_frontend=True):
|
def text_normalize(self, text, split=True, text_frontend=True):
|
||||||
|
# NOTE skip text_frontend when ssml symbol in text
|
||||||
|
if '<|' in text and '|>' in text:
|
||||||
|
text_frontend = False
|
||||||
if isinstance(text, Generator):
|
if isinstance(text, Generator):
|
||||||
logging.info('get tts_text generator, will skip text_normalize!')
|
logging.info('get tts_text generator, will skip text_normalize!')
|
||||||
return [text]
|
return [text]
|
||||||
|
|||||||
@@ -92,29 +92,14 @@ def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
|
|||||||
def export_cosyvoice2_vllm(model, model_path, device):
|
def export_cosyvoice2_vllm(model, model_path, device):
|
||||||
if os.path.exists(model_path):
|
if os.path.exists(model_path):
|
||||||
return
|
return
|
||||||
pad_to = DEFAULT_VOCAB_PADDING_SIZE = 64
|
|
||||||
vocab_size = model.speech_embedding.num_embeddings
|
|
||||||
feature_size = model.speech_embedding.embedding_dim
|
|
||||||
pad_vocab_size = ((vocab_size + pad_to - 1) // pad_to) * pad_to
|
|
||||||
|
|
||||||
dtype = torch.bfloat16
|
dtype = torch.bfloat16
|
||||||
# lm_head
|
# lm_head
|
||||||
use_bias = True if model.llm_decoder.bias is not None else False
|
use_bias = True if model.llm_decoder.bias is not None else False
|
||||||
new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size, bias=use_bias)
|
model.llm.model.lm_head = model.llm_decoder
|
||||||
with torch.no_grad():
|
|
||||||
new_lm_head.weight[:vocab_size] = model.llm_decoder.weight
|
|
||||||
new_lm_head.weight[vocab_size:] = 0
|
|
||||||
if use_bias is True:
|
|
||||||
new_lm_head.bias[:vocab_size] = model.llm_decoder.bias
|
|
||||||
new_lm_head.bias[vocab_size:] = 0
|
|
||||||
model.llm.model.lm_head = new_lm_head
|
|
||||||
new_codec_embed = torch.nn.Linear(in_features=feature_size, out_features=pad_vocab_size)
|
|
||||||
# embed_tokens
|
# embed_tokens
|
||||||
embed_tokens = model.llm.model.model.embed_tokens
|
embed_tokens = model.llm.model.model.embed_tokens
|
||||||
with torch.no_grad():
|
model.llm.model.set_input_embeddings(model.speech_embedding)
|
||||||
new_codec_embed.weight[:vocab_size] = model.speech_embedding.weight
|
|
||||||
new_codec_embed.weight[vocab_size:] = 0
|
|
||||||
model.llm.model.set_input_embeddings(new_codec_embed)
|
|
||||||
model.llm.model.to(device)
|
model.llm.model.to(device)
|
||||||
model.llm.model.to(dtype)
|
model.llm.model.to(dtype)
|
||||||
tmp_vocab_size = model.llm.model.config.vocab_size
|
tmp_vocab_size = model.llm.model.config.vocab_size
|
||||||
@@ -122,14 +107,12 @@ def export_cosyvoice2_vllm(model, model_path, device):
|
|||||||
del model.llm.model.generation_config.eos_token_id
|
del model.llm.model.generation_config.eos_token_id
|
||||||
del model.llm.model.config.bos_token_id
|
del model.llm.model.config.bos_token_id
|
||||||
del model.llm.model.config.eos_token_id
|
del model.llm.model.config.eos_token_id
|
||||||
model.llm.model.config.vocab_size = pad_vocab_size
|
model.llm.model.config.vocab_size = model.speech_embedding.num_embeddings
|
||||||
model.llm.model.config.tie_word_embeddings = False
|
model.llm.model.config.tie_word_embeddings = False
|
||||||
model.llm.model.config.use_bias = use_bias
|
model.llm.model.config.use_bias = use_bias
|
||||||
model.llm.model.save_pretrained(model_path)
|
model.llm.model.save_pretrained(model_path)
|
||||||
if use_bias is True:
|
if use_bias is True:
|
||||||
os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
|
os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
|
||||||
else:
|
|
||||||
os.system('sed -i s@Qwen2ForCausalLM@Qwen2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
|
|
||||||
model.llm.model.config.vocab_size = tmp_vocab_size
|
model.llm.model.config.vocab_size = tmp_vocab_size
|
||||||
model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
|
model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
|
||||||
model.llm.model.set_input_embeddings(embed_tokens)
|
model.llm.model.set_input_embeddings(embed_tokens)
|
||||||
|
|||||||
12
example.py
12
example.py
@@ -1,6 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
sys.path.append('third_party/Matcha-TTS')
|
sys.path.append('third_party/Matcha-TTS')
|
||||||
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2, CosyVoice3
|
from cosyvoice.cli.cosyvoice import AutoModel
|
||||||
from cosyvoice.utils.file_utils import load_wav
|
from cosyvoice.utils.file_utils import load_wav
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
@@ -8,14 +8,14 @@ import torchaudio
|
|||||||
def cosyvoice_example():
|
def cosyvoice_example():
|
||||||
""" CosyVoice Usage, check https://fun-audio-llm.github.io/ for more details
|
""" CosyVoice Usage, check https://fun-audio-llm.github.io/ for more details
|
||||||
"""
|
"""
|
||||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-SFT')
|
||||||
# sft usage
|
# sft usage
|
||||||
print(cosyvoice.list_available_spks())
|
print(cosyvoice.list_available_spks())
|
||||||
# change stream=True for chunk stream inference
|
# change stream=True for chunk stream inference
|
||||||
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
||||||
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M')
|
||||||
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
@@ -26,7 +26,7 @@ def cosyvoice_example():
|
|||||||
for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav', stream=False)):
|
||||||
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct')
|
||||||
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>', stream=False)):
|
||||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
@@ -34,7 +34,7 @@ def cosyvoice_example():
|
|||||||
def cosyvoice2_example():
|
def cosyvoice2_example():
|
||||||
""" CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
|
""" CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
|
||||||
"""
|
"""
|
||||||
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B')
|
||||||
|
|
||||||
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
|
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
|
||||||
# zero_shot usage
|
# zero_shot usage
|
||||||
@@ -68,7 +68,7 @@ def cosyvoice2_example():
|
|||||||
def cosyvoice3_example():
|
def cosyvoice3_example():
|
||||||
""" CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
|
""" CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
|
||||||
"""
|
"""
|
||||||
cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_jit=False, load_trt=False, fp16=False)
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice3-0.5B')
|
||||||
# zero_shot usage
|
# zero_shot usage
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from vllm import ModelRegistry
|
|||||||
from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM
|
from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM
|
||||||
ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM)
|
ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM)
|
||||||
|
|
||||||
from cosyvoice.cli.cosyvoice import CosyVoice2, CosyVoice3
|
from cosyvoice.cli.cosyvoice import AutoModel
|
||||||
from cosyvoice.utils.common import set_all_random_seed
|
from cosyvoice.utils.common import set_all_random_seed
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -12,7 +12,7 @@ from tqdm import tqdm
|
|||||||
def cosyvoice2_example():
|
def cosyvoice2_example():
|
||||||
""" CosyVoice2 vllm usage
|
""" CosyVoice2 vllm usage
|
||||||
"""
|
"""
|
||||||
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True)
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B', load_jit=True, load_trt=True, load_vllm=True, fp16=True)
|
||||||
for i in tqdm(range(100)):
|
for i in tqdm(range(100)):
|
||||||
set_all_random_seed(i)
|
set_all_random_seed(i)
|
||||||
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
||||||
@@ -21,7 +21,7 @@ def cosyvoice2_example():
|
|||||||
def cosyvoice3_example():
|
def cosyvoice3_example():
|
||||||
""" CosyVoice3 vllm usage
|
""" CosyVoice3 vllm usage
|
||||||
"""
|
"""
|
||||||
cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=True)
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False)
|
||||||
for i in tqdm(range(100)):
|
for i in tqdm(range(100)):
|
||||||
set_all_random_seed(i)
|
set_all_random_seed(i)
|
||||||
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
||||||
|
|||||||
34
webui.py
34
webui.py
@@ -22,8 +22,8 @@ import random
|
|||||||
import librosa
|
import librosa
|
||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
|
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
|
||||||
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
|
from cosyvoice.cli.cosyvoice import AutoModel
|
||||||
from cosyvoice.utils.file_utils import load_wav, logging
|
from cosyvoice.utils.file_utils import logging
|
||||||
from cosyvoice.utils.common import set_all_random_seed
|
from cosyvoice.utils.common import set_all_random_seed
|
||||||
|
|
||||||
inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
|
inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
|
||||||
@@ -42,23 +42,9 @@ def generate_seed():
|
|||||||
"value": seed
|
"value": seed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
|
|
||||||
speech, _ = librosa.effects.trim(
|
|
||||||
speech, top_db=top_db,
|
|
||||||
frame_length=win_length,
|
|
||||||
hop_length=hop_length
|
|
||||||
)
|
|
||||||
if speech.abs().max() > max_val:
|
|
||||||
speech = speech / speech.abs().max() * max_val
|
|
||||||
speech = torch.concat([speech, torch.zeros(1, int(cosyvoice.sample_rate * 0.2))], dim=1)
|
|
||||||
return speech
|
|
||||||
|
|
||||||
|
|
||||||
def change_instruction(mode_checkbox_group):
|
def change_instruction(mode_checkbox_group):
|
||||||
return instruct_dict[mode_checkbox_group]
|
return instruct_dict[mode_checkbox_group]
|
||||||
|
|
||||||
|
|
||||||
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
|
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
|
||||||
seed, stream, speed):
|
seed, stream, speed):
|
||||||
if prompt_wav_upload is not None:
|
if prompt_wav_upload is not None:
|
||||||
@@ -118,15 +104,13 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|||||||
yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
|
yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
|
||||||
elif mode_checkbox_group == '3s极速复刻':
|
elif mode_checkbox_group == '3s极速复刻':
|
||||||
logging.info('get zero_shot inference request')
|
logging.info('get zero_shot inference request')
|
||||||
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
|
||||||
set_all_random_seed(seed)
|
set_all_random_seed(seed)
|
||||||
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_wav, stream=stream, speed=speed):
|
||||||
yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
|
yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
|
||||||
elif mode_checkbox_group == '跨语种复刻':
|
elif mode_checkbox_group == '跨语种复刻':
|
||||||
logging.info('get cross_lingual inference request')
|
logging.info('get cross_lingual inference request')
|
||||||
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
|
||||||
set_all_random_seed(seed)
|
set_all_random_seed(seed)
|
||||||
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_wav, stream=stream, speed=speed):
|
||||||
yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
|
yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
|
||||||
else:
|
else:
|
||||||
logging.info('get instruct inference request')
|
logging.info('get instruct inference request')
|
||||||
@@ -181,16 +165,10 @@ if __name__ == '__main__':
|
|||||||
default=8000)
|
default=8000)
|
||||||
parser.add_argument('--model_dir',
|
parser.add_argument('--model_dir',
|
||||||
type=str,
|
type=str,
|
||||||
default='pretrained_models/CosyVoice2-0.5B',
|
default='pretrained_models/CosyVoice3-0.5B',
|
||||||
help='local path or modelscope repo id')
|
help='local path or modelscope repo id')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
try:
|
model = AutoModel(model_dir=args.model_dir)
|
||||||
cosyvoice = CosyVoice(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
cosyvoice = CosyVoice2(args.model_dir)
|
|
||||||
except Exception:
|
|
||||||
raise TypeError('no valid model_type!')
|
|
||||||
|
|
||||||
sft_spk = cosyvoice.list_available_spks()
|
sft_spk = cosyvoice.list_available_spks()
|
||||||
if len(sft_spk) == 0:
|
if len(sft_spk) == 0:
|
||||||
|
|||||||
Reference in New Issue
Block a user