From 06934c38c7122b030a212d7fdb0fdf624d806b6f Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Thu, 26 Sep 2024 14:46:24 +0800 Subject: [PATCH] update vc code --- README.md | 16 ++++++++-------- cosyvoice/cli/cosyvoice.py | 2 -- cosyvoice/cli/frontend.py | 2 -- cosyvoice/cli/model.py | 7 +++---- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b8fffa5..940bc60 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow # SDK模型下载 from modelscope import snapshot_download snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M') +snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz') snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT') snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct') snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd') @@ -80,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice # git模型下载,请确保已安装git lfs mkdir -p pretrained_models git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M +git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd @@ -118,7 +120,7 @@ print(cosyvoice.list_avaliable_spks()) for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)): torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050) -cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') +cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)): @@ -127,18 +129,16 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来 prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)): torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050) - -cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct') -# instruct usage, support [laughter][breath] -for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)): - torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050) - -cosyvoice = CosyVoice('pretrained_models/CosyVoice-VC') # vc usage prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000) for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)): torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050) + +cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct') +# instruct usage, support [laughter][breath] +for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的勇气智慧。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)): + torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050) ``` **Start web demo** diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index d09403e..bedb2ba 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -25,7 +25,6 @@ class CosyVoice: def __init__(self, model_dir, load_jit=True, load_onnx=False): instruct = True if '-Instruct' in model_dir else False - vc = True if '-VC' in model_dir else False self.model_dir = model_dir if not os.path.exists(model_dir): model_dir = snapshot_download(model_dir) @@ -37,7 +36,6 @@ class CosyVoice: '{}/speech_tokenizer_v1.onnx'.format(model_dir), '{}/spk2info.pt'.format(model_dir), instruct, - vc, configs['allowed_special']) self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) self.model.load('{}/llm.pt'.format(model_dir), diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index d9d97ee..8e12a1c 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -42,7 +42,6 @@ class CosyVoiceFrontEnd: speech_tokenizer_model: str, spk2info: str = '', instruct: bool = False, - vc: bool = False, allowed_special: str = 'all'): self.tokenizer = get_tokenizer() self.feat_extractor = feat_extractor @@ -59,7 +58,6 @@ class CosyVoiceFrontEnd: else: self.spk2info = {} self.instruct = instruct - self.vc = vc self.allowed_special = allowed_special self.inflect_parser = inflect.engine() self.use_ttsfrd = use_ttsfrd diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 1272b61..b75774c 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -54,10 +54,9 @@ class CosyVoiceModel: self.hift_cache_dict = {} def load(self, llm_model, flow_model, hift_model): - if self.llm is not None: - self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) - self.llm.to(self.device).eval() - self.llm.half() + self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) + self.llm.to(self.device).eval() + self.llm.half() self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) self.flow.to(self.device).eval() self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))