mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
update
This commit is contained in:
10
README.md
10
README.md
@@ -116,27 +116,27 @@ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=True, loa
|
|||||||
print(cosyvoice.list_avaliable_spks())
|
print(cosyvoice.list_avaliable_spks())
|
||||||
# change stream=True for chunk stream inference
|
# change stream=True for chunk stream inference
|
||||||
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
||||||
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
|
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
|
||||||
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
||||||
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
# cross_lingual usage
|
# cross_lingual usage
|
||||||
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
||||||
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
||||||
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
# vc usage
|
# vc usage
|
||||||
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
||||||
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
||||||
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
||||||
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
|
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
||||||
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
||||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Start web demo**
|
**Start web demo**
|
||||||
|
|||||||
@@ -157,6 +157,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|||||||
vocab_size: int = 4096,
|
vocab_size: int = 4096,
|
||||||
input_frame_rate: int = 50,
|
input_frame_rate: int = 50,
|
||||||
only_mask_loss: bool = True,
|
only_mask_loss: bool = True,
|
||||||
|
token_mel_ratio: int = 2,
|
||||||
|
pre_lookahead_len: int = 3,
|
||||||
encoder: torch.nn.Module = None,
|
encoder: torch.nn.Module = None,
|
||||||
decoder: torch.nn.Module = None,
|
decoder: torch.nn.Module = None,
|
||||||
decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
|
decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
|
||||||
@@ -181,6 +183,8 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|||||||
self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
|
self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
|
||||||
self.decoder = decoder
|
self.decoder = decoder
|
||||||
self.only_mask_loss = only_mask_loss
|
self.only_mask_loss = only_mask_loss
|
||||||
|
self.token_mel_ratio = token_mel_ratio
|
||||||
|
self.pre_lookahead_len = pre_lookahead_len
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def inference(self,
|
def inference(self,
|
||||||
@@ -206,7 +210,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
|
|||||||
# text encode
|
# text encode
|
||||||
h, h_lengths = self.encoder(token, token_len)
|
h, h_lengths = self.encoder(token, token_len)
|
||||||
if finalize is False:
|
if finalize is False:
|
||||||
h = h[:, :-self.encoder.pre_lookahead_layer.pre_lookahead_len * self.encoder.up_layer.stride]
|
h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
|
||||||
mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
|
mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
|
||||||
h = self.encoder_proj(h)
|
h = self.encoder_proj(h)
|
||||||
|
|
||||||
|
|||||||
@@ -240,6 +240,8 @@ def get_tokenizer(
|
|||||||
|
|
||||||
class QwenTokenizer():
|
class QwenTokenizer():
|
||||||
def __init__(self, token_path, skip_special_tokens=True):
|
def __init__(self, token_path, skip_special_tokens=True):
|
||||||
|
super().__init__()
|
||||||
|
# NOTE: non-chat model, all these special tokens keep randomly initialized.
|
||||||
special_tokens = {
|
special_tokens = {
|
||||||
'eos_token': '<|endoftext|>',
|
'eos_token': '<|endoftext|>',
|
||||||
'pad_token': '<|endoftext|>',
|
'pad_token': '<|endoftext|>',
|
||||||
@@ -248,6 +250,9 @@ class QwenTokenizer():
|
|||||||
'[breath]', '<strong>', '</strong>', '[noise]',
|
'[breath]', '<strong>', '</strong>', '[noise]',
|
||||||
'[laughter]', '[cough]', '[clucking]', '[accent]',
|
'[laughter]', '[cough]', '[clucking]', '[accent]',
|
||||||
'[quick_breath]',
|
'[quick_breath]',
|
||||||
|
"<laughter>", "</laughter>",
|
||||||
|
"[hissing]", "[sigh]", "[vocalized-noise]",
|
||||||
|
"[lipsmack]", "[mn]"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(token_path)
|
self.tokenizer = AutoTokenizer.from_pretrained(token_path)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
--extra-index-url https://download.pytorch.org/whl/torch_stable.html
|
||||||
conformer==0.3.2
|
conformer==0.3.2
|
||||||
deepspeed==0.14.2; sys_platform == 'linux'
|
deepspeed==0.14.2; sys_platform == 'linux'
|
||||||
diffusers==0.27.2
|
diffusers==0.27.2
|
||||||
@@ -25,8 +25,9 @@ pydantic==2.7.0
|
|||||||
rich==13.7.1
|
rich==13.7.1
|
||||||
soundfile==0.12.1
|
soundfile==0.12.1
|
||||||
tensorboard==2.14.0
|
tensorboard==2.14.0
|
||||||
torch==2.0.1
|
tensorrt-cu12==10.0.1
|
||||||
torchaudio==2.0.2
|
torch==2.3.1+cu121
|
||||||
|
torchaudio==2.3.1+cu121
|
||||||
uvicorn==0.30.0
|
uvicorn==0.30.0
|
||||||
wget==3.2
|
wget==3.2
|
||||||
fastapi==0.111.0
|
fastapi==0.111.0
|
||||||
|
|||||||
Reference in New Issue
Block a user