mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 09:29:25 +08:00
fix lint
This commit is contained in:
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -52,5 +52,5 @@ jobs:
|
|||||||
set -eux
|
set -eux
|
||||||
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
|
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
|
||||||
flake8 --version
|
flake8 --version
|
||||||
flake8 --max-line-length 180 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504,F401,F403,F405,F841 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
|
flake8 --max-line-length 180 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504,F401,F403,F405,F722,F841 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
|
||||||
if [ $? != 0 ]; then exit 1; fi
|
if [ $? != 0 ]; then exit 1; fi
|
||||||
18
README.md
18
README.md
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## 👉🏻 CosyVoice 👈🏻
|
## 👉🏻 CosyVoice 👈🏻
|
||||||
|
|
||||||
**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice3-0.5B); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
|
**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [Modelscope](https://www.modelscope.cn/studios/FunAudioLLM/Fun-CosyVoice3-0.5B); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
|
||||||
|
|
||||||
**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
|
**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
|
||||||
|
|
||||||
@@ -31,8 +31,8 @@
|
|||||||
|
|
||||||
- [x] 2025/12
|
- [x] 2025/12
|
||||||
|
|
||||||
- [x] release cosyvoice3-0.5B base model and its training/inference script
|
- [x] release CosyVoice3-0.5B base model and its training/inference script
|
||||||
- [x] release cosyvoice3-0.5B modelscope gradio space
|
- [x] release CosyVoice3-0.5B modelscope gradio space
|
||||||
|
|
||||||
- [x] 2025/08
|
- [x] 2025/08
|
||||||
|
|
||||||
@@ -40,20 +40,20 @@
|
|||||||
|
|
||||||
- [x] 2025/07
|
- [x] 2025/07
|
||||||
|
|
||||||
- [x] release cosyvoice 3.0 eval set
|
- [x] release CosyVoice 3.0 eval set
|
||||||
|
|
||||||
- [x] 2025/05
|
- [x] 2025/05
|
||||||
|
|
||||||
- [x] add cosyvoice 2.0 vllm support
|
- [x] add CosyVoice2-0.5B vllm support
|
||||||
|
|
||||||
- [x] 2024/12
|
- [x] 2024/12
|
||||||
|
|
||||||
- [x] 25hz cosyvoice 2.0 released
|
- [x] 25hz CosyVoice2-0.5B released
|
||||||
|
|
||||||
- [x] 2024/09
|
- [x] 2024/09
|
||||||
|
|
||||||
- [x] 25hz cosyvoice base model
|
- [x] 25hz CosyVoice-300M base model
|
||||||
- [x] 25hz cosyvoice voice conversion model
|
- [x] 25hz CosyVoice-300M voice conversion function
|
||||||
|
|
||||||
- [x] 2024/08
|
- [x] 2024/08
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
|
|||||||
|
|
||||||
### Basic Usage
|
### Basic Usage
|
||||||
|
|
||||||
We strongly recommend using `CosyVoice2-0.5B` for better performance.
|
We strongly recommend using `CosyVoice3-0.5B` for better performance.
|
||||||
Follow the code in `example.py` for detailed usage of each model.
|
Follow the code in `example.py` for detailed usage of each model.
|
||||||
```sh
|
```sh
|
||||||
python example.py
|
python example.py
|
||||||
|
|||||||
@@ -156,9 +156,9 @@ class CosyVoice2(CosyVoice):
|
|||||||
'{}/spk2info.pt'.format(model_dir),
|
'{}/spk2info.pt'.format(model_dir),
|
||||||
configs['allowed_special'])
|
configs['allowed_special'])
|
||||||
self.sample_rate = configs['sample_rate']
|
self.sample_rate = configs['sample_rate']
|
||||||
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or load_vllm is True or fp16 is True):
|
||||||
load_jit, load_trt, fp16 = False, False, False
|
load_jit, load_trt, load_vllm, fp16 = False, False, False, False
|
||||||
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
logging.warning('no cuda device, set load_jit/load_trt/load_vllm/fp16 to False')
|
||||||
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
|
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
|
||||||
self.model.load('{}/llm.pt'.format(model_dir),
|
self.model.load('{}/llm.pt'.format(model_dir),
|
||||||
'{}/flow.pt'.format(model_dir),
|
'{}/flow.pt'.format(model_dir),
|
||||||
@@ -174,11 +174,7 @@ class CosyVoice2(CosyVoice):
|
|||||||
self.fp16)
|
self.fp16)
|
||||||
del configs
|
del configs
|
||||||
|
|
||||||
def inference_instruct(self, *args, **kwargs):
|
|
||||||
raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
|
|
||||||
|
|
||||||
def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
|
def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
|
||||||
assert isinstance(self.model, CosyVoice2Model) or isinstance(self.model, CosyVoice3Model), 'inference_instruct2 is only implemented for CosyVoice2 and CosyVoice3!'
|
|
||||||
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
|
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
|
||||||
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
|
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|||||||
@@ -436,4 +436,4 @@ class CosyVoice3Model(CosyVoice2Model):
|
|||||||
tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
|
tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
|
||||||
tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
|
tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
|
||||||
self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
|
self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
|
||||||
return tts_speech
|
return tts_speech
|
||||||
|
|||||||
@@ -476,7 +476,7 @@ class JointAttnProcessor:
|
|||||||
# Split the attention outputs.
|
# Split the attention outputs.
|
||||||
x, c = (
|
x, c = (
|
||||||
x[:, : residual.shape[1]],
|
x[:, : residual.shape[1]],
|
||||||
x[:, residual.shape[1] :],
|
x[:, residual.shape[1]:],
|
||||||
)
|
)
|
||||||
|
|
||||||
# linear proj
|
# linear proj
|
||||||
|
|||||||
@@ -402,11 +402,12 @@ class CausalMaskedDiffWithDiT(torch.nn.Module):
|
|||||||
assert feat.shape[2] == mel_len2
|
assert feat.shape[2] == mel_len2
|
||||||
return feat.float(), None
|
return feat.float(), None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
torch.backends.cudnn.deterministic = True
|
torch.backends.cudnn.deterministic = True
|
||||||
torch.backends.cudnn.benchmark = False
|
torch.backends.cudnn.benchmark = False
|
||||||
from hyperpyyaml import load_hyperpyyaml
|
from hyperpyyaml import load_hyperpyyaml
|
||||||
with open('./pretrained_models/CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
with open('./pretrained_models/Fun-CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
||||||
configs = load_hyperpyyaml(f, overrides={'llm': None, 'hift': None})
|
configs = load_hyperpyyaml(f, overrides={'llm': None, 'hift': None})
|
||||||
model = configs['flow']
|
model = configs['flow']
|
||||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
@@ -425,6 +426,7 @@ if __name__ == '__main__':
|
|||||||
pred_gt, _ = model.inference(token, token_len, prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=True)
|
pred_gt, _ = model.inference(token, token_len, prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=True)
|
||||||
for i in range(0, max_len, chunk_size):
|
for i in range(0, max_len, chunk_size):
|
||||||
finalize = True if i + chunk_size + context_size >= max_len else False
|
finalize = True if i + chunk_size + context_size >= max_len else False
|
||||||
pred_chunk, _ = model.inference(token[:, :i + chunk_size + context_size], torch.tensor([token[:, :i + chunk_size + context_size].shape[1]]).to(device), prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=finalize)
|
pred_chunk, _ = model.inference(token[:, :i + chunk_size + context_size], torch.tensor([token[:, :i + chunk_size + context_size].shape[1]]).to(device),
|
||||||
|
prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=finalize)
|
||||||
pred_chunk = pred_chunk[:, :, i * model.token_mel_ratio:]
|
pred_chunk = pred_chunk[:, :, i * model.token_mel_ratio:]
|
||||||
print((pred_gt[:, :, i * model.token_mel_ratio: i * model.token_mel_ratio + pred_chunk.shape[2]] - pred_chunk).abs().max().item())
|
print((pred_gt[:, :, i * model.token_mel_ratio: i * model.token_mel_ratio + pred_chunk.shape[2]] - pred_chunk).abs().max().item())
|
||||||
|
|||||||
@@ -100,4 +100,4 @@ class CausalConvRNNF0Predictor(nn.Module):
|
|||||||
for i in range(1, len(self.condnet)):
|
for i in range(1, len(self.condnet)):
|
||||||
x = self.condnet[i](x)
|
x = self.condnet[i](x)
|
||||||
x = x.transpose(1, 2)
|
x = x.transpose(1, 2)
|
||||||
return torch.abs(self.classifier(x).squeeze(-1))
|
return torch.abs(self.classifier(x).squeeze(-1))
|
||||||
|
|||||||
@@ -342,11 +342,9 @@ class SourceModuleHnNSF(torch.nn.Module):
|
|||||||
|
|
||||||
# to produce sine waveforms
|
# to produce sine waveforms
|
||||||
if sinegen_type == '1':
|
if sinegen_type == '1':
|
||||||
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
|
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
|
||||||
sine_amp, add_noise_std, voiced_threshod)
|
|
||||||
else:
|
else:
|
||||||
self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
|
self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num, sine_amp, add_noise_std, voiced_threshod, causal=causal)
|
||||||
sine_amp, add_noise_std, voiced_threshod, causal=causal)
|
|
||||||
|
|
||||||
# to merge source harmonics into a single excitation
|
# to merge source harmonics into a single excitation
|
||||||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||||
@@ -675,7 +673,8 @@ class CausalHiFTGenerator(HiFTGenerator):
|
|||||||
x = self.conv_pre(x)
|
x = self.conv_pre(x)
|
||||||
else:
|
else:
|
||||||
x = self.conv_pre(x[:, :, :-self.conv_pre_look_right], x[:, :, -self.conv_pre_look_right:])
|
x = self.conv_pre(x[:, :, :-self.conv_pre_look_right], x[:, :, -self.conv_pre_look_right:])
|
||||||
s_stft_real, s_stft_imag = s_stft_real[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)], s_stft_imag[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
|
s_stft_real = s_stft_real[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
|
||||||
|
s_stft_imag = s_stft_imag[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
|
||||||
s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
|
s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
|
||||||
|
|
||||||
for i in range(self.num_upsamples):
|
for i in range(self.num_upsamples):
|
||||||
@@ -711,7 +710,7 @@ class CausalHiFTGenerator(HiFTGenerator):
|
|||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def inference(self, speech_feat: torch.Tensor, finalize: bool = True) -> torch.Tensor:
|
def inference(self, speech_feat: torch.Tensor, finalize: bool = True) -> torch.Tensor:
|
||||||
# mel->f0
|
# mel->f0 NOTE f0_predictor precision is crucial for causal inference, move self.f0_predictor to cpu if necessary
|
||||||
self.f0_predictor.to('cpu')
|
self.f0_predictor.to('cpu')
|
||||||
f0 = self.f0_predictor(speech_feat.cpu(), finalize=finalize).to(speech_feat)
|
f0 = self.f0_predictor(speech_feat.cpu(), finalize=finalize).to(speech_feat)
|
||||||
# f0->source
|
# f0->source
|
||||||
@@ -729,7 +728,7 @@ if __name__ == '__main__':
|
|||||||
torch.backends.cudnn.deterministic = True
|
torch.backends.cudnn.deterministic = True
|
||||||
torch.backends.cudnn.benchmark = False
|
torch.backends.cudnn.benchmark = False
|
||||||
from hyperpyyaml import load_hyperpyyaml
|
from hyperpyyaml import load_hyperpyyaml
|
||||||
with open('./pretrained_models/CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
with open('./pretrained_models/Fun-CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
||||||
configs = load_hyperpyyaml(f, overrides={'llm': None, 'flow': None})
|
configs = load_hyperpyyaml(f, overrides={'llm': None, 'flow': None})
|
||||||
model = configs['hift']
|
model = configs['hift']
|
||||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
@@ -742,4 +741,4 @@ if __name__ == '__main__':
|
|||||||
finalize = True if i + chunk_size + context_size >= max_len else False
|
finalize = True if i + chunk_size + context_size >= max_len else False
|
||||||
pred_chunk, _ = model.inference(mel[:, :, : i + chunk_size + context_size], finalize=finalize)
|
pred_chunk, _ = model.inference(mel[:, :, : i + chunk_size + context_size], finalize=finalize)
|
||||||
pred_chunk = pred_chunk[:, i * 480:]
|
pred_chunk = pred_chunk[:, i * 480:]
|
||||||
print((pred_gt[:, i * 480:i * 480 + pred_chunk.shape[1]] - pred_chunk).abs().max().item())
|
print((pred_gt[:, i * 480:i * 480 + pred_chunk.shape[1]] - pred_chunk).abs().max().item())
|
||||||
|
|||||||
@@ -369,7 +369,8 @@ class Qwen2LM(TransformerLM):
|
|||||||
speech_token_emb = self.speech_embedding(speech_token)
|
speech_token_emb = self.speech_embedding(speech_token)
|
||||||
|
|
||||||
# 3. prepare llm_input/target
|
# 3. prepare llm_input/target
|
||||||
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb, speech_token, speech_token_emb, speech_token_len)
|
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb,
|
||||||
|
speech_token, speech_token_emb, speech_token_len)
|
||||||
lm_target = lm_target.to(device)
|
lm_target = lm_target.to(device)
|
||||||
|
|
||||||
# 4. run lm forward
|
# 4. run lm forward
|
||||||
@@ -685,7 +686,8 @@ class CosyVoice3LM(Qwen2LM):
|
|||||||
speech_token_emb = self.speech_embedding(speech_token)
|
speech_token_emb = self.speech_embedding(speech_token)
|
||||||
|
|
||||||
# 3. prepare llm_input/target
|
# 3. prepare llm_input/target
|
||||||
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb, speech_token, speech_token_emb, speech_token_len)
|
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb,
|
||||||
|
speech_token, speech_token_emb, speech_token_len)
|
||||||
lm_target = lm_target.to(device)
|
lm_target = lm_target.to(device)
|
||||||
|
|
||||||
# 4. run lm forward
|
# 4. run lm forward
|
||||||
|
|||||||
@@ -202,11 +202,11 @@ class CausalConv1dDownSample(torch.nn.Conv1d):
|
|||||||
dtype=None
|
dtype=None
|
||||||
) -> None:
|
) -> None:
|
||||||
super(CausalConv1dDownSample, self).__init__(in_channels, out_channels,
|
super(CausalConv1dDownSample, self).__init__(in_channels, out_channels,
|
||||||
kernel_size, stride,
|
kernel_size, stride,
|
||||||
padding=0, dilation=dilation,
|
padding=0, dilation=dilation,
|
||||||
groups=groups, bias=bias,
|
groups=groups, bias=bias,
|
||||||
padding_mode=padding_mode,
|
padding_mode=padding_mode,
|
||||||
device=device, dtype=dtype)
|
device=device, dtype=dtype)
|
||||||
assert stride != 1 and dilation == 1
|
assert stride != 1 and dilation == 1
|
||||||
assert kernel_size % stride == 0
|
assert kernel_size % stride == 0
|
||||||
self.causal_padding = stride - 1
|
self.causal_padding = stride - 1
|
||||||
@@ -236,11 +236,11 @@ class CausalConv1dUpsample(torch.nn.Conv1d):
|
|||||||
dtype=None
|
dtype=None
|
||||||
) -> None:
|
) -> None:
|
||||||
super(CausalConv1dUpsample, self).__init__(in_channels, out_channels,
|
super(CausalConv1dUpsample, self).__init__(in_channels, out_channels,
|
||||||
kernel_size, 1,
|
kernel_size, 1,
|
||||||
padding=0, dilation=dilation,
|
padding=0, dilation=dilation,
|
||||||
groups=groups, bias=bias,
|
groups=groups, bias=bias,
|
||||||
padding_mode=padding_mode,
|
padding_mode=padding_mode,
|
||||||
device=device, dtype=dtype)
|
device=device, dtype=dtype)
|
||||||
assert dilation == 1
|
assert dilation == 1
|
||||||
self.causal_padding = kernel_size - 1
|
self.causal_padding = kernel_size - 1
|
||||||
self.upsample = torch.nn.Upsample(scale_factor=stride, mode='nearest')
|
self.upsample = torch.nn.Upsample(scale_factor=stride, mode='nearest')
|
||||||
@@ -255,4 +255,4 @@ class CausalConv1dUpsample(torch.nn.Conv1d):
|
|||||||
x = torch.concat([cache, x], dim=2)
|
x = torch.concat([cache, x], dim=2)
|
||||||
x = super(CausalConv1dUpsample, self).forward(x)
|
x = super(CausalConv1dUpsample, self).forward(x)
|
||||||
assert input_timestep == x.shape[2]
|
assert input_timestep == x.shape[2]
|
||||||
return x
|
return x
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ instruct_list = ["You are a helpful assistant. 请用广东话表达。<endofpro
|
|||||||
"You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<endofprompt>",
|
"You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<endofprompt>",
|
||||||
"You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<endofprompt>"]
|
"You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<endofprompt>"]
|
||||||
|
|
||||||
|
|
||||||
def pad_list(xs: List[torch.Tensor], pad_value: int):
|
def pad_list(xs: List[torch.Tensor], pad_value: int):
|
||||||
"""Perform padding for the list of tensors.
|
"""Perform padding for the list of tensors.
|
||||||
|
|
||||||
|
|||||||
36
example.py
36
example.py
@@ -16,20 +16,23 @@ def cosyvoice_example():
|
|||||||
|
|
||||||
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M')
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M')
|
||||||
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
# cross_lingual usage
|
# cross_lingual usage
|
||||||
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', './asset/cross_lingual_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.',
|
||||||
|
'./asset/cross_lingual_prompt.wav')):
|
||||||
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
# vc usage
|
# vc usage
|
||||||
for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav')):
|
||||||
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct')
|
cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct')
|
||||||
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男',
|
||||||
|
'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>')):
|
||||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
|
|
||||||
def cosyvoice2_example():
|
def cosyvoice2_example():
|
||||||
""" CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
|
""" CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
|
||||||
"""
|
"""
|
||||||
@@ -37,21 +40,21 @@ def cosyvoice2_example():
|
|||||||
|
|
||||||
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
|
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
|
||||||
# zero_shot usage
|
# zero_shot usage
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# save zero_shot spk for future usage
|
# save zero_shot spk for future usage
|
||||||
assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', 'my_zero_shot_spk') is True
|
assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', 'my_zero_shot_spk') is True
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk')):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
cosyvoice.save_spkinfo()
|
cosyvoice.save_spkinfo()
|
||||||
|
|
||||||
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
|
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
|
||||||
for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', './asset/zero_shot_prompt.wav')):
|
||||||
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# instruct usage
|
# instruct usage
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话<|endofprompt|>', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话<|endofprompt|>', './asset/zero_shot_prompt.wav')):
|
||||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# bistream usage, you can use generator as input, this is useful when using text llm model as input
|
# bistream usage, you can use generator as input, this is useful when using text llm model as input
|
||||||
@@ -64,28 +67,35 @@ def cosyvoice2_example():
|
|||||||
for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
|
|
||||||
def cosyvoice3_example():
|
def cosyvoice3_example():
|
||||||
""" CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
|
""" CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
|
||||||
"""
|
"""
|
||||||
cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B')
|
cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B')
|
||||||
# zero_shot usage
|
# zero_shot usage
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
|
||||||
|
'./asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L280
|
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L280
|
||||||
for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点,[breath]邻居都很活络,[breath]嗯,都很熟悉。[breath]', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点,[breath]邻居都很活络,[breath]嗯,都很熟悉。[breath]',
|
||||||
|
'./asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# instruct usage, for supported control, check cosyvoice/utils/common.py#L28
|
# instruct usage, for supported control, check cosyvoice/utils/common.py#L28
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct2('好少咯,一般系放嗰啲国庆啊,中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。<|endofprompt|>', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct2('好少咯,一般系放嗰啲国庆啊,中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。<|endofprompt|>',
|
||||||
|
'./asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>',
|
||||||
|
'./asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
# hotfix usage
|
# hotfix usage
|
||||||
for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
|
||||||
|
'./asset/zero_shot_prompt.wav', stream=False)):
|
||||||
torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cosyvoice_example()
|
cosyvoice_example()
|
||||||
cosyvoice2_example()
|
cosyvoice2_example()
|
||||||
|
|||||||
@@ -18,18 +18,22 @@ def cosyvoice2_example():
|
|||||||
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
def cosyvoice3_example():
|
def cosyvoice3_example():
|
||||||
""" CosyVoice3 vllm usage
|
""" CosyVoice3 vllm usage
|
||||||
"""
|
"""
|
||||||
cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False)
|
cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=False)
|
||||||
for i in tqdm(range(100)):
|
for i in tqdm(range(100)):
|
||||||
set_all_random_seed(i)
|
set_all_random_seed(i)
|
||||||
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
|
for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
|
||||||
|
'./asset/zero_shot_prompt.wav', stream=False)):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cosyvoice2_example()
|
cosyvoice2_example()
|
||||||
cosyvoice3_example()
|
cosyvoice3_example()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|||||||
4
webui.py
4
webui.py
@@ -42,9 +42,11 @@ def generate_seed():
|
|||||||
"value": seed
|
"value": seed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def change_instruction(mode_checkbox_group):
|
def change_instruction(mode_checkbox_group):
|
||||||
return instruct_dict[mode_checkbox_group]
|
return instruct_dict[mode_checkbox_group]
|
||||||
|
|
||||||
|
|
||||||
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
|
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
|
||||||
seed, stream, speed):
|
seed, stream, speed):
|
||||||
if prompt_wav_upload is not None:
|
if prompt_wav_upload is not None:
|
||||||
@@ -168,7 +170,7 @@ if __name__ == '__main__':
|
|||||||
default='pretrained_models/CosyVoice3-0.5B',
|
default='pretrained_models/CosyVoice3-0.5B',
|
||||||
help='local path or modelscope repo id')
|
help='local path or modelscope repo id')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
model = AutoModel(model_dir=args.model_dir)
|
cosyvoice = AutoModel(model_dir=args.model_dir)
|
||||||
|
|
||||||
sft_spk = cosyvoice.list_available_spks()
|
sft_spk = cosyvoice.list_available_spks()
|
||||||
if len(sft_spk) == 0:
|
if len(sft_spk) == 0:
|
||||||
|
|||||||
Reference in New Issue
Block a user