mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-04 17:39:25 +08:00
fix lint
This commit is contained in:
@@ -156,9 +156,9 @@ class CosyVoice2(CosyVoice):
|
||||
'{}/spk2info.pt'.format(model_dir),
|
||||
configs['allowed_special'])
|
||||
self.sample_rate = configs['sample_rate']
|
||||
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
||||
load_jit, load_trt, fp16 = False, False, False
|
||||
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
||||
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or load_vllm is True or fp16 is True):
|
||||
load_jit, load_trt, load_vllm, fp16 = False, False, False, False
|
||||
logging.warning('no cuda device, set load_jit/load_trt/load_vllm/fp16 to False')
|
||||
self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
|
||||
self.model.load('{}/llm.pt'.format(model_dir),
|
||||
'{}/flow.pt'.format(model_dir),
|
||||
@@ -174,11 +174,7 @@ class CosyVoice2(CosyVoice):
|
||||
self.fp16)
|
||||
del configs
|
||||
|
||||
def inference_instruct(self, *args, **kwargs):
|
||||
raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
|
||||
|
||||
def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
|
||||
assert isinstance(self.model, CosyVoice2Model) or isinstance(self.model, CosyVoice3Model), 'inference_instruct2 is only implemented for CosyVoice2 and CosyVoice3!'
|
||||
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
|
||||
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
|
||||
start_time = time.time()
|
||||
|
||||
@@ -436,4 +436,4 @@ class CosyVoice3Model(CosyVoice2Model):
|
||||
tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
|
||||
tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
|
||||
self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
|
||||
return tts_speech
|
||||
return tts_speech
|
||||
|
||||
@@ -476,7 +476,7 @@ class JointAttnProcessor:
|
||||
# Split the attention outputs.
|
||||
x, c = (
|
||||
x[:, : residual.shape[1]],
|
||||
x[:, residual.shape[1] :],
|
||||
x[:, residual.shape[1]:],
|
||||
)
|
||||
|
||||
# linear proj
|
||||
|
||||
@@ -402,11 +402,12 @@ class CausalMaskedDiffWithDiT(torch.nn.Module):
|
||||
assert feat.shape[2] == mel_len2
|
||||
return feat.float(), None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
from hyperpyyaml import load_hyperpyyaml
|
||||
with open('./pretrained_models/CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
||||
with open('./pretrained_models/Fun-CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
||||
configs = load_hyperpyyaml(f, overrides={'llm': None, 'hift': None})
|
||||
model = configs['flow']
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
@@ -425,6 +426,7 @@ if __name__ == '__main__':
|
||||
pred_gt, _ = model.inference(token, token_len, prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=True)
|
||||
for i in range(0, max_len, chunk_size):
|
||||
finalize = True if i + chunk_size + context_size >= max_len else False
|
||||
pred_chunk, _ = model.inference(token[:, :i + chunk_size + context_size], torch.tensor([token[:, :i + chunk_size + context_size].shape[1]]).to(device), prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=finalize)
|
||||
pred_chunk, _ = model.inference(token[:, :i + chunk_size + context_size], torch.tensor([token[:, :i + chunk_size + context_size].shape[1]]).to(device),
|
||||
prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=finalize)
|
||||
pred_chunk = pred_chunk[:, :, i * model.token_mel_ratio:]
|
||||
print((pred_gt[:, :, i * model.token_mel_ratio: i * model.token_mel_ratio + pred_chunk.shape[2]] - pred_chunk).abs().max().item())
|
||||
print((pred_gt[:, :, i * model.token_mel_ratio: i * model.token_mel_ratio + pred_chunk.shape[2]] - pred_chunk).abs().max().item())
|
||||
|
||||
@@ -100,4 +100,4 @@ class CausalConvRNNF0Predictor(nn.Module):
|
||||
for i in range(1, len(self.condnet)):
|
||||
x = self.condnet[i](x)
|
||||
x = x.transpose(1, 2)
|
||||
return torch.abs(self.classifier(x).squeeze(-1))
|
||||
return torch.abs(self.classifier(x).squeeze(-1))
|
||||
|
||||
@@ -342,11 +342,9 @@ class SourceModuleHnNSF(torch.nn.Module):
|
||||
|
||||
# to produce sine waveforms
|
||||
if sinegen_type == '1':
|
||||
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
|
||||
sine_amp, add_noise_std, voiced_threshod)
|
||||
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
|
||||
else:
|
||||
self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
|
||||
sine_amp, add_noise_std, voiced_threshod, causal=causal)
|
||||
self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num, sine_amp, add_noise_std, voiced_threshod, causal=causal)
|
||||
|
||||
# to merge source harmonics into a single excitation
|
||||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||
@@ -675,7 +673,8 @@ class CausalHiFTGenerator(HiFTGenerator):
|
||||
x = self.conv_pre(x)
|
||||
else:
|
||||
x = self.conv_pre(x[:, :, :-self.conv_pre_look_right], x[:, :, -self.conv_pre_look_right:])
|
||||
s_stft_real, s_stft_imag = s_stft_real[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)], s_stft_imag[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
|
||||
s_stft_real = s_stft_real[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
|
||||
s_stft_imag = s_stft_imag[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
|
||||
s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
|
||||
|
||||
for i in range(self.num_upsamples):
|
||||
@@ -711,7 +710,7 @@ class CausalHiFTGenerator(HiFTGenerator):
|
||||
|
||||
@torch.inference_mode()
|
||||
def inference(self, speech_feat: torch.Tensor, finalize: bool = True) -> torch.Tensor:
|
||||
# mel->f0
|
||||
# mel->f0 NOTE f0_predictor precision is crucial for causal inference, move self.f0_predictor to cpu if necessary
|
||||
self.f0_predictor.to('cpu')
|
||||
f0 = self.f0_predictor(speech_feat.cpu(), finalize=finalize).to(speech_feat)
|
||||
# f0->source
|
||||
@@ -729,7 +728,7 @@ if __name__ == '__main__':
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
from hyperpyyaml import load_hyperpyyaml
|
||||
with open('./pretrained_models/CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
||||
with open('./pretrained_models/Fun-CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
|
||||
configs = load_hyperpyyaml(f, overrides={'llm': None, 'flow': None})
|
||||
model = configs['hift']
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
@@ -742,4 +741,4 @@ if __name__ == '__main__':
|
||||
finalize = True if i + chunk_size + context_size >= max_len else False
|
||||
pred_chunk, _ = model.inference(mel[:, :, : i + chunk_size + context_size], finalize=finalize)
|
||||
pred_chunk = pred_chunk[:, i * 480:]
|
||||
print((pred_gt[:, i * 480:i * 480 + pred_chunk.shape[1]] - pred_chunk).abs().max().item())
|
||||
print((pred_gt[:, i * 480:i * 480 + pred_chunk.shape[1]] - pred_chunk).abs().max().item())
|
||||
|
||||
@@ -369,7 +369,8 @@ class Qwen2LM(TransformerLM):
|
||||
speech_token_emb = self.speech_embedding(speech_token)
|
||||
|
||||
# 3. prepare llm_input/target
|
||||
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb, speech_token, speech_token_emb, speech_token_len)
|
||||
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb,
|
||||
speech_token, speech_token_emb, speech_token_len)
|
||||
lm_target = lm_target.to(device)
|
||||
|
||||
# 4. run lm forward
|
||||
@@ -685,7 +686,8 @@ class CosyVoice3LM(Qwen2LM):
|
||||
speech_token_emb = self.speech_embedding(speech_token)
|
||||
|
||||
# 3. prepare llm_input/target
|
||||
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb, speech_token, speech_token_emb, speech_token_len)
|
||||
lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb,
|
||||
speech_token, speech_token_emb, speech_token_len)
|
||||
lm_target = lm_target.to(device)
|
||||
|
||||
# 4. run lm forward
|
||||
|
||||
@@ -202,11 +202,11 @@ class CausalConv1dDownSample(torch.nn.Conv1d):
|
||||
dtype=None
|
||||
) -> None:
|
||||
super(CausalConv1dDownSample, self).__init__(in_channels, out_channels,
|
||||
kernel_size, stride,
|
||||
padding=0, dilation=dilation,
|
||||
groups=groups, bias=bias,
|
||||
padding_mode=padding_mode,
|
||||
device=device, dtype=dtype)
|
||||
kernel_size, stride,
|
||||
padding=0, dilation=dilation,
|
||||
groups=groups, bias=bias,
|
||||
padding_mode=padding_mode,
|
||||
device=device, dtype=dtype)
|
||||
assert stride != 1 and dilation == 1
|
||||
assert kernel_size % stride == 0
|
||||
self.causal_padding = stride - 1
|
||||
@@ -236,11 +236,11 @@ class CausalConv1dUpsample(torch.nn.Conv1d):
|
||||
dtype=None
|
||||
) -> None:
|
||||
super(CausalConv1dUpsample, self).__init__(in_channels, out_channels,
|
||||
kernel_size, 1,
|
||||
padding=0, dilation=dilation,
|
||||
groups=groups, bias=bias,
|
||||
padding_mode=padding_mode,
|
||||
device=device, dtype=dtype)
|
||||
kernel_size, 1,
|
||||
padding=0, dilation=dilation,
|
||||
groups=groups, bias=bias,
|
||||
padding_mode=padding_mode,
|
||||
device=device, dtype=dtype)
|
||||
assert dilation == 1
|
||||
self.causal_padding = kernel_size - 1
|
||||
self.upsample = torch.nn.Upsample(scale_factor=stride, mode='nearest')
|
||||
@@ -255,4 +255,4 @@ class CausalConv1dUpsample(torch.nn.Conv1d):
|
||||
x = torch.concat([cache, x], dim=2)
|
||||
x = super(CausalConv1dUpsample, self).forward(x)
|
||||
assert input_timestep == x.shape[2]
|
||||
return x
|
||||
return x
|
||||
|
||||
@@ -52,6 +52,7 @@ instruct_list = ["You are a helpful assistant. 请用广东话表达。<endofpro
|
||||
"You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<endofprompt>",
|
||||
"You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<endofprompt>"]
|
||||
|
||||
|
||||
def pad_list(xs: List[torch.Tensor], pad_value: int):
|
||||
"""Perform padding for the list of tensors.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user