mirror of
https://github.com/FunAudioLLM/CosyVoice.git
synced 2026-02-05 18:09:24 +08:00
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -52,5 +52,5 @@ jobs:
|
|||||||
set -eux
|
set -eux
|
||||||
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
|
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
|
||||||
flake8 --version
|
flake8 --version
|
||||||
flake8 --max-line-length 150 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
|
flake8 --max-line-length 180 --ignore B006,B008,B905,C408,E402,E731,E741,W503,W504 --exclude ./third_party/,./runtime/python/grpc/cosyvoice_pb2*py
|
||||||
if [ $? != 0 ]; then exit 1; fi
|
if [ $? != 0 ]; then exit 1; fi
|
||||||
@@ -149,6 +149,11 @@ class CosyVoiceFrontEnd:
|
|||||||
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
|
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
|
||||||
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
||||||
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
||||||
|
if resample_rate == 24000:
|
||||||
|
# cosyvoice2, force speech_feat % speech_token = 2
|
||||||
|
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
|
||||||
|
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2* token_len
|
||||||
|
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
|
||||||
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
||||||
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
|
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
|
||||||
'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
||||||
|
|||||||
@@ -380,8 +380,7 @@ class CosyVoice2Model:
|
|||||||
while True:
|
while True:
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
|
if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
|
||||||
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]) \
|
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
|
||||||
.unsqueeze(dim=0)
|
|
||||||
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
||||||
prompt_token=flow_prompt_speech_token,
|
prompt_token=flow_prompt_speech_token,
|
||||||
prompt_feat=prompt_speech_feat,
|
prompt_feat=prompt_speech_feat,
|
||||||
|
|||||||
@@ -123,8 +123,8 @@ class ConditionalDecoder(nn.Module):
|
|||||||
input_channel = output_channel
|
input_channel = output_channel
|
||||||
output_channel = channels[i]
|
output_channel = channels[i]
|
||||||
is_last = i == len(channels) - 1
|
is_last = i == len(channels) - 1
|
||||||
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal \
|
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
|
||||||
else ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
|
ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
|
||||||
transformer_blocks = nn.ModuleList(
|
transformer_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
BasicTransformerBlock(
|
BasicTransformerBlock(
|
||||||
@@ -138,7 +138,7 @@ class ConditionalDecoder(nn.Module):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
downsample = (
|
downsample = (
|
||||||
Downsample1D(output_channel) if not is_last else \
|
Downsample1D(output_channel) if not is_last else
|
||||||
CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
|
CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
|
||||||
)
|
)
|
||||||
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
|
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
|
||||||
@@ -147,7 +147,7 @@ class ConditionalDecoder(nn.Module):
|
|||||||
input_channel = channels[-1]
|
input_channel = channels[-1]
|
||||||
out_channels = channels[-1]
|
out_channels = channels[-1]
|
||||||
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
|
resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
|
||||||
ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
|
ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
|
||||||
|
|
||||||
transformer_blocks = nn.ModuleList(
|
transformer_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
@@ -251,7 +251,7 @@ class ConditionalDecoder(nn.Module):
|
|||||||
x = rearrange(x, "b c t -> b t c").contiguous()
|
x = rearrange(x, "b c t -> b t c").contiguous()
|
||||||
# attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
|
# attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
|
||||||
attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
|
attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
|
||||||
attn_mask = mask_to_bias(attn_mask==1, x.dtype)
|
attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
|
||||||
for transformer_block in transformer_blocks:
|
for transformer_block in transformer_blocks:
|
||||||
x = transformer_block(
|
x = transformer_block(
|
||||||
hidden_states=x,
|
hidden_states=x,
|
||||||
@@ -270,7 +270,7 @@ class ConditionalDecoder(nn.Module):
|
|||||||
x = rearrange(x, "b c t -> b t c").contiguous()
|
x = rearrange(x, "b c t -> b t c").contiguous()
|
||||||
# attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
|
# attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
|
||||||
attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
|
attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
|
||||||
attn_mask = mask_to_bias(attn_mask==1, x.dtype)
|
attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
|
||||||
for transformer_block in transformer_blocks:
|
for transformer_block in transformer_blocks:
|
||||||
x = transformer_block(
|
x = transformer_block(
|
||||||
hidden_states=x,
|
hidden_states=x,
|
||||||
@@ -287,7 +287,7 @@ class ConditionalDecoder(nn.Module):
|
|||||||
x = rearrange(x, "b c t -> b t c").contiguous()
|
x = rearrange(x, "b c t -> b t c").contiguous()
|
||||||
# attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
|
# attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
|
||||||
attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
|
attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
|
||||||
attn_mask = mask_to_bias(attn_mask==1, x.dtype)
|
attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
|
||||||
for transformer_block in transformer_blocks:
|
for transformer_block in transformer_blocks:
|
||||||
x = transformer_block(
|
x = transformer_block(
|
||||||
hidden_states=x,
|
hidden_states=x,
|
||||||
|
|||||||
@@ -150,12 +150,12 @@ class ConditionalCFM(BASECFM):
|
|||||||
self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
|
self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
|
||||||
# run trt engine
|
# run trt engine
|
||||||
self.estimator.execute_v2([x.contiguous().data_ptr(),
|
self.estimator.execute_v2([x.contiguous().data_ptr(),
|
||||||
mask.contiguous().data_ptr(),
|
mask.contiguous().data_ptr(),
|
||||||
mu.contiguous().data_ptr(),
|
mu.contiguous().data_ptr(),
|
||||||
t.contiguous().data_ptr(),
|
t.contiguous().data_ptr(),
|
||||||
spks.contiguous().data_ptr(),
|
spks.contiguous().data_ptr(),
|
||||||
cond.contiguous().data_ptr(),
|
cond.contiguous().data_ptr(),
|
||||||
x.data_ptr()])
|
x.data_ptr()])
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def compute_loss(self, x1, mask, mu, spks=None, cond=None):
|
def compute_loss(self, x1, mask, mu, spks=None, cond=None):
|
||||||
|
|||||||
@@ -269,6 +269,7 @@ class QwenTokenizer():
|
|||||||
text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
|
text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def get_qwen_tokenizer(
|
def get_qwen_tokenizer(
|
||||||
token_path: str,
|
token_path: str,
|
||||||
|
|||||||
@@ -54,10 +54,7 @@ class Upsample1D(nn.Module):
|
|||||||
self.out_channels = out_channels
|
self.out_channels = out_channels
|
||||||
self.stride = stride
|
self.stride = stride
|
||||||
# In this mode, first repeat interpolate, than conv with stride=1
|
# In this mode, first repeat interpolate, than conv with stride=1
|
||||||
self.conv = nn.Conv1d(
|
self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
|
||||||
self.channels, self.out_channels, stride * 2 + 1, stride = 1,
|
|
||||||
padding=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
|
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
|
||||||
outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
|
outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
|
||||||
|
|||||||
Reference in New Issue
Block a user