fix vocoder train

This commit is contained in:
lyuxiang.lx
2025-03-07 16:39:13 +08:00
parent fcc054f64e
commit a69b7e275d
12 changed files with 108 additions and 17 deletions

View File

@@ -91,7 +91,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
conds = conds.transpose(1, 2)
mask = (~make_pad_mask(feat_len)).to(h)
# NOTE 这一句应该是不需要的应该h已经过length_regulator跟feat一样的shape
# NOTE this is unnecessary, feat/h already same shape
feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
loss, _ = self.decoder.compute_loss(
feat.transpose(1, 2).contiguous(),
@@ -117,7 +117,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
# concat text and prompt_text
# concat speech token and prompt speech token
token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)

View File

@@ -51,6 +51,7 @@ class InterpolateRegulator(nn.Module):
def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
# in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
# NOTE 20 corresponds to token_overlap_len in cosyvoice/cli/model.py
# x in (B, T, D)
if x2.shape[1] > 40:
x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')