From 038ff9f353b21c98c54b744eaa19ba9b3674c35a Mon Sep 17 00:00:00 2001 From: burkliu Date: Fri, 25 Apr 2025 10:31:43 +0800 Subject: [PATCH] [feature] modify pad to trim --- cosyvoice/dataset/processor.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index 8ac82a1..08030d6 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -177,11 +177,10 @@ def compute_fbank(data, waveform = sample['speech'] feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) - # padding with replicate mode (align to speech_token len * token_mel_ratio) - pad_len = sample["speech_token"].shape[0] * token_mel_ratio - feat.shape[0] - if pad_len > 0: - feat_to_pad = feat[-1:].repeat((pad_len, 1)) - feat = torch.cat([feat, feat_to_pad], dim=0) + # trim to align speech_token and speech_feat + token_len = min(feat.shape[0] // token_mel_ratio, sample["speech_token"].shape[0]) + feat = feat[:token_mel_ratio * token_len] + sample["speech_token"] = sample["speech_token"][:token_len] sample['speech_feat'] = feat yield sample