From fbab274b6a9abe127be61825092ca3a241bff19f Mon Sep 17 00:00:00 2001 From: burkliu Date: Fri, 25 Apr 2025 10:31:43 +0800 Subject: [PATCH] [feature] modify pad to trim Conflicts: cosyvoice/dataset/processor.py --- cosyvoice/dataset/processor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index 8424ada..a94eb15 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'): def compute_fbank(data, feat_extractor, + token_mel_ratio=0, mode='train'): """ Extract fbank @@ -174,8 +175,13 @@ def compute_fbank(data, assert 'utt' in sample assert 'text_token' in sample waveform = sample['speech'] - mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) - sample['speech_feat'] = mat + feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) + if token_mel_ratio != 0: + # trim to align speech_token and speech_feat + token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0])) + feat = feat[:token_mel_ratio * token_len] + sample["speech_token"] = sample["speech_token"][:token_len] + sample['speech_feat'] = feat yield sample