diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py index 8424ada..a94eb15 100644 --- a/cosyvoice/dataset/processor.py +++ b/cosyvoice/dataset/processor.py @@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'): def compute_fbank(data, feat_extractor, + token_mel_ratio=0, mode='train'): """ Extract fbank @@ -174,8 +175,13 @@ def compute_fbank(data, assert 'utt' in sample assert 'text_token' in sample waveform = sample['speech'] - mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) - sample['speech_feat'] = mat + feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) + if token_mel_ratio != 0: + # trim to align speech_token and speech_feat + token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0])) + feat = feat[:token_mel_ratio * token_len] + sample["speech_token"] = sample["speech_token"][:token_len] + sample['speech_feat'] = feat yield sample