Convert audio to mono while extract speech token

modified:     tools/extract_speech_token.py
This commit is contained in:
hanasay
2025-02-14 15:25:45 +08:00
parent 95e99e0417
commit 296ed4f526

View File

@@ -27,6 +27,9 @@ def single_job(utt):
audio, sample_rate = torchaudio.load(utt2wav[utt], backend='soundfile') audio, sample_rate = torchaudio.load(utt2wav[utt], backend='soundfile')
if sample_rate != 16000: if sample_rate != 16000:
audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio) audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
# Convert audio to mono
if audio.shape[0] > 1:
audio = audio.mean(dim=0, keepdim=True)
if audio.shape[1] / 16000 > 30: if audio.shape[1] / 16000 > 30:
logging.warning('do not support extract speech token for audio longer than 30s') logging.warning('do not support extract speech token for audio longer than 30s')
speech_token = [] speech_token = []