From 296ed4f526743aaa3d2b006e460c730592800748 Mon Sep 17 00:00:00 2001 From: hanasay Date: Fri, 14 Feb 2025 15:25:45 +0800 Subject: [PATCH] =?UTF-8?q?Convert=20audio=20to=20mono=20while=20extract?= =?UTF-8?q?=20speech=20token=20=09modified=EF=BC=9A=20=20=20=20=20tools/ex?= =?UTF-8?q?tract=5Fspeech=5Ftoken.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/extract_speech_token.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/extract_speech_token.py b/tools/extract_speech_token.py index 776b6cf..976a23b 100755 --- a/tools/extract_speech_token.py +++ b/tools/extract_speech_token.py @@ -27,6 +27,9 @@ def single_job(utt): audio, sample_rate = torchaudio.load(utt2wav[utt], backend='soundfile') if sample_rate != 16000: audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio) + # Convert audio to mono + if audio.shape[0] > 1: + audio = audio.mean(dim=0, keepdim=True) if audio.shape[1] / 16000 > 30: logging.warning('do not support extract speech token for audio longer than 30s') speech_token = []