From 296ed4f526743aaa3d2b006e460c730592800748 Mon Sep 17 00:00:00 2001
From: hanasay <hanasay16@gmail.com>
Date: Fri, 14 Feb 2025 15:25:45 +0800
Subject: [PATCH] =?UTF-8?q?Convert=20audio=20to=20mono=20while=20extract?=
 =?UTF-8?q?=20speech=20token=20=09modified=EF=BC=9A=20=20=20=20=20tools/ex?=
 =?UTF-8?q?tract=5Fspeech=5Ftoken.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/extract_speech_token.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/extract_speech_token.py b/tools/extract_speech_token.py
index 776b6cf..976a23b 100755
--- a/tools/extract_speech_token.py
+++ b/tools/extract_speech_token.py
@@ -27,6 +27,9 @@ def single_job(utt):
     audio, sample_rate = torchaudio.load(utt2wav[utt], backend='soundfile')
     if sample_rate != 16000:
         audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
+    # Convert audio to mono
+    if audio.shape[0] > 1:
+        audio = audio.mean(dim=0, keepdim=True)
     if audio.shape[1] / 16000 > 30:
         logging.warning('do not support extract speech token for audio longer than 30s')
         speech_token = []