From 8c19579b1e2e5f11c1e8f72ac5427a6386451561 Mon Sep 17 00:00:00 2001 From: Alexey <56967387+lifeFedorovAlexey@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:42:23 +0300 Subject: [PATCH] fix: convert all audio to WAV 16kHz PCM before processing (#379) --- musetalk/data/dataset.py | 11 +++++++---- musetalk/utils/audio_utils.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 musetalk/utils/audio_utils.py diff --git a/musetalk/data/dataset.py b/musetalk/data/dataset.py index 4b72449..94c90c7 100755 --- a/musetalk/data/dataset.py +++ b/musetalk/data/dataset.py @@ -15,6 +15,7 @@ from decord.ndarray import cpu from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark from musetalk.data import audio +from musetalk.utils.audio_utils import ensure_wav syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync @@ -171,7 +172,8 @@ class FaceDataset(Dataset): """ if not os.path.exists(wav_path): return None - audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000) + wav_path_converted = ensure_wav(wav_path) + audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000) assert sampling_rate == 16000 while start_index >= 25 * 30: @@ -206,11 +208,12 @@ class FaceDataset(Dataset): if not os.path.exists(wav_path): return None - audio_input, sampling_rate = librosa.load(wav_path, sr=16000) + wav_path_converted = ensure_wav(wav_path) + audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000) assert sampling_rate == 16000 - audio_input = self.mel_feature_extractor(audio_input) - return audio_input, start_index + audio_mel = self.mel_feature_extractor(audio_input_librosa) + return audio_mel, start_index def mel_feature_extractor(self, audio_input): """Extract mel spectrogram features diff --git a/musetalk/utils/audio_utils.py b/musetalk/utils/audio_utils.py new file mode 100644 index 0000000..2d1387e --- /dev/null +++ b/musetalk/utils/audio_utils.py @@ -0,0 +1,17 @@ +import os, subprocess + +def ensure_wav(input_path: str, target_path: str | None = None) -> str: + """ + Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg. + Returns path to the converted .wav (original if already correct). + """ + if not isinstance(input_path, str) or not os.path.exists(input_path): + return input_path + base, ext = os.path.splitext(input_path) + ext = ext.lower() + + if target_path is None: + target_path = base + "_16k.wav" + cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path] + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return target_path \ No newline at end of file