From e636166b8523f1bfb0ea4688721f4337071b712e Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Fri, 4 Apr 2025 06:04:56 +0100 Subject: [PATCH] fix: floor (#293) --- musetalk/utils/audio_processor.py | 44 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/musetalk/utils/audio_processor.py b/musetalk/utils/audio_processor.py index 1c41ceb..7601c1b 100755 --- a/musetalk/utils/audio_processor.py +++ b/musetalk/utils/audio_processor.py @@ -1,16 +1,17 @@ -import os import math +import os + import librosa import numpy as np import torch - from einops import rearrange from transformers import AutoFeatureExtractor + class AudioProcessor: def __init__(self, feature_extractor_path="openai/whisper-tiny/"): self.feature_extractor = AutoFeatureExtractor.from_pretrained(feature_extractor_path) - + def get_audio_feature(self, wav_path, start_index=0, weight_dtype=None): if not os.path.exists(wav_path): return None @@ -19,11 +20,11 @@ class AudioProcessor: # Split audio into 30s segments segment_length = 30 * sampling_rate segments = [librosa_output[i:i + segment_length] for i in range(0, len(librosa_output), segment_length)] - + features = [] for segment in segments: audio_feature = self.feature_extractor( - segment, + segment, return_tensors="pt", sampling_rate=sampling_rate ).input_features @@ -32,13 +33,13 @@ class AudioProcessor: features.append(audio_feature) return features, len(librosa_output) - + def get_whisper_chunk( - self, - whisper_input_features, - device, - weight_dtype, - whisper, + self, + whisper_input_features, + device, + weight_dtype, + whisper, librosa_length, fps=25, audio_padding_length_left=2, @@ -48,30 +49,30 @@ class AudioProcessor: whisper_feature = [] # Process multiple 30s mel input features for input_feature in whisper_input_features: - audio_feats = whisper.encoder(input_feature.to(device), output_hidden_states=True).hidden_states + audio_feats = whisper.encoder(input_feature.to(device), output_hidden_states=True).hidden_states audio_feats = torch.stack(audio_feats, dim=2).to(weight_dtype) whisper_feature.append(audio_feats) - + whisper_feature = torch.cat(whisper_feature, dim=1) # Trim the last segment to remove padding sr = 16000 audio_fps = 50 fps = int(fps) whisper_idx_multiplier = audio_fps / fps - num_frames = math.floor((librosa_length / sr)) * fps - actual_length = math.floor((librosa_length / sr)) * audio_fps + num_frames = math.floor((librosa_length / sr) * fps) + actual_length = math.floor((librosa_length / sr) * audio_fps) whisper_feature = whisper_feature[:,:actual_length,...] - + # Calculate padding amount padding_nums = math.floor(whisper_idx_multiplier) # Add padding at start and end whisper_feature = torch.cat([ - torch.zeros_like(whisper_feature[:, :padding_nums * audio_padding_length_left]), - whisper_feature, + torch.zeros_like(whisper_feature[:, :padding_nums * audio_padding_length_left]), + whisper_feature, # Add extra padding to prevent out of bounds torch.zeros_like(whisper_feature[:, :padding_nums * 3 * audio_padding_length_right]) ], 1) - + audio_prompts = [] for frame_index in range(num_frames): try: @@ -86,7 +87,7 @@ class AudioProcessor: print(f"num frames: {num_frames}, fps: {fps}, whisper_idx_multiplier: {whisper_idx_multiplier}") print(f"frame_index: {frame_index}, audio_index: {audio_index}-{audio_index + audio_feature_length_per_frame}") exit() - + audio_prompts = torch.cat(audio_prompts, dim=0) # T, 10, 5, 384 audio_prompts = rearrange(audio_prompts, 'b c h w -> b (c h) w') return audio_prompts @@ -97,5 +98,4 @@ if __name__ == "__main__": audio_feature, librosa_feature_length = audio_processor.get_audio_feature(wav_path) print("Audio Feature shape:", audio_feature.shape) print("librosa_feature_length:", librosa_feature_length) - - \ No newline at end of file +