fix: infer bug

2026-02-04 17:39:20 +08:00 · 2025-03-31 19:28:54 +08:00
parent 6255496f80
commit 17a93b2ff6
6 changed files with 126 additions and 42 deletions
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -1,32 +1,58 @@
-import argparse
 import os
-from omegaconf import OmegaConf
-import numpy as np
 import cv2
-import torch
-import glob
-import pickle
-from tqdm import tqdm
 import copy
-
-from musetalk.utils.utils import get_file_type,get_video_fps,datagen
-from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder
-from musetalk.utils.blending import get_image
-from musetalk.utils.utils import load_all_model
+import glob
+import torch
 import shutil
+import pickle
+import argparse
+import numpy as np
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from transformers import WhisperModel
+
+from musetalk.utils.blending import get_image
+from musetalk.utils.face_parsing import FaceParsing
+from musetalk.utils.audio_processor import AudioProcessor
+from musetalk.utils.utils import get_file_type, get_video_fps, datagen, load_all_model
+from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder
+

-# load model weights
-audio_processor, vae, unet, pe = load_all_model()
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-timesteps = torch.tensor([0], device=device)

@torch.no_grad()
 def main(args):
-    global pe
+    # Configure ffmpeg path
+    if args.ffmpeg_path not in os.getenv('PATH'):
+        print("Adding ffmpeg to PATH")
+        os.environ["PATH"] = f"{args.ffmpeg_path}:{os.environ['PATH']}"
+        
+    # Set computing device
+    device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
+
+    # Load model weights
+    vae, unet, pe = load_all_model(
+        unet_model_path=args.unet_model_path, 
+        vae_type=args.vae_type,
+        unet_config=args.unet_config,
+        device=device
+    )
+    timesteps = torch.tensor([0], device=device)
+    
+    
    if args.use_float16 is True:
        pe = pe.half()
        vae.vae = vae.vae.half()
        unet.model = unet.model.half()
+        
+    # Initialize audio processor and Whisper model  
+    audio_processor = AudioProcessor(feature_extractor_path=args.whisper_dir)
+    weight_dtype = unet.model.dtype
+    whisper = WhisperModel.from_pretrained(args.whisper_dir)
+    whisper = whisper.to(device=device, dtype=weight_dtype).eval()
+    whisper.requires_grad_(False)
+    
+    # Initialize face parser
+    fp = FaceParsing()
    
    inference_config = OmegaConf.load(args.inference_config)
    print(inference_config)
@@ -64,10 +90,20 @@ def main(args):
        else:
            raise ValueError(f"{video_path} should be a video file, an image file or a directory of images")

-        #print(input_img_list)
        ############################################## extract audio feature ##############################################
-        whisper_feature = audio_processor.audio2feat(audio_path)
-        whisper_chunks = audio_processor.feature2chunks(feature_array=whisper_feature,fps=fps)
+        # Extract audio features
+        whisper_input_features, librosa_length = audio_processor.get_audio_feature(audio_path)
+        whisper_chunks = audio_processor.get_whisper_chunk(
+            whisper_input_features, 
+            device, 
+            weight_dtype, 
+            whisper, 
+            librosa_length,
+            fps=fps,
+            audio_padding_length_left=args.audio_padding_length_left,
+            audio_padding_length_right=args.audio_padding_length_right,
+        )
+        
        ############################################## preprocess input image  ##############################################
        if os.path.exists(crop_coord_save_path) and args.use_saved_coord:
            print("using extracted coordinates")
@@ -102,10 +138,7 @@ def main(args):
        gen = datagen(whisper_chunks,input_latent_list_cycle,batch_size)
        res_frame_list = []
        for i, (whisper_batch,latent_batch) in enumerate(tqdm(gen,total=int(np.ceil(float(video_num)/batch_size)))):
-            audio_feature_batch = torch.from_numpy(whisper_batch)
-            audio_feature_batch = audio_feature_batch.to(device=unet.device,
-                                                         dtype=unet.model.dtype) # torch, B, 5*N,384
-            audio_feature_batch = pe(audio_feature_batch)
+            audio_feature_batch = pe(whisper_batch)
            latent_batch = latent_batch.to(dtype=unet.model.dtype)
            
            pred_latents = unet.model(latent_batch, timesteps, encoder_hidden_states=audio_feature_batch).sample
@@ -122,10 +155,10 @@ def main(args):
            try:
                res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1))
            except:
-#                 print(bbox)
                continue
            
-            combine_frame = get_image(ori_frame,res_frame,bbox)
+            # Merge results
+            combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], fp=fp)      
            cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)

        cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 temp.mp4"
@@ -142,11 +175,11 @@ def main(args):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
+    parser.add_argument("--ffmpeg_path", type=str, default="./ffmpeg-4.4-amd64-static/", help="Path to ffmpeg executable")
    parser.add_argument("--inference_config", type=str, default="configs/inference/test_img.yaml")
    parser.add_argument("--bbox_shift", type=int, default=0)
    parser.add_argument("--result_dir", default='./results', help="path to output")
-
-    parser.add_argument("--fps", type=int, default=25)
+    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID to use")
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument("--output_vid_name", type=str, default=None)
    parser.add_argument("--use_saved_coord",
@@ -156,6 +189,12 @@ if __name__ == "__main__":
                        action="store_true",
                        help="Whether use float16 to speed up inference",
    )
-
+    parser.add_argument("--fps", type=int, default=25, help="Video frames per second")
+    parser.add_argument("--unet_model_path", type=str, default="./models/musetalk/pytorch_model.bin", help="Path to UNet model weights")
+    parser.add_argument("--vae_type", type=str, default="sd-vae", help="Type of VAE model")
+    parser.add_argument("--unet_config", type=str, default="./models/musetalk/config.json", help="Path to UNet configuration file")
+    parser.add_argument("--whisper_dir", type=str, default="./models/whisper", help="Directory containing Whisper model")
+    parser.add_argument("--audio_padding_length_left", type=int, default=2, help="Left padding length for audio")
+    parser.add_argument("--audio_padding_length_right", type=int, default=2, help="Right padding length for audio")
    args = parser.parse_args()
    main(args)