From ef36109b0647bc675351597e8fe392ca89a9e82c Mon Sep 17 00:00:00 2001
From: czk32611 <czk32611@hotmail.com>
Date: Fri, 19 Apr 2024 09:14:49 +0800
Subject: [PATCH] <enhance>(inference): support using an image as video
 input(#17 #34)

---
 README.md                     | 10 +++++-----
 scripts/inference.py          | 19 ++++++++++++-------
 scripts/realtime_inference.py |  6 +++---
 3 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index deb3d89..6ce4055 100644
--- a/README.md
+++ b/README.md
@@ -244,7 +244,7 @@ Here, we provide the inference script.
 python -m scripts.inference --inference_config configs/inference/test.yaml 
 ```
 configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
-The video_path should be either a video file or a directory of images. 
+The video_path should be either a video file, an image file or a directory of images. 
 
 You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
 
@@ -276,12 +276,12 @@ configs/inference/realtime.yaml is the path to the real-time inference configura
     ```
     Inferring using: data/audio/yongen.wav
     ```
-1. While MuseTalk is inferring, sub-threads can simultaneously stream the results to the users. The generation process can achieve up to 50fps on an NVIDIA Tesla V100.
+1. While MuseTalk is inferring, sub-threads can simultaneously stream the results to the users. The generation process can achieve 30fps+ on an NVIDIA Tesla V100.
     ```
     2%|██▍                                                         | 3/141 [00:00<00:32,  4.30it/s]   # inference process
-    Generating the 6-th frame with FPS: 48.58                                                  # playing process
-    Generating the 7-th frame with FPS: 48.74
-    Generating the 8-th frame with FPS: 49.17
+    Displaying the 6-th frame with FPS: 48.58                                                  # display process
+    Displaying the 7-th frame with FPS: 48.74
+    Displaying the 8-th frame with FPS: 49.17
     3%|███▎                                                        | 4/141 [00:00<00:32,  4.21it/s]
     ```
 1. Set `preparation` to `False` and run this script if you want to genrate more videos using the same avatar.
diff --git a/scripts/inference.py b/scripts/inference.py
index 7afc350..ad9de8d 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -36,7 +36,7 @@ def main(args):
         crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
         os.makedirs(result_img_save_path,exist_ok =True)
         
-        if args.output_vid_name=="":
+        if args.output_vid_name is None:
             output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
         else:
             output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
@@ -48,10 +48,16 @@ def main(args):
             os.system(cmd)
             input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
             fps = get_video_fps(video_path)
-        else: # input img folder
+        elif get_file_type(video_path)=="image":
+            input_img_list = [video_path, ]
+            fps = args.fps
+        elif os.path.isdir(video_path):  # input img folder
             input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
             input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
             fps = args.fps
+        else:
+            raise ValueError(f"{video_path} should be a video file, an image file or a directory of images")
+
         #print(input_img_list)
         ############################################## extract audio feature ##############################################
         whisper_feature = audio_processor.audio2feat(audio_path)
@@ -114,12 +120,12 @@ def main(args):
             
             combine_frame = get_image(ori_frame,res_frame,bbox)
             cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)
-            
-        cmd_img2video = f"ffmpeg -y -v fatal -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 temp.mp4"
+
+        cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 temp.mp4"
         print(cmd_img2video)
         os.system(cmd_img2video)
         
-        cmd_combine_audio = f"ffmpeg -y -v fatal -i {audio_path} -i temp.mp4 {output_vid_name}"
+        cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i temp.mp4 {output_vid_name}"
         print(cmd_combine_audio)
         os.system(cmd_combine_audio)
         
@@ -135,7 +141,7 @@ if __name__ == "__main__":
 
     parser.add_argument("--fps", type=int, default=25)
     parser.add_argument("--batch_size", type=int, default=8)
-    parser.add_argument("--output_vid_name", type=str,default='')
+    parser.add_argument("--output_vid_name", type=str, default=None)
     parser.add_argument("--use_saved_coord",
                         action="store_true",
                         help='use saved coordinate to save time')
@@ -143,4 +149,3 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
     main(args)
-    
\ No newline at end of file
diff --git a/scripts/realtime_inference.py b/scripts/realtime_inference.py
index e5b7765..e9d1338 100644
--- a/scripts/realtime_inference.py
+++ b/scripts/realtime_inference.py
@@ -206,7 +206,7 @@ class Avatar:
             combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
 
             fps = 1/(time.time()-start+1e-6)
-            print(f"Generating the {self.idx}-th frame with FPS: {fps:.2f}")
+            print(f"Displaying the {self.idx}-th frame with FPS: {fps:.2f}")
             cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png",combine_frame)
             self.idx = self.idx + 1
 
@@ -244,12 +244,12 @@ class Avatar:
         
         if out_vid_name is not None: 
             # optional
-            cmd_img2video = f"ffmpeg -y -v fatal -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 {self.avatar_path}/temp.mp4"
+            cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 {self.avatar_path}/temp.mp4"
             print(cmd_img2video)
             os.system(cmd_img2video)
 
             output_vid = os.path.join(self.video_out_path, out_vid_name+".mp4") # on
-            cmd_combine_audio = f"ffmpeg -y -v fatal -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}"
+            cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}"
             print(cmd_combine_audio)
             os.system(cmd_combine_audio)