v1.5

2026-02-05 18:09:19 +08:00 · 2025-03-28 16:03:02 +08:00
parent 058f7ddc7f
commit db204311a5
46 changed files with 729 additions and 204 deletions
--- a/musetalk/utils/blending.py
+++ b/musetalk/utils/blending.py
@@ -2,9 +2,6 @@ from PIL import Image
 import numpy as np
 import cv2
 import copy
-from face_parsing import FaceParsing
-
-fp = FaceParsing()

 def get_crop_box(box, expand):
    x, y, x1, y1 = box
@@ -14,46 +11,98 @@ def get_crop_box(box, expand):
    crop_box = [x_c-s, y_c-s, x_c+s, y_c+s]
    return crop_box, s

-def face_seg(image):
-    seg_image = fp(image)
+def face_seg(image, mode="jaw", fp=None):
+    """
+    对图像进行面部解析，生成面部区域的掩码。
+
+    Args:
+        image (PIL.Image): 输入图像。
+
+    Returns:
+        PIL.Image: 面部区域的掩码图像。
+    """
+    seg_image = fp(image, mode=mode)  # 使用 FaceParsing 模型解析面部
    if seg_image is None:
-        print("error, no person_segment")
+        print("error, no person_segment")  # 如果没有检测到面部，返回错误
        return None

-    seg_image = seg_image.resize(image.size)
+    seg_image = seg_image.resize(image.size)  # 将掩码图像调整为输入图像的大小
    return seg_image

-def get_image(image,face,face_box,upper_boundary_ratio = 0.5,expand=1.2):
-    #print(image.shape)
-    #print(face.shape)
+
+def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1.5, mode="raw", fp=None):
+    """
+    将裁剪的面部图像粘贴回原始图像，并进行一些处理。
+
+    Args:
+        image (numpy.ndarray): 原始图像（身体部分）。
+        face (numpy.ndarray): 裁剪的面部图像。
+        face_box (tuple): 面部边界框的坐标 (x, y, x1, y1)。
+        upper_boundary_ratio (float): 用于控制面部区域的保留比例。
+        expand (float): 扩展因子，用于放大裁剪框。
+        mode: 融合mask构建方式 
+
+    Returns:
+        numpy.ndarray: 处理后的图像。
+    """
+    # 将 numpy 数组转换为 PIL 图像
+    body = Image.fromarray(image[:, :, ::-1])  # 身体部分图像(整张图)
+    face = Image.fromarray(face[:, :, ::-1])  # 面部图像
+
+    x, y, x1, y1 = face_box  # 获取面部边界框的坐标
+    crop_box, s = get_crop_box(face_box, expand)  # 计算扩展后的裁剪框
+    x_s, y_s, x_e, y_e = crop_box  # 裁剪框的坐标
+    face_position = (x, y)  # 面部在原始图像中的位置
+
+    # 从身体图像中裁剪出扩展后的面部区域（下巴到边界有距离）
+    face_large = body.crop(crop_box)
+        
+    ori_shape = face_large.size  # 裁剪后图像的原始尺寸
+
+    # 对裁剪后的面部区域进行面部解析，生成掩码
+    mask_image = face_seg(face_large, mode=mode, fp=fp)
    
+    mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s))  # 裁剪出面部区域的掩码
+    
+    mask_image = Image.new('L', ori_shape, 0)  # 创建一个全黑的掩码图像
+    mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))  # 将面部掩码粘贴到全黑图像上
+    
+    
+    # 保留面部区域的上半部分（用于控制说话区域）
+    width, height = mask_image.size
+    top_boundary = int(height * upper_boundary_ratio)  # 计算上半部分的边界
+    modified_mask_image = Image.new('L', ori_shape, 0)  # 创建一个新的全黑掩码图像
+    modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))  # 粘贴上半部分掩码
+    
+    
+    # 对掩码进行高斯模糊，使边缘更平滑
+    blur_kernel_size = int(0.05 * ori_shape[0] // 2 * 2) + 1  # 计算模糊核大小
+    mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)  # 高斯模糊
+    #mask_array = np.array(modified_mask_image)
+    mask_image = Image.fromarray(mask_array)  # 将模糊后的掩码转换回 PIL 图像
+    
+    # 将裁剪的面部图像粘贴回扩展后的面部区域
+    face_large.paste(face, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
+    
+    body.paste(face_large, crop_box[:2], mask_image)
+    
+    # 不用掩码，完全用infer
+    #face_large.save("debug/checkpoint_6_face_large.png")
+
+    body = np.array(body)  # 将 PIL 图像转换回 numpy 数组
+
+    return body[:, :, ::-1]  # 返回处理后的图像（BGR 转 RGB）
+
+def get_image_blending(image,face,face_box,mask_array,crop_box):
    body = Image.fromarray(image[:,:,::-1])
    face = Image.fromarray(face[:,:,::-1])

-    x, y, x1, y1 = face_box 
-    #print(x1-x,y1-y)
-    crop_box, s = get_crop_box(face_box, expand)
+    x, y, x1, y1 = face_box
    x_s, y_s, x_e, y_e = crop_box
-    face_position = (x, y)
-
    face_large = body.crop(crop_box)
-    ori_shape = face_large.size

-    mask_image = face_seg(face_large)
-    mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s))
-    mask_image = Image.new('L', ori_shape, 0)
-    mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s))
-
-    # keep upper_boundary_ratio of talking area
-    width, height = mask_image.size
-    top_boundary = int(height * upper_boundary_ratio)
-    modified_mask_image = Image.new('L', ori_shape, 0)
-    modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))
-
-    blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
-    mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
    mask_image = Image.fromarray(mask_array)
-    
+    mask_image = mask_image.convert("L")
    face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s))
    body.paste(face_large, crop_box[:2], mask_image)
    body = np.array(body)
@@ -84,17 +133,3 @@ def get_image_prepare_material(image,face_box,upper_boundary_ratio = 0.5,expand=
    blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
    mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
    return mask_array,crop_box
-
-def get_image_blending(image,face,face_box,mask_array,crop_box):
-    body = image
-    x, y, x1, y1 = face_box
-    x_s, y_s, x_e, y_e = crop_box
-    face_large = copy.deepcopy(body[y_s:y_e, x_s:x_e])
-    face_large[y-y_s:y1-y_s, x-x_s:x1-x_s]=face
-
-    mask_image = cv2.cvtColor(mask_array,cv2.COLOR_BGR2GRAY)
-    mask_image = (mask_image/255).astype(np.float32)
-
-    body[y_s:y_e, x_s:x_e] = cv2.blendLinear(face_large,body[y_s:y_e, x_s:x_e],mask_image,1-mask_image)
-
-    return body