Files
MuseTalk/musetalk/data/sample_method.py
Zhizhou Zhong 1ab53a626b feat: data preprocessing and training (#294)
* docs: update readme

* docs: update readme

* feat: training codes

* feat: data preprocess

* docs: release training
2025-04-04 22:10:03 +08:00

234 lines
11 KiB
Python
Executable File

import numpy as np
import random
def summarize_tensor(x):
return f"\033[34m{str(tuple(x.shape)).ljust(24)}\033[0m (\033[31mmin {x.min().item():+.4f}\033[0m / \033[32mmean {x.mean().item():+.4f}\033[0m / \033[33mmax {x.max().item():+.4f}\033[0m)"
def calculate_mouth_open_similarity(landmarks_list, select_idx,top_k=50,ascending=True):
num_landmarks = len(landmarks_list)
mouth_open_ratios = np.zeros(num_landmarks) # Initialize as a numpy array
print(np.shape(landmarks_list))
## Calculate mouth opening ratios
for i, landmarks in enumerate(landmarks_list):
# Assuming landmarks are in the format [x, y] and accessible by index
mouth_top = landmarks[165] # Adjust index according to your landmarks format
mouth_bottom = landmarks[147] # Adjust index according to your landmarks format
mouth_open_ratio = np.linalg.norm(mouth_top - mouth_bottom)
mouth_open_ratios[i] = mouth_open_ratio
# Calculate differences matrix
differences_matrix = np.abs(mouth_open_ratios[:, np.newaxis] - mouth_open_ratios[select_idx])
differences_matrix_with_signs = mouth_open_ratios[:, np.newaxis] - mouth_open_ratios[select_idx]
print(differences_matrix.shape)
# Find top_k similar indices for each landmark set
if ascending:
top_indices = np.argsort(differences_matrix[i])[:top_k]
else:
top_indices = np.argsort(-differences_matrix[i])[:top_k]
similar_landmarks_indices = top_indices.tolist()
similar_landmarks_distances = differences_matrix_with_signs[i].tolist() #注意这里不要排序
return similar_landmarks_indices, similar_landmarks_distances
#############################################################################################
def get_closed_mouth(landmarks_list,ascending=True,top_k=50):
num_landmarks = len(landmarks_list)
mouth_open_ratios = np.zeros(num_landmarks) # Initialize as a numpy array
## Calculate mouth opening ratios
#print("landmarks shape",np.shape(landmarks_list))
for i, landmarks in enumerate(landmarks_list):
# Assuming landmarks are in the format [x, y] and accessible by index
#print(landmarks[165])
mouth_top = np.array(landmarks[165])# Adjust index according to your landmarks format
mouth_bottom = np.array(landmarks[147]) # Adjust index according to your landmarks format
mouth_open_ratio = np.linalg.norm(mouth_top - mouth_bottom)
mouth_open_ratios[i] = mouth_open_ratio
# Find top_k similar indices for each landmark set
if ascending:
top_indices = np.argsort(mouth_open_ratios)[:top_k]
else:
top_indices = np.argsort(-mouth_open_ratios)[:top_k]
return top_indices
def calculate_landmarks_similarity(selected_idx, landmarks_list,image_shapes, start_index, end_index, top_k=50,ascending=True):
"""
Calculate the similarity between sets of facial landmarks and return the indices of the most similar faces.
Parameters:
landmarks_list (list): A list containing sets of facial landmarks, each element is a set of landmarks.
image_shapes (list): A list containing the shape of each image, each element is a (width, height) tuple.
start_index (int): The starting index of the facial landmarks.
end_index (int): The ending index of the facial landmarks.
top_k (int): The number of most similar landmark sets to return. Default is 50.
ascending (bool): Controls the sorting order. If True, sort in ascending order; If False, sort in descending order. Default is True.
Returns:
similar_landmarks_indices (list): A list containing the indices of the most similar facial landmarks for each face.
resized_landmarks (list): A list containing the resized facial landmarks.
"""
num_landmarks = len(landmarks_list)
resized_landmarks = []
# Preprocess landmarks
for i in range(num_landmarks):
landmark_array = np.array(landmarks_list[i])
selected_landmarks = landmark_array[start_index:end_index]
resized_landmark = resize_landmark(selected_landmarks, w=image_shapes[i][0], h=image_shapes[i][1],new_w=256,new_h=256)
resized_landmarks.append(resized_landmark)
resized_landmarks_array = np.array(resized_landmarks) # Convert list to array for easier manipulation
# Calculate similarity
distances = np.linalg.norm(resized_landmarks_array - resized_landmarks_array[selected_idx][np.newaxis, :], axis=2)
overall_distances = np.mean(distances, axis=1) # Calculate mean distance for each set of landmarks
if ascending:
sorted_indices = np.argsort(overall_distances)
similar_landmarks_indices = sorted_indices[1:top_k+1].tolist() # Exclude self and take top_k
else:
sorted_indices = np.argsort(-overall_distances)
similar_landmarks_indices = sorted_indices[0:top_k].tolist()
return similar_landmarks_indices
def process_bbox_musetalk(face_array, landmark_array):
x_min_face, y_min_face, x_max_face, y_max_face = map(int, face_array)
x_min_lm = min([int(x) for x, y in landmark_array])
y_min_lm = min([int(y) for x, y in landmark_array])
x_max_lm = max([int(x) for x, y in landmark_array])
y_max_lm = max([int(y) for x, y in landmark_array])
x_min = min(x_min_face, x_min_lm)
y_min = min(y_min_face, y_min_lm)
x_max = max(x_max_face, x_max_lm)
y_max = max(y_max_face, y_max_lm)
x_min = max(x_min, 0)
y_min = max(y_min, 0)
return [x_min, y_min, x_max, y_max]
def shift_landmarks_to_face_coordinates(landmark_list, face_list):
"""
Translates the data in landmark_list to the coordinates of the cropped larger face.
Parameters:
landmark_list (list): A list containing multiple sets of facial landmarks.
face_list (list): A list containing multiple facial images.
Returns:
landmark_list_shift (list): The list of translated landmarks.
bbox_union (list): The list of union bounding boxes.
face_shapes (list): The list of facial shapes.
"""
landmark_list_shift = []
bbox_union = []
face_shapes = []
for i in range(len(face_list)):
landmark_array = np.array(landmark_list[i]) # 转换为numpy数组并创建副本
face_array = face_list[i]
f_landmark_bbox = process_bbox_musetalk(face_array, landmark_array)
x_min, y_min, x_max, y_max = f_landmark_bbox
landmark_array[:, 0] = landmark_array[:, 0] - f_landmark_bbox[0]
landmark_array[:, 1] = landmark_array[:, 1] - f_landmark_bbox[1]
landmark_list_shift.append(landmark_array)
bbox_union.append(f_landmark_bbox)
face_shapes.append((x_max - x_min, y_max - y_min))
return landmark_list_shift, bbox_union, face_shapes
def resize_landmark(landmark, w, h, new_w, new_h):
landmark_norm = landmark / [w, h]
landmark_resized = landmark_norm * [new_w, new_h]
return landmark_resized
def get_src_idx(drive_idx, T, sample_method,landmarks_list,image_shapes,top_k_ratio):
"""
Calculate the source index (src_idx) based on the given drive index, T, s, e, and sampling method.
Parameters:
- drive_idx (int): The current drive index.
- T (int): Total number of frames or a specific range limit.
- sample_method (str): Sampling method, which can be "random" or other methods.
- landmarks_list (list): List of facial landmarks.
- image_shapes (list): List of image shapes.
- top_k_ratio (float): Ratio for selecting top k similar frames.
Returns:
- src_idx (int): The calculated source index.
"""
if sample_method == "random":
src_idx = random.randint(drive_idx - 5 * T, drive_idx + 5 * T)
elif sample_method == "pose_similarity":
top_k = int(top_k_ratio*len(landmarks_list))
try:
top_k = int(top_k_ratio*len(landmarks_list))
# facial contour
landmark_start_idx = 0
landmark_end_idx = 16
pose_similarity_list = calculate_landmarks_similarity(drive_idx, landmarks_list,image_shapes, landmark_start_idx, landmark_end_idx,top_k=top_k, ascending=True)
src_idx = random.choice(pose_similarity_list)
while abs(src_idx-drive_idx)<5:
src_idx = random.choice(pose_similarity_list)
except Exception as e:
print(e)
return None
elif sample_method=="pose_similarity_and_closed_mouth":
# facial contour
landmark_start_idx = 0
landmark_end_idx = 16
try:
top_k = int(top_k_ratio*len(landmarks_list))
closed_mouth_list = get_closed_mouth(landmarks_list, ascending=True,top_k=top_k)
#print("closed_mouth_list",closed_mouth_list)
pose_similarity_list = calculate_landmarks_similarity(drive_idx, landmarks_list,image_shapes, landmark_start_idx, landmark_end_idx,top_k=top_k, ascending=True)
#print("pose_similarity_list",pose_similarity_list)
common_list = list(set(closed_mouth_list).intersection(set(pose_similarity_list)))
if len(common_list) == 0:
src_idx = random.randint(drive_idx - 5 * T, drive_idx + 5 * T)
else:
src_idx = random.choice(common_list)
while abs(src_idx-drive_idx) <5:
src_idx = random.randint(drive_idx - 5 * T, drive_idx + 5 * T)
except Exception as e:
print(e)
return None
elif sample_method=="pose_similarity_and_mouth_dissimilarity":
top_k = int(top_k_ratio*len(landmarks_list))
try:
top_k = int(top_k_ratio*len(landmarks_list))
# facial contour for 68 landmarks format
landmark_start_idx = 0
landmark_end_idx = 16
pose_similarity_list = calculate_landmarks_similarity(drive_idx, landmarks_list,image_shapes, landmark_start_idx, landmark_end_idx,top_k=top_k, ascending=True)
# Mouth inner coutour for 68 landmarks format
landmark_start_idx = 60
landmark_end_idx = 67
mouth_dissimilarity_list = calculate_landmarks_similarity(drive_idx, landmarks_list,image_shapes, landmark_start_idx, landmark_end_idx,top_k=top_k, ascending=False)
common_list = list(set(pose_similarity_list).intersection(set(mouth_dissimilarity_list)))
if len(common_list) == 0:
src_idx = random.randint(drive_idx - 5 * T, drive_idx + 5 * T)
else:
src_idx = random.choice(common_list)
while abs(src_idx-drive_idx) <5:
src_idx = random.randint(drive_idx - 5 * T, drive_idx + 5 * T)
except Exception as e:
print(e)
return None
else:
raise ValueError(f"Unknown sample_method: {sample_method}")
return src_idx