mirror of
https://github.com/TMElyralab/MuseTalk.git
synced 2026-02-05 01:49:20 +08:00
feat: data preprocessing and training (#294)
* docs: update readme * docs: update readme * feat: training codes * feat: data preprocess * docs: release training
This commit is contained in:
21
configs/training/gpu.yaml
Executable file
21
configs/training/gpu.yaml
Executable file
@@ -0,0 +1,21 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: True
|
||||
deepspeed_config:
|
||||
offload_optimizer_device: none
|
||||
offload_param_device: none
|
||||
zero3_init_flag: False
|
||||
zero_stage: 2
|
||||
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: "5, 7" # modify this according to your GPU number
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
num_machines: 1
|
||||
num_processes: 2 # it should be the same as the number of GPUs
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
31
configs/training/preprocess.yaml
Executable file
31
configs/training/preprocess.yaml
Executable file
@@ -0,0 +1,31 @@
|
||||
clip_len_second: 30 # the length of the video clip
|
||||
video_root_raw: "./dataset/HDTF/source/" # the path of the original video
|
||||
val_list_hdtf:
|
||||
- RD_Radio7_000
|
||||
- RD_Radio8_000
|
||||
- RD_Radio9_000
|
||||
- WDA_TinaSmith_000
|
||||
- WDA_TomCarper_000
|
||||
- WDA_TomPerez_000
|
||||
- WDA_TomUdall_000
|
||||
- WDA_VeronicaEscobar0_000
|
||||
- WDA_VeronicaEscobar1_000
|
||||
- WDA_WhipJimClyburn_000
|
||||
- WDA_XavierBecerra_000
|
||||
- WDA_XavierBecerra_001
|
||||
- WDA_XavierBecerra_002
|
||||
- WDA_ZoeLofgren_000
|
||||
- WRA_SteveScalise1_000
|
||||
- WRA_TimScott_000
|
||||
- WRA_ToddYoung_000
|
||||
- WRA_TomCotton_000
|
||||
- WRA_TomPrice_000
|
||||
- WRA_VickyHartzler_000
|
||||
|
||||
# following dir will be automatically generated
|
||||
video_root_25fps: "./dataset/HDTF/video_root_25fps/"
|
||||
video_file_list: "./dataset/HDTF/video_file_list.txt"
|
||||
video_audio_clip_root: "./dataset/HDTF/video_audio_clip_root/"
|
||||
meta_root: "./dataset/HDTF/meta/"
|
||||
video_clip_file_list_train: "./dataset/HDTF/train.txt"
|
||||
video_clip_file_list_val: "./dataset/HDTF/val.txt"
|
||||
89
configs/training/stage1.yaml
Executable file
89
configs/training/stage1.yaml
Executable file
@@ -0,0 +1,89 @@
|
||||
exp_name: 'test' # Name of the experiment
|
||||
output_dir: './exp_out/stage1/' # Directory to save experiment outputs
|
||||
unet_sub_folder: musetalk # Subfolder name for UNet model
|
||||
random_init_unet: True # Whether to randomly initialize UNet (stage1) or use pretrained weights (stage2)
|
||||
whisper_path: "./models/whisper" # Path to the Whisper model
|
||||
pretrained_model_name_or_path: "./models" # Path to pretrained models
|
||||
resume_from_checkpoint: True # Whether to resume training from a checkpoint
|
||||
padding_pixel_mouth: 10 # Number of pixels to pad around the mouth region
|
||||
vae_type: "sd-vae" # Type of VAE model to use
|
||||
# Validation parameters
|
||||
num_images_to_keep: 8 # Number of validation images to keep
|
||||
ref_dropout_rate: 0 # Dropout rate for reference images
|
||||
syncnet_config_path: "./configs/training/syncnet.yaml" # Path to SyncNet configuration
|
||||
use_adapted_weight: False # Whether to use adapted weights for loss calculation
|
||||
cropping_jaw2edge_margin_mean: 10 # Mean margin for jaw-to-edge cropping
|
||||
cropping_jaw2edge_margin_std: 10 # Standard deviation for jaw-to-edge cropping
|
||||
crop_type: "crop_resize" # Type of cropping method
|
||||
random_margin_method: "normal" # Method for random margin generation
|
||||
num_backward_frames: 16 # Number of frames to use for backward pass in SyncNet
|
||||
|
||||
data:
|
||||
dataset_key: "HDTF" # Dataset to use for training
|
||||
train_bs: 32 # Training batch size (actual batch size is train_bs*n_sample_frames)
|
||||
image_size: 256 # Size of input images
|
||||
n_sample_frames: 1 # Number of frames to sample per batch
|
||||
num_workers: 8 # Number of data loading workers
|
||||
audio_padding_length_left: 2 # Left padding length for audio features
|
||||
audio_padding_length_right: 2 # Right padding length for audio features
|
||||
sample_method: pose_similarity_and_mouth_dissimilarity # Method for sampling frames
|
||||
top_k_ratio: 0.51 # Ratio for top-k sampling
|
||||
contorl_face_min_size: True # Whether to control minimum face size
|
||||
min_face_size: 150 # Minimum face size in pixels
|
||||
|
||||
loss_params:
|
||||
l1_loss: 1.0 # Weight for L1 loss
|
||||
vgg_loss: 0.01 # Weight for VGG perceptual loss
|
||||
vgg_layer_weight: [1, 1, 1, 1, 1] # Weights for different VGG layers
|
||||
pyramid_scale: [1, 0.5, 0.25, 0.125] # Scales for image pyramid
|
||||
gan_loss: 0 # Weight for GAN loss
|
||||
fm_loss: [1.0, 1.0, 1.0, 1.0] # Weights for feature matching loss
|
||||
sync_loss: 0 # Weight for sync loss
|
||||
mouth_gan_loss: 0 # Weight for mouth-specific GAN loss
|
||||
|
||||
model_params:
|
||||
discriminator_params:
|
||||
scales: [1] # Scales for discriminator
|
||||
block_expansion: 32 # Expansion factor for discriminator blocks
|
||||
max_features: 512 # Maximum number of features in discriminator
|
||||
num_blocks: 4 # Number of blocks in discriminator
|
||||
sn: True # Whether to use spectral normalization
|
||||
image_channel: 3 # Number of image channels
|
||||
estimate_jacobian: False # Whether to estimate Jacobian
|
||||
|
||||
discriminator_train_params:
|
||||
lr: 0.000005 # Learning rate for discriminator
|
||||
eps: 0.00000001 # Epsilon for optimizer
|
||||
weight_decay: 0.01 # Weight decay for optimizer
|
||||
patch_size: 1 # Size of patches for discriminator
|
||||
betas: [0.5, 0.999] # Beta parameters for Adam optimizer
|
||||
epochs: 10000 # Number of training epochs
|
||||
start_gan: 1000 # Step to start GAN training
|
||||
|
||||
solver:
|
||||
gradient_accumulation_steps: 1 # Number of steps for gradient accumulation
|
||||
uncond_steps: 10 # Number of unconditional steps
|
||||
mixed_precision: 'fp32' # Precision mode for training
|
||||
enable_xformers_memory_efficient_attention: True # Whether to use memory efficient attention
|
||||
gradient_checkpointing: True # Whether to use gradient checkpointing
|
||||
max_train_steps: 250000 # Maximum number of training steps
|
||||
max_grad_norm: 1.0 # Maximum gradient norm for clipping
|
||||
# Learning rate parameters
|
||||
learning_rate: 2.0e-5 # Base learning rate
|
||||
scale_lr: False # Whether to scale learning rate
|
||||
lr_warmup_steps: 1000 # Number of warmup steps for learning rate
|
||||
lr_scheduler: "linear" # Type of learning rate scheduler
|
||||
# Optimizer parameters
|
||||
use_8bit_adam: False # Whether to use 8-bit Adam optimizer
|
||||
adam_beta1: 0.5 # Beta1 parameter for Adam optimizer
|
||||
adam_beta2: 0.999 # Beta2 parameter for Adam optimizer
|
||||
adam_weight_decay: 1.0e-2 # Weight decay for Adam optimizer
|
||||
adam_epsilon: 1.0e-8 # Epsilon for Adam optimizer
|
||||
|
||||
total_limit: 10 # Maximum number of checkpoints to keep
|
||||
save_model_epoch_interval: 250000 # Interval between model saves
|
||||
checkpointing_steps: 10000 # Number of steps between checkpoints
|
||||
val_freq: 2000 # Frequency of validation
|
||||
|
||||
seed: 41 # Random seed for reproducibility
|
||||
|
||||
89
configs/training/stage2.yaml
Executable file
89
configs/training/stage2.yaml
Executable file
@@ -0,0 +1,89 @@
|
||||
exp_name: 'test' # Name of the experiment
|
||||
output_dir: './exp_out/stage2/' # Directory to save experiment outputs
|
||||
unet_sub_folder: musetalk # Subfolder name for UNet model
|
||||
random_init_unet: False # Whether to randomly initialize UNet (stage1) or use pretrained weights (stage2)
|
||||
whisper_path: "./models/whisper" # Path to the Whisper model
|
||||
pretrained_model_name_or_path: "./models" # Path to pretrained models
|
||||
resume_from_checkpoint: True # Whether to resume training from a checkpoint
|
||||
padding_pixel_mouth: 10 # Number of pixels to pad around the mouth region
|
||||
vae_type: "sd-vae" # Type of VAE model to use
|
||||
# Validation parameters
|
||||
num_images_to_keep: 8 # Number of validation images to keep
|
||||
ref_dropout_rate: 0 # Dropout rate for reference images
|
||||
syncnet_config_path: "./configs/training/syncnet.yaml" # Path to SyncNet configuration
|
||||
use_adapted_weight: False # Whether to use adapted weights for loss calculation
|
||||
cropping_jaw2edge_margin_mean: 10 # Mean margin for jaw-to-edge cropping
|
||||
cropping_jaw2edge_margin_std: 10 # Standard deviation for jaw-to-edge cropping
|
||||
crop_type: "dynamic_margin_crop_resize" # Type of cropping method
|
||||
random_margin_method: "normal" # Method for random margin generation
|
||||
num_backward_frames: 16 # Number of frames to use for backward pass in SyncNet
|
||||
|
||||
data:
|
||||
dataset_key: "HDTF" # Dataset to use for training
|
||||
train_bs: 2 # Training batch size (actual batch size is train_bs*n_sample_frames)
|
||||
image_size: 256 # Size of input images
|
||||
n_sample_frames: 16 # Number of frames to sample per batch
|
||||
num_workers: 8 # Number of data loading workers
|
||||
audio_padding_length_left: 2 # Left padding length for audio features
|
||||
audio_padding_length_right: 2 # Right padding length for audio features
|
||||
sample_method: pose_similarity_and_mouth_dissimilarity # Method for sampling frames
|
||||
top_k_ratio: 0.51 # Ratio for top-k sampling
|
||||
contorl_face_min_size: True # Whether to control minimum face size
|
||||
min_face_size: 200 # Minimum face size in pixels
|
||||
|
||||
loss_params:
|
||||
l1_loss: 1.0 # Weight for L1 loss
|
||||
vgg_loss: 0.01 # Weight for VGG perceptual loss
|
||||
vgg_layer_weight: [1, 1, 1, 1, 1] # Weights for different VGG layers
|
||||
pyramid_scale: [1, 0.5, 0.25, 0.125] # Scales for image pyramid
|
||||
gan_loss: 0.01 # Weight for GAN loss
|
||||
fm_loss: [1.0, 1.0, 1.0, 1.0] # Weights for feature matching loss
|
||||
sync_loss: 0.05 # Weight for sync loss
|
||||
mouth_gan_loss: 0.01 # Weight for mouth-specific GAN loss
|
||||
|
||||
model_params:
|
||||
discriminator_params:
|
||||
scales: [1] # Scales for discriminator
|
||||
block_expansion: 32 # Expansion factor for discriminator blocks
|
||||
max_features: 512 # Maximum number of features in discriminator
|
||||
num_blocks: 4 # Number of blocks in discriminator
|
||||
sn: True # Whether to use spectral normalization
|
||||
image_channel: 3 # Number of image channels
|
||||
estimate_jacobian: False # Whether to estimate Jacobian
|
||||
|
||||
discriminator_train_params:
|
||||
lr: 0.000005 # Learning rate for discriminator
|
||||
eps: 0.00000001 # Epsilon for optimizer
|
||||
weight_decay: 0.01 # Weight decay for optimizer
|
||||
patch_size: 1 # Size of patches for discriminator
|
||||
betas: [0.5, 0.999] # Beta parameters for Adam optimizer
|
||||
epochs: 10000 # Number of training epochs
|
||||
start_gan: 1000 # Step to start GAN training
|
||||
|
||||
solver:
|
||||
gradient_accumulation_steps: 8 # Number of steps for gradient accumulation
|
||||
uncond_steps: 10 # Number of unconditional steps
|
||||
mixed_precision: 'fp32' # Precision mode for training
|
||||
enable_xformers_memory_efficient_attention: True # Whether to use memory efficient attention
|
||||
gradient_checkpointing: True # Whether to use gradient checkpointing
|
||||
max_train_steps: 250000 # Maximum number of training steps
|
||||
max_grad_norm: 1.0 # Maximum gradient norm for clipping
|
||||
# Learning rate parameters
|
||||
learning_rate: 5.0e-6 # Base learning rate
|
||||
scale_lr: False # Whether to scale learning rate
|
||||
lr_warmup_steps: 1000 # Number of warmup steps for learning rate
|
||||
lr_scheduler: "linear" # Type of learning rate scheduler
|
||||
# Optimizer parameters
|
||||
use_8bit_adam: False # Whether to use 8-bit Adam optimizer
|
||||
adam_beta1: 0.5 # Beta1 parameter for Adam optimizer
|
||||
adam_beta2: 0.999 # Beta2 parameter for Adam optimizer
|
||||
adam_weight_decay: 1.0e-2 # Weight decay for Adam optimizer
|
||||
adam_epsilon: 1.0e-8 # Epsilon for Adam optimizer
|
||||
|
||||
total_limit: 10 # Maximum number of checkpoints to keep
|
||||
save_model_epoch_interval: 250000 # Interval between model saves
|
||||
checkpointing_steps: 2000 # Number of steps between checkpoints
|
||||
val_freq: 2000 # Frequency of validation
|
||||
|
||||
seed: 41 # Random seed for reproducibility
|
||||
|
||||
19
configs/training/syncnet.yaml
Normal file
19
configs/training/syncnet.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
# This file is modified from LatentSync (https://github.com/bytedance/LatentSync/blob/main/latentsync/configs/training/syncnet_16_pixel.yaml).
|
||||
model:
|
||||
audio_encoder: # input (1, 80, 52)
|
||||
in_channels: 1
|
||||
block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
|
||||
downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
|
||||
attn_blocks: [0, 0, 0, 0, 0, 0, 0]
|
||||
dropout: 0.0
|
||||
visual_encoder: # input (48, 128, 256)
|
||||
in_channels: 48
|
||||
block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
|
||||
downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
|
||||
attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
|
||||
dropout: 0.0
|
||||
|
||||
ckpt:
|
||||
resume_ckpt_path: ""
|
||||
inference_ckpt_path: ./models/syncnet/latentsync_syncnet.pt # this pretrained model is from LatentSync (https://huggingface.co/ByteDance/LatentSync/tree/main)
|
||||
save_ckpt_steps: 2500
|
||||
Reference in New Issue
Block a user