MuseTalk/configs/training/stage2.yaml

exp_name: 'test'  # Name of the experiment
output_dir: './exp_out/stage2/'  # Directory to save experiment outputs
unet_sub_folder: musetalk  # Subfolder name for UNet model
random_init_unet: False  # Whether to randomly initialize UNet (stage1) or use pretrained weights (stage2)
whisper_path: "./models/whisper"  # Path to the Whisper model
pretrained_model_name_or_path: "./models"  # Path to pretrained models
resume_from_checkpoint: True  # Whether to resume training from a checkpoint
padding_pixel_mouth: 10  # Number of pixels to pad around the mouth region
vae_type: "sd-vae"  # Type of VAE model to use
# Validation parameters
num_images_to_keep: 8  # Number of validation images to keep
ref_dropout_rate: 0  # Dropout rate for reference images
syncnet_config_path: "./configs/training/syncnet.yaml"  # Path to SyncNet configuration
use_adapted_weight: False  # Whether to use adapted weights for loss calculation
cropping_jaw2edge_margin_mean: 10  # Mean margin for jaw-to-edge cropping
cropping_jaw2edge_margin_std: 10  # Standard deviation for jaw-to-edge cropping
crop_type: "dynamic_margin_crop_resize"  # Type of cropping method
random_margin_method: "normal"  # Method for random margin generation
num_backward_frames: 16  # Number of frames to use for backward pass in SyncNet

data:
  dataset_key: "HDTF"  # Dataset to use for training
  train_bs: 2  # Training batch size (actual batch size is train_bs*n_sample_frames)
  image_size: 256  # Size of input images
  n_sample_frames: 16  # Number of frames to sample per batch
  num_workers: 8  # Number of data loading workers
  audio_padding_length_left: 2  # Left padding length for audio features
  audio_padding_length_right: 2  # Right padding length for audio features
  sample_method: pose_similarity_and_mouth_dissimilarity  # Method for sampling frames
  top_k_ratio: 0.51  # Ratio for top-k sampling
  contorl_face_min_size: True  # Whether to control minimum face size
  min_face_size: 200  # Minimum face size in pixels

loss_params:
  l1_loss: 1.0  # Weight for L1 loss
  vgg_loss: 0.01  # Weight for VGG perceptual loss
  vgg_layer_weight: [1, 1, 1, 1, 1]  # Weights for different VGG layers
  pyramid_scale: [1, 0.5, 0.25, 0.125]  # Scales for image pyramid
  gan_loss: 0.01  # Weight for GAN loss
  fm_loss: [1.0, 1.0, 1.0, 1.0]  # Weights for feature matching loss
  sync_loss: 0.05  # Weight for sync loss
  mouth_gan_loss: 0.01  # Weight for mouth-specific GAN loss

model_params:
  discriminator_params:
    scales: [1]  # Scales for discriminator
    block_expansion: 32  # Expansion factor for discriminator blocks
    max_features: 512  # Maximum number of features in discriminator
    num_blocks: 4  # Number of blocks in discriminator
    sn: True  # Whether to use spectral normalization
    image_channel: 3  # Number of image channels
    estimate_jacobian: False  # Whether to estimate Jacobian

discriminator_train_params:
  lr: 0.000005  # Learning rate for discriminator
  eps: 0.00000001  # Epsilon for optimizer
  weight_decay: 0.01  # Weight decay for optimizer
  patch_size: 1  # Size of patches for discriminator
  betas: [0.5, 0.999]  # Beta parameters for Adam optimizer
  epochs: 10000  # Number of training epochs
  start_gan: 1000  # Step to start GAN training

solver:
  gradient_accumulation_steps: 8  # Number of steps for gradient accumulation
  uncond_steps: 10  # Number of unconditional steps
  mixed_precision: 'fp32'  # Precision mode for training
  enable_xformers_memory_efficient_attention: True  # Whether to use memory efficient attention
  gradient_checkpointing: True  # Whether to use gradient checkpointing
  max_train_steps: 250000  # Maximum number of training steps
  max_grad_norm: 1.0  # Maximum gradient norm for clipping
  # Learning rate parameters
  learning_rate: 5.0e-6  # Base learning rate
  scale_lr: False  # Whether to scale learning rate
  lr_warmup_steps: 1000  # Number of warmup steps for learning rate
  lr_scheduler: "linear"  # Type of learning rate scheduler
  # Optimizer parameters
  use_8bit_adam: False  # Whether to use 8-bit Adam optimizer
  adam_beta1: 0.5  # Beta1 parameter for Adam optimizer
  adam_beta2: 0.999  # Beta2 parameter for Adam optimizer
  adam_weight_decay: 1.0e-2  # Weight decay for Adam optimizer
  adam_epsilon: 1.0e-8  # Epsilon for Adam optimizer

total_limit: 10  # Maximum number of checkpoints to keep
save_model_epoch_interval: 250000  # Interval between model saves
checkpointing_steps: 2000  # Number of steps between checkpoints
val_freq: 2000  # Frequency of validation

seed: 41  # Random seed for reproducibility