feat: data preprocessing and training (#294)

* docs: update readme * docs: update readme * feat: training codes * feat: data preprocess * docs: release training
2026-02-05 18:09:19 +08:00 · 2025-04-04 22:10:03 +08:00
parent e636166b85
commit 1ab53a626b
23 changed files with 3854 additions and 6 deletions
--- a/configs/training/syncnet.yaml
+++ b/configs/training/syncnet.yaml
@@ -0,0 +1,19 @@
+# This file is modified from LatentSync (https://github.com/bytedance/LatentSync/blob/main/latentsync/configs/training/syncnet_16_pixel.yaml).
+model:
+  audio_encoder: # input (1, 80, 52)
+    in_channels: 1
+    block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
+    downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
+    attn_blocks: [0, 0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+  visual_encoder: # input (48, 128, 256)
+    in_channels: 48
+    block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
+    downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
+    attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
+    dropout: 0.0
+
+ckpt:
+  resume_ckpt_path: ""
+  inference_ckpt_path: ./models/syncnet/latentsync_syncnet.pt # this pretrained model is from LatentSync (https://huggingface.co/ByteDance/LatentSync/tree/main)
+  save_ckpt_steps: 2500