Merge pull request #639 from FunAudioLLM/dev/lyuxiang.lx

use stream read to save memory
2026-02-05 18:09:24 +08:00 · 2024-11-11 22:17:21 +08:00
parent 6d22d0b76f 5ed5bb15c8
commit 7701325969
1 changed files with 12 additions and 11 deletions
--- a/cosyvoice/dataset/processor.py
+++ b/cosyvoice/dataset/processor.py
@@ -40,7 +40,8 @@ def parquet_opener(data, mode='train', tts_data={}):
        assert 'src' in sample
        url = sample['src']
        try:
-            df = pq.read_table(url).to_pandas()
+            for df in pq.ParquetFile(url).iter_batches(batch_size=64):
                df = df.to_pandas()
                for i in range(len(df)):
                    if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
                        continue