add files

2026-02-05 18:09:20 +08:00 · 2025-02-20 12:17:03 +08:00
parent a21dd4555c
commit edd008441b
667 changed files with 473123 additions and 0 deletions
--- a/funasr_local/iterators/init.py
+++ b/funasr_local/iterators/init.py
--- a/funasr_local/iterators/abs_iter_factory.py
+++ b/funasr_local/iterators/abs_iter_factory.py
@@ -0,0 +1,9 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Iterator
+
+
+class AbsIterFactory(ABC):
+    @abstractmethod
+    def build_iter(self, epoch: int, shuffle: bool = None) -> Iterator:
+        raise NotImplementedError
--- a/funasr_local/iterators/chunk_iter_factory.py
+++ b/funasr_local/iterators/chunk_iter_factory.py
@@ -0,0 +1,215 @@
+import logging
+from typing import Any
+from typing import Dict
+from typing import Iterator
+from typing import List
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr_local.iterators.abs_iter_factory import AbsIterFactory
+from funasr_local.iterators.sequence_iter_factory import SequenceIterFactory
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class ChunkIterFactory(AbsIterFactory):
+    """Creates chunks from a sequence
+
+    Examples:
+        >>> batches = [["id1"], ["id2"], ...]
+        >>> batch_size = 128
+        >>> chunk_length = 1000
+        >>> iter_factory = ChunkIterFactory(dataset, batches, batch_size, chunk_length)
+        >>> it = iter_factory.build_iter(epoch)
+        >>> for ids, batch in it:
+        ...     ...
+
+    - The number of mini-batches are varied in each epochs and
+      we can't get the number in advance
+      because IterFactory doesn't be given to the length information.
+    - Since the first reason, "num_iters_per_epoch" can't be implemented
+      for this iterator. Instead of it, "num_samples_per_epoch" is implemented.
+
+    """
+
+    def __init__(
+        self,
+        dataset,
+        batch_size: int,
+        batches: Union[AbsSampler, Sequence[Sequence[Any]]],
+        chunk_length: Union[int, str],
+        chunk_shift_ratio: float = 0.5,
+        num_cache_chunks: int = 1024,
+        num_samples_per_epoch: int = None,
+        seed: int = 0,
+        shuffle: bool = False,
+        num_workers: int = 0,
+        collate_fn=None,
+        pin_memory: bool = False,
+    ):
+        assert check_argument_types()
+        assert all(len(x) == 1 for x in batches), "batch-size must be 1"
+
+        self.per_sample_iter_factory = SequenceIterFactory(
+            dataset=dataset,
+            batches=batches,
+            num_iters_per_epoch=num_samples_per_epoch,
+            seed=seed,
+            shuffle=shuffle,
+            num_workers=num_workers,
+            collate_fn=collate_fn,
+            pin_memory=pin_memory,
+        )
+
+        self.num_cache_chunks = max(num_cache_chunks, batch_size)
+        if isinstance(chunk_length, str):
+            if len(chunk_length) == 0:
+                raise ValueError("e.g. 5,8 or 3-5: but got empty string")
+
+            self.chunk_lengths = []
+            for x in chunk_length.split(","):
+                try:
+                    sps = list(map(int, x.split("-")))
+                except ValueError:
+                    raise ValueError(f"e.g. 5,8 or 3-5: but got {chunk_length}")
+
+                if len(sps) > 2:
+                    raise ValueError(f"e.g. 5,8 or 3-5: but got {chunk_length}")
+                elif len(sps) == 2:
+                    # Append all numbers between the range into the candidates
+                    self.chunk_lengths += list(range(sps[0], sps[1] + 1))
+                else:
+                    self.chunk_lengths += [sps[0]]
+        else:
+            # Single candidates: Fixed chunk length
+            self.chunk_lengths = [chunk_length]
+
+        self.chunk_shift_ratio = chunk_shift_ratio
+        self.batch_size = batch_size
+        self.seed = seed
+        self.shuffle = shuffle
+
+    def build_iter(
+        self,
+        epoch: int,
+        shuffle: bool = None,
+    ) -> Iterator[Tuple[List[str], Dict[str, torch.Tensor]]]:
+        per_sample_loader = self.per_sample_iter_factory.build_iter(epoch, shuffle)
+
+        if shuffle is None:
+            shuffle = self.shuffle
+        state = np.random.RandomState(epoch + self.seed)
+
+        # NOTE(kamo):
+        #   This iterator supports multiple chunk lengths and
+        #   keep chunks for each lengths here until collecting specified numbers
+        cache_chunks_dict = {}
+        cache_id_list_dict = {}
+        for ids, batch in per_sample_loader:
+            # Must be per-sample-loader
+            assert len(ids) == 1, f"Must be per-sample-loader: {len(ids)}"
+            assert all(len(x) == 1 for x in batch.values())
+
+            # Get keys of sequence data
+            sequence_keys = []
+            for key in batch:
+                if key + "_lengths" in batch:
+                    sequence_keys.append(key)
+            # Remove lengths data and get the first sample
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+            id_ = ids[0]
+
+            for key in sequence_keys:
+                if len(batch[key]) != len(batch[sequence_keys[0]]):
+                    raise RuntimeError(
+                        f"All sequences must has same length: "
+                        f"{len(batch[key])} != {len(batch[sequence_keys[0]])}"
+                    )
+
+            L = len(batch[sequence_keys[0]])
+            # Select chunk length
+            chunk_lengths = [lg for lg in self.chunk_lengths if lg < L]
+            if len(chunk_lengths) == 0:
+                logging.warning(
+                    f"The length of '{id_}' is {L}, but it is shorter than "
+                    f"any candidates of chunk-length: {self.chunk_lengths}"
+                )
+                continue
+
+            W = int(state.choice(chunk_lengths, 1))
+            cache_id_list = cache_id_list_dict.setdefault(W, [])
+            cache_chunks = cache_chunks_dict.setdefault(W, {})
+
+            # Shift width to the next chunk
+            S = int(W * self.chunk_shift_ratio)
+            # Number of chunks
+            N = (L - W) // S + 1
+            if shuffle:
+                Z = state.randint(0, (L - W) % S + 1)
+            else:
+                Z = 0
+
+            # Split a sequence into chunks.
+            # Note that the marginal frames divided by chunk length are discarded
+            for k, v in batch.items():
+                if k not in cache_chunks:
+                    cache_chunks[k] = []
+                if k in sequence_keys:
+                    # Shift chunks with overlapped length for data augmentation
+                    cache_chunks[k] += [v[Z + i * S : Z + i * S + W] for i in range(N)]
+                else:
+                    # If not sequence, use whole data instead of chunk
+                    cache_chunks[k] += [v for _ in range(N)]
+            cache_id_list += [id_ for _ in range(N)]
+
+            if len(cache_id_list) > self.num_cache_chunks:
+                cache_id_list, cache_chunks = yield from self._generate_mini_batches(
+                    cache_id_list,
+                    cache_chunks,
+                    shuffle,
+                    state,
+                )
+
+            cache_id_list_dict[W] = cache_id_list
+            cache_chunks_dict[W] = cache_chunks
+
+        else:
+            for W in cache_id_list_dict:
+                cache_id_list = cache_id_list_dict.setdefault(W, [])
+                cache_chunks = cache_chunks_dict.setdefault(W, {})
+
+                yield from self._generate_mini_batches(
+                    cache_id_list,
+                    cache_chunks,
+                    shuffle,
+                    state,
+                )
+
+    def _generate_mini_batches(
+        self,
+        id_list: List[str],
+        batches: Dict[str, List[torch.Tensor]],
+        shuffle: bool,
+        state: np.random.RandomState,
+    ):
+        if shuffle:
+            indices = np.arange(0, len(id_list))
+            state.shuffle(indices)
+            batches = {k: [v[i] for i in indices] for k, v in batches.items()}
+            id_list = [id_list[i] for i in indices]
+
+        bs = self.batch_size
+        while len(id_list) >= bs:
+            # Make mini-batch and yield
+            yield (
+                id_list[:bs],
+                {k: torch.stack(v[:bs], 0) for k, v in batches.items()},
+            )
+            id_list = id_list[bs:]
+            batches = {k: v[bs:] for k, v in batches.items()}
+
+        return id_list, batches
--- a/funasr_local/iterators/multiple_iter_factory.py
+++ b/funasr_local/iterators/multiple_iter_factory.py
@@ -0,0 +1,37 @@
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Iterator
+
+import numpy as np
+from typeguard import check_argument_types
+
+from funasr_local.iterators.abs_iter_factory import AbsIterFactory
+
+
+class MultipleIterFactory(AbsIterFactory):
+    def __init__(
+        self,
+        build_funcs: Collection[Callable[[], AbsIterFactory]],
+        seed: int = 0,
+        shuffle: bool = False,
+    ):
+        assert check_argument_types()
+        self.build_funcs = list(build_funcs)
+        self.seed = seed
+        self.shuffle = shuffle
+
+    def build_iter(self, epoch: int, shuffle: bool = None) -> Iterator:
+        if shuffle is None:
+            shuffle = self.shuffle
+
+        build_funcs = list(self.build_funcs)
+
+        if shuffle:
+            np.random.RandomState(epoch + self.seed).shuffle(build_funcs)
+
+        for i, build_func in enumerate(build_funcs):
+            logging.info(f"Building {i}th iter-factory...")
+            iter_factory = build_func()
+            assert isinstance(iter_factory, AbsIterFactory), type(iter_factory)
+            yield from iter_factory.build_iter(epoch, shuffle)
--- a/funasr_local/iterators/sequence_iter_factory.py
+++ b/funasr_local/iterators/sequence_iter_factory.py
@@ -0,0 +1,143 @@
+from typing import Any
+from typing import Sequence
+from typing import Union
+
+import numpy as np
+from torch.utils.data import DataLoader
+from typeguard import check_argument_types
+
+from funasr_local.iterators.abs_iter_factory import AbsIterFactory
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class RawSampler(AbsSampler):
+    def __init__(self, batches):
+        self.batches = batches
+
+    def __len__(self):
+        return len(self.batches)
+
+    def __iter__(self):
+        return iter(self.batches)
+
+    def generate(self, seed):
+        return list(self.batches)
+
+
+class SequenceIterFactory(AbsIterFactory):
+    """Build iterator for each epoch.
+
+    This class simply creates pytorch DataLoader except for the following points:
+    - The random seed is decided according to the number of epochs. This feature
+      guarantees reproducibility when resuming from middle of training process.
+    - Enable to restrict the number of samples for one epoch. This features
+      controls the interval number between training and evaluation.
+
+    """
+
+    def __init__(
+        self,
+        dataset,
+        batches: Union[AbsSampler, Sequence[Sequence[Any]]],
+        num_iters_per_epoch: int = None,
+        seed: int = 0,
+        shuffle: bool = False,
+        num_workers: int = 0,
+        collate_fn=None,
+        pin_memory: bool = False,
+    ):
+        assert check_argument_types()
+
+        if not isinstance(batches, AbsSampler):
+            self.sampler = RawSampler(batches)
+        else:
+            self.sampler = batches
+
+        self.dataset = dataset
+        self.num_iters_per_epoch = num_iters_per_epoch
+        self.shuffle = shuffle
+        self.seed = seed
+        self.num_workers = num_workers
+        self.collate_fn = collate_fn
+        # https://discuss.pytorch.org/t/what-is-the-disadvantage-of-using-pin-memory/1702
+        self.pin_memory = pin_memory
+
+    def build_iter(self, epoch: int, shuffle: bool = None) -> DataLoader:
+        if shuffle is None:
+            shuffle = self.shuffle
+
+        if self.num_iters_per_epoch is not None:
+            N = len(self.sampler)
+            # If corpus size is larger than the num_per_epoch
+            if self.num_iters_per_epoch < N:
+                N = len(self.sampler)
+                real_epoch, offset = divmod(self.num_iters_per_epoch * epoch, N)
+
+                if offset >= self.num_iters_per_epoch:
+                    current_batches = self.sampler.generate(real_epoch + self.seed)
+                    if shuffle:
+                        np.random.RandomState(real_epoch + self.seed).shuffle(
+                            current_batches
+                        )
+                    batches = current_batches[
+                        offset - self.num_iters_per_epoch : offset
+                    ]
+                else:
+                    prev_batches = self.sampler.generate(real_epoch - 1 + self.seed)
+                    current_batches = self.sampler.generate(real_epoch + self.seed)
+                    if shuffle:
+                        np.random.RandomState(real_epoch - 1 + self.seed).shuffle(
+                            prev_batches
+                        )
+                        np.random.RandomState(real_epoch + self.seed).shuffle(
+                            current_batches
+                        )
+                    batches = (
+                        prev_batches[offset - self.num_iters_per_epoch :]
+                        + current_batches[:offset]
+                    )
+
+            # If corpus size is less than the num_per_epoch
+            else:
+                _epoch, _cursor = divmod(self.num_iters_per_epoch * (epoch - 1), N)
+                _remain = self.num_iters_per_epoch
+                batches = []
+                current_batches = self.sampler.generate(_epoch + self.seed)
+                if shuffle:
+                    np.random.RandomState(_epoch + self.seed).shuffle(current_batches)
+                while _remain > 0:
+
+                    _batches = current_batches[_cursor : _cursor + _remain]
+                    batches += _batches
+                    if _cursor + _remain >= N:
+                        _epoch += 1
+                        _cursor = 0
+                        current_batches = self.sampler.generate(_epoch + self.seed)
+                        if shuffle:
+                            np.random.RandomState(_epoch + self.seed).shuffle(
+                                current_batches
+                            )
+                    else:
+                        _cursor = _cursor + _remain
+                    _remain -= len(_batches)
+
+                assert len(batches) == self.num_iters_per_epoch
+
+        else:
+            batches = self.sampler.generate(epoch + self.seed)
+            if shuffle:
+                np.random.RandomState(epoch + self.seed).shuffle(batches)
+
+        # For backward compatibility for pytorch DataLoader
+        if self.collate_fn is not None:
+            kwargs = dict(collate_fn=self.collate_fn)
+        else:
+            kwargs = {}
+
+        return DataLoader(
+            dataset=self.dataset,
+            batch_sampler=batches,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            **kwargs,
+        )