add files

2026-02-05 18:09:20 +08:00 · 2025-02-20 12:17:03 +08:00
parent a21dd4555c
commit edd008441b
667 changed files with 473123 additions and 0 deletions
--- a/funasr_local/samplers/init.py
+++ b/funasr_local/samplers/init.py
--- a/funasr_local/samplers/abs_sampler.py
+++ b/funasr_local/samplers/abs_sampler.py
@@ -0,0 +1,19 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Iterator
+from typing import Tuple
+
+from torch.utils.data import Sampler
+
+
+class AbsSampler(Sampler, ABC):
+    @abstractmethod
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def __iter__(self) -> Iterator[Tuple[str, ...]]:
+        raise NotImplementedError
+
+    def generate(self, seed):
+        return list(self)
--- a/funasr_local/samplers/build_batch_sampler.py
+++ b/funasr_local/samplers/build_batch_sampler.py
@@ -0,0 +1,168 @@
+from typing import List
+from typing import Dict
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr_local.samplers.abs_sampler import AbsSampler
+from funasr_local.samplers.folded_batch_sampler import FoldedBatchSampler
+from funasr_local.samplers.length_batch_sampler import LengthBatchSampler
+from funasr_local.samplers.num_elements_batch_sampler import NumElementsBatchSampler
+from funasr_local.samplers.sorted_batch_sampler import SortedBatchSampler
+from funasr_local.samplers.unsorted_batch_sampler import UnsortedBatchSampler
+
+
+BATCH_TYPES = dict(
+    unsorted="UnsortedBatchSampler has nothing in particular feature and "
+    "just creates mini-batches which has constant batch_size. "
+    "This sampler doesn't require any length "
+    "information for each feature. "
+    "'key_file' is just a text file which describes each sample name."
+    "\n\n"
+    "    utterance_id_a\n"
+    "    utterance_id_b\n"
+    "    utterance_id_c\n"
+    "\n"
+    "The fist column is referred, so 'shape file' can be used, too.\n\n"
+    "    utterance_id_a 100,80\n"
+    "    utterance_id_b 400,80\n"
+    "    utterance_id_c 512,80\n",
+    sorted="SortedBatchSampler sorts samples by the length of the first input "
+    " in order to make each sample in a mini-batch has close length. "
+    "This sampler requires a text file which describes the length for each sample "
+    "\n\n"
+    "    utterance_id_a 1000\n"
+    "    utterance_id_b 1453\n"
+    "    utterance_id_c 1241\n"
+    "\n"
+    "The first element of feature dimensions is referred, "
+    "so 'shape_file' can be also used.\n\n"
+    "    utterance_id_a 1000,80\n"
+    "    utterance_id_b 1453,80\n"
+    "    utterance_id_c 1241,80\n",
+    folded="FoldedBatchSampler supports variable batch_size. "
+    "The batch_size is decided by\n"
+    "    batch_size = base_batch_size // (L // fold_length)\n"
+    "L is referred to the largest length of samples in the mini-batch. "
+    "This samples requires length information as same as SortedBatchSampler\n",
+    length="LengthBatchSampler supports variable batch_size. "
+    "This sampler makes mini-batches which have same number of 'bins' as possible "
+    "counting by the total lengths of each feature in the mini-batch. "
+    "This sampler requires a text file which describes the length for each sample. "
+    "\n\n"
+    "    utterance_id_a 1000\n"
+    "    utterance_id_b 1453\n"
+    "    utterance_id_c 1241\n"
+    "\n"
+    "The first element of feature dimensions is referred, "
+    "so 'shape_file' can be also used.\n\n"
+    "    utterance_id_a 1000,80\n"
+    "    utterance_id_b 1453,80\n"
+    "    utterance_id_c 1241,80\n",
+    numel="NumElementsBatchSampler supports variable batch_size. "
+    "Just like LengthBatchSampler, this sampler makes mini-batches"
+    " which have same number of 'bins' as possible "
+    "counting by the total number of elements of each feature "
+    "instead of the length. "
+    "Thus this sampler requires the full information of the dimension of the features. "
+    "\n\n"
+    "    utterance_id_a 1000,80\n"
+    "    utterance_id_b 1453,80\n"
+    "    utterance_id_c 1241,80\n",
+)
+
+
+def build_batch_sampler(
+    type: str,
+    batch_size: int,
+    batch_bins: int,
+    shape_files: Union[Tuple[str, ...], List[str], Dict],
+    sort_in_batch: str = "descending",
+    sort_batch: str = "ascending",
+    drop_last: bool = False,
+    min_batch_size: int = 1,
+    fold_lengths: Sequence[int] = (),
+    padding: bool = True,
+    utt2category_file: str = None,
+) -> AbsSampler:
+    """Helper function to instantiate BatchSampler.
+
+    Args:
+        type: mini-batch type. "unsorted", "sorted", "folded", "numel", or, "length"
+        batch_size: The mini-batch size. Used for "unsorted", "sorted", "folded" mode
+        batch_bins: Used for "numel" model
+        shape_files: Text files describing the length and dimension
+            of each features. e.g. uttA 1330,80
+        sort_in_batch:
+        sort_batch:
+        drop_last:
+        min_batch_size:  Used for "numel" or "folded" mode
+        fold_lengths: Used for "folded" mode
+        padding: Whether sequences are input as a padded tensor or not.
+            used for "numel" mode
+    """
+    assert check_argument_types()
+    if len(shape_files) == 0:
+        raise ValueError("No shape file are given")
+
+    if type == "unsorted":
+        retval = UnsortedBatchSampler(
+            batch_size=batch_size, key_file=shape_files[0], drop_last=drop_last
+        )
+
+    elif type == "sorted":
+        retval = SortedBatchSampler(
+            batch_size=batch_size,
+            shape_file=shape_files[0],
+            sort_in_batch=sort_in_batch,
+            sort_batch=sort_batch,
+            drop_last=drop_last,
+        )
+
+    elif type == "folded":
+        if len(fold_lengths) != len(shape_files):
+            raise ValueError(
+                f"The number of fold_lengths must be equal to "
+                f"the number of shape_files: "
+                f"{len(fold_lengths)} != {len(shape_files)}"
+            )
+        retval = FoldedBatchSampler(
+            batch_size=batch_size,
+            shape_files=shape_files,
+            fold_lengths=fold_lengths,
+            sort_in_batch=sort_in_batch,
+            sort_batch=sort_batch,
+            drop_last=drop_last,
+            min_batch_size=min_batch_size,
+            utt2category_file=utt2category_file,
+        )
+
+    elif type == "numel":
+        retval = NumElementsBatchSampler(
+            batch_bins=batch_bins,
+            shape_files=shape_files,
+            sort_in_batch=sort_in_batch,
+            sort_batch=sort_batch,
+            drop_last=drop_last,
+            padding=padding,
+            min_batch_size=min_batch_size,
+        )
+
+    elif type == "length":
+        retval = LengthBatchSampler(
+            batch_bins=batch_bins,
+            shape_files=shape_files,
+            sort_in_batch=sort_in_batch,
+            sort_batch=sort_batch,
+            drop_last=drop_last,
+            padding=padding,
+            min_batch_size=min_batch_size,
+        )
+
+    else:
+        raise ValueError(f"Not supported: {type}")
+    assert check_return_type(retval)
+    return retval
--- a/funasr_local/samplers/folded_batch_sampler.py
+++ b/funasr_local/samplers/folded_batch_sampler.py
@@ -0,0 +1,156 @@
+from typing import Iterator
+from typing import List
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import load_num_sequence_text
+from funasr_local.fileio.read_text import read_2column_text
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class FoldedBatchSampler(AbsSampler):
+    def __init__(
+        self,
+        batch_size: int,
+        shape_files: Union[Tuple[str, ...], List[str]],
+        fold_lengths: Sequence[int],
+        min_batch_size: int = 1,
+        sort_in_batch: str = "descending",
+        sort_batch: str = "ascending",
+        drop_last: bool = False,
+        utt2category_file: str = None,
+    ):
+        assert check_argument_types()
+        assert batch_size > 0
+        if sort_batch != "ascending" and sort_batch != "descending":
+            raise ValueError(
+                f"sort_batch must be ascending or descending: {sort_batch}"
+            )
+        if sort_in_batch != "descending" and sort_in_batch != "ascending":
+            raise ValueError(
+                f"sort_in_batch must be ascending or descending: {sort_in_batch}"
+            )
+
+        self.batch_size = batch_size
+        self.shape_files = shape_files
+        self.sort_in_batch = sort_in_batch
+        self.sort_batch = sort_batch
+        self.drop_last = drop_last
+
+        # utt2shape: (Length, ...)
+        #    uttA 100,...
+        #    uttB 201,...
+        utt2shapes = [
+            load_num_sequence_text(s, loader_type="csv_int") for s in shape_files
+        ]
+
+        first_utt2shape = utt2shapes[0]
+        for s, d in zip(shape_files, utt2shapes):
+            if set(d) != set(first_utt2shape):
+                raise RuntimeError(
+                    f"keys are mismatched between {s} != {shape_files[0]}"
+                )
+
+        # Sort samples in ascending order
+        # (shape order should be like (Length, Dim))
+        keys = sorted(first_utt2shape, key=lambda k: first_utt2shape[k][0])
+        if len(keys) == 0:
+            raise RuntimeError(f"0 lines found: {shape_files[0]}")
+
+        category2utt = {}
+        if utt2category_file is not None:
+            utt2category = read_2column_text(utt2category_file)
+            if set(utt2category) != set(first_utt2shape):
+                raise RuntimeError(
+                    "keys are mismatched between "
+                    f"{utt2category_file} != {shape_files[0]}"
+                )
+            for k in keys:
+                category2utt.setdefault(utt2category[k], []).append(k)
+        else:
+            category2utt["default_category"] = keys
+
+        self.batch_list = []
+        for d, v in category2utt.items():
+            category_keys = v
+            # Decide batch-sizes
+            start = 0
+            batch_sizes = []
+            while True:
+                k = category_keys[start]
+                factor = max(int(d[k][0] / m) for d, m in zip(utt2shapes, fold_lengths))
+                bs = max(min_batch_size, int(batch_size / (1 + factor)))
+                if self.drop_last and start + bs > len(category_keys):
+                    # This if-block avoids 0-batches
+                    if len(self.batch_list) > 0:
+                        break
+
+                bs = min(len(category_keys) - start, bs)
+                batch_sizes.append(bs)
+                start += bs
+                if start >= len(category_keys):
+                    break
+
+            if len(batch_sizes) == 0:
+                # Maybe we can't reach here
+                raise RuntimeError("0 batches")
+
+            # If the last batch-size is smaller than minimum batch_size,
+            # the samples are redistributed to the other mini-batches
+            if len(batch_sizes) > 1 and batch_sizes[-1] < min_batch_size:
+                for i in range(batch_sizes.pop(-1)):
+                    batch_sizes[-(i % len(batch_sizes)) - 2] += 1
+
+            if not self.drop_last:
+                # Bug check
+                assert sum(batch_sizes) == len(
+                    category_keys
+                ), f"{sum(batch_sizes)} != {len(category_keys)}"
+
+            # Set mini-batch
+            cur_batch_list = []
+            start = 0
+            for bs in batch_sizes:
+                assert len(category_keys) >= start + bs, "Bug"
+                minibatch_keys = category_keys[start : start + bs]
+                start += bs
+                if sort_in_batch == "descending":
+                    minibatch_keys.reverse()
+                elif sort_in_batch == "ascending":
+                    # Key are already sorted in ascending
+                    pass
+                else:
+                    raise ValueError(
+                        "sort_in_batch must be ascending or "
+                        f"descending: {sort_in_batch}"
+                    )
+                cur_batch_list.append(tuple(minibatch_keys))
+
+            if sort_batch == "ascending":
+                pass
+            elif sort_batch == "descending":
+                cur_batch_list.reverse()
+            else:
+                raise ValueError(
+                    f"sort_batch must be ascending or descending: {sort_batch}"
+                )
+            self.batch_list.extend(cur_batch_list)
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"N-batch={len(self)}, "
+            f"batch_size={self.batch_size}, "
+            f"shape_files={self.shape_files}, "
+            f"sort_in_batch={self.sort_in_batch}, "
+            f"sort_batch={self.sort_batch})"
+        )
+
+    def __len__(self):
+        return len(self.batch_list)
+
+    def __iter__(self) -> Iterator[Tuple[str, ...]]:
+        return iter(self.batch_list)
--- a/funasr_local/samplers/length_batch_sampler.py
+++ b/funasr_local/samplers/length_batch_sampler.py
@@ -0,0 +1,147 @@
+from typing import Iterator
+from typing import List
+from typing import Dict
+from typing import Tuple
+from typing import Union
+
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import load_num_sequence_text
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class LengthBatchSampler(AbsSampler):
+    def __init__(
+        self,
+        batch_bins: int,
+        shape_files: Union[Tuple[str, ...], List[str], Dict],
+        min_batch_size: int = 1,
+        sort_in_batch: str = "descending",
+        sort_batch: str = "ascending",
+        drop_last: bool = False,
+        padding: bool = True,
+    ):
+        assert check_argument_types()
+        assert batch_bins > 0
+        if sort_batch != "ascending" and sort_batch != "descending":
+            raise ValueError(
+                f"sort_batch must be ascending or descending: {sort_batch}"
+            )
+        if sort_in_batch != "descending" and sort_in_batch != "ascending":
+            raise ValueError(
+                f"sort_in_batch must be ascending or descending: {sort_in_batch}"
+            )
+
+        self.batch_bins = batch_bins
+        self.shape_files = shape_files
+        self.sort_in_batch = sort_in_batch
+        self.sort_batch = sort_batch
+        self.drop_last = drop_last
+
+        # utt2shape: (Length, ...)
+        #    uttA 100,...
+        #    uttB 201,...
+        if isinstance(shape_files, dict):
+            utt2shapes = [shape_files]
+        else:
+            utt2shapes = [
+                load_num_sequence_text(s, loader_type="csv_int") for s in shape_files
+            ]
+
+        first_utt2shape = utt2shapes[0]
+        for s, d in zip(shape_files, utt2shapes):
+            if set(d) != set(first_utt2shape):
+                raise RuntimeError(
+                    f"keys are mismatched between {s} != {shape_files[0]}"
+                )
+
+        # Sort samples in ascending order
+        # (shape order should be like (Length, Dim))
+        keys = sorted(first_utt2shape, key=lambda k: first_utt2shape[k][0])
+        if len(keys) == 0:
+            raise RuntimeError(f"0 lines found: {shape_files[0]}")
+
+        # Decide batch-sizes
+        batch_sizes = []
+        current_batch_keys = []
+        for key in keys:
+            current_batch_keys.append(key)
+            # shape: (Length, dim1, dim2, ...)
+            if padding:
+                # bins = bs x max_length
+                bins = sum(len(current_batch_keys) * sh[key][0] for sh in utt2shapes)
+            else:
+                # bins = sum of lengths
+                bins = sum(d[k][0] for k in current_batch_keys for d in utt2shapes)
+
+            if bins > batch_bins and len(current_batch_keys) >= min_batch_size:
+                batch_sizes.append(len(current_batch_keys))
+                current_batch_keys = []
+        else:
+            if len(current_batch_keys) != 0 and (
+                not self.drop_last or len(batch_sizes) == 0
+            ):
+                batch_sizes.append(len(current_batch_keys))
+
+        if len(batch_sizes) == 0:
+            # Maybe we can't reach here
+            raise RuntimeError("0 batches")
+
+        # If the last batch-size is smaller than minimum batch_size,
+        # the samples are redistributed to the other mini-batches
+        if len(batch_sizes) > 1 and batch_sizes[-1] < min_batch_size:
+            for i in range(batch_sizes.pop(-1)):
+                batch_sizes[-(i % len(batch_sizes)) - 1] += 1
+
+        if not self.drop_last:
+            # Bug check
+            assert sum(batch_sizes) == len(keys), f"{sum(batch_sizes)} != {len(keys)}"
+
+        # Set mini-batch
+        self.batch_list = []
+        iter_bs = iter(batch_sizes)
+        bs = next(iter_bs)
+        minibatch_keys = []
+        for key in keys:
+            minibatch_keys.append(key)
+            if len(minibatch_keys) == bs:
+                if sort_in_batch == "descending":
+                    minibatch_keys.reverse()
+                elif sort_in_batch == "ascending":
+                    # Key are already sorted in ascending
+                    pass
+                else:
+                    raise ValueError(
+                        "sort_in_batch must be ascending"
+                        f" or descending: {sort_in_batch}"
+                    )
+                self.batch_list.append(tuple(minibatch_keys))
+                minibatch_keys = []
+                try:
+                    bs = next(iter_bs)
+                except StopIteration:
+                    break
+
+        if sort_batch == "ascending":
+            pass
+        elif sort_batch == "descending":
+            self.batch_list.reverse()
+        else:
+            raise ValueError(
+                f"sort_batch must be ascending or descending: {sort_batch}"
+            )
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"N-batch={len(self)}, "
+            f"batch_bins={self.batch_bins}, "
+            f"sort_in_batch={self.sort_in_batch}, "
+            f"sort_batch={self.sort_batch})"
+        )
+
+    def __len__(self):
+        return len(self.batch_list)
+
+    def __iter__(self) -> Iterator[Tuple[str, ...]]:
+        return iter(self.batch_list)
--- a/funasr_local/samplers/num_elements_batch_sampler.py
+++ b/funasr_local/samplers/num_elements_batch_sampler.py
@@ -0,0 +1,160 @@
+from typing import Iterator
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import load_num_sequence_text
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class NumElementsBatchSampler(AbsSampler):
+    def __init__(
+        self,
+        batch_bins: int,
+        shape_files: Union[Tuple[str, ...], List[str]],
+        min_batch_size: int = 1,
+        sort_in_batch: str = "descending",
+        sort_batch: str = "ascending",
+        drop_last: bool = False,
+        padding: bool = True,
+    ):
+        assert check_argument_types()
+        assert batch_bins > 0
+        if sort_batch != "ascending" and sort_batch != "descending":
+            raise ValueError(
+                f"sort_batch must be ascending or descending: {sort_batch}"
+            )
+        if sort_in_batch != "descending" and sort_in_batch != "ascending":
+            raise ValueError(
+                f"sort_in_batch must be ascending or descending: {sort_in_batch}"
+            )
+
+        self.batch_bins = batch_bins
+        self.shape_files = shape_files
+        self.sort_in_batch = sort_in_batch
+        self.sort_batch = sort_batch
+        self.drop_last = drop_last
+
+        # utt2shape: (Length, ...)
+        #    uttA 100,...
+        #    uttB 201,...
+        utt2shapes = [
+            load_num_sequence_text(s, loader_type="csv_int") for s in shape_files
+        ]
+
+        first_utt2shape = utt2shapes[0]
+        for s, d in zip(shape_files, utt2shapes):
+            if set(d) != set(first_utt2shape):
+                raise RuntimeError(
+                    f"keys are mismatched between {s} != {shape_files[0]}"
+                )
+
+        # Sort samples in ascending order
+        # (shape order should be like (Length, Dim))
+        keys = sorted(first_utt2shape, key=lambda k: first_utt2shape[k][0])
+        if len(keys) == 0:
+            raise RuntimeError(f"0 lines found: {shape_files[0]}")
+        if padding:
+            # If padding case, the feat-dim must be same over whole corpus,
+            # therefore the first sample is referred
+            feat_dims = [np.prod(d[keys[0]][1:]) for d in utt2shapes]
+        else:
+            feat_dims = None
+
+        # Decide batch-sizes
+        batch_sizes = []
+        current_batch_keys = []
+        for key in keys:
+            current_batch_keys.append(key)
+            # shape: (Length, dim1, dim2, ...)
+            if padding:
+                for d, s in zip(utt2shapes, shape_files):
+                    if tuple(d[key][1:]) != tuple(d[keys[0]][1:]):
+                        raise RuntimeError(
+                            "If padding=True, the "
+                            f"feature dimension must be unified: {s}",
+                        )
+                bins = sum(
+                    len(current_batch_keys) * sh[key][0] * d
+                    for sh, d in zip(utt2shapes, feat_dims)
+                )
+            else:
+                bins = sum(
+                    np.prod(d[k]) for k in current_batch_keys for d in utt2shapes
+                )
+
+            if bins > batch_bins and len(current_batch_keys) >= min_batch_size:
+                batch_sizes.append(len(current_batch_keys))
+                current_batch_keys = []
+        else:
+            if len(current_batch_keys) != 0 and (
+                not self.drop_last or len(batch_sizes) == 0
+            ):
+                batch_sizes.append(len(current_batch_keys))
+
+        if len(batch_sizes) == 0:
+            # Maybe we can't reach here
+            raise RuntimeError("0 batches")
+
+        # If the last batch-size is smaller than minimum batch_size,
+        # the samples are redistributed to the other mini-batches
+        if len(batch_sizes) > 1 and batch_sizes[-1] < min_batch_size:
+            for i in range(batch_sizes.pop(-1)):
+                batch_sizes[-(i % len(batch_sizes)) - 1] += 1
+
+        if not self.drop_last:
+            # Bug check
+            assert sum(batch_sizes) == len(keys), f"{sum(batch_sizes)} != {len(keys)}"
+
+        # Set mini-batch
+        self.batch_list = []
+        iter_bs = iter(batch_sizes)
+        bs = next(iter_bs)
+        minibatch_keys = []
+        for key in keys:
+            minibatch_keys.append(key)
+            if len(minibatch_keys) == bs:
+                if sort_in_batch == "descending":
+                    minibatch_keys.reverse()
+                elif sort_in_batch == "ascending":
+                    # Key are already sorted in ascending
+                    pass
+                else:
+                    raise ValueError(
+                        "sort_in_batch must be ascending"
+                        f" or descending: {sort_in_batch}"
+                    )
+
+                self.batch_list.append(tuple(minibatch_keys))
+                minibatch_keys = []
+                try:
+                    bs = next(iter_bs)
+                except StopIteration:
+                    break
+
+        if sort_batch == "ascending":
+            pass
+        elif sort_batch == "descending":
+            self.batch_list.reverse()
+        else:
+            raise ValueError(
+                f"sort_batch must be ascending or descending: {sort_batch}"
+            )
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"N-batch={len(self)}, "
+            f"batch_bins={self.batch_bins}, "
+            f"sort_in_batch={self.sort_in_batch}, "
+            f"sort_batch={self.sort_batch})"
+        )
+
+    def __len__(self):
+        return len(self.batch_list)
+
+    def __iter__(self) -> Iterator[Tuple[str, ...]]:
+        return iter(self.batch_list)
--- a/funasr_local/samplers/sorted_batch_sampler.py
+++ b/funasr_local/samplers/sorted_batch_sampler.py
@@ -0,0 +1,95 @@
+import logging
+from typing import Iterator
+from typing import Tuple
+
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import load_num_sequence_text
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class SortedBatchSampler(AbsSampler):
+    """BatchSampler with sorted samples by length.
+
+    Args:
+        batch_size:
+        shape_file:
+        sort_in_batch: 'descending', 'ascending' or None.
+        sort_batch:
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        shape_file: str,
+        sort_in_batch: str = "descending",
+        sort_batch: str = "ascending",
+        drop_last: bool = False,
+    ):
+        assert check_argument_types()
+        assert batch_size > 0
+        self.batch_size = batch_size
+        self.shape_file = shape_file
+        self.sort_in_batch = sort_in_batch
+        self.sort_batch = sort_batch
+        self.drop_last = drop_last
+
+        # utt2shape: (Length, ...)
+        #    uttA 100,...
+        #    uttB 201,...
+        utt2shape = load_num_sequence_text(shape_file, loader_type="csv_int")
+        if sort_in_batch == "descending":
+            # Sort samples in descending order (required by RNN)
+            keys = sorted(utt2shape, key=lambda k: -utt2shape[k][0])
+        elif sort_in_batch == "ascending":
+            # Sort samples in ascending order
+            keys = sorted(utt2shape, key=lambda k: utt2shape[k][0])
+        else:
+            raise ValueError(
+                f"sort_in_batch must be either one of "
+                f"ascending, descending, or None: {sort_in_batch}"
+            )
+        if len(keys) == 0:
+            raise RuntimeError(f"0 lines found: {shape_file}")
+
+        # Apply max(, 1) to avoid 0-batches
+        N = max(len(keys) // batch_size, 1)
+        if not self.drop_last:
+            # Split keys evenly as possible as. Note that If N != 1,
+            # the these batches always have size of batch_size at minimum.
+            self.batch_list = [
+                keys[i * len(keys) // N : (i + 1) * len(keys) // N] for i in range(N)
+            ]
+        else:
+            self.batch_list = [
+                tuple(keys[i * batch_size : (i + 1) * batch_size]) for i in range(N)
+            ]
+
+        if len(self.batch_list) == 0:
+            logging.warning(f"{shape_file} is empty")
+
+        if sort_in_batch != sort_batch:
+            if sort_batch not in ("ascending", "descending"):
+                raise ValueError(
+                    f"sort_batch must be ascending or descending: {sort_batch}"
+                )
+            self.batch_list.reverse()
+
+        if len(self.batch_list) == 0:
+            raise RuntimeError("0 batches")
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"N-batch={len(self)}, "
+            f"batch_size={self.batch_size}, "
+            f"shape_file={self.shape_file}, "
+            f"sort_in_batch={self.sort_in_batch}, "
+            f"sort_batch={self.sort_batch})"
+        )
+
+    def __len__(self):
+        return len(self.batch_list)
+
+    def __iter__(self) -> Iterator[Tuple[str, ...]]:
+        return iter(self.batch_list)
--- a/funasr_local/samplers/unsorted_batch_sampler.py
+++ b/funasr_local/samplers/unsorted_batch_sampler.py
@@ -0,0 +1,91 @@
+import logging
+from typing import Iterator
+from typing import Tuple
+
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import read_2column_text
+from funasr_local.samplers.abs_sampler import AbsSampler
+
+
+class UnsortedBatchSampler(AbsSampler):
+    """BatchSampler with constant batch-size.
+
+    Any sorting is not done in this class,
+    so no length information is required,
+    This class is convenient for decoding mode,
+    or not seq2seq learning e.g. classification.
+
+    Args:
+        batch_size:
+        key_file:
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        key_file: str,
+        drop_last: bool = False,
+        utt2category_file: str = None,
+    ):
+        assert check_argument_types()
+        assert batch_size > 0
+        self.batch_size = batch_size
+        self.key_file = key_file
+        self.drop_last = drop_last
+
+        # utt2shape:
+        #    uttA <anything is o.k>
+        #    uttB <anything is o.k>
+        utt2any = read_2column_text(key_file)
+        if len(utt2any) == 0:
+            logging.warning(f"{key_file} is empty")
+        # In this case the, the first column in only used
+        keys = list(utt2any)
+        if len(keys) == 0:
+            raise RuntimeError(f"0 lines found: {key_file}")
+
+        category2utt = {}
+        if utt2category_file is not None:
+            utt2category = read_2column_text(utt2category_file)
+            if set(utt2category) != set(keys):
+                raise RuntimeError(
+                    f"keys are mismatched between {utt2category_file} != {key_file}"
+                )
+            for k, v in utt2category.items():
+                category2utt.setdefault(v, []).append(k)
+        else:
+            category2utt["default_category"] = keys
+
+        self.batch_list = []
+        for d, v in category2utt.items():
+            category_keys = v
+            # Apply max(, 1) to avoid 0-batches
+            N = max(len(category_keys) // batch_size, 1)
+            if not self.drop_last:
+                # Split keys evenly as possible as. Note that If N != 1,
+                # the these batches always have size of batch_size at minimum.
+                cur_batch_list = [
+                    category_keys[i * len(keys) // N : (i + 1) * len(keys) // N]
+                    for i in range(N)
+                ]
+            else:
+                cur_batch_list = [
+                    tuple(category_keys[i * batch_size : (i + 1) * batch_size])
+                    for i in range(N)
+                ]
+            self.batch_list.extend(cur_batch_list)
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"N-batch={len(self)}, "
+            f"batch_size={self.batch_size}, "
+            f"key_file={self.key_file}, "
+        )
+
+    def __len__(self):
+        return len(self.batch_list)
+
+    def __iter__(self) -> Iterator[Tuple[str, ...]]:
+        return iter(self.batch_list)