add files

2026-02-05 18:09:20 +08:00 · 2025-02-20 12:17:03 +08:00
parent a21dd4555c
commit edd008441b
667 changed files with 473123 additions and 0 deletions
--- a/funasr_local/fileio/init.py
+++ b/funasr_local/fileio/init.py
--- a/funasr_local/fileio/datadir_writer.py
+++ b/funasr_local/fileio/datadir_writer.py
@@ -0,0 +1,78 @@
+from pathlib import Path
+from typing import Union
+import warnings
+
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+
+class DatadirWriter:
+    """Writer class to create kaldi like data directory.
+
+    Examples:
+        >>> with DatadirWriter("output") as writer:
+        ...     # output/sub.txt is created here
+        ...     subwriter = writer["sub.txt"]
+        ...     # Write "uttidA some/where/a.wav"
+        ...     subwriter["uttidA"] = "some/where/a.wav"
+        ...     subwriter["uttidB"] = "some/where/b.wav"
+
+    """
+
+    def __init__(self, p: Union[Path, str]):
+        assert check_argument_types()
+        self.path = Path(p)
+        self.chilidren = {}
+        self.fd = None
+        self.has_children = False
+        self.keys = set()
+
+    def __enter__(self):
+        return self
+
+    def __getitem__(self, key: str) -> "DatadirWriter":
+        assert check_argument_types()
+        if self.fd is not None:
+            raise RuntimeError("This writer points out a file")
+
+        if key not in self.chilidren:
+            w = DatadirWriter((self.path / key))
+            self.chilidren[key] = w
+            self.has_children = True
+
+        retval = self.chilidren[key]
+        assert check_return_type(retval)
+        return retval
+
+    def __setitem__(self, key: str, value: str):
+        assert check_argument_types()
+        if self.has_children:
+            raise RuntimeError("This writer points out a directory")
+        if key in self.keys:
+            warnings.warn(f"Duplicated: {key}")
+
+        if self.fd is None:
+            self.path.parent.mkdir(parents=True, exist_ok=True)
+            self.fd = self.path.open("w", encoding="utf-8")
+
+        self.keys.add(key)
+        self.fd.write(f"{key} {value}\n")
+        self.fd.flush()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        if self.has_children:
+            prev_child = None
+            for child in self.chilidren.values():
+                child.close()
+                if prev_child is not None and prev_child.keys != child.keys:
+                    warnings.warn(
+                        f"Ids are mismatching between "
+                        f"{prev_child.path} and {child.path}"
+                    )
+                prev_child = child
+
+        elif self.fd is not None:
+            self.fd.close()
--- a/funasr_local/fileio/npy_scp.py
+++ b/funasr_local/fileio/npy_scp.py
@@ -0,0 +1,97 @@
+import collections.abc
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import read_2column_text
+
+
+class NpyScpWriter:
+    """Writer class for a scp file of numpy file.
+
+    Examples:
+        key1 /some/path/a.npy
+        key2 /some/path/b.npy
+        key3 /some/path/c.npy
+        key4 /some/path/d.npy
+        ...
+
+        >>> writer = NpyScpWriter('./data/', './data/feat.scp')
+        >>> writer['aa'] = numpy_array
+        >>> writer['bb'] = numpy_array
+
+    """
+
+    def __init__(self, outdir: Union[Path, str], scpfile: Union[Path, str]):
+        assert check_argument_types()
+        self.dir = Path(outdir)
+        self.dir.mkdir(parents=True, exist_ok=True)
+        scpfile = Path(scpfile)
+        scpfile.parent.mkdir(parents=True, exist_ok=True)
+        self.fscp = scpfile.open("w", encoding="utf-8")
+
+        self.data = {}
+
+    def get_path(self, key):
+        return self.data[key]
+
+    def __setitem__(self, key, value):
+        assert isinstance(value, np.ndarray), type(value)
+        p = self.dir / f"{key}.npy"
+        p.parent.mkdir(parents=True, exist_ok=True)
+        np.save(str(p), value)
+        self.fscp.write(f"{key} {p}\n")
+
+        # Store the file path
+        self.data[key] = str(p)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        self.fscp.close()
+
+
+class NpyScpReader(collections.abc.Mapping):
+    """Reader class for a scp file of numpy file.
+
+    Examples:
+        key1 /some/path/a.npy
+        key2 /some/path/b.npy
+        key3 /some/path/c.npy
+        key4 /some/path/d.npy
+        ...
+
+        >>> reader = NpyScpReader('npy.scp')
+        >>> array = reader['key1']
+
+    """
+
+    def __init__(self, fname: Union[Path, str]):
+        assert check_argument_types()
+        self.fname = Path(fname)
+        self.data = read_2column_text(fname)
+
+    def get_path(self, key):
+        return self.data[key]
+
+    def __getitem__(self, key) -> np.ndarray:
+        p = self.data[key]
+        return np.load(p)
+
+    def __contains__(self, item):
+        return item
+
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(self.data)
+
+    def keys(self):
+        return self.data.keys()
--- a/funasr_local/fileio/rand_gen_dataset.py
+++ b/funasr_local/fileio/rand_gen_dataset.py
@@ -0,0 +1,86 @@
+import collections
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import load_num_sequence_text
+
+
+class FloatRandomGenerateDataset(collections.abc.Mapping):
+    """Generate float array from shape.txt.
+
+    Examples:
+        shape.txt
+        uttA 123,83
+        uttB 34,83
+        >>> dataset = FloatRandomGenerateDataset("shape.txt")
+        >>> array = dataset["uttA"]
+        >>> assert array.shape == (123, 83)
+        >>> array = dataset["uttB"]
+        >>> assert array.shape == (34, 83)
+
+    """
+
+    def __init__(
+        self,
+        shape_file: Union[Path, str],
+        dtype: Union[str, np.dtype] = "float32",
+        loader_type: str = "csv_int",
+    ):
+        assert check_argument_types()
+        shape_file = Path(shape_file)
+        self.utt2shape = load_num_sequence_text(shape_file, loader_type)
+        self.dtype = np.dtype(dtype)
+
+    def __iter__(self):
+        return iter(self.utt2shape)
+
+    def __len__(self):
+        return len(self.utt2shape)
+
+    def __getitem__(self, item) -> np.ndarray:
+        shape = self.utt2shape[item]
+        return np.random.randn(*shape).astype(self.dtype)
+
+
+class IntRandomGenerateDataset(collections.abc.Mapping):
+    """Generate float array from shape.txt
+
+    Examples:
+        shape.txt
+        uttA 123,83
+        uttB 34,83
+        >>> dataset = IntRandomGenerateDataset("shape.txt", low=0, high=10)
+        >>> array = dataset["uttA"]
+        >>> assert array.shape == (123, 83)
+        >>> array = dataset["uttB"]
+        >>> assert array.shape == (34, 83)
+
+    """
+
+    def __init__(
+        self,
+        shape_file: Union[Path, str],
+        low: int,
+        high: int = None,
+        dtype: Union[str, np.dtype] = "int64",
+        loader_type: str = "csv_int",
+    ):
+        assert check_argument_types()
+        shape_file = Path(shape_file)
+        self.utt2shape = load_num_sequence_text(shape_file, loader_type)
+        self.dtype = np.dtype(dtype)
+        self.low = low
+        self.high = high
+
+    def __iter__(self):
+        return iter(self.utt2shape)
+
+    def __len__(self):
+        return len(self.utt2shape)
+
+    def __getitem__(self, item) -> np.ndarray:
+        shape = self.utt2shape[item]
+        return np.random.randint(self.low, self.high, size=shape, dtype=self.dtype)
--- a/funasr_local/fileio/read_text.py
+++ b/funasr_local/fileio/read_text.py
@@ -0,0 +1,81 @@
+import logging
+from pathlib import Path
+from typing import Dict
+from typing import List
+from typing import Union
+
+from typeguard import check_argument_types
+
+
+def read_2column_text(path: Union[Path, str]) -> Dict[str, str]:
+    """Read a text file having 2 column as dict object.
+
+    Examples:
+        wav.scp:
+            key1 /some/path/a.wav
+            key2 /some/path/b.wav
+
+        >>> read_2column_text('wav.scp')
+        {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
+
+    """
+    assert check_argument_types()
+
+    data = {}
+    with Path(path).open("r", encoding="utf-8") as f:
+        for linenum, line in enumerate(f, 1):
+            sps = line.rstrip().split(maxsplit=1)
+            if len(sps) == 1:
+                k, v = sps[0], ""
+            else:
+                k, v = sps
+            if k in data:
+                raise RuntimeError(f"{k} is duplicated ({path}:{linenum})")
+            data[k] = v
+    return data
+
+
+def load_num_sequence_text(
+    path: Union[Path, str], loader_type: str = "csv_int"
+) -> Dict[str, List[Union[float, int]]]:
+    """Read a text file indicating sequences of number
+
+    Examples:
+        key1 1 2 3
+        key2 34 5 6
+
+        >>> d = load_num_sequence_text('text')
+        >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
+    """
+    assert check_argument_types()
+    if loader_type == "text_int":
+        delimiter = " "
+        dtype = int
+    elif loader_type == "text_float":
+        delimiter = " "
+        dtype = float
+    elif loader_type == "csv_int":
+        delimiter = ","
+        dtype = int
+    elif loader_type == "csv_float":
+        delimiter = ","
+        dtype = float
+    else:
+        raise ValueError(f"Not supported loader_type={loader_type}")
+
+    # path looks like:
+    #   utta 1,0
+    #   uttb 3,4,5
+    # -> return {'utta': np.ndarray([1, 0]),
+    #            'uttb': np.ndarray([3, 4, 5])}
+    d = read_2column_text(path)
+
+    # Using for-loop instead of dict-comprehension for debuggability
+    retval = {}
+    for k, v in d.items():
+        try:
+            retval[k] = [dtype(i) for i in v.split(delimiter)]
+        except TypeError:
+            logging.error(f'Error happened with path="{path}", id="{k}", value="{v}"')
+            raise
+    return retval
--- a/funasr_local/fileio/sound_scp.py
+++ b/funasr_local/fileio/sound_scp.py
@@ -0,0 +1,136 @@
+import collections.abc
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import soundfile
+import librosa
+from typeguard import check_argument_types
+
+from funasr_local.fileio.read_text import read_2column_text
+
+
+class SoundScpReader(collections.abc.Mapping):
+    """Reader class for 'wav.scp'.
+
+    Examples:
+        key1 /some/path/a.wav
+        key2 /some/path/b.wav
+        key3 /some/path/c.wav
+        key4 /some/path/d.wav
+        ...
+
+        >>> reader = SoundScpReader('wav.scp')
+        >>> rate, array = reader['key1']
+
+    """
+
+    def __init__(
+        self,
+        fname,
+        dtype=np.int16,
+        always_2d: bool = False,
+        normalize: bool = False,
+        dest_sample_rate: int = 16000,
+    ):
+        assert check_argument_types()
+        self.fname = fname
+        self.dtype = dtype
+        self.always_2d = always_2d
+        self.normalize = normalize
+        self.data = read_2column_text(fname)
+        self.dest_sample_rate = dest_sample_rate
+
+    def __getitem__(self, key):
+        wav = self.data[key]
+        if self.normalize:
+            # soundfile.read normalizes data to [-1,1] if dtype is not given
+            array, rate = librosa.load(
+                wav, sr=self.dest_sample_rate, mono=not self.always_2d
+            )
+        else:
+            array, rate = librosa.load(
+                wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype
+            )
+
+        return rate, array
+
+    def get_path(self, key):
+        return self.data[key]
+
+    def __contains__(self, item):
+        return item
+
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(self.data)
+
+    def keys(self):
+        return self.data.keys()
+
+
+class SoundScpWriter:
+    """Writer class for 'wav.scp'
+
+    Examples:
+        key1 /some/path/a.wav
+        key2 /some/path/b.wav
+        key3 /some/path/c.wav
+        key4 /some/path/d.wav
+        ...
+
+        >>> writer = SoundScpWriter('./data/', './data/feat.scp')
+        >>> writer['aa'] = 16000, numpy_array
+        >>> writer['bb'] = 16000, numpy_array
+
+    """
+
+    def __init__(
+        self,
+        outdir: Union[Path, str],
+        scpfile: Union[Path, str],
+        format="wav",
+        dtype=None,
+    ):
+        assert check_argument_types()
+        self.dir = Path(outdir)
+        self.dir.mkdir(parents=True, exist_ok=True)
+        scpfile = Path(scpfile)
+        scpfile.parent.mkdir(parents=True, exist_ok=True)
+        self.fscp = scpfile.open("w", encoding="utf-8")
+        self.format = format
+        self.dtype = dtype
+
+        self.data = {}
+
+    def __setitem__(self, key: str, value):
+        rate, signal = value
+        assert isinstance(rate, int), type(rate)
+        assert isinstance(signal, np.ndarray), type(signal)
+        if signal.ndim not in (1, 2):
+            raise RuntimeError(f"Input signal must be 1 or 2 dimension: {signal.ndim}")
+        if signal.ndim == 1:
+            signal = signal[:, None]
+
+        wav = self.dir / f"{key}.{self.format}"
+        wav.parent.mkdir(parents=True, exist_ok=True)
+        soundfile.write(str(wav), signal, rate)
+
+        self.fscp.write(f"{key} {wav}\n")
+
+        # Store the file path
+        self.data[key] = str(wav)
+
+    def get_path(self, key):
+        return self.data[key]
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        self.fscp.close()