add files

This commit is contained in:
烨玮
2025-02-20 12:17:03 +08:00
parent a21dd4555c
commit edd008441b
667 changed files with 473123 additions and 0 deletions

View File

View File

@@ -0,0 +1,78 @@
from pathlib import Path
from typing import Union
import warnings
from typeguard import check_argument_types
from typeguard import check_return_type
class DatadirWriter:
"""Writer class to create kaldi like data directory.
Examples:
>>> with DatadirWriter("output") as writer:
... # output/sub.txt is created here
... subwriter = writer["sub.txt"]
... # Write "uttidA some/where/a.wav"
... subwriter["uttidA"] = "some/where/a.wav"
... subwriter["uttidB"] = "some/where/b.wav"
"""
def __init__(self, p: Union[Path, str]):
assert check_argument_types()
self.path = Path(p)
self.chilidren = {}
self.fd = None
self.has_children = False
self.keys = set()
def __enter__(self):
return self
def __getitem__(self, key: str) -> "DatadirWriter":
assert check_argument_types()
if self.fd is not None:
raise RuntimeError("This writer points out a file")
if key not in self.chilidren:
w = DatadirWriter((self.path / key))
self.chilidren[key] = w
self.has_children = True
retval = self.chilidren[key]
assert check_return_type(retval)
return retval
def __setitem__(self, key: str, value: str):
assert check_argument_types()
if self.has_children:
raise RuntimeError("This writer points out a directory")
if key in self.keys:
warnings.warn(f"Duplicated: {key}")
if self.fd is None:
self.path.parent.mkdir(parents=True, exist_ok=True)
self.fd = self.path.open("w", encoding="utf-8")
self.keys.add(key)
self.fd.write(f"{key} {value}\n")
self.fd.flush()
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
if self.has_children:
prev_child = None
for child in self.chilidren.values():
child.close()
if prev_child is not None and prev_child.keys != child.keys:
warnings.warn(
f"Ids are mismatching between "
f"{prev_child.path} and {child.path}"
)
prev_child = child
elif self.fd is not None:
self.fd.close()

View File

@@ -0,0 +1,97 @@
import collections.abc
from pathlib import Path
from typing import Union
import numpy as np
from typeguard import check_argument_types
from funasr_local.fileio.read_text import read_2column_text
class NpyScpWriter:
"""Writer class for a scp file of numpy file.
Examples:
key1 /some/path/a.npy
key2 /some/path/b.npy
key3 /some/path/c.npy
key4 /some/path/d.npy
...
>>> writer = NpyScpWriter('./data/', './data/feat.scp')
>>> writer['aa'] = numpy_array
>>> writer['bb'] = numpy_array
"""
def __init__(self, outdir: Union[Path, str], scpfile: Union[Path, str]):
assert check_argument_types()
self.dir = Path(outdir)
self.dir.mkdir(parents=True, exist_ok=True)
scpfile = Path(scpfile)
scpfile.parent.mkdir(parents=True, exist_ok=True)
self.fscp = scpfile.open("w", encoding="utf-8")
self.data = {}
def get_path(self, key):
return self.data[key]
def __setitem__(self, key, value):
assert isinstance(value, np.ndarray), type(value)
p = self.dir / f"{key}.npy"
p.parent.mkdir(parents=True, exist_ok=True)
np.save(str(p), value)
self.fscp.write(f"{key} {p}\n")
# Store the file path
self.data[key] = str(p)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
self.fscp.close()
class NpyScpReader(collections.abc.Mapping):
"""Reader class for a scp file of numpy file.
Examples:
key1 /some/path/a.npy
key2 /some/path/b.npy
key3 /some/path/c.npy
key4 /some/path/d.npy
...
>>> reader = NpyScpReader('npy.scp')
>>> array = reader['key1']
"""
def __init__(self, fname: Union[Path, str]):
assert check_argument_types()
self.fname = Path(fname)
self.data = read_2column_text(fname)
def get_path(self, key):
return self.data[key]
def __getitem__(self, key) -> np.ndarray:
p = self.data[key]
return np.load(p)
def __contains__(self, item):
return item
def __len__(self):
return len(self.data)
def __iter__(self):
return iter(self.data)
def keys(self):
return self.data.keys()

View File

@@ -0,0 +1,86 @@
import collections
from pathlib import Path
from typing import Union
import numpy as np
from typeguard import check_argument_types
from funasr_local.fileio.read_text import load_num_sequence_text
class FloatRandomGenerateDataset(collections.abc.Mapping):
"""Generate float array from shape.txt.
Examples:
shape.txt
uttA 123,83
uttB 34,83
>>> dataset = FloatRandomGenerateDataset("shape.txt")
>>> array = dataset["uttA"]
>>> assert array.shape == (123, 83)
>>> array = dataset["uttB"]
>>> assert array.shape == (34, 83)
"""
def __init__(
self,
shape_file: Union[Path, str],
dtype: Union[str, np.dtype] = "float32",
loader_type: str = "csv_int",
):
assert check_argument_types()
shape_file = Path(shape_file)
self.utt2shape = load_num_sequence_text(shape_file, loader_type)
self.dtype = np.dtype(dtype)
def __iter__(self):
return iter(self.utt2shape)
def __len__(self):
return len(self.utt2shape)
def __getitem__(self, item) -> np.ndarray:
shape = self.utt2shape[item]
return np.random.randn(*shape).astype(self.dtype)
class IntRandomGenerateDataset(collections.abc.Mapping):
"""Generate float array from shape.txt
Examples:
shape.txt
uttA 123,83
uttB 34,83
>>> dataset = IntRandomGenerateDataset("shape.txt", low=0, high=10)
>>> array = dataset["uttA"]
>>> assert array.shape == (123, 83)
>>> array = dataset["uttB"]
>>> assert array.shape == (34, 83)
"""
def __init__(
self,
shape_file: Union[Path, str],
low: int,
high: int = None,
dtype: Union[str, np.dtype] = "int64",
loader_type: str = "csv_int",
):
assert check_argument_types()
shape_file = Path(shape_file)
self.utt2shape = load_num_sequence_text(shape_file, loader_type)
self.dtype = np.dtype(dtype)
self.low = low
self.high = high
def __iter__(self):
return iter(self.utt2shape)
def __len__(self):
return len(self.utt2shape)
def __getitem__(self, item) -> np.ndarray:
shape = self.utt2shape[item]
return np.random.randint(self.low, self.high, size=shape, dtype=self.dtype)

View File

@@ -0,0 +1,81 @@
import logging
from pathlib import Path
from typing import Dict
from typing import List
from typing import Union
from typeguard import check_argument_types
def read_2column_text(path: Union[Path, str]) -> Dict[str, str]:
"""Read a text file having 2 column as dict object.
Examples:
wav.scp:
key1 /some/path/a.wav
key2 /some/path/b.wav
>>> read_2column_text('wav.scp')
{'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
"""
assert check_argument_types()
data = {}
with Path(path).open("r", encoding="utf-8") as f:
for linenum, line in enumerate(f, 1):
sps = line.rstrip().split(maxsplit=1)
if len(sps) == 1:
k, v = sps[0], ""
else:
k, v = sps
if k in data:
raise RuntimeError(f"{k} is duplicated ({path}:{linenum})")
data[k] = v
return data
def load_num_sequence_text(
path: Union[Path, str], loader_type: str = "csv_int"
) -> Dict[str, List[Union[float, int]]]:
"""Read a text file indicating sequences of number
Examples:
key1 1 2 3
key2 34 5 6
>>> d = load_num_sequence_text('text')
>>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
"""
assert check_argument_types()
if loader_type == "text_int":
delimiter = " "
dtype = int
elif loader_type == "text_float":
delimiter = " "
dtype = float
elif loader_type == "csv_int":
delimiter = ","
dtype = int
elif loader_type == "csv_float":
delimiter = ","
dtype = float
else:
raise ValueError(f"Not supported loader_type={loader_type}")
# path looks like:
# utta 1,0
# uttb 3,4,5
# -> return {'utta': np.ndarray([1, 0]),
# 'uttb': np.ndarray([3, 4, 5])}
d = read_2column_text(path)
# Using for-loop instead of dict-comprehension for debuggability
retval = {}
for k, v in d.items():
try:
retval[k] = [dtype(i) for i in v.split(delimiter)]
except TypeError:
logging.error(f'Error happened with path="{path}", id="{k}", value="{v}"')
raise
return retval

View File

@@ -0,0 +1,136 @@
import collections.abc
from pathlib import Path
from typing import Union
import numpy as np
import soundfile
import librosa
from typeguard import check_argument_types
from funasr_local.fileio.read_text import read_2column_text
class SoundScpReader(collections.abc.Mapping):
"""Reader class for 'wav.scp'.
Examples:
key1 /some/path/a.wav
key2 /some/path/b.wav
key3 /some/path/c.wav
key4 /some/path/d.wav
...
>>> reader = SoundScpReader('wav.scp')
>>> rate, array = reader['key1']
"""
def __init__(
self,
fname,
dtype=np.int16,
always_2d: bool = False,
normalize: bool = False,
dest_sample_rate: int = 16000,
):
assert check_argument_types()
self.fname = fname
self.dtype = dtype
self.always_2d = always_2d
self.normalize = normalize
self.data = read_2column_text(fname)
self.dest_sample_rate = dest_sample_rate
def __getitem__(self, key):
wav = self.data[key]
if self.normalize:
# soundfile.read normalizes data to [-1,1] if dtype is not given
array, rate = librosa.load(
wav, sr=self.dest_sample_rate, mono=not self.always_2d
)
else:
array, rate = librosa.load(
wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype
)
return rate, array
def get_path(self, key):
return self.data[key]
def __contains__(self, item):
return item
def __len__(self):
return len(self.data)
def __iter__(self):
return iter(self.data)
def keys(self):
return self.data.keys()
class SoundScpWriter:
"""Writer class for 'wav.scp'
Examples:
key1 /some/path/a.wav
key2 /some/path/b.wav
key3 /some/path/c.wav
key4 /some/path/d.wav
...
>>> writer = SoundScpWriter('./data/', './data/feat.scp')
>>> writer['aa'] = 16000, numpy_array
>>> writer['bb'] = 16000, numpy_array
"""
def __init__(
self,
outdir: Union[Path, str],
scpfile: Union[Path, str],
format="wav",
dtype=None,
):
assert check_argument_types()
self.dir = Path(outdir)
self.dir.mkdir(parents=True, exist_ok=True)
scpfile = Path(scpfile)
scpfile.parent.mkdir(parents=True, exist_ok=True)
self.fscp = scpfile.open("w", encoding="utf-8")
self.format = format
self.dtype = dtype
self.data = {}
def __setitem__(self, key: str, value):
rate, signal = value
assert isinstance(rate, int), type(rate)
assert isinstance(signal, np.ndarray), type(signal)
if signal.ndim not in (1, 2):
raise RuntimeError(f"Input signal must be 1 or 2 dimension: {signal.ndim}")
if signal.ndim == 1:
signal = signal[:, None]
wav = self.dir / f"{key}.{self.format}"
wav.parent.mkdir(parents=True, exist_ok=True)
soundfile.write(str(wav), signal, rate)
self.fscp.write(f"{key} {wav}\n")
# Store the file path
self.data[key] = str(wav)
def get_path(self, key):
return self.data[key]
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
self.fscp.close()