add files

This commit is contained in:
烨玮
2025-02-20 12:17:03 +08:00
parent a21dd4555c
commit edd008441b
667 changed files with 473123 additions and 0 deletions

View File

View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
from pathlib import Path
from typing import Iterable
from typing import Union
import numpy as np
from funasr_local.utils.cli_utils import get_commandline_args
def aggregate_stats_dirs(
input_dir: Iterable[Union[str, Path]],
output_dir: Union[str, Path],
log_level: str,
skip_sum_stats: bool,
):
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) (levelname)s: %(message)s",
)
input_dirs = [Path(p) for p in input_dir]
output_dir = Path(output_dir)
for mode in ["train", "valid"]:
with (input_dirs[0] / mode / "batch_keys").open("r", encoding="utf-8") as f:
batch_keys = [line.strip() for line in f if line.strip() != ""]
with (input_dirs[0] / mode / "stats_keys").open("r", encoding="utf-8") as f:
stats_keys = [line.strip() for line in f if line.strip() != ""]
(output_dir / mode).mkdir(parents=True, exist_ok=True)
for key in batch_keys:
with (output_dir / mode / f"{key}_shape").open(
"w", encoding="utf-8"
) as fout:
for idir in input_dirs:
with (idir / mode / f"{key}_shape").open(
"r", encoding="utf-8"
) as fin:
# Read to the last in order to sort keys
# because the order can be changed if num_workers>=1
lines = fin.readlines()
lines = sorted(lines, key=lambda x: x.split()[0])
for line in lines:
fout.write(line)
for key in stats_keys:
if not skip_sum_stats:
sum_stats = None
for idir in input_dirs:
stats = np.load(idir / mode / f"{key}_stats.npz")
if sum_stats is None:
sum_stats = dict(**stats)
else:
for k in stats:
sum_stats[k] += stats[k]
np.savez(output_dir / mode / f"{key}_stats.npz", **sum_stats)
# if --write_collected_feats=true
p = Path(mode) / "collect_feats" / f"{key}.scp"
scp = input_dirs[0] / p
if scp.exists():
(output_dir / p).parent.mkdir(parents=True, exist_ok=True)
with (output_dir / p).open("w", encoding="utf-8") as fout:
for idir in input_dirs:
with (idir / p).open("r", encoding="utf-8") as fin:
for line in fin:
fout.write(line)
def get_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Aggregate statistics directories into one directory",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument(
"--skip_sum_stats",
default=False,
action="store_true",
help="Skip computing the sum of statistics.",
)
parser.add_argument("--input_dir", action="append", help="Input directories")
parser.add_argument("--output_dir", required=True, help="Output directory")
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
aggregate_stats_dirs(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,640 @@
#!/usr/bin/env python3
# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import logging
import sys
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
import numpy as np
import torch
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.batch_beam_search import BatchBeamSearch
from funasr_local.modules.beam_search.batch_beam_search_online_sim import BatchBeamSearchOnlineSim
from funasr_local.modules.beam_search.beam_search import BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.scorers.scorer_interface import BatchScorerInterface
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend
header_colors = '\033[95m'
end_colors = '\033[0m'
class Speech2Text:
"""Speech2Text class
Examples:
>>> import soundfile
>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
batch_size: int = 1,
dtype: str = "float32",
beam_size: int = 20,
ctc_weight: float = 0.5,
lm_weight: float = 1.0,
ngram_weight: float = 0.9,
penalty: float = 0.0,
nbest: int = 1,
streaming: bool = False,
frontend_conf: dict = None,
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
frontend = None
if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
logging.info("asr_model: {}".format(asr_model))
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
decoder = asr_model.decoder
ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
token_list = asr_model.token_list
scorers.update(
decoder=decoder,
ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
# 2. Build Language model
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
scorers["lm"] = lm.lm
# 3. Build ngram model
# ngram is not supported now
ngram = None
scorers["ngram"] = ngram
# 4. Build BeamSearch object
# transducer is not supported now
beam_search_transducer = None
weights = dict(
decoder=1.0 - ctc_weight,
ctc=ctc_weight,
lm=lm_weight,
ngram=ngram_weight,
length_bonus=penalty,
)
beam_search = BeamSearch(
beam_size=beam_size,
weights=weights,
scorers=scorers,
sos=asr_model.sos,
eos=asr_model.eos,
vocab_size=len(token_list),
token_list=token_list,
pre_beam_score_key=None if ctc_weight == 1.0 else "full",
)
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
self.beam_search = beam_search
self.beam_search_transducer = beam_search_transducer
self.maxlenratio = maxlenratio
self.minlenratio = minlenratio
self.device = device
self.dtype = dtype
self.nbest = nbest
self.frontend = frontend
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
) -> List[
Tuple[
Optional[str],
List[str],
List[int],
Union[Hypothesis],
]
]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.asr_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
batch = {"speech": feats, "speech_lengths": feats_len}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, _ = self.asr_model.encode(**batch)
if isinstance(enc, tuple):
enc = enc[0]
assert len(enc) == 1, len(enc)
# c. Passed the encoder result and the beam search
nbest_hyps = self.beam_search(
x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
)
nbest_hyps = nbest_hyps[: self.nbest]
results = []
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
# remove sos/eos and get results
last_pos = -1
if isinstance(hyp.yseq, list):
token_int = hyp.yseq[1:last_pos]
else:
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
token_int = list(filter(lambda x: x != 0, token_int))
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
results.append((text, token, token_int, hyp))
assert check_return_type(results)
return results
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
nbest=nbest,
num_workers=num_workers,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
streaming=streaming,
)
logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
speech2text = Speech2Text(**speech2text_kwargs)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
fs=fs,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
else:
writer = None
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
# batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
# N-best list of (text, token, token_int, hyp_object)
try:
results = speech2text(**batch)
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
results = [[" ", ["sil"], [2], hyp]] * nbest
# Only supporting batch_size==1
key = keys[0]
for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
# Create a directory: outdir/{n}best_recog
if writer is not None:
ibest_writer = writer[f"{n}best_recog"]
# Write the result to each file
ibest_writer["token"][key] = " ".join(token)
# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = text
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,345 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--njob",
type=int,
default=1,
help="The number of jobs for each gpu",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=True,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global CMVN file",
)
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group.add_argument(
"--beam_search_config",
default={},
help="The keyword arguments for transducer beam search.",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=5, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.0,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group.add_argument("--simu_streaming", type=str2bool, default=False)
group.add_argument("--chunk_size", type=int, default=16)
group.add_argument("--left_context", type=int, default=16)
group.add_argument("--right_context", type=int, default=0)
group.add_argument(
"--display_partial_hypotheses",
type=bool,
default=False,
help="Whether to display partial hypotheses during chunk-by-chunk inference.",
)
group = parser.add_argument_group("Dynamic quantization related")
group.add_argument(
"--quantize_asr_model",
type=bool,
default=False,
help="Apply dynamic quantization to ASR model.",
)
group.add_argument(
"--quantize_modules",
nargs="*",
default=None,
help="""Module names to apply dynamic quantization on.
The module names are provided as a list, where each name is separated
by a comma (e.g.: --quantize-config=[Linear,LSTM,GRU]).
Each specified name should be an attribute of 'torch.nn', e.g.:
torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
)
group.add_argument(
"--quantize_dtype",
type=str,
default="qint8",
choices=["float16", "qint8"],
help="Dtype for dynamic quantization.",
)
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
group.add_argument("--token_num_relax", type=int, default=1, help="")
group.add_argument("--decoding_ind", type=int, default=0, help="")
group.add_argument("--decoding_mode", type=str, default="model1", help="")
group.add_argument(
"--ctc_weight2",
type=float,
default=0.0,
help="CTC weight in joint decoding",
)
return parser
def inference_launch(**kwargs):
if 'mode' in kwargs:
mode = kwargs['mode']
else:
logging.info("Unknown decoding mode.")
return None
if mode == "asr":
from funasr_local.bin.asr_inference import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "uniasr":
from funasr_local.bin.asr_inference_uniasr import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "uniasr_vad":
from funasr_local.bin.asr_inference_uniasr_vad import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "paraformer":
from funasr_local.bin.asr_inference_paraformer import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "paraformer_streaming":
from funasr_local.bin.asr_inference_paraformer_streaming import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "paraformer_vad":
from funasr_local.bin.asr_inference_paraformer_vad import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "paraformer_punc":
logging.info("Unknown decoding mode: {}".format(mode))
return None
elif mode == "paraformer_vad_punc":
from funasr_local.bin.asr_inference_paraformer_vad_punc import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "vad":
from funasr_local.bin.vad_inference import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "mfcca":
from funasr_local.bin.asr_inference_mfcca import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "rnnt":
from funasr_local.bin.asr_inference_rnnt import inference_modelscope
return inference_modelscope(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def inference_launch_funasr_local(**kwargs):
if 'mode' in kwargs:
mode = kwargs['mode']
else:
logging.info("Unknown decoding mode.")
return None
if mode == "asr":
from funasr_local.bin.asr_inference import inference
return inference(**kwargs)
elif mode == "uniasr":
from funasr_local.bin.asr_inference_uniasr import inference
return inference(**kwargs)
elif mode == "paraformer":
from funasr_local.bin.asr_inference_paraformer import inference
return inference(**kwargs)
elif mode == "paraformer_vad_punc":
from funasr_local.bin.asr_inference_paraformer_vad_punc import inference
return inference(**kwargs)
elif mode == "vad":
from funasr_local.bin.vad_inference import inference
return inference(**kwargs)
elif mode == "mfcca":
from funasr_local.bin.asr_inference_mfcca import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "rnnt":
from funasr_local.bin.asr_inference_rnnt import inference
return inference(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
parser.add_argument(
"--mode",
type=str,
default="asr",
help="The decoding mode",
)
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
inference_launch_funasr_local(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,767 @@
#!/usr/bin/env python3
# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import logging
import sys
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
import numpy as np
import torch
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.batch_beam_search import BatchBeamSearch
from funasr_local.modules.beam_search.beam_search import BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.scorers.scorer_interface import BatchScorerInterface
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTaskMFCCA as ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
import pdb
global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
'audio_fs': 16000,
'model_fs': 16000
}
class Speech2Text:
"""Speech2Text class
Examples:
>>> import soundfile
>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
batch_size: int = 1,
dtype: str = "float32",
beam_size: int = 20,
ctc_weight: float = 0.5,
lm_weight: float = 1.0,
ngram_weight: float = 0.9,
penalty: float = 0.0,
nbest: int = 1,
streaming: bool = False,
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
logging.info("asr_model: {}".format(asr_model))
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
decoder = asr_model.decoder
ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
token_list = asr_model.token_list
scorers.update(
decoder=decoder,
ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
# 2. Build Language model
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
lm.to(device)
scorers["lm"] = lm.lm
# 3. Build ngram model
# ngram is not supported now
ngram = None
scorers["ngram"] = ngram
# 4. Build BeamSearch object
# transducer is not supported now
beam_search_transducer = None
weights = dict(
decoder=1.0 - ctc_weight,
ctc=ctc_weight,
lm=lm_weight,
ngram=ngram_weight,
length_bonus=penalty,
)
beam_search = BeamSearch(
beam_size=beam_size,
weights=weights,
scorers=scorers,
sos=asr_model.sos,
eos=asr_model.eos,
vocab_size=len(token_list),
token_list=token_list,
pre_beam_score_key=None if ctc_weight == 1.0 else "full",
)
#beam_search.__class__ = BatchBeamSearch
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
self.beam_search = beam_search
self.beam_search_transducer = beam_search_transducer
self.maxlenratio = maxlenratio
self.minlenratio = minlenratio
self.device = device
self.dtype = dtype
self.nbest = nbest
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
) -> List[
Tuple[
Optional[str],
List[str],
List[int],
Union[Hypothesis],
]
]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if(speech.dim()==3):
speech = torch.squeeze(speech, 2)
#speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
speech = speech.to(getattr(torch, self.dtype))
# lenghts: (1,)
lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
batch = {"speech": speech, "speech_lengths": lengths}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, _ = self.asr_model.encode(**batch)
assert len(enc) == 1, len(enc)
# c. Passed the encoder result and the beam search
nbest_hyps = self.beam_search(
x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
)
nbest_hyps = nbest_hyps[: self.nbest]
results = []
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
# remove sos/eos and get results
last_pos = -1
if isinstance(hyp.yseq, list):
token_int = hyp.yseq[1:last_pos]
else:
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
token_int = list(filter(lambda x: x != 0, token_int))
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
results.append((text, token, token_int, hyp))
assert check_return_type(results)
return results
# def inference(
# maxlenratio: float,
# minlenratio: float,
# batch_size: int,
# beam_size: int,
# ngpu: int,
# ctc_weight: float,
# lm_weight: float,
# penalty: float,
# log_level: Union[int, str],
# data_path_and_name_and_type,
# asr_train_config: Optional[str],
# asr_model_file: Optional[str],
# cmvn_file: Optional[str] = None,
# lm_train_config: Optional[str] = None,
# lm_file: Optional[str] = None,
# token_type: Optional[str] = None,
# key_file: Optional[str] = None,
# word_lm_train_config: Optional[str] = None,
# bpemodel: Optional[str] = None,
# allow_variable_data_keys: bool = False,
# streaming: bool = False,
# output_dir: Optional[str] = None,
# dtype: str = "float32",
# seed: int = 0,
# ngram_weight: float = 0.9,
# nbest: int = 1,
# num_workers: int = 1,
# **kwargs,
# ):
# assert check_argument_types()
# if batch_size > 1:
# raise NotImplementedError("batch decoding is not implemented")
# if word_lm_train_config is not None:
# raise NotImplementedError("Word LM is not implemented")
# if ngpu > 1:
# raise NotImplementedError("only single GPU decoding is supported")
#
# logging.basicConfig(
# level=log_level,
# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
# )
#
# if ngpu >= 1 and torch.cuda.is_available():
# device = "cuda"
# else:
# device = "cpu"
#
# # 1. Set random-seed
# set_all_random_seed(seed)
#
# # 2. Build speech2text
# speech2text_kwargs = dict(
# asr_train_config=asr_train_config,
# asr_model_file=asr_model_file,
# cmvn_file=cmvn_file,
# lm_train_config=lm_train_config,
# lm_file=lm_file,
# token_type=token_type,
# bpemodel=bpemodel,
# device=device,
# maxlenratio=maxlenratio,
# minlenratio=minlenratio,
# dtype=dtype,
# beam_size=beam_size,
# ctc_weight=ctc_weight,
# lm_weight=lm_weight,
# ngram_weight=ngram_weight,
# penalty=penalty,
# nbest=nbest,
# streaming=streaming,
# )
# logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
# speech2text = Speech2Text(**speech2text_kwargs)
#
# # 3. Build data-iterator
# loader = ASRTask.build_streaming_iterator(
# data_path_and_name_and_type,
# dtype=dtype,
# batch_size=batch_size,
# key_file=key_file,
# num_workers=num_workers,
# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
# allow_variable_data_keys=allow_variable_data_keys,
# inference=True,
# )
#
# finish_count = 0
# file_count = 1
# # 7 .Start for-loop
# # FIXME(kamo): The output format should be discussed about
# asr_result_list = []
# if output_dir is not None:
# writer = DatadirWriter(output_dir)
# else:
# writer = None
#
# for keys, batch in loader:
# assert isinstance(batch, dict), type(batch)
# assert all(isinstance(s, str) for s in keys), keys
# _bs = len(next(iter(batch.values())))
# assert len(keys) == _bs, f"{len(keys)} != {_bs}"
# #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
#
# # N-best list of (text, token, token_int, hyp_object)
# try:
# results = speech2text(**batch)
# except TooShortUttError as e:
# logging.warning(f"Utterance {keys} {e}")
# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
# results = [[" ", ["<space>"], [2], hyp]] * nbest
#
# # Only supporting batch_size==1
# key = keys[0]
# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
# # Create a directory: outdir/{n}best_recog
# if writer is not None:
# ibest_writer = writer[f"{n}best_recog"]
#
# # Write the result to each file
# ibest_writer["token"][key] = " ".join(token)
# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
# ibest_writer["score"][key] = str(hyp.score)
#
# if text is not None:
# text_postprocessed = postprocess_utils.sentence_postprocess(token)
# item = {'key': key, 'value': text_postprocessed}
# asr_result_list.append(item)
# finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
# if writer is not None:
# ibest_writer["text"][key] = text
# return asr_result_list
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
nbest=nbest,
num_workers=num_workers,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
streaming=streaming,
)
logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
speech2text = Speech2Text(**speech2text_kwargs)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
fs=fs,
mc=True,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
else:
writer = None
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
# batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
# N-best list of (text, token, token_int, hyp_object)
try:
results = speech2text(**batch)
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
results = [[" ", ["<space>"], [2], hyp]] * nbest
# Only supporting batch_size==1
key = keys[0]
for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
# Create a directory: outdir/{n}best_recog
if writer is not None:
ibest_writer = writer[f"{n}best_recog"]
# Write the result to each file
ibest_writer["token"][key] = " ".join(token)
# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
text_postprocessed = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = text
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,761 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
import time
import copy
import os
import codecs
import tempfile
import requests
import yaml
from pathlib import Path
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
from typing import List
import numpy as np
import torch
import torchaudio
from typeguard import check_argument_types
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTaskParaformer as ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
from funasr_local.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
np.set_printoptions(threshold=np.inf)
class Speech2Text:
"""Speech2Text class
Examples:
>>> import soundfile
>>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
dtype: str = "float32",
beam_size: int = 20,
ctc_weight: float = 0.5,
lm_weight: float = 1.0,
ngram_weight: float = 0.9,
penalty: float = 0.0,
nbest: int = 1,
frontend_conf: dict = None,
hotword_list_or_file: str = None,
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
frontend = None
if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
frontend = WavFrontendOnline(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
logging.info("asr_model: {}".format(asr_model))
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
if asr_model.ctc != None:
ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
scorers.update(
ctc=ctc
)
token_list = asr_model.token_list
scorers.update(
length_bonus=LengthBonus(len(token_list)),
)
# 2. Build Language model
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
scorers["lm"] = lm.lm
# 3. Build ngram model
# ngram is not supported now
ngram = None
scorers["ngram"] = ngram
# 4. Build BeamSearch object
# transducer is not supported now
beam_search_transducer = None
weights = dict(
decoder=1.0 - ctc_weight,
ctc=ctc_weight,
lm=lm_weight,
ngram=ngram_weight,
length_bonus=penalty,
)
beam_search = BeamSearch(
beam_size=beam_size,
weights=weights,
scorers=scorers,
sos=asr_model.sos,
eos=asr_model.eos,
vocab_size=len(token_list),
token_list=token_list,
pre_beam_score_key=None if ctc_weight == 1.0 else "full",
)
beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
for scorer in scorers.values():
if isinstance(scorer, torch.nn.Module):
scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
logging.info(f"Decoding device={device}, dtype={dtype}")
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
# 6. [Optional] Build hotword list from str, local file or url
is_use_lm = lm_weight != 0.0 and lm_file is not None
if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
self.beam_search = beam_search
logging.info(f"Beam_search: {self.beam_search}")
self.beam_search_transducer = beam_search_transducer
self.maxlenratio = maxlenratio
self.minlenratio = minlenratio
self.device = device
self.dtype = dtype
self.nbest = nbest
self.frontend = frontend
self.encoder_downsampling_factor = 1
if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
self.encoder_downsampling_factor = 4
@torch.no_grad()
def __call__(
self, cache: dict, speech: Union[torch.Tensor], speech_lengths: Union[torch.Tensor] = None
):
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
results = []
cache_en = cache["encoder"]
if speech.shape[1] < 16 * 60 and cache_en["is_final"]:
if cache_en["start_idx"] == 0:
return []
cache_en["tail_chunk"] = True
feats = cache_en["feats"]
feats_len = torch.tensor([feats.shape[1]])
self.asr_model.frontend = None
results = self.infer(feats, feats_len, cache)
return results
else:
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths, cache_en["is_final"])
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.asr_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
if feats.shape[1] != 0:
if cache_en["is_final"]:
if feats.shape[1] + cache_en["chunk_size"][2] < cache_en["chunk_size"][1]:
cache_en["last_chunk"] = True
else:
# first chunk
feats_chunk1 = feats[:, :cache_en["chunk_size"][1], :]
feats_len = torch.tensor([feats_chunk1.shape[1]])
results_chunk1 = self.infer(feats_chunk1, feats_len, cache)
# last chunk
cache_en["last_chunk"] = True
feats_chunk2 = feats[:, -(feats.shape[1] + cache_en["chunk_size"][2] - cache_en["chunk_size"][1]):, :]
feats_len = torch.tensor([feats_chunk2.shape[1]])
results_chunk2 = self.infer(feats_chunk2, feats_len, cache)
return ["".join(results_chunk1 + results_chunk2)]
results = self.infer(feats, feats_len, cache)
return results
@torch.no_grad()
def infer(self, feats: Union[torch.Tensor], feats_len: Union[torch.Tensor], cache: List = None):
batch = {"speech": feats, "speech_lengths": feats_len}
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache=cache)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
pre_acoustic_embeds, pre_token_length= predictor_outs[0], predictor_outs[1]
if torch.max(pre_token_length) < 1:
return []
decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
decoder_out = decoder_outs
results = []
b, n, d = decoder_out.size()
for i in range(b):
x = enc[i, :enc_len[i], :]
am_scores = decoder_out[i, :pre_token_length[i], :]
if self.beam_search is not None:
nbest_hyps = self.beam_search(
x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
)
nbest_hyps = nbest_hyps[: self.nbest]
else:
yseq = am_scores.argmax(dim=-1)
score = am_scores.max(dim=-1)[0]
score = torch.sum(score, dim=-1)
# pad with mask tokens to ensure compatibility with sos/eos tokens
yseq = torch.tensor(
[self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
)
nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
# remove sos/eos and get results
last_pos = -1
if isinstance(hyp.yseq, list):
token_int = hyp.yseq[1:last_pos]
else:
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
results.append(text)
# assert check_return_type(results)
return results
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
nbest=nbest,
num_workers=num_workers,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
output_dir: Optional[str] = None,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
export_mode = False
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
batch_size = 1
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
)
speech2text = Speech2Text(**speech2text_kwargs)
def _load_bytes(input):
middle_data = np.frombuffer(input, dtype=np.int16)
middle_data = np.asarray(middle_data)
if middle_data.dtype.kind not in 'iu':
raise TypeError("'middle_data' must be an array of integers")
dtype = np.dtype('float32')
if dtype.kind != 'f':
raise TypeError("'dtype' must be a floating point type")
i = np.iinfo(middle_data.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
return array
def _read_yaml(yaml_path: Union[str, Path]) -> Dict:
if not Path(yaml_path).exists():
raise FileExistsError(f'The {yaml_path} does not exist.')
with open(str(yaml_path), 'rb') as f:
data = yaml.load(f, Loader=yaml.Loader)
return data
def _prepare_cache(cache: dict = {}, chunk_size=[5,10,5], batch_size=1):
if len(cache) > 0:
return cache
config = _read_yaml(asr_train_config)
enc_output_size = config["encoder_conf"]["output_size"]
feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
"cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
"feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
cache["encoder"] = cache_en
cache_de = {"decode_fsmn": None}
cache["decoder"] = cache_de
return cache
def _cache_reset(cache: dict = {}, chunk_size=[5,10,5], batch_size=1):
if len(cache) > 0:
config = _read_yaml(asr_train_config)
enc_output_size = config["encoder_conf"]["output_size"]
feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
"cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
"feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
cache["encoder"] = cache_en
cache_de = {"decode_fsmn": None}
cache["decoder"] = cache_de
return cache
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
# 3. Build data-iterator
if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
raw_inputs = _load_bytes(data_path_and_name_and_type[0])
raw_inputs = torch.tensor(raw_inputs)
if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, np.ndarray):
raw_inputs = torch.tensor(raw_inputs)
is_final = False
cache = {}
chunk_size = [5, 10, 5]
if param_dict is not None and "cache" in param_dict:
cache = param_dict["cache"]
if param_dict is not None and "is_final" in param_dict:
is_final = param_dict["is_final"]
if param_dict is not None and "chunk_size" in param_dict:
chunk_size = param_dict["chunk_size"]
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
raw_inputs = torch.unsqueeze(raw_inputs, axis=0)
asr_result_list = []
cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
item = {}
if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
sample_offset = 0
speech_length = raw_inputs.shape[1]
stride_size = chunk_size[1] * 960
cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
final_result = ""
for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
if sample_offset + stride_size >= speech_length - 1:
stride_size = speech_length - sample_offset
cache["encoder"]["is_final"] = True
else:
cache["encoder"]["is_final"] = False
input_lens = torch.tensor([stride_size])
asr_result = speech2text(cache, raw_inputs[:, sample_offset: sample_offset + stride_size], input_lens)
if len(asr_result) != 0:
final_result += asr_result[0]
item = {'key': "utt", 'value': [final_result]}
else:
input_lens = torch.tensor([raw_inputs.shape[1]])
cache["encoder"]["is_final"] = is_final
asr_result = speech2text(cache, raw_inputs, input_lens)
item = {'key': "utt", 'value': asr_result}
asr_result_list.append(item)
if is_final:
cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1)
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--hotword",
type=str_or_none,
default=None,
help="hotword file path or hotwords seperated by space"
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group.add_argument(
"--frontend_conf",
default=None,
help="",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
param_dict = {'hotword': args.hotword}
kwargs = vars(args)
kwargs.pop("config", None)
kwargs['param_dict'] = param_dict
inference(**kwargs)
if __name__ == "__main__":
main()
# from modelscope.pipelines import pipeline
# from modelscope.utils.constant import Tasks
#
# inference_16k_pipline = pipeline(
# task=Tasks.auto_speech_recognition,
# model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
#
# rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
# print(rec_result)

View File

@@ -0,0 +1,549 @@
#!/usr/bin/env python3
import json
import argparse
import logging
import sys
import time
from pathlib import Path
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
from typing import List
import math
import numpy as np
import torch
from typeguard import check_argument_types
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTaskParaformer as ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend
from funasr_local.tasks.vad import VADTask
from funasr_local.bin.punctuation_infer import Text2Punc
from funasr_local.bin.asr_inference_paraformer_vad_punc import Speech2Text
from funasr_local.bin.asr_inference_paraformer_vad_punc import Speech2VadSegment
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
vad_infer_config: Optional[str] = None,
vad_model_file: Optional[str] = None,
vad_cmvn_file: Optional[str] = None,
time_stamp_writer: bool = False,
punc_infer_config: Optional[str] = None,
punc_model_file: Optional[str] = None,
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
nbest=nbest,
num_workers=num_workers,
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
time_stamp_writer=time_stamp_writer,
punc_infer_config=punc_infer_config,
punc_model_file=punc_model_file,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
vad_infer_config: Optional[str] = None,
vad_model_file: Optional[str] = None,
vad_cmvn_file: Optional[str] = None,
time_stamp_writer: bool = True,
punc_infer_config: Optional[str] = None,
punc_model_file: Optional[str] = None,
outputs_dict: Optional[bool] = True,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
else:
hotword_list_or_file = None
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2vadsegment
speech2vadsegment_kwargs = dict(
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
device=device,
dtype=dtype,
)
# logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
# 3. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
hotword_list_or_file=hotword_list_or_file,
)
speech2text = Speech2Text(**speech2text_kwargs)
text2punc = None
if punc_model_file is not None:
text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
if output_dir is not None:
writer = DatadirWriter(output_dir)
ibest_writer = writer[f"1best_recog"]
ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
hotword_list_or_file = None
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
if 'hotword' in kwargs:
hotword_list_or_file = kwargs['hotword']
if speech2text.hotword_list is None:
speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
fs=fs,
batch_size=1,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
if param_dict is not None:
use_timestamp = param_dict.get('use_timestamp', True)
else:
use_timestamp = True
finish_count = 0
file_count = 1
lfr_factor = 6
# 7 .Start for-loop
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
writer = None
if output_path is not None:
writer = DatadirWriter(output_path)
ibest_writer = writer[f"1best_recog"]
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
vad_results = speech2vadsegment(**batch)
fbanks, vadsegments = vad_results[0], vad_results[1]
for i, segments in enumerate(vadsegments):
result_segments = [["", [], [], ]]
for j, segment_idx in enumerate(segments):
bed_idx, end_idx = int(segment_idx[0] / 10), int(segment_idx[1] / 10)
segment = fbanks[:, bed_idx:end_idx, :].to(device)
speech_lengths = torch.Tensor([end_idx - bed_idx]).int().to(device)
batch = {"speech": segment, "speech_lengths": speech_lengths, "begin_time": vadsegments[i][j][0],
"end_time": vadsegments[i][j][1]}
results = speech2text(**batch)
if len(results) < 1:
continue
result_cur = [results[0][:-2]]
if j == 0:
result_segments = result_cur
else:
result_segments = [[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]]
key = keys[0]
result = result_segments[0]
text, token, token_int = result[0], result[1], result[2]
time_stamp = None if len(result) < 4 else result[3]
if use_timestamp and time_stamp is not None:
postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
else:
postprocessed_result = postprocess_utils.sentence_postprocess(token)
text_postprocessed = ""
time_stamp_postprocessed = ""
text_postprocessed_punc = postprocessed_result
if len(postprocessed_result) == 3:
text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
postprocessed_result[1], \
postprocessed_result[2]
else:
text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
text_postprocessed_punc = text_postprocessed
if len(word_lists) > 0 and text2punc is not None:
text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
item = {'key': key, 'value': text_postprocessed_punc}
if text_postprocessed != "":
item['text_postprocessed'] = text_postprocessed
if time_stamp_postprocessed != "":
item['time_stamp'] = time_stamp_postprocessed
asr_result_list.append(item)
finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
if writer is not None:
# Write the result to each file
ibest_writer["token"][key] = " ".join(token)
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["vad"][key] = "{}".format(vadsegments)
ibest_writer["text"][key] = " ".join(word_lists)
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
if time_stamp_postprocessed is not None:
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group.add_argument("--time_stamp_writer", type=str2bool, default=False)
group.add_argument(
"--frontend_conf",
default=None,
help="",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--vad_cmvn_file",
type=str,
help="vad, Global cmvn file",
)
group.add_argument(
"--punc_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--punc_model_file",
type=str,
help="VAD model parameter file",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,881 @@
#!/usr/bin/env python3
import json
import argparse
import logging
import sys
import time
import os
import codecs
import tempfile
import requests
from pathlib import Path
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
from typing import List
import math
import copy
import numpy as np
import torch
from typeguard import check_argument_types
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTaskParaformer as ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend
from funasr_local.tasks.vad import VADTask
from funasr_local.bin.vad_inference import Speech2VadSegment
from funasr_local.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
from funasr_local.bin.punctuation_infer import Text2Punc
from funasr_local.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
header_colors = '\033[95m'
end_colors = '\033[0m'
class Speech2Text:
"""Speech2Text class
Examples:
>>> import soundfile
>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
dtype: str = "float32",
beam_size: int = 20,
ctc_weight: float = 0.5,
lm_weight: float = 1.0,
ngram_weight: float = 0.9,
penalty: float = 0.0,
nbest: int = 1,
frontend_conf: dict = None,
hotword_list_or_file: str = None,
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file=cmvn_file, device=device
)
frontend = None
if asr_model.frontend is not None and asr_train_args.frontend_conf is not None:
frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
# logging.info("asr_model: {}".format(asr_model))
# logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
if asr_model.ctc != None:
ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
scorers.update(
ctc=ctc
)
token_list = asr_model.token_list
scorers.update(
length_bonus=LengthBonus(len(token_list)),
)
# 2. Build Language model
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
scorers["lm"] = lm.lm
# 3. Build ngram model
# ngram is not supported now
ngram = None
scorers["ngram"] = ngram
# 4. Build BeamSearch object
# transducer is not supported now
beam_search_transducer = None
weights = dict(
decoder=1.0 - ctc_weight,
ctc=ctc_weight,
lm=lm_weight,
ngram=ngram_weight,
length_bonus=penalty,
)
beam_search = BeamSearch(
beam_size=beam_size,
weights=weights,
scorers=scorers,
sos=asr_model.sos,
eos=asr_model.eos,
vocab_size=len(token_list),
token_list=token_list,
pre_beam_score_key=None if ctc_weight == 1.0 else "full",
)
beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
for scorer in scorers.values():
if isinstance(scorer, torch.nn.Module):
scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
logging.info(f"Decoding device={device}, dtype={dtype}")
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
# 6. [Optional] Build hotword list from str, local file or url
self.hotword_list = None
self.hotword_list = self.generate_hotwords_list(hotword_list_or_file)
is_use_lm = lm_weight != 0.0 and lm_file is not None
if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
self.beam_search = beam_search
logging.info(f"Beam_search: {self.beam_search}")
self.beam_search_transducer = beam_search_transducer
self.maxlenratio = maxlenratio
self.minlenratio = minlenratio
self.device = device
self.dtype = dtype
self.nbest = nbest
self.frontend = frontend
self.encoder_downsampling_factor = 1
if asr_train_args.encoder_conf["input_layer"] == "conv2d":
self.encoder_downsampling_factor = 4
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
begin_time: int = 0, end_time: int = None,
):
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
# feats, feats_len = self.frontend.forward(speech, speech_lengths)
# fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
feats, feats_len = self.frontend.forward_lfr_cmvn(speech, speech_lengths)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.asr_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
batch = {"speech": feats, "speech_lengths": feats_len}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, enc_len = self.asr_model.encode(**batch)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
predictor_outs[2], predictor_outs[3]
pre_token_length = pre_token_length.round().long()
if torch.max(pre_token_length) < 1:
return []
if not isinstance(self.asr_model, ContextualParaformer):
if self.hotword_list:
logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
else:
decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list)
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
if isinstance(self.asr_model, BiCifParaformer):
_, _, us_alphas, us_peaks = self.asr_model.calc_predictor_timestamp(enc, enc_len,
pre_token_length) # test no bias cif2
results = []
b, n, d = decoder_out.size()
for i in range(b):
x = enc[i, :enc_len[i], :]
am_scores = decoder_out[i, :pre_token_length[i], :]
if self.beam_search is not None:
nbest_hyps = self.beam_search(
x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
)
nbest_hyps = nbest_hyps[: self.nbest]
else:
yseq = am_scores.argmax(dim=-1)
score = am_scores.max(dim=-1)[0]
score = torch.sum(score, dim=-1)
# pad with mask tokens to ensure compatibility with sos/eos tokens
yseq = torch.tensor(
[self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
)
nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
# remove sos/eos and get results
last_pos = -1
if isinstance(hyp.yseq, list):
token_int = hyp.yseq[1:last_pos]
else:
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
if len(token_int) == 0:
continue
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
if isinstance(self.asr_model, BiCifParaformer):
_, timestamp = ts_prediction_lfr6_standard(us_alphas[i],
us_peaks[i],
copy.copy(token),
vad_offset=begin_time)
results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor))
else:
results.append((text, token, token_int, enc_len_batch_total, lfr_factor))
# assert check_return_type(results)
return results
def generate_hotwords_list(self, hotword_list_or_file):
# for None
if hotword_list_or_file is None:
hotword_list = None
# for local txt inputs
elif os.path.exists(hotword_list_or_file) and hotword_list_or_file.endswith('.txt'):
logging.info("Attempting to parse hotwords from local txt...")
hotword_list = []
hotword_str_list = []
with codecs.open(hotword_list_or_file, 'r') as fin:
for line in fin.readlines():
hw = line.strip()
hotword_str_list.append(hw)
hotword_list.append(self.converter.tokens2ids([i for i in hw]))
hotword_list.append([self.asr_model.sos])
hotword_str_list.append('<s>')
logging.info("Initialized hotword list from file: {}, hotword list: {}."
.format(hotword_list_or_file, hotword_str_list))
# for url, download and generate txt
elif hotword_list_or_file.startswith('http'):
logging.info("Attempting to parse hotwords from url...")
work_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(work_dir):
os.makedirs(work_dir)
text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file))
local_file = requests.get(hotword_list_or_file)
open(text_file_path, "wb").write(local_file.content)
hotword_list_or_file = text_file_path
hotword_list = []
hotword_str_list = []
with codecs.open(hotword_list_or_file, 'r') as fin:
for line in fin.readlines():
hw = line.strip()
hotword_str_list.append(hw)
hotword_list.append(self.converter.tokens2ids([i for i in hw]))
hotword_list.append([self.asr_model.sos])
hotword_str_list.append('<s>')
logging.info("Initialized hotword list from file: {}, hotword list: {}."
.format(hotword_list_or_file, hotword_str_list))
# for text str input
elif not hotword_list_or_file.endswith('.txt'):
logging.info("Attempting to parse hotwords as str...")
hotword_list = []
hotword_str_list = []
for hw in hotword_list_or_file.strip().split():
hotword_str_list.append(hw)
hotword_list.append(self.converter.tokens2ids([i for i in hw]))
hotword_list.append([self.asr_model.sos])
hotword_str_list.append('<s>')
logging.info("Hotword list: {}.".format(hotword_str_list))
else:
hotword_list = None
return hotword_list
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
vad_infer_config: Optional[str] = None,
vad_model_file: Optional[str] = None,
vad_cmvn_file: Optional[str] = None,
time_stamp_writer: bool = False,
punc_infer_config: Optional[str] = None,
punc_model_file: Optional[str] = None,
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
nbest=nbest,
num_workers=num_workers,
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
time_stamp_writer=time_stamp_writer,
punc_infer_config=punc_infer_config,
punc_model_file=punc_model_file,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
vad_infer_config: Optional[str] = None,
vad_model_file: Optional[str] = None,
vad_cmvn_file: Optional[str] = None,
time_stamp_writer: bool = True,
punc_infer_config: Optional[str] = None,
punc_model_file: Optional[str] = None,
outputs_dict: Optional[bool] = True,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
else:
hotword_list_or_file = None
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2vadsegment
speech2vadsegment_kwargs = dict(
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
device=device,
dtype=dtype,
)
# logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
# 3. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
hotword_list_or_file=hotword_list_or_file,
)
speech2text = Speech2Text(**speech2text_kwargs)
text2punc = None
if punc_model_file is not None:
text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
if output_dir is not None:
writer = DatadirWriter(output_dir)
ibest_writer = writer[f"1best_recog"]
ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
hotword_list_or_file = None
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
if 'hotword' in kwargs:
hotword_list_or_file = kwargs['hotword']
if speech2text.hotword_list is None:
speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
fs=fs,
batch_size=1,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
if param_dict is not None:
use_timestamp = param_dict.get('use_timestamp', True)
else:
use_timestamp = True
finish_count = 0
file_count = 1
lfr_factor = 6
# 7 .Start for-loop
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
writer = None
if output_path is not None:
writer = DatadirWriter(output_path)
ibest_writer = writer[f"1best_recog"]
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
vad_results = speech2vadsegment(**batch)
fbanks, vadsegments = vad_results[0], vad_results[1]
for i, segments in enumerate(vadsegments):
result_segments = [["", [], [], []]]
for j, segment_idx in enumerate(segments):
bed_idx, end_idx = int(segment_idx[0] / 10), int(segment_idx[1] / 10)
segment = fbanks[:, bed_idx:end_idx, :].to(device)
speech_lengths = torch.Tensor([end_idx - bed_idx]).int().to(device)
batch = {"speech": segment, "speech_lengths": speech_lengths, "begin_time": vadsegments[i][j][0],
"end_time": vadsegments[i][j][1]}
results = speech2text(**batch)
if len(results) < 1:
continue
result_cur = [results[0][:-2]]
if j == 0:
result_segments = result_cur
else:
result_segments = [
[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]]
key = keys[0]
result = result_segments[0]
text, token, token_int = result[0], result[1], result[2]
time_stamp = None if len(result) < 4 else result[3]
if use_timestamp and time_stamp is not None:
postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
else:
postprocessed_result = postprocess_utils.sentence_postprocess(token)
text_postprocessed = ""
time_stamp_postprocessed = ""
text_postprocessed_punc = postprocessed_result
if len(postprocessed_result) == 3:
text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
postprocessed_result[1], \
postprocessed_result[2]
else:
text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
text_postprocessed_punc = text_postprocessed
punc_id_list = []
if len(word_lists) > 0 and text2punc is not None:
text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
item = {'key': key, 'value': text_postprocessed_punc}
if text_postprocessed != "":
item['text_postprocessed'] = text_postprocessed
if time_stamp_postprocessed != "":
item['time_stamp'] = time_stamp_postprocessed
item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)
asr_result_list.append(item)
finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
if writer is not None:
# Write the result to each file
ibest_writer["token"][key] = " ".join(token)
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["vad"][key] = "{}".format(vadsegments)
ibest_writer["text"][key] = " ".join(word_lists)
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
if time_stamp_postprocessed is not None:
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group.add_argument("--time_stamp_writer", type=str2bool, default=False)
group.add_argument(
"--frontend_conf",
default=None,
help="",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--vad_cmvn_file",
type=str,
help="vad, Global cmvn file",
)
group.add_argument(
"--punc_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--punc_model_file",
type=str,
help="VAD model parameter file",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,737 @@
#!/usr/bin/env python3
""" Inference class definition for Transducer models."""
from __future__ import annotations
import argparse
import logging
import math
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
from packaging.version import parse as V
from typeguard import check_argument_types, check_return_type
from funasr_local.modules.beam_search.beam_search_transducer import (
BeamSearchTransducer,
Hypothesis,
)
from funasr_local.modules.nets_utils import TooShortUttError
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.tasks.asr import ASRTransducerTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import str2bool, str2triple_str, str_or_none
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.models.frontend.wav_frontend import WavFrontend
class Speech2Text:
"""Speech2Text class for Transducer models.
Args:
asr_train_config: ASR model training config path.
asr_model_file: ASR model path.
beam_search_config: Beam search config path.
lm_train_config: Language Model training config path.
lm_file: Language Model config path.
token_type: Type of token units.
bpemodel: BPE model path.
device: Device to use for inference.
beam_size: Size of beam during search.
dtype: Data type.
lm_weight: Language model weight.
quantize_asr_model: Whether to apply dynamic quantization to ASR model.
quantize_modules: List of module names to apply dynamic quantization on.
quantize_dtype: Dynamic quantization data type.
nbest: Number of final hypothesis.
streaming: Whether to perform chunk-by-chunk inference.
chunk_size: Number of frames in chunk AFTER subsampling.
left_context: Number of frames in left context AFTER subsampling.
right_context: Number of frames in right context AFTER subsampling.
display_partial_hypotheses: Whether to display partial hypotheses.
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
beam_search_config: Dict[str, Any] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
beam_size: int = 5,
dtype: str = "float32",
lm_weight: float = 1.0,
quantize_asr_model: bool = False,
quantize_modules: List[str] = None,
quantize_dtype: str = "qint8",
nbest: int = 1,
streaming: bool = False,
simu_streaming: bool = False,
chunk_size: int = 16,
left_context: int = 32,
right_context: int = 0,
display_partial_hypotheses: bool = False,
) -> None:
"""Construct a Speech2Text object."""
super().__init__()
assert check_argument_types()
asr_model, asr_train_args = ASRTransducerTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
frontend = None
if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
if quantize_asr_model:
if quantize_modules is not None:
if not all([q in ["LSTM", "Linear"] for q in quantize_modules]):
raise ValueError(
"Only 'Linear' and 'LSTM' modules are currently supported"
" by PyTorch and in --quantize_modules"
)
q_config = set([getattr(torch.nn, q) for q in quantize_modules])
else:
q_config = {torch.nn.Linear}
if quantize_dtype == "float16" and (V(torch.__version__) < V("1.5.0")):
raise ValueError(
"float16 dtype for dynamic quantization is not supported with torch"
" version < 1.5.0. Switching to qint8 dtype instead."
)
q_dtype = getattr(torch, quantize_dtype)
asr_model = torch.quantization.quantize_dynamic(
asr_model, q_config, dtype=q_dtype
).eval()
else:
asr_model.to(dtype=getattr(torch, dtype)).eval()
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
lm_scorer = lm.lm
else:
lm_scorer = None
# 4. Build BeamSearch object
if beam_search_config is None:
beam_search_config = {}
beam_search = BeamSearchTransducer(
asr_model.decoder,
asr_model.joint_network,
beam_size,
lm=lm_scorer,
lm_weight=lm_weight,
nbest=nbest,
**beam_search_config,
)
token_list = asr_model.token_list
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.device = device
self.dtype = dtype
self.nbest = nbest
self.converter = converter
self.tokenizer = tokenizer
self.beam_search = beam_search
self.streaming = streaming
self.simu_streaming = simu_streaming
self.chunk_size = max(chunk_size, 0)
self.left_context = left_context
self.right_context = max(right_context, 0)
if not streaming or chunk_size == 0:
self.streaming = False
self.asr_model.encoder.dynamic_chunk_training = False
if not simu_streaming or chunk_size == 0:
self.simu_streaming = False
self.asr_model.encoder.dynamic_chunk_training = False
self.frontend = frontend
self.window_size = self.chunk_size + self.right_context
self._ctx = self.asr_model.encoder.get_encoder_input_size(
self.window_size
)
#self.last_chunk_length = (
# self.asr_model.encoder.embed.min_frame_length + self.right_context + 1
#) * self.hop_length
self.last_chunk_length = (
self.asr_model.encoder.embed.min_frame_length + self.right_context + 1
)
self.reset_inference_cache()
def reset_inference_cache(self) -> None:
"""Reset Speech2Text parameters."""
self.frontend_cache = None
self.asr_model.encoder.reset_streaming_cache(
self.left_context, device=self.device
)
self.beam_search.reset_inference_cache()
self.num_processed_frames = torch.tensor([[0]], device=self.device)
@torch.no_grad()
def streaming_decode(
self,
speech: Union[torch.Tensor, np.ndarray],
is_final: bool = True,
) -> List[Hypothesis]:
"""Speech2Text streaming call.
Args:
speech: Chunk of speech data. (S)
is_final: Whether speech corresponds to the final chunk of data.
Returns:
nbest_hypothesis: N-best hypothesis.
"""
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if is_final:
if self.streaming and speech.size(0) < self.last_chunk_length:
pad = torch.zeros(
self.last_chunk_length - speech.size(0), speech.size(1), dtype=speech.dtype
)
speech = torch.cat([speech, pad], dim=0) #feats, feats_length = self.apply_frontend(speech, is_final=is_final)
feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
if self.asr_model.normalize is not None:
feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
feats = to_device(feats, device=self.device)
feats_lengths = to_device(feats_lengths, device=self.device)
enc_out = self.asr_model.encoder.chunk_forward(
feats,
feats_lengths,
self.num_processed_frames,
chunk_size=self.chunk_size,
left_context=self.left_context,
right_context=self.right_context,
)
nbest_hyps = self.beam_search(enc_out[0], is_final=is_final)
self.num_processed_frames += self.chunk_size
if is_final:
self.reset_inference_cache()
return nbest_hyps
@torch.no_grad()
def simu_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]:
"""Speech2Text call.
Args:
speech: Speech data. (S)
Returns:
nbest_hypothesis: N-best hypothesis.
"""
assert check_argument_types()
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
if self.asr_model.normalize is not None:
feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
feats = to_device(feats, device=self.device)
feats_lengths = to_device(feats_lengths, device=self.device)
enc_out = self.asr_model.encoder.simu_chunk_forward(feats, feats_lengths, self.chunk_size, self.left_context, self.right_context)
nbest_hyps = self.beam_search(enc_out[0])
return nbest_hyps
@torch.no_grad()
def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]:
"""Speech2Text call.
Args:
speech: Speech data. (S)
Returns:
nbest_hypothesis: N-best hypothesis.
"""
assert check_argument_types()
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
feats = to_device(feats, device=self.device)
feats_lengths = to_device(feats_lengths, device=self.device)
enc_out, _ = self.asr_model.encoder(feats, feats_lengths)
nbest_hyps = self.beam_search(enc_out[0])
return nbest_hyps
def hypotheses_to_results(self, nbest_hyps: List[Hypothesis]) -> List[Any]:
"""Build partial or final results from the hypotheses.
Args:
nbest_hyps: N-best hypothesis.
Returns:
results: Results containing different representation for the hypothesis.
"""
results = []
for hyp in nbest_hyps:
token_int = list(filter(lambda x: x != 0, hyp.yseq))
token = self.converter.ids2tokens(token_int)
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
results.append((text, token, token_int, hyp))
assert check_return_type(results)
return results
@staticmethod
def from_pretrained(
model_tag: Optional[str] = None,
**kwargs: Optional[Any],
) -> Speech2Text:
"""Build Speech2Text instance from the pretrained model.
Args:
model_tag: Model tag of the pretrained models.
Return:
: Speech2Text instance.
"""
if model_tag is not None:
try:
from espnet_model_zoo.downloader import ModelDownloader
except ImportError:
logging.error(
"`espnet_model_zoo` is not installed. "
"Please install via `pip install -U espnet_model_zoo`."
)
raise
d = ModelDownloader()
kwargs.update(**d.download_and_unpack(model_tag))
return Speech2Text(**kwargs)
def inference(
output_dir: str,
batch_size: int,
dtype: str,
beam_size: int,
ngpu: int,
seed: int,
lm_weight: float,
nbest: int,
num_workers: int,
log_level: Union[int, str],
data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
asr_train_config: Optional[str],
asr_model_file: Optional[str],
cmvn_file: Optional[str],
beam_search_config: Optional[dict],
lm_train_config: Optional[str],
lm_file: Optional[str],
model_tag: Optional[str],
token_type: Optional[str],
bpemodel: Optional[str],
key_file: Optional[str],
allow_variable_data_keys: bool,
quantize_asr_model: Optional[bool],
quantize_modules: Optional[List[str]],
quantize_dtype: Optional[str],
streaming: Optional[bool],
simu_streaming: Optional[bool],
chunk_size: Optional[int],
left_context: Optional[int],
right_context: Optional[int],
display_partial_hypotheses: bool,
**kwargs,
) -> None:
"""Transducer model inference.
Args:
output_dir: Output directory path.
batch_size: Batch decoding size.
dtype: Data type.
beam_size: Beam size.
ngpu: Number of GPUs.
seed: Random number generator seed.
lm_weight: Weight of language model.
nbest: Number of final hypothesis.
num_workers: Number of workers.
log_level: Level of verbose for logs.
data_path_and_name_and_type:
asr_train_config: ASR model training config path.
asr_model_file: ASR model path.
beam_search_config: Beam search config path.
lm_train_config: Language Model training config path.
lm_file: Language Model path.
model_tag: Model tag.
token_type: Type of token units.
bpemodel: BPE model path.
key_file: File key.
allow_variable_data_keys: Whether to allow variable data keys.
quantize_asr_model: Whether to apply dynamic quantization to ASR model.
quantize_modules: List of module names to apply dynamic quantization on.
quantize_dtype: Dynamic quantization data type.
streaming: Whether to perform chunk-by-chunk inference.
chunk_size: Number of frames in chunk AFTER subsampling.
left_context: Number of frames in left context AFTER subsampling.
right_context: Number of frames in right context AFTER subsampling.
display_partial_hypotheses: Whether to display partial hypotheses.
"""
assert check_argument_types()
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1:
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
beam_search_config=beam_search_config,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
dtype=dtype,
beam_size=beam_size,
lm_weight=lm_weight,
nbest=nbest,
quantize_asr_model=quantize_asr_model,
quantize_modules=quantize_modules,
quantize_dtype=quantize_dtype,
streaming=streaming,
simu_streaming=simu_streaming,
chunk_size=chunk_size,
left_context=left_context,
right_context=right_context,
)
speech2text = Speech2Text.from_pretrained(
model_tag=model_tag,
**speech2text_kwargs,
)
# 3. Build data-iterator
loader = ASRTransducerTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=ASRTransducerTask.build_preprocess_fn(
speech2text.asr_train_args, False
),
collate_fn=ASRTransducerTask.build_collate_fn(
speech2text.asr_train_args, False
),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
# 4 .Start for-loop
with DatadirWriter(output_dir) as writer:
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
assert len(batch.keys()) == 1
try:
if speech2text.streaming:
speech = batch["speech"]
_steps = len(speech) // speech2text._ctx
_end = 0
for i in range(_steps):
_end = (i + 1) * speech2text._ctx
speech2text.streaming_decode(
speech[i * speech2text._ctx : _end], is_final=False
)
final_hyps = speech2text.streaming_decode(
speech[_end : len(speech)], is_final=True
)
elif speech2text.simu_streaming:
final_hyps = speech2text.simu_streaming_decode(**batch)
else:
final_hyps = speech2text(**batch)
results = speech2text.hypotheses_to_results(final_hyps)
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, yseq=[], dec_state=None)
results = [[" ", ["<space>"], [2], hyp]] * nbest
key = keys[0]
for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
ibest_writer = writer[f"{n}best_recog"]
ibest_writer["token"][key] = " ".join(token)
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
ibest_writer["text"][key] = text
def get_parser():
"""Get Transducer model inference parser."""
parser = config_argparse.ArgumentParser(
description="ASR Transducer Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=True,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=5, help="Beam size")
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument(
"--beam_search_config",
default={},
help="The keyword arguments for transducer beam search.",
)
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
group = parser.add_argument_group("Dynamic quantization related")
parser.add_argument(
"--quantize_asr_model",
type=bool,
default=False,
help="Apply dynamic quantization to ASR model.",
)
parser.add_argument(
"--quantize_modules",
nargs="*",
default=None,
help="""Module names to apply dynamic quantization on.
The module names are provided as a list, where each name is separated
by a comma (e.g.: --quantize-config=[Linear,LSTM,GRU]).
Each specified name should be an attribute of 'torch.nn', e.g.:
torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
)
parser.add_argument(
"--quantize_dtype",
type=str,
default="qint8",
choices=["float16", "qint8"],
help="Dtype for dynamic quantization.",
)
group = parser.add_argument_group("Streaming related")
parser.add_argument(
"--streaming",
type=bool,
default=False,
help="Whether to perform chunk-by-chunk inference.",
)
parser.add_argument(
"--simu_streaming",
type=bool,
default=False,
help="Whether to simulate chunk-by-chunk inference.",
)
parser.add_argument(
"--chunk_size",
type=int,
default=16,
help="Number of frames in chunk AFTER subsampling.",
)
parser.add_argument(
"--left_context",
type=int,
default=32,
help="Number of frames in left context of the chunk AFTER subsampling.",
)
parser.add_argument(
"--right_context",
type=int,
default=0,
help="Number of frames in right context of the chunk AFTER subsampling.",
)
parser.add_argument(
"--display_partial_hypotheses",
type=bool,
default=False,
help="Whether to display partial hypotheses during chunk-by-chunk inference.",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,694 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
from pathlib import Path
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
import numpy as np
import torch
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.beam_search import BeamSearchScama as BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTaskUniASR as ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend
class Speech2Text:
"""Speech2Text class
Examples:
>>> import soundfile
>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
dtype: str = "float32",
beam_size: int = 20,
ctc_weight: float = 0.5,
lm_weight: float = 1.0,
ngram_weight: float = 0.9,
penalty: float = 0.0,
nbest: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
frontend_conf: dict = None,
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
frontend = None
if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
if decoding_mode == "model1":
decoder = asr_model.decoder
else:
decoder = asr_model.decoder2
if asr_model.ctc != None:
ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
scorers.update(
ctc=ctc
)
token_list = asr_model.token_list
scorers.update(
decoder=decoder,
length_bonus=LengthBonus(len(token_list)),
)
# 2. Build Language model
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
scorers["lm"] = lm.lm
# 3. Build ngram model
# ngram is not supported now
ngram = None
scorers["ngram"] = ngram
# 4. Build BeamSearch object
# transducer is not supported now
beam_search_transducer = None
weights = dict(
decoder=1.0 - ctc_weight,
ctc=ctc_weight,
lm=lm_weight,
ngram=ngram_weight,
length_bonus=penalty,
)
beam_search = BeamSearch(
beam_size=beam_size,
weights=weights,
scorers=scorers,
sos=asr_model.sos,
eos=asr_model.eos,
vocab_size=len(token_list),
token_list=token_list,
pre_beam_score_key=None if ctc_weight == 1.0 else "full",
)
beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
for scorer in scorers.values():
if isinstance(scorer, torch.nn.Module):
scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
# logging.info(f"Beam_search: {beam_search}")
logging.info(f"Decoding device={device}, dtype={dtype}")
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
self.beam_search = beam_search
self.beam_search_transducer = beam_search_transducer
self.maxlenratio = maxlenratio
self.minlenratio = minlenratio
self.device = device
self.dtype = dtype
self.nbest = nbest
self.token_num_relax = token_num_relax
self.decoding_ind = decoding_ind
self.decoding_mode = decoding_mode
self.frontend = frontend
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
) -> List[
Tuple[
Optional[str],
List[str],
List[int],
Union[Hypothesis],
]
]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.asr_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
feats_raw = feats.clone().to(self.device)
batch = {"speech": feats, "speech_lengths": feats_len}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
_, enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
if isinstance(enc, tuple):
enc = enc[0]
assert len(enc) == 1, len(enc)
if self.decoding_mode == "model1":
predictor_outs = self.asr_model.calc_predictor_mask(enc, enc_len)
else:
enc, enc_len = self.asr_model.encode2(enc, enc_len, feats_raw, feats_len, ind=self.decoding_ind)
predictor_outs = self.asr_model.calc_predictor_mask2(enc, enc_len)
scama_mask = predictor_outs[4]
pre_token_length = predictor_outs[1]
pre_acoustic_embeds = predictor_outs[0]
maxlen = pre_token_length.sum().item() + self.token_num_relax
minlen = max(0, pre_token_length.sum().item() - self.token_num_relax)
# c. Passed the encoder result and the beam search
nbest_hyps = self.beam_search(
x=enc[0], scama_mask=scama_mask, pre_acoustic_embeds=pre_acoustic_embeds, maxlenratio=self.maxlenratio,
minlenratio=self.minlenratio, maxlen=int(maxlen), minlen=int(minlen),
)
nbest_hyps = nbest_hyps[: self.nbest]
results = []
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
# remove sos/eos and get results
last_pos = -1
if isinstance(hyp.yseq, list):
token_int = hyp.yseq[1:last_pos]
else:
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
token_int = list(filter(lambda x: x != 0, token_int))
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
token = list(filter(lambda x: x != "<gbg>", token))
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
results.append((text, token, token_int, hyp))
assert check_return_type(results)
return results
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
ngram_file: Optional[str] = None,
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
ngram_file=ngram_file,
nbest=nbest,
num_workers=num_workers,
token_num_relax=token_num_relax,
decoding_ind=decoding_ind,
decoding_mode=decoding_mode,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
ngram_file: Optional[str] = None,
cmvn_file: Optional[str] = None,
# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
if param_dict is not None and "decoding_model" in param_dict:
if param_dict["decoding_model"] == "fast":
decoding_ind = 0
decoding_mode = "model1"
elif param_dict["decoding_model"] == "normal":
decoding_ind = 0
decoding_mode = "model2"
elif param_dict["decoding_model"] == "offline":
decoding_ind = 1
decoding_mode = "model2"
else:
raise NotImplementedError("unsupported decoding model {}".format(param_dict["decoding_model"]))
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
ngram_file=ngram_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
streaming=streaming,
token_num_relax=token_num_relax,
decoding_ind=decoding_ind,
decoding_mode=decoding_mode,
)
speech2text = Speech2Text(**speech2text_kwargs)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
fs=fs,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
else:
writer = None
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
#batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
# N-best list of (text, token, token_int, hyp_object)
try:
results = speech2text(**batch)
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
results = [[" ", ["sil"], [2], hyp]] * nbest
# Only supporting batch_size==1
key = keys[0]
logging.info(f"Utterance: {key}")
for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
# Create a directory: outdir/{n}best_recog
if writer is not None:
ibest_writer = writer[f"{n}best_recog"]
# Write the result to each file
ibest_writer["token"][key] = " ".join(token)
# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = " ".join(word_lists)
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
group.add_argument("--token_num_relax", type=int, default=1, help="")
group.add_argument("--decoding_ind", type=int, default=0, help="")
group.add_argument("--decoding_mode", type=str, default="model1", help="")
group.add_argument(
"--ctc_weight2",
type=float,
default=0.0,
help="CTC weight in joint decoding",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,695 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
from pathlib import Path
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
import numpy as np
import torch
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.beam_search.beam_search import BeamSearchScama as BeamSearch
from funasr_local.modules.beam_search.beam_search import Hypothesis
from funasr_local.modules.scorers.ctc import CTCPrefixScorer
from funasr_local.modules.scorers.length_bonus import LengthBonus
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.asr import ASRTaskUniASR as ASRTask
from funasr_local.tasks.lm import LMTask
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend
header_colors = '\033[95m'
end_colors = '\033[0m'
class Speech2Text:
"""Speech2Text class
Examples:
>>> import soundfile
>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
asr_train_config: Union[Path, str] = None,
asr_model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
lm_train_config: Union[Path, str] = None,
lm_file: Union[Path, str] = None,
token_type: str = None,
bpemodel: str = None,
device: str = "cpu",
maxlenratio: float = 0.0,
minlenratio: float = 0.0,
dtype: str = "float32",
beam_size: int = 20,
ctc_weight: float = 0.5,
lm_weight: float = 1.0,
ngram_weight: float = 0.9,
penalty: float = 0.0,
nbest: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
frontend_conf: dict = None,
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
scorers = {}
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
frontend = None
if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
if decoding_mode == "model1":
decoder = asr_model.decoder
else:
decoder = asr_model.decoder2
if asr_model.ctc != None:
ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
scorers.update(
ctc=ctc
)
token_list = asr_model.token_list
scorers.update(
decoder=decoder,
length_bonus=LengthBonus(len(token_list)),
)
# 2. Build Language model
if lm_train_config is not None:
lm, lm_train_args = LMTask.build_model_from_file(
lm_train_config, lm_file, device
)
scorers["lm"] = lm.lm
# 3. Build ngram model
# ngram is not supported now
ngram = None
scorers["ngram"] = ngram
# 4. Build BeamSearch object
# transducer is not supported now
beam_search_transducer = None
weights = dict(
decoder=1.0 - ctc_weight,
ctc=ctc_weight,
lm=lm_weight,
ngram=ngram_weight,
length_bonus=penalty,
)
beam_search = BeamSearch(
beam_size=beam_size,
weights=weights,
scorers=scorers,
sos=asr_model.sos,
eos=asr_model.eos,
vocab_size=len(token_list),
token_list=token_list,
pre_beam_score_key=None if ctc_weight == 1.0 else "full",
)
beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
for scorer in scorers.values():
if isinstance(scorer, torch.nn.Module):
scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
# logging.info(f"Beam_search: {beam_search}")
logging.info(f"Decoding device={device}, dtype={dtype}")
# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
if token_type is None:
token_type = asr_train_args.token_type
if bpemodel is None:
bpemodel = asr_train_args.bpemodel
if token_type is None:
tokenizer = None
elif token_type == "bpe":
if bpemodel is not None:
tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
else:
tokenizer = None
else:
tokenizer = build_tokenizer(token_type=token_type)
converter = TokenIDConverter(token_list=token_list)
logging.info(f"Text tokenizer: {tokenizer}")
self.asr_model = asr_model
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
self.beam_search = beam_search
self.beam_search_transducer = beam_search_transducer
self.maxlenratio = maxlenratio
self.minlenratio = minlenratio
self.device = device
self.dtype = dtype
self.nbest = nbest
self.token_num_relax = token_num_relax
self.decoding_ind = decoding_ind
self.decoding_mode = decoding_mode
self.frontend = frontend
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
) -> List[
Tuple[
Optional[str],
List[str],
List[int],
Union[Hypothesis],
]
]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.asr_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
feats_raw = feats.clone().to(self.device)
batch = {"speech": feats, "speech_lengths": feats_len}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
_, enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
if isinstance(enc, tuple):
enc = enc[0]
assert len(enc) == 1, len(enc)
if self.decoding_mode == "model1":
predictor_outs = self.asr_model.calc_predictor_mask(enc, enc_len)
else:
enc, enc_len = self.asr_model.encode2(enc, enc_len, feats_raw, feats_len, ind=self.decoding_ind)
predictor_outs = self.asr_model.calc_predictor_mask2(enc, enc_len)
scama_mask = predictor_outs[4]
pre_token_length = predictor_outs[1]
pre_acoustic_embeds = predictor_outs[0]
maxlen = pre_token_length.sum().item() + self.token_num_relax
minlen = max(0, pre_token_length.sum().item() - self.token_num_relax)
# c. Passed the encoder result and the beam search
nbest_hyps = self.beam_search(
x=enc[0], scama_mask=scama_mask, pre_acoustic_embeds=pre_acoustic_embeds, maxlenratio=self.maxlenratio,
minlenratio=self.minlenratio, maxlen=int(maxlen), minlen=int(minlen),
)
nbest_hyps = nbest_hyps[: self.nbest]
results = []
for hyp in nbest_hyps:
assert isinstance(hyp, (Hypothesis)), type(hyp)
# remove sos/eos and get results
last_pos = -1
if isinstance(hyp.yseq, list):
token_int = hyp.yseq[1:last_pos]
else:
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
token_int = list(filter(lambda x: x != 0, token_int))
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
token = list(filter(lambda x: x != "<gbg>", token))
if self.tokenizer is not None:
text = self.tokenizer.tokens2text(token)
else:
text = None
results.append((text, token, token_int, hyp))
assert check_return_type(results)
return results
def inference(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
ngram_file: Optional[str] = None,
cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
**kwargs,
):
inference_pipeline = inference_modelscope(
maxlenratio=maxlenratio,
minlenratio=minlenratio,
batch_size=batch_size,
beam_size=beam_size,
ngpu=ngpu,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
penalty=penalty,
log_level=log_level,
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
raw_inputs=raw_inputs,
lm_train_config=lm_train_config,
lm_file=lm_file,
token_type=token_type,
key_file=key_file,
word_lm_train_config=word_lm_train_config,
bpemodel=bpemodel,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
output_dir=output_dir,
dtype=dtype,
seed=seed,
ngram_weight=ngram_weight,
ngram_file=ngram_file,
nbest=nbest,
num_workers=num_workers,
token_num_relax=token_num_relax,
decoding_ind=decoding_ind,
decoding_mode=decoding_mode,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
maxlenratio: float,
minlenratio: float,
batch_size: int,
beam_size: int,
ngpu: int,
ctc_weight: float,
lm_weight: float,
penalty: float,
log_level: Union[int, str],
# data_path_and_name_and_type,
asr_train_config: Optional[str],
asr_model_file: Optional[str],
ngram_file: Optional[str] = None,
cmvn_file: Optional[str] = None,
# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
lm_train_config: Optional[str] = None,
lm_file: Optional[str] = None,
token_type: Optional[str] = None,
key_file: Optional[str] = None,
word_lm_train_config: Optional[str] = None,
bpemodel: Optional[str] = None,
allow_variable_data_keys: bool = False,
streaming: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
ngram_weight: float = 0.9,
nbest: int = 1,
num_workers: int = 1,
token_num_relax: int = 1,
decoding_ind: int = 0,
decoding_mode: str = "model1",
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
if param_dict is not None and "decoding_model" in param_dict:
if param_dict["decoding_model"] == "fast":
decoding_ind = 0
decoding_mode = "model1"
elif param_dict["decoding_model"] == "normal":
decoding_ind = 0
decoding_mode = "model2"
elif param_dict["decoding_model"] == "offline":
decoding_ind = 1
decoding_mode = "model2"
else:
raise NotImplementedError("unsupported decoding model {}".format(param_dict["decoding_model"]))
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2text
speech2text_kwargs = dict(
asr_train_config=asr_train_config,
asr_model_file=asr_model_file,
cmvn_file=cmvn_file,
lm_train_config=lm_train_config,
lm_file=lm_file,
ngram_file=ngram_file,
token_type=token_type,
bpemodel=bpemodel,
device=device,
maxlenratio=maxlenratio,
minlenratio=minlenratio,
dtype=dtype,
beam_size=beam_size,
ctc_weight=ctc_weight,
lm_weight=lm_weight,
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
streaming=streaming,
token_num_relax=token_num_relax,
decoding_ind=decoding_ind,
decoding_mode=decoding_mode,
)
speech2text = Speech2Text(**speech2text_kwargs)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
fs=fs,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
else:
writer = None
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
#batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
# N-best list of (text, token, token_int, hyp_object)
try:
results = speech2text(**batch)
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
results = [[" ", ["sil"], [2], hyp]] * nbest
# Only supporting batch_size==1
key = keys[0]
logging.info(f"Utterance: {key}")
for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
# Create a directory: outdir/{n}best_recog
if writer is not None:
ibest_writer = writer[f"{n}best_recog"]
# Write the result to each file
ibest_writer["token"][key] = " ".join(token)
# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = " ".join(word_lists)
return asr_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="ASR Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--asr_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--asr_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--lm_train_config",
type=str,
help="LM training configuration",
)
group.add_argument(
"--lm_file",
type=str,
help="LM parameter file",
)
group.add_argument(
"--word_lm_train_config",
type=str,
help="Word LM training configuration",
)
group.add_argument(
"--word_lm_file",
type=str,
help="Word LM parameter file",
)
group.add_argument(
"--ngram_file",
type=str,
help="N-gram parameter file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("Beam-search related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
group.add_argument("--beam_size", type=int, default=20, help="Beam size")
group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
group.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain max output length. "
"If maxlenratio=0.0 (default), it uses a end-detect "
"function "
"to automatically find maximum hypothesis lengths."
"If maxlenratio<0.0, its absolute value is interpreted"
"as a constant max output length",
)
group.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
group.add_argument(
"--ctc_weight",
type=float,
default=0.5,
help="CTC weight in joint decoding",
)
group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
group.add_argument("--streaming", type=str2bool, default=False)
group = parser.add_argument_group("Text converter related")
group.add_argument(
"--token_type",
type=str_or_none,
default=None,
choices=["char", "bpe", None],
help="The token type for ASR model. "
"If not given, refers from the training args",
)
group.add_argument(
"--bpemodel",
type=str_or_none,
default=None,
help="The model path of sentencepiece. "
"If not given, refers from the training args",
)
group.add_argument("--token_num_relax", type=int, default=1, help="")
group.add_argument("--decoding_ind", type=int, default=0, help="")
group.add_argument("--decoding_mode", type=str, default="model1", help="")
group.add_argument(
"--ctc_weight2",
type=float,
default=0.0,
help="CTC weight in joint decoding",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.asr import ASRTask
# for ASR Training
def parse_args():
parser = ASRTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for ASR Training
ASRTask.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small":
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.asr import ASRTaskParaformer as ASRTask
# for ASR Training
def parse_args():
parser = ASRTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for ASR Training
ASRTask.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small":
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.asr import ASRTransducerTask
# for ASR Training
def parse_args():
parser = ASRTransducerTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for ASR Training
ASRTransducerTask.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small":
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.asr import ASRTaskUniASR
# for ASR Training
def parse_args():
parser = ASRTaskUniASR.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for ASR Training
ASRTaskUniASR.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small":
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,145 @@
import os
import yaml
def update_dct(fin_configs, root):
if root == {}:
return {}
for root_key, root_value in root.items():
if not isinstance(root[root_key], dict):
fin_configs[root_key] = root[root_key]
else:
if root_key in fin_configs.keys():
result = update_dct(fin_configs[root_key], root[root_key])
fin_configs[root_key] = result
else:
fin_configs[root_key] = root[root_key]
return fin_configs
def parse_args(mode):
if mode == "asr":
from funasr_local.tasks.asr import ASRTask as ASRTask
elif mode == "paraformer":
from funasr_local.tasks.asr import ASRTaskParaformer as ASRTask
elif mode == "paraformer_vad_punc":
from funasr_local.tasks.asr import ASRTaskParaformer as ASRTask
elif mode == "uniasr":
from funasr_local.tasks.asr import ASRTaskUniASR as ASRTask
elif mode == "mfcca":
from funasr_local.tasks.asr import ASRTaskMFCCA as ASRTask
elif mode == "tp":
from funasr_local.tasks.asr import ASRTaskAligner as ASRTask
else:
raise ValueError("Unknown mode: {}".format(mode))
parser = ASRTask.get_parser()
args = parser.parse_args()
return args, ASRTask
def build_trainer(modelscope_dict,
data_dir,
output_dir,
train_set="train",
dev_set="validation",
distributed=False,
dataset_type="small",
batch_bins=None,
max_epoch=None,
optim=None,
lr=None,
scheduler=None,
scheduler_conf=None,
specaug=None,
specaug_conf=None,
param_dict=None,
**kwargs):
mode = modelscope_dict['mode']
args, ASRTask = parse_args(mode=mode)
# ddp related
if args.local_rank is not None:
distributed = True
else:
distributed = False
args.local_rank = args.local_rank if args.local_rank is not None else 0
local_rank = args.local_rank
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[args.local_rank])
else:
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.local_rank)
config = modelscope_dict['am_model_config']
finetune_config = modelscope_dict['finetune_config']
init_param = modelscope_dict['init_model']
cmvn_file = modelscope_dict['cmvn_file']
seg_dict_file = modelscope_dict['seg_dict']
# overwrite parameters
with open(config) as f:
configs = yaml.safe_load(f)
with open(finetune_config) as f:
finetune_configs = yaml.safe_load(f)
# set data_types
if dataset_type == "large":
finetune_configs["dataset_conf"]["data_types"] = "sound,text"
finetune_configs = update_dct(configs, finetune_configs)
for key, value in finetune_configs.items():
if hasattr(args, key):
setattr(args, key, value)
# prepare data
args.dataset_type = dataset_type
if args.dataset_type == "small":
args.train_data_path_and_name_and_type = [["{}/{}/wav.scp".format(data_dir, train_set), "speech", "sound"],
["{}/{}/text".format(data_dir, train_set), "text", "text"]]
args.valid_data_path_and_name_and_type = [["{}/{}/wav.scp".format(data_dir, dev_set), "speech", "sound"],
["{}/{}/text".format(data_dir, dev_set), "text", "text"]]
elif args.dataset_type == "large":
args.train_data_file = None
args.valid_data_file = None
else:
raise ValueError(f"Not supported dataset_type={args.dataset_type}")
args.init_param = [init_param]
args.cmvn_file = cmvn_file
if os.path.exists(seg_dict_file):
args.seg_dict_file = seg_dict_file
else:
args.seg_dict_file = None
args.data_dir = data_dir
args.train_set = train_set
args.dev_set = dev_set
args.output_dir = output_dir
args.gpu_id = args.local_rank
args.config = finetune_config
if optim is not None:
args.optim = optim
if lr is not None:
args.optim_conf["lr"] = lr
if scheduler is not None:
args.scheduler = scheduler
if scheduler_conf is not None:
args.scheduler_conf = scheduler_conf
if specaug is not None:
args.specaug = specaug
if specaug_conf is not None:
args.specaug_conf = specaug_conf
if max_epoch is not None:
args.max_epoch = max_epoch
if batch_bins is not None:
if args.dataset_type == "small":
args.batch_bins = batch_bins
elif args.dataset_type == "large":
args.dataset_conf["batch_conf"]["batch_size"] = batch_bins
else:
raise ValueError(f"Not supported dataset_type={args.dataset_type}")
if args.normalize in ["null", "none", "None"]:
args.normalize = None
if args.patience in ["null", "none", "None"]:
args.patience = None
args.local_rank = local_rank
args.distributed = distributed
ASRTask.finetune_args = args
return ASRTask

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.data2vec import Data2VecTask
def parse_args():
parser = Data2VecTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for data2vec Training
Data2VecTask.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small":
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,185 @@
#!/usr/bin/env python3
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="Speaker Verification",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--njob",
type=int,
default=1,
help="The number of jobs for each gpu",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=True)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--diar_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--diar_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global CMVN file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("The inference configuration related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument(
"--diar_smooth_size",
type=int,
default=121,
help="The smoothing size for post-processing"
)
return parser
def inference_launch(mode, **kwargs):
if mode == "sond":
from funasr_local.bin.sond_inference import inference_modelscope
return inference_modelscope(mode=mode, **kwargs)
elif mode == "sond_demo":
from funasr_local.bin.sond_inference import inference_modelscope
param_dict = {
"extract_profile": True,
"sv_train_config": "sv.yaml",
"sv_model_file": "sv.pb",
}
if "param_dict" in kwargs and kwargs["param_dict"] is not None:
for key in param_dict:
if key not in kwargs["param_dict"]:
kwargs["param_dict"][key] = param_dict[key]
else:
kwargs["param_dict"] = param_dict
return inference_modelscope(mode=mode, **kwargs)
elif mode == "eend-ola":
from funasr_local.bin.eend_ola_inference import inference_modelscope
return inference_modelscope(mode=mode, **kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
parser.add_argument(
"--mode",
type=str,
default="sond",
help="The decoding mode",
)
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
inference_launch(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.diar import DiarTask
# for ASR Training
def parse_args():
parser = DiarTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for ASR Training
DiarTask.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small":
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,429 @@
#!/usr/bin/env python3
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import argparse
import logging
import os
import sys
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
import numpy as np
import torch
from scipy.signal import medfilt
from typeguard import check_argument_types
from funasr_local.models.frontend.wav_frontend import WavFrontendMel23
from funasr_local.tasks.diar import EENDOLADiarTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
class Speech2Diarization:
"""Speech2Diarlization class
Examples:
>>> import soundfile
>>> import numpy as np
>>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
>>> profile = np.load("profiles.npy")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2diar(audio, profile)
{"spk1": [(int, int), ...], ...}
"""
def __init__(
self,
diar_train_config: Union[Path, str] = None,
diar_model_file: Union[Path, str] = None,
device: str = "cpu",
dtype: str = "float32",
):
assert check_argument_types()
# 1. Build Diarization model
diar_model, diar_train_args = EENDOLADiarTask.build_model_from_file(
config_file=diar_train_config,
model_file=diar_model_file,
device=device
)
frontend = None
if diar_train_args.frontend is not None and diar_train_args.frontend_conf is not None:
frontend = WavFrontendMel23(**diar_train_args.frontend_conf)
# set up seed for eda
np.random.seed(diar_train_args.seed)
torch.manual_seed(diar_train_args.seed)
torch.cuda.manual_seed(diar_train_args.seed)
os.environ['PYTORCH_SEED'] = str(diar_train_args.seed)
logging.info("diar_model: {}".format(diar_model))
logging.info("diar_train_args: {}".format(diar_train_args))
diar_model.to(dtype=getattr(torch, dtype)).eval()
self.diar_model = diar_model
self.diar_train_args = diar_train_args
self.device = device
self.dtype = dtype
self.frontend = frontend
@torch.no_grad()
def __call__(
self,
speech: Union[torch.Tensor, np.ndarray],
speech_lengths: Union[torch.Tensor, np.ndarray] = None
):
"""Inference
Args:
speech: Input speech data
Returns:
diarization results
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.diar_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
batch = {"speech": feats, "speech_lengths": feats_len}
batch = to_device(batch, device=self.device)
results = self.diar_model.estimate_sequential(**batch)
return results
@staticmethod
def from_pretrained(
model_tag: Optional[str] = None,
**kwargs: Optional[Any],
):
"""Build Speech2Diarization instance from the pretrained model.
Args:
model_tag (Optional[str]): Model tag of the pretrained models.
Currently, the tags of espnet_model_zoo are supported.
Returns:
Speech2Diarization: Speech2Diarization instance.
"""
if model_tag is not None:
try:
from espnet_model_zoo.downloader import ModelDownloader
except ImportError:
logging.error(
"`espnet_model_zoo` is not installed. "
"Please install via `pip install -U espnet_model_zoo`."
)
raise
d = ModelDownloader()
kwargs.update(**d.download_and_unpack(model_tag))
return Speech2Diarization(**kwargs)
def inference_modelscope(
diar_train_config: str,
diar_model_file: str,
output_dir: Optional[str] = None,
batch_size: int = 1,
dtype: str = "float32",
ngpu: int = 1,
num_workers: int = 0,
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
param_dict: Optional[dict] = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("param_dict: {}".format(param_dict))
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Build speech2diar
speech2diar_kwargs = dict(
diar_train_config=diar_train_config,
diar_model_file=diar_model_file,
device=device,
dtype=dtype,
)
logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
speech2diar = Speech2Diarization.from_pretrained(
model_tag=model_tag,
**speech2diar_kwargs,
)
speech2diar.diar_model.eval()
def output_results_str(results: dict, uttid: str):
rst = []
mid = uttid.rsplit("-", 1)[0]
for key in results:
results[key] = [(x[0] / 100, x[1] / 100) for x in results[key]]
template = "SPEAKER {} 0 {:.2f} {:.2f} <NA> <NA> {} <NA> <NA>"
for spk, segs in results.items():
rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
return "\n".join(rst)
def _forward(
data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
output_dir_v2: Optional[str] = None,
param_dict: Optional[dict] = None,
):
# 2. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
loader = EENDOLADiarTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=EENDOLADiarTask.build_preprocess_fn(speech2diar.diar_train_args, False),
collate_fn=EENDOLADiarTask.build_collate_fn(speech2diar.diar_train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
# 3. Start for-loop
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
os.makedirs(output_path, exist_ok=True)
output_writer = open("{}/result.txt".format(output_path), "w")
result_list = []
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
# batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
results = speech2diar(**batch)
# post process
a = results[0][0].cpu().numpy()
a = medfilt(a, (11, 1))
rst = []
for spkid, frames in enumerate(a.T):
frames = np.pad(frames, (1, 1), 'constant')
changes, = np.where(np.diff(frames, axis=0) != 0)
fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
for s, e in zip(changes[::2], changes[1::2]):
st = s / 10.
dur = (e - s) / 10.
rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
# Only supporting batch_size==1
value = "\n".join(rst)
item = {"key": keys[0], "value": value}
result_list.append(item)
if output_path is not None:
output_writer.write(value)
output_writer.flush()
if output_path is not None:
output_writer.close()
return result_list
return _forward
def inference(
data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
diar_train_config: Optional[str],
diar_model_file: Optional[str],
output_dir: Optional[str] = None,
batch_size: int = 1,
dtype: str = "float32",
ngpu: int = 0,
seed: int = 0,
num_workers: int = 1,
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
smooth_size: int = 83,
dur_threshold: int = 10,
out_format: str = "vad",
**kwargs,
):
inference_pipeline = inference_modelscope(
diar_train_config=diar_train_config,
diar_model_file=diar_model_file,
output_dir=output_dir,
batch_size=batch_size,
dtype=dtype,
ngpu=ngpu,
seed=seed,
num_workers=num_workers,
log_level=log_level,
key_file=key_file,
model_tag=model_tag,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
smooth_size=smooth_size,
dur_threshold=dur_threshold,
out_format=out_format,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
def get_parser():
parser = config_argparse.ArgumentParser(
description="Speaker verification/x-vector extraction",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--diar_train_config",
type=str,
help="diarization training configuration",
)
group.add_argument(
"--diar_model_file",
type=str,
help="diarization model parameter file",
)
group.add_argument(
"--dur_threshold",
type=int,
default=10,
help="The threshold for short segments in number frames"
)
parser.add_argument(
"--smooth_size",
type=int,
default=83,
help="The smoothing window length in number frames"
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
parser.add_argument("--streaming", type=str2bool, default=False)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
logging.info("args: {}".format(kwargs))
if args.output_dir is None:
jobid, n_gpu = 1, 1
gpuid = args.gpuid_list.split(",")[jobid - 1]
else:
jobid = int(args.output_dir.split(".")[-1])
n_gpu = len(args.gpuid_list.split(","))
gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
results_list = inference(**kwargs)
for results in results_list:
print("{} {}".format(results["key"], results["value"]))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,211 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import sys
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
import numpy as np
import torch
from torch.nn.parallel import data_parallel
from typeguard import check_argument_types
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.tasks.lm import LMTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.forward_adaptor import ForwardAdaptor
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import float_or_none
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def calc_perplexity(
output_dir: str,
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
log_level: Union[int, str],
data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
key_file: Optional[str],
train_config: Optional[str],
model_file: Optional[str],
log_base: Optional[float],
allow_variable_data_keys: bool,
):
assert check_argument_types()
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1:
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build LM
model, train_args = LMTask.build_model_from_file(config_file=train_config, model_file=model_file, device=device)
# Wrape model to make model.nll() data-parallel
wrapped_model = ForwardAdaptor(model, "nll")
wrapped_model.to(dtype=getattr(torch, dtype)).eval()
logging.info(f"Model:\n{model}")
# 3. Build data-iterator
loader = LMTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=LMTask.build_preprocess_fn(train_args, False),
collate_fn=LMTask.build_collate_fn(train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
# 4. Start for-loop
with DatadirWriter(output_dir) as writer:
total_nll = 0.0
total_ntokens = 0
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
with torch.no_grad():
batch = to_device(batch, device)
if ngpu <= 1:
# NOTE(kamo): data_parallel also should work with ngpu=1,
# but for debuggability it's better to keep this block.
nll, lengths = wrapped_model(**batch)
else:
nll, lengths = data_parallel(
wrapped_model, (), range(ngpu), module_kwargs=batch
)
assert _bs == len(nll) == len(lengths), (_bs, len(nll), len(lengths))
# nll: (B, L) -> (B,)
nll = nll.detach().cpu().numpy().sum(1)
# lengths: (B,)
lengths = lengths.detach().cpu().numpy()
total_nll += nll.sum()
total_ntokens += lengths.sum()
for key, _nll, ntoken in zip(keys, nll, lengths):
if log_base is None:
utt_ppl = np.exp(_nll / ntoken)
else:
utt_ppl = log_base ** (_nll / ntoken / np.log(log_base))
# Write PPL of each utts for debugging or analysis
writer["utt2nll"][key] = str(-_nll)
writer["utt2ppl"][key] = str(utt_ppl)
writer["utt2ntokens"][key] = str(ntoken)
if log_base is None:
ppl = np.exp(total_nll / total_ntokens)
else:
ppl = log_base ** (total_nll / total_ntokens / np.log(log_base))
with (Path(output_dir) / "ppl").open("w", encoding="utf-8") as f:
f.write(f"{ppl}\n")
with (Path(output_dir) / "base").open("w", encoding="utf-8") as f:
if log_base is None:
_log_base = np.e
else:
_log_base = log_base
f.write(f"{_log_base}\n")
logging.info(f"PPL={ppl}")
def get_parser():
parser = config_argparse.ArgumentParser(
description="Calc perplexity",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
parser.add_argument(
"--log_base",
type=float_or_none,
default=None,
help="The base of logarithm for Perplexity. "
"If None, napier's constant is used.",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=True,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument("--train_config", type=str)
group.add_argument("--model_file", type=str)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
calc_perplexity(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,406 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import sys
import os
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
from typing import List
import numpy as np
import torch
from torch.nn.parallel import data_parallel
from typeguard import check_argument_types
from funasr_local.tasks.lm import LMTask
from funasr_local.datasets.preprocessor import LMPreprocessor
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.forward_adaptor import ForwardAdaptor
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import float_or_none
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def inference(
output_dir: str,
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
log_level: Union[int, str],
train_config: Optional[str],
model_file: Optional[str],
log_base: Optional[float],
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
split_with_space: Optional[bool] = False,
seg_dict_file: Optional[str] = None,
data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
raw_inputs: Union[List[Any], bytes, str] = None,
**kwargs,
):
inference_pipeline = inference_modelscope(
output_dir=output_dir,
raw_inputs=raw_inputs,
batch_size=batch_size,
dtype=dtype,
ngpu=ngpu,
seed=seed,
num_workers=num_workers,
log_level=log_level,
key_file=key_file,
train_config=train_config,
model_file=model_file,
log_base = log_base,
allow_variable_data_keys = allow_variable_data_keys,
split_with_space=split_with_space,
seg_dict_file=seg_dict_file,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
log_level: Union[int, str],
key_file: Optional[str],
train_config: Optional[str],
model_file: Optional[str],
log_base: Optional[float] = 10,
allow_variable_data_keys: bool = False,
split_with_space: Optional[bool] = False,
seg_dict_file: Optional[str] = None,
output_dir: Optional[str] = None,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build Model
model, train_args = LMTask.build_model_from_file(
train_config, model_file, device)
wrapped_model = ForwardAdaptor(model, "nll")
wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
logging.info(f"Model:\n{model}")
preprocessor = LMPreprocessor(
train=False,
token_type=train_args.token_type,
token_list=train_args.token_list,
bpemodel=train_args.bpemodel,
text_cleaner=train_args.cleaner,
g2p_type=train_args.g2p,
text_name="text",
non_linguistic_symbols=train_args.non_linguistic_symbols,
split_with_space=split_with_space,
seg_dict_file=seg_dict_file
)
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[List[Any], bytes, str] = None,
output_dir_v2: Optional[str] = None,
param_dict: dict = None,
):
results = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
else:
writer = None
if raw_inputs != None:
line = raw_inputs.strip()
key = "lm demo"
if line=="":
item = {'key': key, 'value': ""}
results.append(item)
return results
batch = {}
batch['text'] = line
if preprocessor != None:
batch = preprocessor(key, batch)
# Force data-precision
for name in batch:
value = batch[name]
if not isinstance(value, np.ndarray):
raise RuntimeError(
f"All values must be converted to np.ndarray object "
f'by preprocessing, but "{name}" is still {type(value)}.'
)
# Cast to desired type
if value.dtype.kind == "f":
value = value.astype("float32")
elif value.dtype.kind == "i":
value = value.astype("long")
else:
raise NotImplementedError(f"Not supported dtype: {value.dtype}")
batch[name] = value
batch["text_lengths"] = torch.from_numpy(
np.array([len(batch["text"])], dtype='int32'))
batch["text"] = np.expand_dims(batch["text"], axis=0)
with torch.no_grad():
batch = to_device(batch, device)
if ngpu <= 1:
nll, lengths = wrapped_model(**batch)
else:
nll, lengths = data_parallel(
wrapped_model, (), range(ngpu), module_kwargs=batch
)
## compute ppl
ppl_out_batch = ""
ids2tokens = preprocessor.token_id_converter.ids2tokens
for sent_ids, sent_nll in zip(batch['text'], nll):
pre_word = "<s>"
cur_word = None
sent_lst = ids2tokens(sent_ids) + ['</s>']
ppl_out = " ".join(sent_lst) + "\n"
for word, word_nll in zip(sent_lst, sent_nll):
cur_word = word
word_nll = -word_nll.cpu()
if log_base is None:
word_prob = np.exp(word_nll)
else:
word_prob = log_base ** (word_nll / np.log(log_base))
ppl_out += ' p( {cur} | {pre} ) = {prob} [ {word_nll} ]\n'.format(
cur=cur_word,
pre=pre_word,
prob=round(word_prob.item(), 8),
word_nll=round(word_nll.item(), 8)
)
pre_word = cur_word
sent_nll_mean = sent_nll.mean().cpu().numpy()
sent_nll_sum = sent_nll.sum().cpu().numpy()
if log_base is None:
sent_ppl = np.exp(sent_nll_mean)
else:
sent_ppl = log_base ** (sent_nll_mean / np.log(log_base))
ppl_out += 'logprob= {sent_nll} ppl= {sent_ppl}\n\n'.format(
sent_nll=round(-sent_nll_sum.item(), 4),
sent_ppl=round(sent_ppl.item(), 4)
)
ppl_out_batch += ppl_out
item = {'key': key, 'value': ppl_out}
if writer is not None:
writer["ppl"][key+":\n"] = ppl_out
results.append(item)
return results
# 3. Build data-iterator
loader = LMTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=preprocessor,
collate_fn=LMTask.build_collate_fn(train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
# 4. Start for-loop
total_nll = 0.0
total_ntokens = 0
ppl_out_all = ""
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
ppl_out_batch = ""
with torch.no_grad():
batch = to_device(batch, device)
if ngpu <= 1:
# NOTE(kamo): data_parallel also should work with ngpu=1,
# but for debuggability it's better to keep this block.
nll, lengths = wrapped_model(**batch)
else:
nll, lengths = data_parallel(
wrapped_model, (), range(ngpu), module_kwargs=batch
)
## print ppl
ids2tokens = preprocessor.token_id_converter.ids2tokens
for key, sent_ids, sent_nll in zip(keys, batch['text'], nll):
pre_word = "<s>"
cur_word = None
sent_lst = ids2tokens(sent_ids) + ['</s>']
ppl_out = " ".join(sent_lst) + "\n"
for word, word_nll in zip(sent_lst, sent_nll):
cur_word = word
word_nll = -word_nll.cpu()
if log_base is None:
word_prob = np.exp(word_nll)
else:
word_prob = log_base ** (word_nll / np.log(log_base))
ppl_out += ' p( {cur} | {pre} ) = {prob} [ {word_nll} ]\n'.format(
cur=cur_word,
pre=pre_word,
prob=round(word_prob.item(), 8),
word_nll=round(word_nll.item(), 8)
)
pre_word = cur_word
sent_nll_mean = sent_nll.mean().cpu().numpy()
sent_nll_sum = sent_nll.sum().cpu().numpy()
if log_base is None:
sent_ppl = np.exp(sent_nll_mean)
else:
sent_ppl = log_base ** (sent_nll_mean / np.log(log_base))
ppl_out += 'logprob= {sent_nll} ppl= {sent_ppl}\n\n'.format(
sent_nll=round(-sent_nll_sum.item(), 4),
sent_ppl=round(sent_ppl.item(), 4)
)
ppl_out_batch += ppl_out
utt2nll = round(-sent_nll_sum.item(), 5)
item = {'key': key, 'value': ppl_out}
if writer is not None:
writer["ppl"][key+":\n"] = ppl_out
writer["utt2nll"][key] = str(utt2nll)
results.append(item)
ppl_out_all += ppl_out_batch
assert _bs == len(nll) == len(lengths), (_bs, len(nll), len(lengths))
# nll: (B, L) -> (B,)
nll = nll.detach().cpu().numpy().sum(1)
# lengths: (B,)
lengths = lengths.detach().cpu().numpy()
total_nll += nll.sum()
total_ntokens += lengths.sum()
if log_base is None:
ppl = np.exp(total_nll / total_ntokens)
else:
ppl = log_base ** (total_nll / total_ntokens / np.log(log_base))
avg_ppl = 'logprob= {total_nll} ppl= {total_ppl}\n'.format(
total_nll=round(-total_nll.item(), 4),
total_ppl=round(ppl.item(), 4)
)
item = {'key': 'AVG PPL', 'value': avg_ppl}
ppl_out_all += avg_ppl
if writer is not None:
writer["ppl"]["AVG PPL : "] = avg_ppl
results.append(item)
return results
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="Calc perplexity",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
parser.add_argument(
"--log_base",
type=float_or_none,
default=10,
help="The base of logarithm for Perplexity. "
"If None, napier's constant is used.",
required=False
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
action="append",
required=False
)
group.add_argument(
"--raw_inputs",
type=str,
required=False
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group.add_argument("--split_with_space", type=str2bool, default=False)
group.add_argument("--seg_dict_file", type=str_or_none)
group = parser.add_argument_group("The model configuration related")
group.add_argument("--train_config", type=str)
group.add_argument("--model_file", type=str)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils.types import float_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="Calc perplexity",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--gpuid_list", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument("--njob", type=int, default=1, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
parser.add_argument(
"--log_base",
type=float_or_none,
default=10,
help="The base of logarithm for Perplexity. "
"If None, napier's constant is used.",
required=False
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
action="append",
required=False
)
group.add_argument(
"--raw_inputs",
type=str,
required=False
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group.add_argument("--split_with_space", type=str2bool, default=False)
group.add_argument("--seg_dict_file", type=str_or_none)
group = parser.add_argument_group("The model configuration related")
group.add_argument("--train_config", type=str)
group.add_argument("--model_file", type=str)
group.add_argument("--mode", type=str, default="lm")
return parser
def inference_launch(mode, **kwargs):
if mode == "transformer":
from funasr_local.bin.lm_inference import inference_modelscope
return inference_modelscope(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
kwargs.pop("gpuid_list", None)
kwargs.pop("njob", None)
results = inference_launch(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.lm import LMTask
# for LM Training
def parse_args():
parser = LMTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
# for LM Training
LMTask.main(args=args, cmd=cmd)
if __name__ == '__main__':
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
assert args.num_worker_count == 1
# re-compute batch size: when dataset type is small
if args.dataset_type == "small" and args.ngpu != 0:
if args.batch_size is not None:
args.batch_size = args.batch_size * args.ngpu
if args.batch_bins is not None:
args.batch_bins = args.batch_bins * args.ngpu
main(args=args)

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
import argparse
import logging
import os
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="decoding configs",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("--model_name",
type=str,
default="speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
help="model name in modelscope")
parser.add_argument("--model_revision",
type=str,
default="v1.0.4",
help="model revision in modelscope")
parser.add_argument("--local_model_path",
type=str,
default=None,
help="local model path, usually for fine-tuning")
parser.add_argument("--wav_list",
type=str,
help="input wav list")
parser.add_argument("--output_file",
type=str,
help="saving decoding results")
parser.add_argument(
"--njob",
type=int,
default=1,
help="The number of jobs for each gpu",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
args = parser.parse_args()
# set logging messages
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(args))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_file.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
if args.local_model_path is None:
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model="damo/{}".format(args.model_name),
model_revision=args.model_revision)
else:
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=args.local_model_path)
with open(args.wav_list, 'r') as f_wav:
wav_lines = f_wav.readlines()
with open(args.output_file, "w") as f_out:
for line in wav_lines:
wav_id, wav_path = line.strip().split()
logging.info("decoding, utt_id: ['{}']".format(wav_id))
rec_result = inference_pipeline(audio_in=wav_path)
if 'text' in rec_result:
text = rec_result["text"]
else:
text = ''
f_out.write(wav_id + " " + text + "\n")
logging.info("best hypo: {} \n".format(text))

View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils.types import float_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="Punctuation inference",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--gpuid_list", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument("--njob", type=int, default=1, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group = parser.add_argument_group("Input data related")
group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False)
group.add_argument("--raw_inputs", type=str, required=False)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--cache", type=list, required=False)
group.add_argument("--param_dict", type=dict, required=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument("--train_config", type=str)
group.add_argument("--model_file", type=str)
group.add_argument("--mode", type=str, default="punc")
return parser
def inference_launch(mode, **kwargs):
if mode == "punc":
from funasr_local.bin.punctuation_infer import inference_modelscope
return inference_modelscope(**kwargs)
if mode == "punc_VadRealtime":
from funasr_local.bin.punctuation_infer_vadrealtime import inference_modelscope
return inference_modelscope(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
kwargs.pop("gpuid_list", None)
kwargs.pop("njob", None)
results = inference_launch(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import os
from funasr_local.tasks.punctuation import PunctuationTask
def parse_args():
parser = PunctuationTask.get_parser()
parser.add_argument(
"--gpu_id",
type=int,
default=0,
help="local gpu id.",
)
parser.add_argument(
"--punc_list",
type=str,
default=None,
help="Punctuation list",
)
args = parser.parse_args()
return args
def main(args=None, cmd=None):
"""
punc training.
"""
PunctuationTask.main(args=args, cmd=cmd)
if __name__ == "__main__":
args = parse_args()
# setup local gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
# DDP settings
if args.ngpu > 1:
args.distributed = True
else:
args.distributed = False
main(args=args)

View File

@@ -0,0 +1,320 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import sys
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Any
from typing import List
import numpy as np
import torch
from typeguard import check_argument_types
from funasr_local.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.tasks.punctuation import PunctuationTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.forward_adaptor import ForwardAdaptor
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.datasets.preprocessor import split_to_mini_sentence
class Text2Punc:
def __init__(
self,
train_config: Optional[str],
model_file: Optional[str],
device: str = "cpu",
dtype: str = "float32",
):
# Build Model
model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device)
self.device = device
# Wrape model to make model.nll() data-parallel
self.wrapped_model = ForwardAdaptor(model, "inference")
self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
# logging.info(f"Model:\n{model}")
self.punc_list = train_args.punc_list
self.period = 0
for i in range(len(self.punc_list)):
if self.punc_list[i] == ",":
self.punc_list[i] = ""
elif self.punc_list[i] == "?":
self.punc_list[i] = ""
elif self.punc_list[i] == "":
self.period = i
self.preprocessor = CodeMixTokenizerCommonPreprocessor(
train=False,
token_type=train_args.token_type,
token_list=train_args.token_list,
bpemodel=train_args.bpemodel,
text_cleaner=train_args.cleaner,
g2p_type=train_args.g2p,
text_name="text",
non_linguistic_symbols=train_args.non_linguistic_symbols,
)
@torch.no_grad()
def __call__(self, text: Union[list, str], split_size=20):
data = {"text": text}
result = self.preprocessor(data=data, uid="12938712838719")
split_text = self.preprocessor.pop_split_text_data(result)
mini_sentences = split_to_mini_sentence(split_text, split_size)
mini_sentences_id = split_to_mini_sentence(data["text"], split_size)
assert len(mini_sentences) == len(mini_sentences_id)
cache_sent = []
cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
new_mini_sentence = ""
new_mini_sentence_punc = []
cache_pop_trigger_limit = 200
for mini_sentence_i in range(len(mini_sentences)):
mini_sentence = mini_sentences[mini_sentence_i]
mini_sentence_id = mini_sentences_id[mini_sentence_i]
mini_sentence = cache_sent + mini_sentence
mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
data = {
"text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
"text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
}
data = to_device(data, self.device)
y, _ = self.wrapped_model(**data)
_, indices = y.view(-1, y.shape[-1]).topk(1, dim=1)
punctuations = indices
if indices.size()[0] != 1:
punctuations = torch.squeeze(indices)
assert punctuations.size()[0] == len(mini_sentence)
# Search for the last Period/QuestionMark as cache
if mini_sentence_i < len(mini_sentences) - 1:
sentenceEnd = -1
last_comma_index = -1
for i in range(len(punctuations) - 2, 1, -1):
if self.punc_list[punctuations[i]] == "" or self.punc_list[punctuations[i]] == "":
sentenceEnd = i
break
if last_comma_index < 0 and self.punc_list[punctuations[i]] == "":
last_comma_index = i
if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
# The sentence it too long, cut off at a comma.
sentenceEnd = last_comma_index
punctuations[sentenceEnd] = self.period
cache_sent = mini_sentence[sentenceEnd + 1:]
cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
mini_sentence = mini_sentence[0:sentenceEnd + 1]
punctuations = punctuations[0:sentenceEnd + 1]
# if len(punctuations) == 0:
# continue
punctuations_np = punctuations.cpu().numpy()
new_mini_sentence_punc += [int(x) for x in punctuations_np]
words_with_punc = []
for i in range(len(mini_sentence)):
if i > 0:
if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1:
mini_sentence[i] = " " + mini_sentence[i]
words_with_punc.append(mini_sentence[i])
if self.punc_list[punctuations[i]] != "_":
words_with_punc.append(self.punc_list[punctuations[i]])
new_mini_sentence += "".join(words_with_punc)
# Add Period for the end of the sentence
new_mini_sentence_out = new_mini_sentence
new_mini_sentence_punc_out = new_mini_sentence_punc
if mini_sentence_i == len(mini_sentences) - 1:
if new_mini_sentence[-1] == "" or new_mini_sentence[-1] == "":
new_mini_sentence_out = new_mini_sentence[:-1] + ""
new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
elif new_mini_sentence[-1] != "" and new_mini_sentence[-1] != "":
new_mini_sentence_out = new_mini_sentence + ""
new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
return new_mini_sentence_out, new_mini_sentence_punc_out
def inference(
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
output_dir: str,
log_level: Union[int, str],
train_config: Optional[str],
model_file: Optional[str],
key_file: Optional[str] = None,
data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
raw_inputs: Union[List[Any], bytes, str] = None,
cache: List[Any] = None,
param_dict: dict = None,
**kwargs,
):
inference_pipeline = inference_modelscope(
output_dir=output_dir,
batch_size=batch_size,
dtype=dtype,
ngpu=ngpu,
seed=seed,
num_workers=num_workers,
log_level=log_level,
key_file=key_file,
train_config=train_config,
model_file=model_file,
param_dict=param_dict,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
log_level: Union[int, str],
key_file: Optional[str],
train_config: Optional[str],
model_file: Optional[str],
output_dir: Optional[str] = None,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
text2punc = Text2Punc(train_config, model_file, device)
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[List[Any], bytes, str] = None,
output_dir_v2: Optional[str] = None,
cache: List[Any] = None,
param_dict: dict = None,
):
results = []
split_size = 20
if raw_inputs != None:
line = raw_inputs.strip()
key = "demo"
if line == "":
item = {'key': key, 'value': ""}
results.append(item)
return results
result, _ = text2punc(line)
item = {'key': key, 'value': result}
results.append(item)
return results
for inference_text, _, _ in data_path_and_name_and_type:
with open(inference_text, "r", encoding="utf-8") as fin:
for line in fin:
line = line.strip()
segs = line.split("\t")
if len(segs) != 2:
continue
key = segs[0]
if len(segs[1]) == 0:
continue
result, _ = text2punc(segs[1])
item = {'key': key, 'value': result}
results.append(item)
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path != None:
output_file_name = "infer.out"
Path(output_path).mkdir(parents=True, exist_ok=True)
output_file_path = (Path(output_path) / output_file_name).absolute()
with open(output_file_path, "w", encoding="utf-8") as fout:
for item_i in results:
key_out = item_i["key"]
value_out = item_i["value"]
fout.write(f"{key_out}\t{value_out}\n")
return results
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="Punctuation inference",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group = parser.add_argument_group("Input data related")
group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False)
group.add_argument("--raw_inputs", type=str, required=False)
group.add_argument("--cache", type=list, required=False)
group.add_argument("--param_dict", type=dict, required=False)
group.add_argument("--key_file", type=str_or_none)
group = parser.add_argument_group("The model configuration related")
group.add_argument("--train_config", type=str)
group.add_argument("--model_file", type=str)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
# kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import sys
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Any
from typing import List
import numpy as np
import torch
from typeguard import check_argument_types
from funasr_local.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.tasks.punctuation import PunctuationTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.forward_adaptor import ForwardAdaptor
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.datasets.preprocessor import split_to_mini_sentence
class Text2Punc:
def __init__(
self,
train_config: Optional[str],
model_file: Optional[str],
device: str = "cpu",
dtype: str = "float32",
):
# Build Model
model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device)
self.device = device
# Wrape model to make model.nll() data-parallel
self.wrapped_model = ForwardAdaptor(model, "inference")
self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
# logging.info(f"Model:\n{model}")
self.punc_list = train_args.punc_list
self.period = 0
for i in range(len(self.punc_list)):
if self.punc_list[i] == ",":
self.punc_list[i] = ""
elif self.punc_list[i] == "?":
self.punc_list[i] = ""
elif self.punc_list[i] == "":
self.period = i
self.preprocessor = CodeMixTokenizerCommonPreprocessor(
train=False,
token_type=train_args.token_type,
token_list=train_args.token_list,
bpemodel=train_args.bpemodel,
text_cleaner=train_args.cleaner,
g2p_type=train_args.g2p,
text_name="text",
non_linguistic_symbols=train_args.non_linguistic_symbols,
)
print("start decoding!!!")
@torch.no_grad()
def __call__(self, text: Union[list, str], cache: list, split_size=20):
if cache is not None and len(cache) > 0:
precache = "".join(cache)
else:
precache = ""
cache = []
data = {"text": precache + text}
result = self.preprocessor(data=data, uid="12938712838719")
split_text = self.preprocessor.pop_split_text_data(result)
mini_sentences = split_to_mini_sentence(split_text, split_size)
mini_sentences_id = split_to_mini_sentence(data["text"], split_size)
assert len(mini_sentences) == len(mini_sentences_id)
cache_sent = []
cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
sentence_punc_list = []
sentence_words_list= []
cache_pop_trigger_limit = 200
skip_num = 0
for mini_sentence_i in range(len(mini_sentences)):
mini_sentence = mini_sentences[mini_sentence_i]
mini_sentence_id = mini_sentences_id[mini_sentence_i]
mini_sentence = cache_sent + mini_sentence
mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
data = {
"text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
"text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
"vad_indexes": torch.from_numpy(np.array([len(cache)], dtype='int32')),
}
data = to_device(data, self.device)
y, _ = self.wrapped_model(**data)
_, indices = y.view(-1, y.shape[-1]).topk(1, dim=1)
punctuations = indices
if indices.size()[0] != 1:
punctuations = torch.squeeze(indices)
assert punctuations.size()[0] == len(mini_sentence)
# Search for the last Period/QuestionMark as cache
if mini_sentence_i < len(mini_sentences) - 1:
sentenceEnd = -1
last_comma_index = -1
for i in range(len(punctuations) - 2, 1, -1):
if self.punc_list[punctuations[i]] == "" or self.punc_list[punctuations[i]] == "":
sentenceEnd = i
break
if last_comma_index < 0 and self.punc_list[punctuations[i]] == "":
last_comma_index = i
if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
# The sentence it too long, cut off at a comma.
sentenceEnd = last_comma_index
punctuations[sentenceEnd] = self.period
cache_sent = mini_sentence[sentenceEnd + 1:]
cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
mini_sentence = mini_sentence[0:sentenceEnd + 1]
punctuations = punctuations[0:sentenceEnd + 1]
punctuations_np = punctuations.cpu().numpy()
sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
sentence_words_list += mini_sentence
assert len(sentence_punc_list) == len(sentence_words_list)
words_with_punc = []
sentence_punc_list_out = []
for i in range(0, len(sentence_words_list)):
if i > 0:
if len(sentence_words_list[i][0].encode()) == 1 and len(sentence_words_list[i - 1][-1].encode()) == 1:
sentence_words_list[i] = " " + sentence_words_list[i]
if skip_num < len(cache):
skip_num += 1
else:
words_with_punc.append(sentence_words_list[i])
if skip_num >= len(cache):
sentence_punc_list_out.append(sentence_punc_list[i])
if sentence_punc_list[i] != "_":
words_with_punc.append(sentence_punc_list[i])
sentence_out = "".join(words_with_punc)
sentenceEnd = -1
for i in range(len(sentence_punc_list) - 2, 1, -1):
if sentence_punc_list[i] == "" or sentence_punc_list[i] == "":
sentenceEnd = i
break
cache_out = sentence_words_list[sentenceEnd + 1 :]
if sentence_out[-1] in self.punc_list:
sentence_out = sentence_out[:-1]
sentence_punc_list_out[-1] = "_"
return sentence_out, sentence_punc_list_out, cache_out
def inference(
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
output_dir: str,
log_level: Union[int, str],
train_config: Optional[str],
model_file: Optional[str],
key_file: Optional[str] = None,
data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
raw_inputs: Union[List[Any], bytes, str] = None,
cache: List[Any] = None,
param_dict: dict = None,
**kwargs,
):
inference_pipeline = inference_modelscope(
output_dir=output_dir,
batch_size=batch_size,
dtype=dtype,
ngpu=ngpu,
seed=seed,
num_workers=num_workers,
log_level=log_level,
key_file=key_file,
train_config=train_config,
model_file=model_file,
param_dict=param_dict,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs, cache)
def inference_modelscope(
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
log_level: Union[int, str],
#cache: list,
key_file: Optional[str],
train_config: Optional[str],
model_file: Optional[str],
output_dir: Optional[str] = None,
param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
text2punc = Text2Punc(train_config, model_file, device)
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[List[Any], bytes, str] = None,
output_dir_v2: Optional[str] = None,
cache: List[Any] = None,
param_dict: dict = None,
):
results = []
split_size = 10
cache_in = param_dict["cache"]
if raw_inputs != None:
line = raw_inputs.strip()
key = "demo"
if line == "":
item = {'key': key, 'value': ""}
results.append(item)
return results
result, _, cache = text2punc(line, cache_in)
param_dict["cache"] = cache
item = {'key': key, 'value': result}
results.append(item)
return results
return results
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="Punctuation inference",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group = parser.add_argument_group("Input data related")
group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False)
group.add_argument("--raw_inputs", type=str, required=False)
group.add_argument("--cache", type=list, required=False)
group.add_argument("--param_dict", type=dict, required=False)
group.add_argument("--key_file", type=str_or_none)
group = parser.add_argument_group("The model configuration related")
group.add_argument("--train_config", type=str)
group.add_argument("--model_file", type=str)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
# kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,577 @@
#!/usr/bin/env python3
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import argparse
import logging
import os
import sys
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from collections import OrderedDict
import numpy as np
import soundfile
import torch
from torch.nn import functional as F
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.tasks.diar import DiarTask
from funasr_local.tasks.asr import ASRTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from scipy.ndimage import median_filter
from funasr_local.utils.misc import statistic_model_parameters
from funasr_local.datasets.iterable_dataset import load_bytes
class Speech2Diarization:
"""Speech2Xvector class
Examples:
>>> import soundfile
>>> import numpy as np
>>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
>>> profile = np.load("profiles.npy")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2diar(audio, profile)
{"spk1": [(int, int), ...], ...}
"""
def __init__(
self,
diar_train_config: Union[Path, str] = None,
diar_model_file: Union[Path, str] = None,
device: Union[str, torch.device] = "cpu",
batch_size: int = 1,
dtype: str = "float32",
streaming: bool = False,
smooth_size: int = 83,
dur_threshold: float = 10,
):
assert check_argument_types()
# TODO: 1. Build Diarization model
diar_model, diar_train_args = DiarTask.build_model_from_file(
config_file=diar_train_config,
model_file=diar_model_file,
device=device
)
logging.info("diar_model: {}".format(diar_model))
logging.info("model parameter number: {}".format(statistic_model_parameters(diar_model)))
logging.info("diar_train_args: {}".format(diar_train_args))
diar_model.to(dtype=getattr(torch, dtype)).eval()
self.diar_model = diar_model
self.diar_train_args = diar_train_args
self.token_list = diar_train_args.token_list
self.smooth_size = smooth_size
self.dur_threshold = dur_threshold
self.device = device
self.dtype = dtype
def smooth_multi_labels(self, multi_label):
multi_label = median_filter(multi_label, (self.smooth_size, 1), mode="constant", cval=0.0).astype(int)
return multi_label
@staticmethod
def calc_spk_turns(label_arr, spk_list):
turn_list = []
length = label_arr.shape[0]
n_spk = label_arr.shape[1]
for k in range(n_spk):
if spk_list[k] == "None":
continue
in_utt = False
start = 0
for i in range(length):
if label_arr[i, k] == 1 and in_utt is False:
start = i
in_utt = True
if label_arr[i, k] == 0 and in_utt is True:
turn_list.append([spk_list[k], start, i - start])
in_utt = False
if in_utt:
turn_list.append([spk_list[k], start, length - start])
return turn_list
@staticmethod
def seq2arr(seq, vec_dim=8):
def int2vec(x, vec_dim=8, dtype=np.int):
b = ('{:0' + str(vec_dim) + 'b}').format(x)
# little-endian order: lower bit first
return (np.array(list(b)[::-1]) == '1').astype(dtype)
# process oov
seq = np.array([int(x) for x in seq])
new_seq = []
for i, x in enumerate(seq):
if x < 2 ** vec_dim:
new_seq.append(x)
else:
idx_list = np.where(seq < 2 ** vec_dim)[0]
idx = np.abs(idx_list - i).argmin()
new_seq.append(seq[idx_list[idx]])
return np.row_stack([int2vec(x, vec_dim) for x in new_seq])
def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T
# upsampling outputs to match inputs
ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
logits_idx = F.upsample(
logits_idx.unsqueeze(1).float(),
size=(ut, ),
mode="nearest",
).squeeze(1).long()
logits_idx = logits_idx[0].tolist()
pse_labels = [self.token_list[x] for x in logits_idx]
if output_format == "pse_labels":
return pse_labels, None
multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers
multi_labels = self.smooth_multi_labels(multi_labels)
if output_format == "binary_labels":
return multi_labels, None
spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
spk_turns = self.calc_spk_turns(multi_labels, spk_list)
results = OrderedDict()
for spk, st, dur in spk_turns:
if spk not in results:
results[spk] = []
if dur > self.dur_threshold:
results[spk].append((st, st+dur))
# sort segments in start time ascending
for spk in results:
results[spk] = sorted(results[spk], key=lambda x: x[0])
return results, pse_labels
@torch.no_grad()
def __call__(
self,
speech: Union[torch.Tensor, np.ndarray],
profile: Union[torch.Tensor, np.ndarray],
output_format: str = "speaker_turn"
):
"""Inference
Args:
speech: Input speech data
profile: Speaker profiles
Returns:
diarization results for each speaker
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if isinstance(profile, np.ndarray):
profile = torch.tensor(profile)
# data: (Nsamples,) -> (1, Nsamples)
speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
profile = profile.unsqueeze(0).to(getattr(torch, self.dtype))
# lengths: (1,)
speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
profile_lengths = profile.new_full([1], dtype=torch.long, fill_value=profile.size(1))
batch = {"speech": speech, "speech_lengths": speech_lengths,
"profile": profile, "profile_lengths": profile_lengths}
# a. To device
batch = to_device(batch, device=self.device)
logits = self.diar_model.prediction_forward(**batch)
results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)
return results, pse_labels
@staticmethod
def from_pretrained(
model_tag: Optional[str] = None,
**kwargs: Optional[Any],
):
"""Build Speech2Xvector instance from the pretrained model.
Args:
model_tag (Optional[str]): Model tag of the pretrained models.
Currently, the tags of espnet_model_zoo are supported.
Returns:
Speech2Xvector: Speech2Xvector instance.
"""
if model_tag is not None:
try:
from espnet_model_zoo.downloader import ModelDownloader
except ImportError:
logging.error(
"`espnet_model_zoo` is not installed. "
"Please install via `pip install -U espnet_model_zoo`."
)
raise
d = ModelDownloader()
kwargs.update(**d.download_and_unpack(model_tag))
return Speech2Diarization(**kwargs)
def inference_modelscope(
diar_train_config: str,
diar_model_file: str,
output_dir: Optional[str] = None,
batch_size: int = 1,
dtype: str = "float32",
ngpu: int = 0,
seed: int = 0,
num_workers: int = 0,
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
smooth_size: int = 83,
dur_threshold: int = 10,
out_format: str = "vad",
param_dict: Optional[dict] = None,
mode: str = "sond",
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("param_dict: {}".format(param_dict))
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2a. Build speech2xvec [Optional]
if mode == "sond_demo" and param_dict is not None and "extract_profile" in param_dict and param_dict["extract_profile"]:
assert "sv_train_config" in param_dict, "sv_train_config must be provided param_dict."
assert "sv_model_file" in param_dict, "sv_model_file must be provided in param_dict."
sv_train_config = param_dict["sv_train_config"]
sv_model_file = param_dict["sv_model_file"]
if "model_dir" in param_dict:
sv_train_config = os.path.join(param_dict["model_dir"], sv_train_config)
sv_model_file = os.path.join(param_dict["model_dir"], sv_model_file)
from funasr_local.bin.sv_inference import Speech2Xvector
speech2xvector_kwargs = dict(
sv_train_config=sv_train_config,
sv_model_file=sv_model_file,
device=device,
dtype=dtype,
streaming=streaming,
embedding_node="resnet1_dense"
)
logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
speech2xvector = Speech2Xvector.from_pretrained(
model_tag=model_tag,
**speech2xvector_kwargs,
)
speech2xvector.sv_model.eval()
# 2b. Build speech2diar
speech2diar_kwargs = dict(
diar_train_config=diar_train_config,
diar_model_file=diar_model_file,
device=device,
dtype=dtype,
streaming=streaming,
smooth_size=smooth_size,
dur_threshold=dur_threshold,
)
logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
speech2diar = Speech2Diarization.from_pretrained(
model_tag=model_tag,
**speech2diar_kwargs,
)
speech2diar.diar_model.eval()
def output_results_str(results: dict, uttid: str):
rst = []
mid = uttid.rsplit("-", 1)[0]
for key in results:
results[key] = [(x[0]/100, x[1]/100) for x in results[key]]
if out_format == "vad":
for spk, segs in results.items():
rst.append("{} {}".format(spk, segs))
else:
template = "SPEAKER {} 0 {:.2f} {:.2f} <NA> <NA> {} <NA> <NA>"
for spk, segs in results.items():
rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
return "\n".join(rst)
def _forward(
data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
output_dir_v2: Optional[str] = None,
param_dict: Optional[dict] = None,
):
logging.info("param_dict: {}".format(param_dict))
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, (list, tuple)):
if not isinstance(raw_inputs[0], List):
raw_inputs = [raw_inputs]
assert all([len(example) >= 2 for example in raw_inputs]), \
"The length of test case in raw_inputs must larger than 1 (>=2)."
def prepare_dataset():
for idx, example in enumerate(raw_inputs):
# read waveform file
example = [load_bytes(x) if isinstance(x, bytes) else x
for x in example]
example = [soundfile.read(x)[0] if isinstance(x, str) else x
for x in example]
# convert torch tensor to numpy array
example = [x.numpy() if isinstance(example[0], torch.Tensor) else x
for x in example]
speech = example[0]
logging.info("Extracting profiles for {} waveforms".format(len(example)-1))
profile = [speech2xvector.calculate_embedding(x) for x in example[1:]]
profile = torch.cat(profile, dim=0)
yield ["test{}".format(idx)], {"speech": [speech], "profile": [profile]}
loader = prepare_dataset()
else:
raise TypeError("raw_inputs must be a list or tuple in [speech, profile1, profile2, ...] ")
else:
# 3. Build data-iterator
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=None,
collate_fn=None,
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
# 7. Start for-loop
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
os.makedirs(output_path, exist_ok=True)
output_writer = open("{}/result.txt".format(output_path), "w")
pse_label_writer = open("{}/labels.txt".format(output_path), "w")
logging.info("Start to diarize...")
result_list = []
for idx, (keys, batch) in enumerate(loader):
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
results, pse_labels = speech2diar(**batch)
# Only supporting batch_size==1
key, value = keys[0], output_results_str(results, keys[0])
item = {"key": key, "value": value}
result_list.append(item)
if output_path is not None:
output_writer.write(value)
output_writer.flush()
pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
pse_label_writer.flush()
if idx % 100 == 0:
logging.info("Processing {:5d}: {}".format(idx, key))
if output_path is not None:
output_writer.close()
pse_label_writer.close()
return result_list
return _forward
def inference(
data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
diar_train_config: Optional[str],
diar_model_file: Optional[str],
output_dir: Optional[str] = None,
batch_size: int = 1,
dtype: str = "float32",
ngpu: int = 0,
seed: int = 0,
num_workers: int = 1,
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
smooth_size: int = 83,
dur_threshold: int = 10,
out_format: str = "vad",
**kwargs,
):
inference_pipeline = inference_modelscope(
diar_train_config=diar_train_config,
diar_model_file=diar_model_file,
output_dir=output_dir,
batch_size=batch_size,
dtype=dtype,
ngpu=ngpu,
seed=seed,
num_workers=num_workers,
log_level=log_level,
key_file=key_file,
model_tag=model_tag,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
smooth_size=smooth_size,
dur_threshold=dur_threshold,
out_format=out_format,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
def get_parser():
parser = config_argparse.ArgumentParser(
description="Speaker verification/x-vector extraction",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--diar_train_config",
type=str,
help="diarization training configuration",
)
group.add_argument(
"--diar_model_file",
type=str,
help="diarization model parameter file",
)
group.add_argument(
"--dur_threshold",
type=int,
default=10,
help="The threshold for short segments in number frames"
)
parser.add_argument(
"--smooth_size",
type=int,
default=83,
help="The smoothing window length in number frames"
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
parser.add_argument("--streaming", type=str2bool, default=False)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
logging.info("args: {}".format(kwargs))
if args.output_dir is None:
jobid, n_gpu = 1, 1
gpuid = args.gpuid_list.split(",")[jobid-1]
else:
jobid = int(args.output_dir.split(".")[-1])
n_gpu = len(args.gpuid_list.split(","))
gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
results_list = inference(**kwargs)
for results in results_list:
print("{} {}".format(results["key"], results["value"]))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,443 @@
#!/usr/bin/env python3
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import argparse
import logging
import os
import sys
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
import numpy as np
import torch
from kaldiio import WriteHelper
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.tasks.sv import SVTask
from funasr_local.tasks.asr import ASRTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils.misc import statistic_model_parameters
class Speech2Xvector:
"""Speech2Xvector class
Examples:
>>> import soundfile
>>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2xvector(audio)
[(text, token, token_int, hypothesis object), ...]
"""
def __init__(
self,
sv_train_config: Union[Path, str] = None,
sv_model_file: Union[Path, str] = None,
device: str = "cpu",
batch_size: int = 1,
dtype: str = "float32",
streaming: bool = False,
embedding_node: str = "resnet1_dense",
):
assert check_argument_types()
# TODO: 1. Build SV model
sv_model, sv_train_args = SVTask.build_model_from_file(
config_file=sv_train_config,
model_file=sv_model_file,
device=device
)
logging.info("sv_model: {}".format(sv_model))
logging.info("model parameter number: {}".format(statistic_model_parameters(sv_model)))
logging.info("sv_train_args: {}".format(sv_train_args))
sv_model.to(dtype=getattr(torch, dtype)).eval()
self.sv_model = sv_model
self.sv_train_args = sv_train_args
self.device = device
self.dtype = dtype
self.embedding_node = embedding_node
@torch.no_grad()
def calculate_embedding(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
# data: (Nsamples,) -> (1, Nsamples)
speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
# lengths: (1,)
lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
batch = {"speech": speech, "speech_lengths": lengths}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, ilens = self.sv_model.encode(**batch)
# c. Forward Pooling
pooling = self.sv_model.pooling_layer(enc)
# d. Forward Decoder
outputs, embeddings = self.sv_model.decoder(pooling)
if self.embedding_node not in embeddings:
raise ValueError("Required embedding node {} not in {}".format(
self.embedding_node, embeddings.keys()))
return embeddings[self.embedding_node]
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray],
ref_speech: Optional[Union[torch.Tensor, np.ndarray]] = None,
) -> Tuple[torch.Tensor, Union[torch.Tensor, None], Union[torch.Tensor, None]]:
"""Inference
Args:
speech: Input speech data
ref_speech: Reference speech to compare
Returns:
embedding, ref_embedding, similarity_score
"""
assert check_argument_types()
self.sv_model.eval()
embedding = self.calculate_embedding(speech)
ref_emb, score = None, None
if ref_speech is not None:
ref_emb = self.calculate_embedding(ref_speech)
score = torch.cosine_similarity(embedding, ref_emb)
results = (embedding, ref_emb, score)
assert check_return_type(results)
return results
@staticmethod
def from_pretrained(
model_tag: Optional[str] = None,
**kwargs: Optional[Any],
):
"""Build Speech2Xvector instance from the pretrained model.
Args:
model_tag (Optional[str]): Model tag of the pretrained models.
Currently, the tags of espnet_model_zoo are supported.
Returns:
Speech2Xvector: Speech2Xvector instance.
"""
if model_tag is not None:
try:
from espnet_model_zoo.downloader import ModelDownloader
except ImportError:
logging.error(
"`espnet_model_zoo` is not installed. "
"Please install via `pip install -U espnet_model_zoo`."
)
raise
d = ModelDownloader()
kwargs.update(**d.download_and_unpack(model_tag))
return Speech2Xvector(**kwargs)
def inference_modelscope(
output_dir: Optional[str] = None,
batch_size: int = 1,
dtype: str = "float32",
ngpu: int = 1,
seed: int = 0,
num_workers: int = 0,
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
sv_train_config: Optional[str] = "sv.yaml",
sv_model_file: Optional[str] = "sv.pb",
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
embedding_node: str = "resnet1_dense",
sv_threshold: float = 0.9465,
param_dict: Optional[dict] = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("param_dict: {}".format(param_dict))
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2xvector
speech2xvector_kwargs = dict(
sv_train_config=sv_train_config,
sv_model_file=sv_model_file,
device=device,
dtype=dtype,
streaming=streaming,
embedding_node=embedding_node
)
logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
speech2xvector = Speech2Xvector.from_pretrained(
model_tag=model_tag,
**speech2xvector_kwargs,
)
speech2xvector.sv_model.eval()
def _forward(
data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
param_dict: Optional[dict] = None,
):
logging.info("param_dict: {}".format(param_dict))
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
# 3. Build data-iterator
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=None,
collate_fn=None,
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
# 7 .Start for-loop
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
embd_writer, ref_embd_writer, score_writer = None, None, None
if output_path is not None:
os.makedirs(output_path, exist_ok=True)
embd_writer = WriteHelper("ark,scp:{}/xvector.ark,{}/xvector.scp".format(output_path, output_path))
sv_result_list = []
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
embedding, ref_embedding, score = speech2xvector(**batch)
# Only supporting batch_size==1
key = keys[0]
normalized_score = 0.0
if score is not None:
score = score.item()
normalized_score = max(score - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
item = {"key": key, "value": normalized_score}
else:
item = {"key": key, "value": embedding.squeeze(0).cpu().numpy()}
sv_result_list.append(item)
if output_path is not None:
embd_writer(key, embedding[0].cpu().numpy())
if ref_embedding is not None:
if ref_embd_writer is None:
ref_embd_writer = WriteHelper(
"ark,scp:{}/ref_xvector.ark,{}/ref_xvector.scp".format(output_path, output_path)
)
score_writer = open(os.path.join(output_path, "score.txt"), "w")
ref_embd_writer(key, ref_embedding[0].cpu().numpy())
score_writer.write("{} {:.6f}\n".format(key, normalized_score))
if output_path is not None:
embd_writer.close()
if ref_embd_writer is not None:
ref_embd_writer.close()
score_writer.close()
return sv_result_list
return _forward
def inference(
output_dir: Optional[str],
batch_size: int,
dtype: str,
ngpu: int,
seed: int,
num_workers: int,
log_level: Union[int, str],
data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
key_file: Optional[str],
sv_train_config: Optional[str],
sv_model_file: Optional[str],
model_tag: Optional[str],
allow_variable_data_keys: bool = True,
streaming: bool = False,
embedding_node: str = "resnet1_dense",
sv_threshold: float = 0.9465,
**kwargs,
):
inference_pipeline = inference_modelscope(
output_dir=output_dir,
batch_size=batch_size,
dtype=dtype,
ngpu=ngpu,
seed=seed,
num_workers=num_workers,
log_level=log_level,
key_file=key_file,
sv_train_config=sv_train_config,
sv_model_file=sv_model_file,
model_tag=model_tag,
allow_variable_data_keys=allow_variable_data_keys,
streaming=streaming,
embedding_node=embedding_node,
sv_threshold=sv_threshold,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
def get_parser():
parser = config_argparse.ArgumentParser(
description="Speaker verification/x-vector extraction",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--sv_train_config",
type=str,
help="SV training configuration",
)
group.add_argument(
"--sv_model_file",
type=str,
help="SV model parameter file",
)
group.add_argument(
"--sv_threshold",
type=float,
default=0.9465,
help="The threshold for verification"
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
parser.add_argument("--streaming", type=str2bool, default=False)
parser.add_argument("--embedding_node", type=str, default="resnet1_dense")
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
logging.info("args: {}".format(kwargs))
if args.output_dir is None:
jobid, n_gpu = 1, 1
gpuid = args.gpuid_list.split(",")[jobid-1]
else:
jobid = int(args.output_dir.split(".")[-1])
n_gpu = len(args.gpuid_list.split(","))
gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
results_list = inference(**kwargs)
for results in results_list:
print("{} {}".format(results["key"], results["value"]))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="Speaker Verification",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--njob",
type=int,
default=1,
help="The number of jobs for each gpu",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=True)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--sv_train_config",
type=str,
help="ASR training configuration",
)
group.add_argument(
"--sv_model_file",
type=str,
help="ASR model parameter file",
)
group.add_argument(
"--cmvn_file",
type=str,
help="Global CMVN file",
)
group.add_argument(
"--model_tag",
type=str,
help="Pretrained model tag. If specify this option, *_train_config and "
"*_file will be overwritten",
)
group = parser.add_argument_group("The inference configuration related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument(
"--sv_threshold",
type=float,
default=0.9465,
help="The threshold for verification"
)
parser.add_argument(
"--embedding_node",
type=str,
default="resnet1_dense",
help="The network node to extract embedding"
)
return parser
def inference_launch(mode, **kwargs):
if mode == "sv":
from funasr_local.bin.sv_inference import inference_modelscope
return inference_modelscope(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
parser.add_argument(
"--mode",
type=str,
default="sv",
help="The decoding mode",
)
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
inference_launch(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,283 @@
#!/usr/bin/env python3
import argparse
from collections import Counter
import logging
from pathlib import Path
import sys
from typing import List
from typing import Optional
from typeguard import check_argument_types
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.text.build_tokenizer import build_tokenizer
from funasr_local.text.cleaner import TextCleaner
from funasr_local.text.phoneme_tokenizer import g2p_choices
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str_or_none
def field2slice(field: Optional[str]) -> slice:
"""Convert field string to slice
Note that field string accepts 1-based integer.
Examples:
>>> field2slice("1-")
slice(0, None, None)
>>> field2slice("1-3")
slice(0, 3, None)
>>> field2slice("-3")
slice(None, 3, None)
"""
field = field.strip()
try:
if "-" in field:
# e.g. "2-" or "2-5" or "-7"
s1, s2 = field.split("-", maxsplit=1)
if s1.strip() == "":
s1 = None
else:
s1 = int(s1)
if s1 == 0:
raise ValueError("1-based string")
if s2.strip() == "":
s2 = None
else:
s2 = int(s2)
else:
# e.g. "2"
s1 = int(field)
s2 = s1 + 1
if s1 == 0:
raise ValueError("must be 1 or more value")
except ValueError:
raise RuntimeError(f"Format error: e.g. '2-', '2-5', or '-5': {field}")
if s1 is None:
slic = slice(None, s2)
else:
# -1 because of 1-based integer following "cut" command
# e.g "1-3" -> slice(0, 3)
slic = slice(s1 - 1, s2)
return slic
def tokenize(
input: str,
output: str,
field: Optional[str],
delimiter: Optional[str],
token_type: str,
space_symbol: str,
non_linguistic_symbols: Optional[str],
bpemodel: Optional[str],
log_level: str,
write_vocabulary: bool,
vocabulary_size: int,
remove_non_linguistic_symbols: bool,
cutoff: int,
add_symbol: List[str],
cleaner: Optional[str],
g2p: Optional[str],
):
assert check_argument_types()
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if input == "-":
fin = sys.stdin
else:
fin = Path(input).open("r", encoding="utf-8")
if output == "-":
fout = sys.stdout
else:
p = Path(output)
p.parent.mkdir(parents=True, exist_ok=True)
fout = p.open("w", encoding="utf-8")
cleaner = TextCleaner(cleaner)
tokenizer = build_tokenizer(
token_type=token_type,
bpemodel=bpemodel,
delimiter=delimiter,
space_symbol=space_symbol,
non_linguistic_symbols=non_linguistic_symbols,
remove_non_linguistic_symbols=remove_non_linguistic_symbols,
g2p_type=g2p,
)
counter = Counter()
if field is not None:
field = field2slice(field)
for line in fin:
line = line.rstrip()
if field is not None:
# e.g. field="2-"
# uttidA hello world!! -> hello world!!
tokens = line.split(delimiter)
tokens = tokens[field]
if delimiter is None:
line = " ".join(tokens)
else:
line = delimiter.join(tokens)
line = cleaner(line)
tokens = tokenizer.text2tokens(line)
if not write_vocabulary:
fout.write(" ".join(tokens) + "\n")
else:
for t in tokens:
counter[t] += 1
if not write_vocabulary:
return
## FIXME
## del duplicate add_symbols in counter
for symbol_and_id in add_symbol:
# e.g symbol="<blank>:0"
try:
symbol, idx = symbol_and_id.split(":")
except ValueError:
raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
symbol = symbol.strip()
if symbol in counter:
del counter[symbol]
# ======= write_vocabulary mode from here =======
# Sort by the number of occurrences in descending order
# and filter lower frequency words than cutoff value
words_and_counts = list(
filter(lambda x: x[1] > cutoff, sorted(counter.items(), key=lambda x: -x[1]))
)
# Restrict the vocabulary size
if vocabulary_size > 0:
if vocabulary_size < len(add_symbol):
raise RuntimeError(f"vocabulary_size is too small: {vocabulary_size}")
words_and_counts = words_and_counts[: vocabulary_size - len(add_symbol)]
# Parse the values of --add_symbol
for symbol_and_id in add_symbol:
# e.g symbol="<blank>:0"
try:
symbol, idx = symbol_and_id.split(":")
idx = int(idx)
except ValueError:
raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
symbol = symbol.strip()
# e.g. idx=0 -> append as the first symbol
# e.g. idx=-1 -> append as the last symbol
if idx < 0:
idx = len(words_and_counts) + 1 + idx
words_and_counts.insert(idx, (symbol, None))
# Write words
for w, c in words_and_counts:
fout.write(w + "\n")
# Logging
total_count = sum(counter.values())
invocab_count = sum(c for w, c in words_and_counts if c is not None)
logging.info(f"OOV rate = {(total_count - invocab_count) / total_count * 100} %")
def get_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Tokenize texts",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument(
"--input", "-i", required=True, help="Input text. - indicates sys.stdin"
)
parser.add_argument(
"--output", "-o", required=True, help="Output text. - indicates sys.stdout"
)
parser.add_argument(
"--field",
"-f",
help="The target columns of the input text as 1-based integer. e.g 2-",
)
parser.add_argument(
"--token_type",
"-t",
default="char",
choices=["char", "bpe", "word", "phn"],
help="Token type",
)
parser.add_argument("--delimiter", "-d", default=None, help="The delimiter")
parser.add_argument("--space_symbol", default="<space>", help="The space symbol")
parser.add_argument("--bpemodel", default=None, help="The bpemodel file path")
parser.add_argument(
"--non_linguistic_symbols",
type=str_or_none,
help="non_linguistic_symbols file path",
)
parser.add_argument(
"--remove_non_linguistic_symbols",
type=str2bool,
default=False,
help="Remove non-language-symbols from tokens",
)
parser.add_argument(
"--cleaner",
type=str_or_none,
choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
default=None,
help="Apply text cleaning",
)
parser.add_argument(
"--g2p",
type=str_or_none,
choices=g2p_choices,
default=None,
help="Specify g2p method if --token_type=phn",
)
group = parser.add_argument_group("write_vocabulary mode related")
group.add_argument(
"--write_vocabulary",
type=str2bool,
default=False,
help="Write tokens list instead of tokenized text per line",
)
group.add_argument("--vocabulary_size", type=int, default=0, help="Vocabulary size")
group.add_argument(
"--cutoff",
default=0,
type=int,
help="cut-off frequency used for write-vocabulary mode",
)
group.add_argument(
"--add_symbol",
type=str,
default=[],
action="append",
help="Append symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1'",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
tokenize(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,399 @@
import argparse
import logging
from optparse import Option
import sys
import json
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
import numpy as np
import torch
from typeguard import check_argument_types
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.datasets.preprocessor import LMPreprocessor
from funasr_local.tasks.asr import ASRTaskAligner as ASRTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.models.frontend.wav_frontend import WavFrontend
from funasr_local.text.token_id_converter import TokenIDConverter
from funasr_local.utils.timestamp_tools import ts_prediction_lfr6_standard
header_colors = '\033[95m'
end_colors = '\033[0m'
global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
'audio_fs': 16000,
'model_fs': 16000
}
class SpeechText2Timestamp:
def __init__(
self,
timestamp_infer_config: Union[Path, str] = None,
timestamp_model_file: Union[Path, str] = None,
timestamp_cmvn_file: Union[Path, str] = None,
device: str = "cpu",
dtype: str = "float32",
**kwargs,
):
assert check_argument_types()
# 1. Build ASR model
tp_model, tp_train_args = ASRTask.build_model_from_file(
timestamp_infer_config, timestamp_model_file, device=device
)
if 'cuda' in device:
tp_model = tp_model.cuda() # force model to cuda
frontend = None
if tp_train_args.frontend is not None:
frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf)
logging.info("tp_model: {}".format(tp_model))
logging.info("tp_train_args: {}".format(tp_train_args))
tp_model.to(dtype=getattr(torch, dtype)).eval()
logging.info(f"Decoding device={device}, dtype={dtype}")
self.tp_model = tp_model
self.tp_train_args = tp_train_args
token_list = self.tp_model.token_list
self.converter = TokenIDConverter(token_list=token_list)
self.device = device
self.dtype = dtype
self.frontend = frontend
self.encoder_downsampling_factor = 1
if tp_train_args.encoder_conf["input_layer"] == "conv2d":
self.encoder_downsampling_factor = 4
@torch.no_grad()
def __call__(
self,
speech: Union[torch.Tensor, np.ndarray],
speech_lengths: Union[torch.Tensor, np.ndarray] = None,
text_lengths: Union[torch.Tensor, np.ndarray] = None
):
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
self.tp_model.frontend = None
else:
feats = speech
feats_len = speech_lengths
# lfr_factor = max(1, (feats.size()[-1]//80)-1)
batch = {"speech": feats, "speech_lengths": feats_len}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, enc_len = self.tp_model.encode(**batch)
if isinstance(enc, tuple):
enc = enc[0]
# c. Forward Predictor
_, _, us_alphas, us_peaks = self.tp_model.calc_predictor_timestamp(enc, enc_len, text_lengths.to(self.device)+1)
return us_alphas, us_peaks
def inference(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
data_path_and_name_and_type,
timestamp_infer_config: Optional[str],
timestamp_model_file: Optional[str],
timestamp_cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
split_with_space: bool = True,
seg_dict_file: Optional[str] = None,
**kwargs,
):
inference_pipeline = inference_modelscope(
batch_size=batch_size,
ngpu=ngpu,
log_level=log_level,
timestamp_infer_config=timestamp_infer_config,
timestamp_model_file=timestamp_model_file,
timestamp_cmvn_file=timestamp_cmvn_file,
key_file=key_file,
allow_variable_data_keys=allow_variable_data_keys,
output_dir=output_dir,
dtype=dtype,
seed=seed,
num_workers=num_workers,
split_with_space=split_with_space,
seg_dict_file=seg_dict_file,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
# data_path_and_name_and_type,
timestamp_infer_config: Optional[str],
timestamp_model_file: Optional[str],
timestamp_cmvn_file: Optional[str] = None,
# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
split_with_space: bool = True,
seg_dict_file: Optional[str] = None,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2vadsegment
speechtext2timestamp_kwargs = dict(
timestamp_infer_config=timestamp_infer_config,
timestamp_model_file=timestamp_model_file,
timestamp_cmvn_file=timestamp_cmvn_file,
device=device,
dtype=dtype,
)
logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs))
speechtext2timestamp = SpeechText2Timestamp(**speechtext2timestamp_kwargs)
preprocessor = LMPreprocessor(
train=False,
token_type=speechtext2timestamp.tp_train_args.token_type,
token_list=speechtext2timestamp.tp_train_args.token_list,
bpemodel=None,
text_cleaner=None,
g2p_type=None,
text_name="text",
non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols,
split_with_space=split_with_space,
seg_dict_file=seg_dict_file,
)
if output_dir is not None:
writer = DatadirWriter(output_dir)
tp_writer = writer[f"timestamp_prediction"]
# ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
else:
tp_writer = None
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
**kwargs
):
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
writer = None
if output_path is not None:
writer = DatadirWriter(output_path)
tp_writer = writer[f"timestamp_prediction"]
else:
tp_writer = None
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = ASRTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=preprocessor,
collate_fn=ASRTask.build_collate_fn(speechtext2timestamp.tp_train_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
tp_result_list = []
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
logging.info("timestamp predicting, utt_id: {}".format(keys))
_batch = {'speech':batch['speech'],
'speech_lengths':batch['speech_lengths'],
'text_lengths':batch['text_lengths']}
us_alphas, us_cif_peak = speechtext2timestamp(**_batch)
for batch_id in range(_bs):
key = keys[batch_id]
token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id])
ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token, force_time_shift=-3.0)
logging.warning(ts_str)
item = {'key': key, 'value': ts_str, 'timestamp':ts_list}
if tp_writer is not None:
tp_writer["tp_sync"][key+'#'] = ts_str
tp_writer["tp_time"][key+'#'] = str(ts_list)
tp_result_list.append(item)
return tp_result_list
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="Timestamp Prediction Inference",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=0,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--timestamp_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--timestamp_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--timestamp_cmvn_file",
type=str,
help="Global cmvn file",
)
group = parser.add_argument_group("infer related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
group.add_argument(
"--seg_dict_file",
type=str,
default=None,
help="The batch size for inference",
)
group.add_argument(
"--split_with_space",
type=bool,
default=False,
help="The batch size for inference",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="Timestamp Prediction Inference",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--njob",
type=int,
default=1,
help="The number of jobs for each gpu",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=True,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--timestamp_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--timestamp_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--timestamp_cmvn_file",
type=str,
help="Global CMVN file",
)
group = parser.add_argument_group("The inference configuration related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
return parser
def inference_launch(mode, **kwargs):
if mode == "tp_norm":
from funasr_local.bin.tp_inference import inference_modelscope
return inference_modelscope(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
parser.add_argument(
"--mode",
type=str,
default="tp_norm",
help="The decoding mode",
)
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
inference_launch(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,575 @@
import argparse
import logging
import os
import sys
import json
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
import math
import numpy as np
import torch
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.modules.scorers.scorer_interface import BatchScorerInterface
from funasr_local.modules.subsampling import TooShortUttError
from funasr_local.tasks.vad import VADTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.utils import asr_utils, wav_utils, postprocess_utils
from funasr_local.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
header_colors = '\033[95m'
end_colors = '\033[0m'
global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
'audio_fs': 16000,
'model_fs': 16000
}
class Speech2VadSegment:
"""Speech2VadSegment class
Examples:
>>> import soundfile
>>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2segment(audio)
[[10, 230], [245, 450], ...]
"""
def __init__(
self,
vad_infer_config: Union[Path, str] = None,
vad_model_file: Union[Path, str] = None,
vad_cmvn_file: Union[Path, str] = None,
device: str = "cpu",
batch_size: int = 1,
dtype: str = "float32",
**kwargs,
):
assert check_argument_types()
# 1. Build vad model
vad_model, vad_infer_args = VADTask.build_model_from_file(
vad_infer_config, vad_model_file, device
)
frontend = None
if vad_infer_args.frontend is not None:
frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf)
logging.info("vad_model: {}".format(vad_model))
logging.info("vad_infer_args: {}".format(vad_infer_args))
vad_model.to(dtype=getattr(torch, dtype)).eval()
self.vad_model = vad_model
self.vad_infer_args = vad_infer_args
self.device = device
self.dtype = dtype
self.frontend = frontend
self.batch_size = batch_size
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
in_cache: Dict[str, torch.Tensor] = dict()
) -> Tuple[List[List[int]], Dict[str, torch.Tensor]]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
self.frontend.filter_length_max = math.inf
fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len)
fbanks = to_device(fbanks, device=self.device)
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
else:
raise Exception("Need to extract feats first, please configure frontend configuration")
# b. Forward Encoder streaming
t_offset = 0
step = min(feats_len.max(), 6000)
segments = [[]] * self.batch_size
for t_offset in range(0, feats_len, min(step, feats_len - t_offset)):
if t_offset + step >= feats_len - 1:
step = feats_len - t_offset
is_final = True
else:
is_final = False
batch = {
"feats": feats[:, t_offset:t_offset + step, :],
"waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)],
"is_final": is_final,
"in_cache": in_cache
}
# a. To device
#batch = to_device(batch, device=self.device)
segments_part, in_cache = self.vad_model(**batch)
if segments_part:
for batch_num in range(0, self.batch_size):
segments[batch_num] += segments_part[batch_num]
return fbanks, segments
class Speech2VadSegmentOnline(Speech2VadSegment):
"""Speech2VadSegmentOnline class
Examples:
>>> import soundfile
>>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2segment(audio)
[[10, 230], [245, 450], ...]
"""
def __init__(self, **kwargs):
super(Speech2VadSegmentOnline, self).__init__(**kwargs)
vad_cmvn_file = kwargs.get('vad_cmvn_file', None)
self.frontend = None
if self.vad_infer_args.frontend is not None:
self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf)
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
batch_size = speech.shape[0]
segments = [[]] * batch_size
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
fbanks, _ = self.frontend.get_fbank()
else:
raise Exception("Need to extract feats first, please configure frontend configuration")
if feats.shape[0]:
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
waveforms = self.frontend.get_waveforms()
batch = {
"feats": feats,
"waveform": waveforms,
"in_cache": in_cache,
"is_final": is_final,
"max_end_sil": max_end_sil
}
# a. To device
batch = to_device(batch, device=self.device)
segments, in_cache = self.vad_model.forward_online(**batch)
# in_cache.update(batch['in_cache'])
# in_cache = {key: value for key, value in batch['in_cache'].items()}
return fbanks, segments, in_cache
def inference(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
data_path_and_name_and_type,
vad_infer_config: Optional[str],
vad_model_file: Optional[str],
vad_cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
online: bool = False,
**kwargs,
):
if not online:
inference_pipeline = inference_modelscope(
batch_size=batch_size,
ngpu=ngpu,
log_level=log_level,
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
key_file=key_file,
allow_variable_data_keys=allow_variable_data_keys,
output_dir=output_dir,
dtype=dtype,
seed=seed,
num_workers=num_workers,
**kwargs,
)
else:
inference_pipeline = inference_modelscope_online(
batch_size=batch_size,
ngpu=ngpu,
log_level=log_level,
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
key_file=key_file,
allow_variable_data_keys=allow_variable_data_keys,
output_dir=output_dir,
dtype=dtype,
seed=seed,
num_workers=num_workers,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
# data_path_and_name_and_type,
vad_infer_config: Optional[str],
vad_model_file: Optional[str],
vad_cmvn_file: Optional[str] = None,
# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
**kwargs,
):
assert check_argument_types()
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2vadsegment
speech2vadsegment_kwargs = dict(
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
device=device,
dtype=dtype,
)
logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = VADTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
ibest_writer = writer[f"1best_recog"]
else:
writer = None
ibest_writer = None
vad_results = []
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
# do vad segment
_, results = speech2vadsegment(**batch)
for i, _ in enumerate(keys):
if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
results[i] = json.dumps(results[i])
item = {'key': keys[i], 'value': results[i]}
vad_results.append(item)
if writer is not None:
results[i] = json.loads(results[i])
ibest_writer["text"][keys[i]] = "{}".format(results[i])
return vad_results
return _forward
def inference_modelscope_online(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
# data_path_and_name_and_type,
vad_infer_config: Optional[str],
vad_model_file: Optional[str],
vad_cmvn_file: Optional[str] = None,
# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
**kwargs,
):
assert check_argument_types()
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2vadsegment
speech2vadsegment_kwargs = dict(
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
device=device,
dtype=dtype,
)
logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs)
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = VADTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
ibest_writer = writer[f"1best_recog"]
else:
writer = None
ibest_writer = None
vad_results = []
batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
is_final = param_dict.get('is_final', False) if param_dict is not None else False
max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
batch['in_cache'] = batch_in_cache
batch['is_final'] = is_final
batch['max_end_sil'] = max_end_sil
# do vad segment
_, results, param_dict['in_cache'] = speech2vadsegment(**batch)
# param_dict['in_cache'] = batch['in_cache']
if results:
for i, _ in enumerate(keys):
if results[i]:
if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
results[i] = json.dumps(results[i])
item = {'key': keys[i], 'value': results[i]}
vad_results.append(item)
if writer is not None:
results[i] = json.loads(results[i])
ibest_writer["text"][keys[i]] = "{}".format(results[i])
return vad_results
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="VAD Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--vad_cmvn_file",
type=str,
help="Global cmvn file",
)
group.add_argument(
"--online",
type=str,
help="decoding mode",
)
group = parser.add_argument_group("infer related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import torch
torch.set_num_threads(1)
import argparse
import logging
import os
import sys
from typing import Union, Dict, Any
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
def get_parser():
parser = config_argparse.ArgumentParser(
description="VAD Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--njob",
type=int,
default=1,
help="The number of jobs for each gpu",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=True,
action="append",
)
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--vad_cmvn_file",
type=str,
help="Global CMVN file",
)
group.add_argument(
"--vad_train_config",
type=str,
help="VAD training configuration",
)
group = parser.add_argument_group("The inference configuration related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
return parser
def inference_launch(mode, **kwargs):
if mode == "offline":
from funasr_local.bin.vad_inference import inference_modelscope
return inference_modelscope(**kwargs)
elif mode == "online":
from funasr_local.bin.vad_inference import inference_modelscope_online
return inference_modelscope_online(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
parser.add_argument(
"--mode",
type=str,
default="vad",
help="The decoding mode",
)
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
# set logging messages
logging.basicConfig(
level=args.log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.info("Decoding args: {}".format(kwargs))
# gpu setting
if args.ngpu > 0:
jobid = int(args.output_dir.split(".")[-1])
gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
inference_launch(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,347 @@
import argparse
import logging
import os
import sys
import json
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
import numpy as np
import torch
from typeguard import check_argument_types
from typeguard import check_return_type
from funasr_local.fileio.datadir_writer import DatadirWriter
from funasr_local.tasks.vad import VADTask
from funasr_local.torch_utils.device_funcs import to_device
from funasr_local.torch_utils.set_all_random_seed import set_all_random_seed
from funasr_local.utils import config_argparse
from funasr_local.utils.cli_utils import get_commandline_args
from funasr_local.utils.types import str2bool
from funasr_local.utils.types import str2triple_str
from funasr_local.utils.types import str_or_none
from funasr_local.models.frontend.wav_frontend import WavFrontendOnline
from funasr_local.models.frontend.wav_frontend import WavFrontend
from funasr_local.bin.vad_inference import Speech2VadSegment
header_colors = '\033[95m'
end_colors = '\033[0m'
class Speech2VadSegmentOnline(Speech2VadSegment):
"""Speech2VadSegmentOnline class
Examples:
>>> import soundfile
>>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2segment(audio)
[[10, 230], [245, 450], ...]
"""
def __init__(self, **kwargs):
super(Speech2VadSegmentOnline, self).__init__(**kwargs)
vad_cmvn_file = kwargs.get('vad_cmvn_file', None)
self.frontend = None
if self.vad_infer_args.frontend is not None:
self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf)
@torch.no_grad()
def __call__(
self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
"""Inference
Args:
speech: Input speech data
Returns:
text, token, token_int, hyp
"""
assert check_argument_types()
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
batch_size = speech.shape[0]
segments = [[]] * batch_size
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
fbanks, _ = self.frontend.get_fbank()
else:
raise Exception("Need to extract feats first, please configure frontend configuration")
if feats.shape[0]:
feats = to_device(feats, device=self.device)
feats_len = feats_len.int()
waveforms = self.frontend.get_waveforms()
batch = {
"feats": feats,
"waveform": waveforms,
"in_cache": in_cache,
"is_final": is_final,
"max_end_sil": max_end_sil
}
# a. To device
batch = to_device(batch, device=self.device)
segments, in_cache = self.vad_model.forward_online(**batch)
# in_cache.update(batch['in_cache'])
# in_cache = {key: value for key, value in batch['in_cache'].items()}
return fbanks, segments, in_cache
def inference(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
data_path_and_name_and_type,
vad_infer_config: Optional[str],
vad_model_file: Optional[str],
vad_cmvn_file: Optional[str] = None,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
**kwargs,
):
inference_pipeline = inference_modelscope(
batch_size=batch_size,
ngpu=ngpu,
log_level=log_level,
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
key_file=key_file,
allow_variable_data_keys=allow_variable_data_keys,
output_dir=output_dir,
dtype=dtype,
seed=seed,
num_workers=num_workers,
**kwargs,
)
return inference_pipeline(data_path_and_name_and_type, raw_inputs)
def inference_modelscope(
batch_size: int,
ngpu: int,
log_level: Union[int, str],
# data_path_and_name_and_type,
vad_infer_config: Optional[str],
vad_model_file: Optional[str],
vad_cmvn_file: Optional[str] = None,
# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
key_file: Optional[str] = None,
allow_variable_data_keys: bool = False,
output_dir: Optional[str] = None,
dtype: str = "float32",
seed: int = 0,
num_workers: int = 1,
**kwargs,
):
assert check_argument_types()
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
logging.basicConfig(
level=log_level,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# 1. Set random-seed
set_all_random_seed(seed)
# 2. Build speech2vadsegment
speech2vadsegment_kwargs = dict(
vad_infer_config=vad_infer_config,
vad_model_file=vad_model_file,
vad_cmvn_file=vad_cmvn_file,
device=device,
dtype=dtype,
)
logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs)
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
fs: dict = None,
param_dict: dict = None,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
loader = VADTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
batch_size=batch_size,
key_file=key_file,
num_workers=num_workers,
preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
allow_variable_data_keys=allow_variable_data_keys,
inference=True,
)
finish_count = 0
file_count = 1
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
if output_path is not None:
writer = DatadirWriter(output_path)
ibest_writer = writer[f"1best_recog"]
else:
writer = None
ibest_writer = None
vad_results = []
batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
is_final = param_dict.get('is_final', False) if param_dict is not None else False
max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800
for keys, batch in loader:
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
assert len(keys) == _bs, f"{len(keys)} != {_bs}"
batch['in_cache'] = batch_in_cache
batch['is_final'] = is_final
batch['max_end_sil'] = max_end_sil
# do vad segment
_, results, param_dict['in_cache'] = speech2vadsegment(**batch)
# param_dict['in_cache'] = batch['in_cache']
if results:
for i, _ in enumerate(keys):
if results[i]:
if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
results[i] = json.dumps(results[i])
item = {'key': keys[i], 'value': results[i]}
vad_results.append(item)
if writer is not None:
results[i] = json.loads(results[i])
ibest_writer["text"][keys[i]] = "{}".format(results[i])
return vad_results
return _forward
def get_parser():
parser = config_argparse.ArgumentParser(
description="VAD Decoding",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Note(kamo): Use '_' instead of '-' as separator.
# '-' is confusing if written in yaml.
parser.add_argument(
"--log_level",
type=lambda x: x.upper(),
default="INFO",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
help="The verbose level of logging",
)
parser.add_argument("--output_dir", type=str, required=False)
parser.add_argument(
"--ngpu",
type=int,
default=0,
help="The number of gpus. 0 indicates CPU mode",
)
parser.add_argument(
"--gpuid_list",
type=str,
default="",
help="The visible gpus",
)
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument(
"--dtype",
default="float32",
choices=["float16", "float32", "float64"],
help="Data type",
)
parser.add_argument(
"--num_workers",
type=int,
default=1,
help="The number of workers used for DataLoader",
)
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
type=str2triple_str,
required=False,
action="append",
)
group.add_argument("--raw_inputs", type=list, default=None)
# example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
group.add_argument("--key_file", type=str_or_none)
group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
group = parser.add_argument_group("The model configuration related")
group.add_argument(
"--vad_infer_config",
type=str,
help="VAD infer configuration",
)
group.add_argument(
"--vad_model_file",
type=str,
help="VAD model parameter file",
)
group.add_argument(
"--vad_cmvn_file",
type=str,
help="Global cmvn file",
)
group = parser.add_argument_group("infer related")
group.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size for inference",
)
return parser
def main(cmd=None):
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
kwargs = vars(args)
kwargs.pop("config", None)
inference(**kwargs)
if __name__ == "__main__":
main()