mirror of
https://github.com/HumanAIGC/lite-avatar.git
synced 2026-02-05 18:09:20 +08:00
add files
This commit is contained in:
63
funasr_local/text/build_tokenizer.py
Normal file
63
funasr_local/text/build_tokenizer.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from typing import Union
|
||||
|
||||
from typeguard import check_argument_types
|
||||
|
||||
from funasr_local.text.abs_tokenizer import AbsTokenizer
|
||||
from funasr_local.text.char_tokenizer import CharTokenizer
|
||||
from funasr_local.text.phoneme_tokenizer import PhonemeTokenizer
|
||||
from funasr_local.text.sentencepiece_tokenizer import SentencepiecesTokenizer
|
||||
from funasr_local.text.word_tokenizer import WordTokenizer
|
||||
|
||||
|
||||
def build_tokenizer(
|
||||
token_type: str,
|
||||
bpemodel: Union[Path, str, Iterable[str]] = None,
|
||||
non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
|
||||
remove_non_linguistic_symbols: bool = False,
|
||||
space_symbol: str = "<space>",
|
||||
delimiter: str = None,
|
||||
g2p_type: str = None,
|
||||
) -> AbsTokenizer:
|
||||
"""A helper function to instantiate Tokenizer"""
|
||||
assert check_argument_types()
|
||||
if token_type == "bpe":
|
||||
if bpemodel is None:
|
||||
raise ValueError('bpemodel is required if token_type = "bpe"')
|
||||
|
||||
if remove_non_linguistic_symbols:
|
||||
raise RuntimeError(
|
||||
"remove_non_linguistic_symbols is not implemented for token_type=bpe"
|
||||
)
|
||||
return SentencepiecesTokenizer(bpemodel)
|
||||
|
||||
elif token_type == "word":
|
||||
if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
|
||||
return WordTokenizer(
|
||||
delimiter=delimiter,
|
||||
non_linguistic_symbols=non_linguistic_symbols,
|
||||
remove_non_linguistic_symbols=True,
|
||||
)
|
||||
else:
|
||||
return WordTokenizer(delimiter=delimiter)
|
||||
|
||||
elif token_type == "char":
|
||||
return CharTokenizer(
|
||||
non_linguistic_symbols=non_linguistic_symbols,
|
||||
space_symbol=space_symbol,
|
||||
remove_non_linguistic_symbols=remove_non_linguistic_symbols,
|
||||
)
|
||||
|
||||
elif token_type == "phn":
|
||||
return PhonemeTokenizer(
|
||||
g2p_type=g2p_type,
|
||||
non_linguistic_symbols=non_linguistic_symbols,
|
||||
space_symbol=space_symbol,
|
||||
remove_non_linguistic_symbols=remove_non_linguistic_symbols,
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"token_mode must be one of bpe, word, char or phn: " f"{token_type}"
|
||||
)
|
||||
Reference in New Issue
Block a user