mirror of
https://github.com/HumanAIGC/lite-avatar.git
synced 2026-02-05 18:09:20 +08:00
add files
This commit is contained in:
38
funasr_local/text/sentencepiece_tokenizer.py
Normal file
38
funasr_local/text/sentencepiece_tokenizer.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from typing import List
|
||||
from typing import Union
|
||||
|
||||
import sentencepiece as spm
|
||||
from typeguard import check_argument_types
|
||||
|
||||
from funasr_local.text.abs_tokenizer import AbsTokenizer
|
||||
|
||||
|
||||
class SentencepiecesTokenizer(AbsTokenizer):
|
||||
def __init__(self, model: Union[Path, str]):
|
||||
assert check_argument_types()
|
||||
self.model = str(model)
|
||||
# NOTE(kamo):
|
||||
# Don't build SentencePieceProcessor in __init__()
|
||||
# because it's not picklable and it may cause following error,
|
||||
# "TypeError: can't pickle SwigPyObject objects",
|
||||
# when giving it as argument of "multiprocessing.Process()".
|
||||
self.sp = None
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}(model="{self.model}")'
|
||||
|
||||
def _build_sentence_piece_processor(self):
|
||||
# Build SentencePieceProcessor lazily.
|
||||
if self.sp is None:
|
||||
self.sp = spm.SentencePieceProcessor()
|
||||
self.sp.load(self.model)
|
||||
|
||||
def text2tokens(self, line: str) -> List[str]:
|
||||
self._build_sentence_piece_processor()
|
||||
return self.sp.EncodeAsPieces(line)
|
||||
|
||||
def tokens2text(self, tokens: Iterable[str]) -> str:
|
||||
self._build_sentence_piece_processor()
|
||||
return self.sp.DecodePieces(list(tokens))
|
||||
Reference in New Issue
Block a user