This commit is contained in:
lyuxiang.lx
2024-12-12 15:43:17 +08:00
parent 0bf706c26f
commit 2345ce6be2
4 changed files with 19 additions and 9 deletions

View File

@@ -240,6 +240,8 @@ def get_tokenizer(
class QwenTokenizer():
def __init__(self, token_path, skip_special_tokens=True):
super().__init__()
# NOTE: non-chat model, all these special tokens keep randomly initialized.
special_tokens = {
'eos_token': '<|endoftext|>',
'pad_token': '<|endoftext|>',
@@ -248,6 +250,9 @@ class QwenTokenizer():
'[breath]', '<strong>', '</strong>', '[noise]',
'[laughter]', '[cough]', '[clucking]', '[accent]',
'[quick_breath]',
"<laughter>", "</laughter>",
"[hissing]", "[sigh]", "[vocalized-noise]",
"[lipsmack]", "[mn]"
]
}
self.tokenizer = AutoTokenizer.from_pretrained(token_path)