update

2026-02-04 17:39:25 +08:00 · 2024-12-12 15:43:17 +08:00
parent 0bf706c26f
commit 2345ce6be2
4 changed files with 19 additions and 9 deletions
--- a/cosyvoice/tokenizer/tokenizer.py
+++ b/cosyvoice/tokenizer/tokenizer.py
@@ -240,6 +240,8 @@ def get_tokenizer(

 class QwenTokenizer():
    def __init__(self, token_path, skip_special_tokens=True):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
        special_tokens = {
            'eos_token': '<|endoftext|>',
            'pad_token': '<|endoftext|>',
@@ -248,6 +250,9 @@ class QwenTokenizer():
                '[breath]', '<strong>', '</strong>', '[noise]',
                '[laughter]', '[cough]', '[clucking]', '[accent]',
                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
            ]
        }
        self.tokenizer = AutoTokenizer.from_pretrained(token_path)