add huggingface to pretrained

2026-02-05 18:09:24 +08:00 · 2025-07-29 07:54:42 +00:00
parent b048a2d6db
commit d1c354eac7
12 changed files with 2340 additions and 0 deletions
--- a/examples/grpo/cosyvoice2/pretrained_to_huggingface.py
+++ b/examples/grpo/cosyvoice2/pretrained_to_huggingface.py
@@ -0,0 +1,124 @@
+
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage: Instruct TTS
+  python3 infer.py \
+    --token2wav-path /workspace/CosyVoice2-0.5B \
+    --prompt-text "吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。" \
+    --prompt-speech-path ./assets/prompt_audio.wav \
+    --model-path ./transformers_cosyvoice2_llm \
+    --input-text "用四川话说<|endofprompt|>扁担长，板凳宽，扁担绑在板凳上。吃葡萄不吐葡萄皮，不吃葡萄倒吐葡萄皮。"
+"""
+from cosyvoice.cli.cosyvoice import CosyVoice2
+import sys
+from argparse import ArgumentParser
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+sys.path.append("/workspace/CosyVoice/third_party/Matcha-TTS")
+
+
+
+def get_args():
+    parser = ArgumentParser()
+
+    parser.add_argument(
+        "--pretrained-cosyvoice2-path",
+        type=str,
+        default="/workspace/CosyVoice2-0.5B",
+        help="Token2Wav path, default to %(default)r",
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default='./transformers_cosyvoice2_llm',
+        help="The path to save the model",
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    cosy2_model = CosyVoice2(
+        args.pretrained_cosyvoice2_path, load_jit=False, load_trt=False, fp16=False
+    )
+
+    llm = cosy2_model.model.llm.llm.model
+
+    speech_embedding = cosy2_model.model.llm.speech_embedding
+    llm_decoder = cosy2_model.model.llm.llm_decoder
+    llm_embedding = cosy2_model.model.llm.llm_embedding
+
+    tokenizer = AutoTokenizer.from_pretrained(f"{args.pretrained_cosyvoice2_path}/CosyVoice-BlankEN")
+    special_tokens = {
+        'eos_token': '<|endoftext|>',
+        'pad_token': '<|endoftext|>',
+        'additional_special_tokens': [
+            '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+            '[breath]', '<strong>', '</strong>', '[noise]',
+            '[laughter]', '[cough]', '[clucking]', '[accent]',
+            '[quick_breath]',
+            "<laughter>", "</laughter>",
+            "[hissing]", "[sigh]", "[vocalized-noise]",
+            "[lipsmack]", "[mn]"
+        ]
+    }
+    tokenizer.add_special_tokens(special_tokens)
+
+    original_tokenizer_vocab_size = len(tokenizer)
+    cosyvoice2_token_size = 6561
+    new_tokens = [f"<|s_{i}|>" for i in range(cosyvoice2_token_size)] + [
+        "<|eos1|>", "<|eos2|>", "<|eos3|>", "<|sos|>", "<|task_id|>"
+    ]
+    num_added_tokens = tokenizer.add_tokens(new_tokens)
+
+    llm.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)
+    vocab_size = llm.get_input_embeddings().weight.shape[0]
+
+    feature_size = speech_embedding.embedding_dim
+    new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=vocab_size, bias=True)
+
+    with torch.no_grad():
+        # set the weight and bias of the new lm_head to 0
+        new_lm_head.weight.data.zero_()
+        new_lm_head.bias.data.zero_()
+        new_lm_head.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size+cosyvoice2_token_size+3] = llm_decoder.weight
+        new_lm_head.bias[original_tokenizer_vocab_size:original_tokenizer_vocab_size+cosyvoice2_token_size+3] = llm_decoder.bias
+
+    llm.lm_head = new_lm_head
+    input_embeddings = llm.get_input_embeddings()
+
+    with torch.no_grad():
+        input_embeddings.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size+cosyvoice2_token_size+3] = speech_embedding.weight
+        input_embeddings.weight[original_tokenizer_vocab_size+cosyvoice2_token_size+3:original_tokenizer_vocab_size+cosyvoice2_token_size+3+2] = llm_embedding.weight
+
+    eos_token_ids = [original_tokenizer_vocab_size + cosyvoice2_token_size, original_tokenizer_vocab_size + cosyvoice2_token_size + 1, original_tokenizer_vocab_size + cosyvoice2_token_size + 2]
+    llm.generation_config.eos_token_id = eos_token_ids
+    llm.generation_config.temperature = 1.0
+    llm.generation_config.top_p = 0.8
+    llm.generation_config.top_k = 25
+
+    llm.config.eos_token_id = original_tokenizer_vocab_size + cosyvoice2_token_size
+    llm.config.vocab_size = vocab_size
+    llm.config.tie_word_embeddings = False
+    llm.config.use_bias = True
+    llm.to(torch.bfloat16)
+    llm.save_pretrained(args.save_path)
+
+    TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
+    tokenizer.chat_template = TEMPLATE
+    tokenizer.save_pretrained(args.save_path)