Update to MiniCPM-V 2.6

2026-02-04 17:59:18 +08:00 · 2024-08-06 12:26:49 +08:00
parent 1cb882d473
commit b1a15299e6
28 changed files with 3692 additions and 191 deletions
--- a/finetune/dataset.py
+++ b/finetune/dataset.py
@@ -105,7 +105,7 @@ def data_collator(examples, padding_value=0, max_length=2048):
    }


-def conversation_to_ids(conversation, tokenizer, llm_type=None):
+def conversation_to_ids(conversation, tokenizer, llm_type=None, new_schema=False):
    """
    for single image multi-turn conversation
    conversation: [{'role': 'user', 'content': 'Describe this image'},
@@ -115,6 +115,10 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None):
        input_ids, context, raw_msg = conversation_to_ids_llama3(
            conversation, tokenizer
        )
+    elif llm_type == "qwen2":
+        input_ids, context, raw_msg = conversation_to_ids_qwen2(
+            conversation, tokenizer
+        )
    else:
        input_ids, context, raw_msg = conversation_to_ids_minicpm(
            conversation, tokenizer
@@ -125,6 +129,7 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None):

    # build target
    target = torch.full_like(ids, -100, dtype=torch.int32)
+    
    for i in range(1, len(ids)):
        if context[i] == 0:
            target[i - 1] = ids[i]
@@ -133,14 +138,21 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None):
                target[i - 1] = tokenizer.eot_id
            else:
                target[i - 1] = tokenizer.eos_id
-
+    
    # build image bound
-    image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
-    image_start_tokens += 1
-    image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
+    if new_schema:
+        start_cond = (ids == tokenizer.im_start_id) | (ids == tokenizer.slice_start_id)
+        end_cond = (ids == tokenizer.im_end_id) | (ids == tokenizer.slice_end_id)
+        image_start_tokens = torch.where(start_cond)[0]
+        image_start_tokens += 1
+        image_end_tokens = torch.where(end_cond)[0]
+    else:
+        image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
+        image_start_tokens += 1
+        image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
    if len(image_start_tokens) != len(image_end_tokens):
        print("image start token != image end tokens")
-        
+    
    if len(image_start_tokens) > 0:
        image_bound = torch.hstack(
            [image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)]
@@ -230,6 +242,46 @@ def conversation_to_ids_llama3(conversation, tokenizer):
    return input_ids, context, raw_msg


+def conversation_to_ids_qwen2(conversation, tokenizer):
+    raw_msg = ""
+    chat = []
+    context = []
+    for idx, msg in enumerate(conversation):
+        role = msg["role"]
+        message = msg["content"]
+        assert role in ["user", "assistant"]
+        if role == "user":
+            prefix = "user"
+        else:
+            prefix = "assistant"
+        chat.append({"role":prefix, "content":message})
+        raw_msg += prefix + message
+    assert set([i['role'] for i in chat]) & set(['assistant'])
+
+    ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
+    input_ids = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False)
+    input_ids = np.array(input_ids)
+
+    start_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_start|>'))[0]
+    assistant_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('assistant'))[0]
+    end_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_end|>'))[0]
+
+    context = np.ones_like(input_ids, dtype=np.int8)
+
+    for assistant_idx in assistant_idxs:
+        if assistant_idx-1 in set(start_idxs):
+            st = assistant_idx + 1
+            for end_idx in end_idxs:
+                if end_idx > st:
+                    context[st: end_idx + 1] = 0
+                    break
+                    
+    input_ids = np.hstack(input_ids)
+    context = np.hstack(context)
+    return input_ids, context, raw_msg
+
+
+
 def preprocess(
    image,
    conversation,
@@ -256,8 +308,14 @@ def preprocess(
    default_image_placeholder = (
        tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end
    )
+    new_schema = False
+    use_image_id = False
+    if llm_type=='qwen2':
+        new_schema = True
+        use_image_id = True
    if slice_config:
        images = []
+        image_id_cnt = 0 
        source_image, patches, best_grid = slice_image(
            image,
            slice_config["max_slice_nums"],
@@ -270,9 +328,11 @@ def preprocess(
            for i in range(len(patches)):
                for j in range(len(patches[0])):
                    images.append(patches[i][j])
-
+            if use_image_id:
+                image_placeholder = f'{tokenizer.im_id_start}{idx}{tokenizer.im_id_end}' + image_placeholder
+                image_id_cnt += 1
            image_placeholder += get_grid_placeholder(
-                tokenizer, best_grid, query_nums)
+                tokenizer, best_grid, query_nums, new_schema = new_schema)
        images = [transform(i) for i in images]
    else:
        images = [transform(image)]
@@ -286,7 +346,7 @@ def preprocess(
            image_placeholder + "\n" + conversation[0]["content"]
        )

-    input_dict = conversation_to_ids(conversation, tokenizer, llm_type)
+    input_dict = conversation_to_ids(conversation, tokenizer, llm_type, new_schema)

    if batch_vision:
        tgt_sizes = []
@@ -424,7 +484,7 @@ def split_to_patches(image, grid):
    return patches


-def get_grid_placeholder(tokenizer, grid, query_num):
+def get_grid_placeholder(tokenizer, grid, query_num, new_schema=False):
    image_placeholder = (
        tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end
    )
@@ -437,7 +497,10 @@ def get_grid_placeholder(tokenizer, grid, query_num):
        for j in range(cols):
            lines.append(image_placeholder)
        slices.append("".join(lines))
-    slice_placeholder = tokenizer.slice_start + \
+    if new_schema:
+        slice_placeholder = '\n'.join(slices)
+    else:
+        slice_placeholder = tokenizer.slice_start + \
        "\n".join(slices) + tokenizer.slice_end
    return slice_placeholder

@@ -455,4 +518,4 @@ def reshape_by_patch(image_tensor, patch_size):
    patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)
    patches = patches.permute(0, 1, 3, 2).reshape(
        image_tensor.size(0), patch_size, -1)
-    return patches
+    return patches
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -6,6 +6,8 @@ from dataclasses import dataclass, field
 from functools import partial
 from typing import Dict, List, Optional, Union, Literal, Tuple
 from types import MethodType
+from torchvision import transforms
+
 import torch
 import transformers
 from accelerate.utils import DistributedType
@@ -130,6 +132,18 @@ def make_supervised_data_module(
    )


+def build_transform():
+    IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
+    IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
+    return transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
+                ),
+            ]
+        )
+
 def get_parameter_number(model):
    trainable_params, all_param = 0, 0
    for param in model.parameters():
@@ -248,10 +262,11 @@ def train():
    else:
        batch_vision = False

+    transform_func = build_transform()
    data_module = make_supervised_data_module(
        tokenizer=tokenizer,
        data_args=data_args,
-        transform=model.transform,
+        transform=transform_func,
        data_collator=data_collator,
        slice_config=slice_config,
        llm_type=llm_type,
--- a/finetune/finetune_ds.sh
+++ b/finetune/finetune_ds.sh
@@ -6,12 +6,15 @@ NODE_RANK=0
 MASTER_ADDR=localhost
 MASTER_PORT=6001

-MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+MODEL="openbmb/MiniCPM-V-2_6"
+# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
 # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 # See the section for finetuning in README for more information.
 DATA="path/to/trainging_data"
 EVAL_DATA="path/to/test_data"
-LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
+LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3"
+
+

 DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
@@ -28,10 +31,10 @@ torchrun $DISTRIBUTED_ARGS finetune.py  \
    --remove_unused_columns false \
    --label_names "labels" \
    --prediction_loss_only false \
-    --bf16 false \
-    --bf16_full_eval false \
-    --fp16 true \
-    --fp16_full_eval true \
+    --bf16 true \
+    --bf16_full_eval true \
+    --fp16 false \
+    --fp16_full_eval false \
    --do_train \
    --do_eval \
    --tune_vision true \
@@ -40,8 +43,8 @@ torchrun $DISTRIBUTED_ARGS finetune.py  \
    --max_slice_nums 9 \
    --max_steps 10000 \
    --eval_steps 1000 \
-    --output_dir output/output_minicpmv2 \
-    --logging_dir output/output_minicpmv2 \
+    --output_dir output/output_minicpmv26 \
+    --logging_dir output/output_minicpmv26 \
    --logging_strategy "steps" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
--- a/finetune/finetune_lora.sh
+++ b/finetune/finetune_lora.sh
@@ -6,13 +6,14 @@ NODE_RANK=0
 MASTER_ADDR=localhost
 MASTER_PORT=6001

-MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
 # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 # See the section for finetuning in README for more information.
 DATA="path/to/trainging_data"
 EVAL_DATA="path/to/test_data"
-LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
-
+LLM_TYPE="qwen2" 
+# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
+#if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE=llama3
 DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
@@ -42,12 +43,12 @@ torchrun $DISTRIBUTED_ARGS finetune.py  \
    --max_slice_nums 9 \
    --max_steps 10000 \
    --eval_steps 1000 \
-    --output_dir output/output_minicpmv2_lora \
-    --logging_dir output/output_minicpmv2_lora \
+    --output_dir output/output__lora \
+    --logging_dir output/output_lora \
    --logging_strategy "steps" \
-    --per_device_train_batch_size 2 \
+    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
+    --gradient_accumulation_steps 1 \
    --evaluation_strategy "steps" \
    --save_strategy "steps" \
    --save_steps 1000 \
--- a/finetune/readme.md
+++ b/finetune/readme.md
@@ -1,6 +1,76 @@
 # MiniCPM-V Finetuning


+We offer the official scripts for easy finetuning of the pretrained **MiniCPM-V-2_6**, **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.
+
+### Data preparation
+
+To prepare your finetuning data, you should formulate each sample as a dictionary consisting of an id, an image path list with an image, and a list of conversations. Then save data samples in JSON files.
+
+For the vision-language example with image, you are required to provide **\<image\>** to define the position to insert the image embeddings. If you don't provide \<image\>, the image will be placed at the front of the conversation.
+
+<details>
+  <summary>
+    <b>vision-language example (vl_finetune_data.json) with 1 samples.</b>
+  </summary>
+
+```
+  [
+    {
+      "id": "0",
+      "image": 'path/to/image_0.jpg',
+      "conversations": [
+            {
+              'role': 'user', 
+              'content': '<image>\nHow many desserts are on the white plate?'
+            }, 
+            {
+                'role': 'assistant', 
+                'content': 'There are three desserts on the white plate.'
+            },   
+            {
+                'role': 'user', 
+                'content': 'What type of desserts are they?'
+            },
+            {
+                'role': 'assistant', 
+                'content': 'The desserts are cakes with bananas and pecans on top. They share similarities with donuts, but the presence of bananas and pecans differentiates them.'
+            }, 
+            {
+                'role': 'user', 
+                'content': 'What is the setting of the image?'}, 
+            {
+                'role': 'assistant', 
+                'content': 'The image is set on a table top with a plate containing the three desserts.'
+            },
+        ]
+    },
+  ]
+```
+
+</details>
+
+### Full-parameter finetuning
+
+Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path and DATA path in the shell scripts.
+
+```shell
+MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2
+DATA="path/to/trainging_data" # json file
+EVAL_DATA="path/to/test_data" # json file
+```
+
+To launch your training, run the following script:
+
+```
+sh finetune_ds.sh
+```
+
+#### Customizing Hyperparameters
+To tailor the training process according to your specific requirements, you can adjust various hyperparameters. For comprehensive documentation on available hyperparameters and their functionalities, you can refer to the [official Transformers documentation](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments). Experimentation and fine-tuning of these parameters are essential for achieving optimal model performance tailored to your specific task and dataset.
+# MiniCPM-V Finetuning
+
+
 We offer the official scripts for easy finetuning of the pretrained **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.

 ### Data preparation
@@ -55,10 +125,10 @@ For the vision-language example with image, you are required to provide **\<imag
 Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path, DATA path and LLM_TYPE in the shell scripts.

 ```shell
-MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2
 DATA="path/to/trainging_data" # json file
 EVAL_DATA="path/to/test_data" # json file
-LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
+LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3"
 ```

 To launch your training, run the following script:
@@ -82,7 +152,7 @@ After training, you could load the model with the path to the adapter. We advise
 ```
 from peft import PeftModel
 from transformers import AutoModel
-model_type="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
+model_type=  "openbmb/MiniCPM-V-2_6"   # or openbmb/MiniCPM-Llama3-V-2_5 , openbmb/MiniCPM-V-2
 path_to_adapter="path_to_your_fine_tuned_checkpoint"

 model =  AutoModel.from_pretrained(
--- a/finetune/trainer.py
+++ b/finetune/trainer.py
@@ -1,3 +1,4 @@
+
 import torch
 import torch.nn as nn
 import deepspeed