mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-04 17:59:18 +08:00
Update to MiniCPM-V 2.6
This commit is contained in:
@@ -105,7 +105,7 @@ def data_collator(examples, padding_value=0, max_length=2048):
|
||||
}
|
||||
|
||||
|
||||
def conversation_to_ids(conversation, tokenizer, llm_type=None):
|
||||
def conversation_to_ids(conversation, tokenizer, llm_type=None, new_schema=False):
|
||||
"""
|
||||
for single image multi-turn conversation
|
||||
conversation: [{'role': 'user', 'content': 'Describe this image'},
|
||||
@@ -115,6 +115,10 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None):
|
||||
input_ids, context, raw_msg = conversation_to_ids_llama3(
|
||||
conversation, tokenizer
|
||||
)
|
||||
elif llm_type == "qwen2":
|
||||
input_ids, context, raw_msg = conversation_to_ids_qwen2(
|
||||
conversation, tokenizer
|
||||
)
|
||||
else:
|
||||
input_ids, context, raw_msg = conversation_to_ids_minicpm(
|
||||
conversation, tokenizer
|
||||
@@ -125,6 +129,7 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None):
|
||||
|
||||
# build target
|
||||
target = torch.full_like(ids, -100, dtype=torch.int32)
|
||||
|
||||
for i in range(1, len(ids)):
|
||||
if context[i] == 0:
|
||||
target[i - 1] = ids[i]
|
||||
@@ -133,14 +138,21 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None):
|
||||
target[i - 1] = tokenizer.eot_id
|
||||
else:
|
||||
target[i - 1] = tokenizer.eos_id
|
||||
|
||||
|
||||
# build image bound
|
||||
image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
|
||||
image_start_tokens += 1
|
||||
image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
|
||||
if new_schema:
|
||||
start_cond = (ids == tokenizer.im_start_id) | (ids == tokenizer.slice_start_id)
|
||||
end_cond = (ids == tokenizer.im_end_id) | (ids == tokenizer.slice_end_id)
|
||||
image_start_tokens = torch.where(start_cond)[0]
|
||||
image_start_tokens += 1
|
||||
image_end_tokens = torch.where(end_cond)[0]
|
||||
else:
|
||||
image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
|
||||
image_start_tokens += 1
|
||||
image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
|
||||
if len(image_start_tokens) != len(image_end_tokens):
|
||||
print("image start token != image end tokens")
|
||||
|
||||
|
||||
if len(image_start_tokens) > 0:
|
||||
image_bound = torch.hstack(
|
||||
[image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)]
|
||||
@@ -230,6 +242,46 @@ def conversation_to_ids_llama3(conversation, tokenizer):
|
||||
return input_ids, context, raw_msg
|
||||
|
||||
|
||||
def conversation_to_ids_qwen2(conversation, tokenizer):
|
||||
raw_msg = ""
|
||||
chat = []
|
||||
context = []
|
||||
for idx, msg in enumerate(conversation):
|
||||
role = msg["role"]
|
||||
message = msg["content"]
|
||||
assert role in ["user", "assistant"]
|
||||
if role == "user":
|
||||
prefix = "user"
|
||||
else:
|
||||
prefix = "assistant"
|
||||
chat.append({"role":prefix, "content":message})
|
||||
raw_msg += prefix + message
|
||||
assert set([i['role'] for i in chat]) & set(['assistant'])
|
||||
|
||||
ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
|
||||
input_ids = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False)
|
||||
input_ids = np.array(input_ids)
|
||||
|
||||
start_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_start|>'))[0]
|
||||
assistant_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('assistant'))[0]
|
||||
end_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_end|>'))[0]
|
||||
|
||||
context = np.ones_like(input_ids, dtype=np.int8)
|
||||
|
||||
for assistant_idx in assistant_idxs:
|
||||
if assistant_idx-1 in set(start_idxs):
|
||||
st = assistant_idx + 1
|
||||
for end_idx in end_idxs:
|
||||
if end_idx > st:
|
||||
context[st: end_idx + 1] = 0
|
||||
break
|
||||
|
||||
input_ids = np.hstack(input_ids)
|
||||
context = np.hstack(context)
|
||||
return input_ids, context, raw_msg
|
||||
|
||||
|
||||
|
||||
def preprocess(
|
||||
image,
|
||||
conversation,
|
||||
@@ -256,8 +308,14 @@ def preprocess(
|
||||
default_image_placeholder = (
|
||||
tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end
|
||||
)
|
||||
new_schema = False
|
||||
use_image_id = False
|
||||
if llm_type=='qwen2':
|
||||
new_schema = True
|
||||
use_image_id = True
|
||||
if slice_config:
|
||||
images = []
|
||||
image_id_cnt = 0
|
||||
source_image, patches, best_grid = slice_image(
|
||||
image,
|
||||
slice_config["max_slice_nums"],
|
||||
@@ -270,9 +328,11 @@ def preprocess(
|
||||
for i in range(len(patches)):
|
||||
for j in range(len(patches[0])):
|
||||
images.append(patches[i][j])
|
||||
|
||||
if use_image_id:
|
||||
image_placeholder = f'{tokenizer.im_id_start}{idx}{tokenizer.im_id_end}' + image_placeholder
|
||||
image_id_cnt += 1
|
||||
image_placeholder += get_grid_placeholder(
|
||||
tokenizer, best_grid, query_nums)
|
||||
tokenizer, best_grid, query_nums, new_schema = new_schema)
|
||||
images = [transform(i) for i in images]
|
||||
else:
|
||||
images = [transform(image)]
|
||||
@@ -286,7 +346,7 @@ def preprocess(
|
||||
image_placeholder + "\n" + conversation[0]["content"]
|
||||
)
|
||||
|
||||
input_dict = conversation_to_ids(conversation, tokenizer, llm_type)
|
||||
input_dict = conversation_to_ids(conversation, tokenizer, llm_type, new_schema)
|
||||
|
||||
if batch_vision:
|
||||
tgt_sizes = []
|
||||
@@ -424,7 +484,7 @@ def split_to_patches(image, grid):
|
||||
return patches
|
||||
|
||||
|
||||
def get_grid_placeholder(tokenizer, grid, query_num):
|
||||
def get_grid_placeholder(tokenizer, grid, query_num, new_schema=False):
|
||||
image_placeholder = (
|
||||
tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end
|
||||
)
|
||||
@@ -437,7 +497,10 @@ def get_grid_placeholder(tokenizer, grid, query_num):
|
||||
for j in range(cols):
|
||||
lines.append(image_placeholder)
|
||||
slices.append("".join(lines))
|
||||
slice_placeholder = tokenizer.slice_start + \
|
||||
if new_schema:
|
||||
slice_placeholder = '\n'.join(slices)
|
||||
else:
|
||||
slice_placeholder = tokenizer.slice_start + \
|
||||
"\n".join(slices) + tokenizer.slice_end
|
||||
return slice_placeholder
|
||||
|
||||
@@ -455,4 +518,4 @@ def reshape_by_patch(image_tensor, patch_size):
|
||||
patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)
|
||||
patches = patches.permute(0, 1, 3, 2).reshape(
|
||||
image_tensor.size(0), patch_size, -1)
|
||||
return patches
|
||||
return patches
|
||||
@@ -6,6 +6,8 @@ from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from typing import Dict, List, Optional, Union, Literal, Tuple
|
||||
from types import MethodType
|
||||
from torchvision import transforms
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from accelerate.utils import DistributedType
|
||||
@@ -130,6 +132,18 @@ def make_supervised_data_module(
|
||||
)
|
||||
|
||||
|
||||
def build_transform():
|
||||
IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
|
||||
IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_STD
|
||||
return transforms.Compose(
|
||||
[
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(
|
||||
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def get_parameter_number(model):
|
||||
trainable_params, all_param = 0, 0
|
||||
for param in model.parameters():
|
||||
@@ -248,10 +262,11 @@ def train():
|
||||
else:
|
||||
batch_vision = False
|
||||
|
||||
transform_func = build_transform()
|
||||
data_module = make_supervised_data_module(
|
||||
tokenizer=tokenizer,
|
||||
data_args=data_args,
|
||||
transform=model.transform,
|
||||
transform=transform_func,
|
||||
data_collator=data_collator,
|
||||
slice_config=slice_config,
|
||||
llm_type=llm_type,
|
||||
|
||||
@@ -6,12 +6,15 @@ NODE_RANK=0
|
||||
MASTER_ADDR=localhost
|
||||
MASTER_PORT=6001
|
||||
|
||||
MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
|
||||
MODEL="openbmb/MiniCPM-V-2_6"
|
||||
# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
|
||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
|
||||
# See the section for finetuning in README for more information.
|
||||
DATA="path/to/trainging_data"
|
||||
EVAL_DATA="path/to/test_data"
|
||||
LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
|
||||
LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3"
|
||||
|
||||
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node $GPUS_PER_NODE \
|
||||
@@ -28,10 +31,10 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
|
||||
--remove_unused_columns false \
|
||||
--label_names "labels" \
|
||||
--prediction_loss_only false \
|
||||
--bf16 false \
|
||||
--bf16_full_eval false \
|
||||
--fp16 true \
|
||||
--fp16_full_eval true \
|
||||
--bf16 true \
|
||||
--bf16_full_eval true \
|
||||
--fp16 false \
|
||||
--fp16_full_eval false \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--tune_vision true \
|
||||
@@ -40,8 +43,8 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
|
||||
--max_slice_nums 9 \
|
||||
--max_steps 10000 \
|
||||
--eval_steps 1000 \
|
||||
--output_dir output/output_minicpmv2 \
|
||||
--logging_dir output/output_minicpmv2 \
|
||||
--output_dir output/output_minicpmv26 \
|
||||
--logging_dir output/output_minicpmv26 \
|
||||
--logging_strategy "steps" \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
|
||||
@@ -6,13 +6,14 @@ NODE_RANK=0
|
||||
MASTER_ADDR=localhost
|
||||
MASTER_PORT=6001
|
||||
|
||||
MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
|
||||
MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5
|
||||
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
|
||||
# See the section for finetuning in README for more information.
|
||||
DATA="path/to/trainging_data"
|
||||
EVAL_DATA="path/to/test_data"
|
||||
LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
|
||||
|
||||
LLM_TYPE="qwen2"
|
||||
# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
|
||||
#if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE=llama3
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node $GPUS_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
@@ -42,12 +43,12 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
|
||||
--max_slice_nums 9 \
|
||||
--max_steps 10000 \
|
||||
--eval_steps 1000 \
|
||||
--output_dir output/output_minicpmv2_lora \
|
||||
--logging_dir output/output_minicpmv2_lora \
|
||||
--output_dir output/output__lora \
|
||||
--logging_dir output/output_lora \
|
||||
--logging_strategy "steps" \
|
||||
--per_device_train_batch_size 2 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--evaluation_strategy "steps" \
|
||||
--save_strategy "steps" \
|
||||
--save_steps 1000 \
|
||||
|
||||
@@ -1,6 +1,76 @@
|
||||
# MiniCPM-V Finetuning
|
||||
|
||||
|
||||
We offer the official scripts for easy finetuning of the pretrained **MiniCPM-V-2_6**, **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.
|
||||
|
||||
### Data preparation
|
||||
|
||||
To prepare your finetuning data, you should formulate each sample as a dictionary consisting of an id, an image path list with an image, and a list of conversations. Then save data samples in JSON files.
|
||||
|
||||
For the vision-language example with image, you are required to provide **\<image\>** to define the position to insert the image embeddings. If you don't provide \<image\>, the image will be placed at the front of the conversation.
|
||||
|
||||
<details>
|
||||
<summary>
|
||||
<b>vision-language example (vl_finetune_data.json) with 1 samples.</b>
|
||||
</summary>
|
||||
|
||||
```
|
||||
[
|
||||
{
|
||||
"id": "0",
|
||||
"image": 'path/to/image_0.jpg',
|
||||
"conversations": [
|
||||
{
|
||||
'role': 'user',
|
||||
'content': '<image>\nHow many desserts are on the white plate?'
|
||||
},
|
||||
{
|
||||
'role': 'assistant',
|
||||
'content': 'There are three desserts on the white plate.'
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'What type of desserts are they?'
|
||||
},
|
||||
{
|
||||
'role': 'assistant',
|
||||
'content': 'The desserts are cakes with bananas and pecans on top. They share similarities with donuts, but the presence of bananas and pecans differentiates them.'
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'What is the setting of the image?'},
|
||||
{
|
||||
'role': 'assistant',
|
||||
'content': 'The image is set on a table top with a plate containing the three desserts.'
|
||||
},
|
||||
]
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Full-parameter finetuning
|
||||
|
||||
Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path and DATA path in the shell scripts.
|
||||
|
||||
```shell
|
||||
MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2
|
||||
DATA="path/to/trainging_data" # json file
|
||||
EVAL_DATA="path/to/test_data" # json file
|
||||
```
|
||||
|
||||
To launch your training, run the following script:
|
||||
|
||||
```
|
||||
sh finetune_ds.sh
|
||||
```
|
||||
|
||||
#### Customizing Hyperparameters
|
||||
To tailor the training process according to your specific requirements, you can adjust various hyperparameters. For comprehensive documentation on available hyperparameters and their functionalities, you can refer to the [official Transformers documentation](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments). Experimentation and fine-tuning of these parameters are essential for achieving optimal model performance tailored to your specific task and dataset.
|
||||
# MiniCPM-V Finetuning
|
||||
|
||||
|
||||
We offer the official scripts for easy finetuning of the pretrained **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.
|
||||
|
||||
### Data preparation
|
||||
@@ -55,10 +125,10 @@ For the vision-language example with image, you are required to provide **\<imag
|
||||
Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path, DATA path and LLM_TYPE in the shell scripts.
|
||||
|
||||
```shell
|
||||
MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
|
||||
MODEL="openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2
|
||||
DATA="path/to/trainging_data" # json file
|
||||
EVAL_DATA="path/to/test_data" # json file
|
||||
LLM_TYPE="llama3" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm
|
||||
LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3"
|
||||
```
|
||||
|
||||
To launch your training, run the following script:
|
||||
@@ -82,7 +152,7 @@ After training, you could load the model with the path to the adapter. We advise
|
||||
```
|
||||
from peft import PeftModel
|
||||
from transformers import AutoModel
|
||||
model_type="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2
|
||||
model_type= "openbmb/MiniCPM-V-2_6" # or openbmb/MiniCPM-Llama3-V-2_5 , openbmb/MiniCPM-V-2
|
||||
path_to_adapter="path_to_your_fine_tuned_checkpoint"
|
||||
|
||||
model = AutoModel.from_pretrained(
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import deepspeed
|
||||
|
||||
Reference in New Issue
Block a user