mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 10:19:18 +08:00
Model Fine-tuning Memory Usage Statistics (#160)
This commit is contained in:
@@ -66,24 +66,26 @@ class SupervisedDataset(Dataset):
|
|||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
def data_collator(examples, padding_value=0, max_length=2048):
|
||||||
|
def trim_and_pad(seq, batch_first, padding_value):
|
||||||
|
return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value)
|
||||||
|
|
||||||
def data_collator(examples, padding_value=0):
|
input_ids = trim_and_pad(
|
||||||
input_ids = pad_sequence(
|
|
||||||
[example["input_ids"] for example in examples],
|
[example["input_ids"] for example in examples],
|
||||||
batch_first=True,
|
batch_first=True,
|
||||||
padding_value=padding_value,
|
padding_value=padding_value,
|
||||||
)
|
)
|
||||||
position_ids = pad_sequence(
|
position_ids = trim_and_pad(
|
||||||
[example["position_ids"] for example in examples],
|
[example["position_ids"] for example in examples],
|
||||||
batch_first=True,
|
batch_first=True,
|
||||||
padding_value=padding_value,
|
padding_value=padding_value,
|
||||||
)
|
)
|
||||||
targets = pad_sequence(
|
targets = trim_and_pad(
|
||||||
[example["labels"] for example in examples],
|
[example["labels"] for example in examples],
|
||||||
batch_first=True,
|
batch_first=True,
|
||||||
padding_value=padding_value,
|
padding_value=-100,
|
||||||
)
|
)
|
||||||
attention_mask = pad_sequence(
|
attention_mask = trim_and_pad(
|
||||||
[example["attention_mask"] for example in examples],
|
[example["attention_mask"] for example in examples],
|
||||||
batch_first=True,
|
batch_first=True,
|
||||||
padding_value=padding_value,
|
padding_value=padding_value,
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from functools import partial
|
||||||
from typing import Dict, List, Optional, Union, Literal, Tuple
|
from typing import Dict, List, Optional, Union, Literal, Tuple
|
||||||
from types import MethodType
|
from types import MethodType
|
||||||
import torch
|
import torch
|
||||||
@@ -133,6 +134,7 @@ def make_supervised_data_module(
|
|||||||
patch_size=14,
|
patch_size=14,
|
||||||
query_nums=64,
|
query_nums=64,
|
||||||
batch_vision=False,
|
batch_vision=False,
|
||||||
|
max_length=2048,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""Make dataset and collator for supervised fine-tuning."""
|
"""Make dataset and collator for supervised fine-tuning."""
|
||||||
dataset_cls = SupervisedDataset
|
dataset_cls = SupervisedDataset
|
||||||
@@ -169,7 +171,7 @@ def make_supervised_data_module(
|
|||||||
return dict(
|
return dict(
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
eval_dataset=eval_dataset,
|
eval_dataset=eval_dataset,
|
||||||
data_collator=data_collator,
|
data_collator= partial(data_collator, max_length=max_length),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -287,6 +289,7 @@ def train():
|
|||||||
patch_size=model.config.patch_size,
|
patch_size=model.config.patch_size,
|
||||||
query_nums=model.config.query_num,
|
query_nums=model.config.query_num,
|
||||||
batch_vision=batch_vision,
|
batch_vision=batch_vision,
|
||||||
|
max_length=training_args.model_max_length,
|
||||||
)
|
)
|
||||||
|
|
||||||
trainer = CPMTrainer(
|
trainer = CPMTrainer(
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
|
|||||||
--output_dir output/output_minicpmv2_lora \
|
--output_dir output/output_minicpmv2_lora \
|
||||||
--logging_dir output/output_minicpmv2_lora \
|
--logging_dir output/output_minicpmv2_lora \
|
||||||
--logging_strategy "steps" \
|
--logging_strategy "steps" \
|
||||||
--per_device_train_batch_size w \
|
--per_device_train_batch_size 2 \
|
||||||
--per_device_eval_batch_size 1 \
|
--per_device_eval_batch_size 1 \
|
||||||
--gradient_accumulation_steps 1 \
|
--gradient_accumulation_steps 1 \
|
||||||
--evaluation_strategy "steps" \
|
--evaluation_strategy "steps" \
|
||||||
@@ -57,5 +57,4 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
|
|||||||
--logging_steps 1 \
|
--logging_steps 1 \
|
||||||
--gradient_checkpointing true \
|
--gradient_checkpointing true \
|
||||||
--deepspeed ds_config_zero2.json \
|
--deepspeed ds_config_zero2.json \
|
||||||
--report_to "tensorboard" \ # wandb
|
--report_to "tensorboard" # wandb
|
||||||
|
|
||||||
|
|||||||
@@ -90,6 +90,22 @@ model = AutoPeftModelForCausalLM.from_pretrained(
|
|||||||
).eval()
|
).eval()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Model Fine-tuning Memory Usage Statistics
|
||||||
|
|
||||||
|
The following table presents the memory usage of the model when fine-tuning using NVIDIA A100 (80GiB) GPUs under different numbers of GPUs. The fine-tuning was performed with the DeepSpeed Zero-2 optimization and Gradient Checkpointing techniques, with a maximum length set to 2048 and batch size set to 1.
|
||||||
|
|
||||||
|
| Fine-tuning Method | GPUs: 2 | GPUs: 4 | GPUs: 8 |
|
||||||
|
|--------------------|---------|---------|---------|
|
||||||
|
| LoRA Fine-tuning | 31.2 GiB| 29.3 GiB| 28.4GiB |
|
||||||
|
| Full Parameters Fine-tuning | Out of memory | 75.0 GiB | 51.2GiB |
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
- **Fine-tuning Method**: Displays two different fine-tuning strategies, LoRA fine-tuning and Full parameters fine-tuning.
|
||||||
|
- **Number of GPUs**: The table lists the memory usage for configurations with 2, 4, and 8 GPUs.
|
||||||
|
- **Memory Usage**: Expressed in GiB, this shows the required memory for each fine-tuning method under corresponding GPU configurations.
|
||||||
|
- **Out of memory**: Indicates that the memory was insufficient for full parameters fine-tuning under the current GPU configurations.
|
||||||
|
|
||||||
### Finetuning FAQs
|
### Finetuning FAQs
|
||||||
<details>
|
<details>
|
||||||
<summary>Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?</summary>
|
<summary>Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?</summary>
|
||||||
|
|||||||
Reference in New Issue
Block a user