Model Fine-tuning Memory Usage Statistics (#160)

This commit is contained in:
qianyu chen
2024-05-28 11:41:27 +08:00
committed by GitHub
parent 7e12387362
commit f592fedb2e
4 changed files with 30 additions and 10 deletions

View File

@@ -66,24 +66,26 @@ class SupervisedDataset(Dataset):
return ret return ret
def data_collator(examples, padding_value=0, max_length=2048):
def trim_and_pad(seq, batch_first, padding_value):
return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value)
def data_collator(examples, padding_value=0): input_ids = trim_and_pad(
input_ids = pad_sequence(
[example["input_ids"] for example in examples], [example["input_ids"] for example in examples],
batch_first=True, batch_first=True,
padding_value=padding_value, padding_value=padding_value,
) )
position_ids = pad_sequence( position_ids = trim_and_pad(
[example["position_ids"] for example in examples], [example["position_ids"] for example in examples],
batch_first=True, batch_first=True,
padding_value=padding_value, padding_value=padding_value,
) )
targets = pad_sequence( targets = trim_and_pad(
[example["labels"] for example in examples], [example["labels"] for example in examples],
batch_first=True, batch_first=True,
padding_value=padding_value, padding_value=-100,
) )
attention_mask = pad_sequence( attention_mask = trim_and_pad(
[example["attention_mask"] for example in examples], [example["attention_mask"] for example in examples],
batch_first=True, batch_first=True,
padding_value=padding_value, padding_value=padding_value,

View File

@@ -3,6 +3,7 @@ import json
import logging import logging
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import partial
from typing import Dict, List, Optional, Union, Literal, Tuple from typing import Dict, List, Optional, Union, Literal, Tuple
from types import MethodType from types import MethodType
import torch import torch
@@ -133,6 +134,7 @@ def make_supervised_data_module(
patch_size=14, patch_size=14,
query_nums=64, query_nums=64,
batch_vision=False, batch_vision=False,
max_length=2048,
) -> Dict: ) -> Dict:
"""Make dataset and collator for supervised fine-tuning.""" """Make dataset and collator for supervised fine-tuning."""
dataset_cls = SupervisedDataset dataset_cls = SupervisedDataset
@@ -169,7 +171,7 @@ def make_supervised_data_module(
return dict( return dict(
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=eval_dataset, eval_dataset=eval_dataset,
data_collator=data_collator, data_collator= partial(data_collator, max_length=max_length),
) )
@@ -287,6 +289,7 @@ def train():
patch_size=model.config.patch_size, patch_size=model.config.patch_size,
query_nums=model.config.query_num, query_nums=model.config.query_num,
batch_vision=batch_vision, batch_vision=batch_vision,
max_length=training_args.model_max_length,
) )
trainer = CPMTrainer( trainer = CPMTrainer(

View File

@@ -42,7 +42,7 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
--output_dir output/output_minicpmv2_lora \ --output_dir output/output_minicpmv2_lora \
--logging_dir output/output_minicpmv2_lora \ --logging_dir output/output_minicpmv2_lora \
--logging_strategy "steps" \ --logging_strategy "steps" \
--per_device_train_batch_size w \ --per_device_train_batch_size 2 \
--per_device_eval_batch_size 1 \ --per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \ --gradient_accumulation_steps 1 \
--evaluation_strategy "steps" \ --evaluation_strategy "steps" \
@@ -57,5 +57,4 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
--logging_steps 1 \ --logging_steps 1 \
--gradient_checkpointing true \ --gradient_checkpointing true \
--deepspeed ds_config_zero2.json \ --deepspeed ds_config_zero2.json \
--report_to "tensorboard" \ # wandb --report_to "tensorboard" # wandb

View File

@@ -90,6 +90,22 @@ model = AutoPeftModelForCausalLM.from_pretrained(
).eval() ).eval()
``` ```
### Model Fine-tuning Memory Usage Statistics
The following table presents the memory usage of the model when fine-tuning using NVIDIA A100 (80GiB) GPUs under different numbers of GPUs. The fine-tuning was performed with the DeepSpeed Zero-2 optimization and Gradient Checkpointing techniques, with a maximum length set to 2048 and batch size set to 1.
| Fine-tuning Method | GPUs: 2 | GPUs: 4 | GPUs: 8 |
|--------------------|---------|---------|---------|
| LoRA Fine-tuning | 31.2 GiB| 29.3 GiB| 28.4GiB |
| Full Parameters Fine-tuning | Out of memory | 75.0 GiB | 51.2GiB |
### Notes
- **Fine-tuning Method**: Displays two different fine-tuning strategies, LoRA fine-tuning and Full parameters fine-tuning.
- **Number of GPUs**: The table lists the memory usage for configurations with 2, 4, and 8 GPUs.
- **Memory Usage**: Expressed in GiB, this shows the required memory for each fine-tuning method under corresponding GPU configurations.
- **Out of memory**: Indicates that the memory was insufficient for full parameters fine-tuning under the current GPU configurations.
### Finetuning FAQs ### Finetuning FAQs
<details> <details>
<summary>Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?</summary> <summary>Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?</summary>