diff --git a/finetune/dataset.py b/finetune/dataset.py index 1774fa4..567330d 100644 --- a/finetune/dataset.py +++ b/finetune/dataset.py @@ -66,24 +66,26 @@ class SupervisedDataset(Dataset): return ret +def data_collator(examples, padding_value=0, max_length=2048): + def trim_and_pad(seq, batch_first, padding_value): + return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value) -def data_collator(examples, padding_value=0): - input_ids = pad_sequence( + input_ids = trim_and_pad( [example["input_ids"] for example in examples], batch_first=True, padding_value=padding_value, ) - position_ids = pad_sequence( + position_ids = trim_and_pad( [example["position_ids"] for example in examples], batch_first=True, padding_value=padding_value, ) - targets = pad_sequence( + targets = trim_and_pad( [example["labels"] for example in examples], batch_first=True, - padding_value=padding_value, + padding_value=-100, ) - attention_mask = pad_sequence( + attention_mask = trim_and_pad( [example["attention_mask"] for example in examples], batch_first=True, padding_value=padding_value, diff --git a/finetune/finetune.py b/finetune/finetune.py index 986abba..2738555 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -3,6 +3,7 @@ import json import logging import os from dataclasses import dataclass, field +from functools import partial from typing import Dict, List, Optional, Union, Literal, Tuple from types import MethodType import torch @@ -133,6 +134,7 @@ def make_supervised_data_module( patch_size=14, query_nums=64, batch_vision=False, + max_length=2048, ) -> Dict: """Make dataset and collator for supervised fine-tuning.""" dataset_cls = SupervisedDataset @@ -169,7 +171,7 @@ def make_supervised_data_module( return dict( train_dataset=train_dataset, eval_dataset=eval_dataset, - data_collator=data_collator, + data_collator= partial(data_collator, max_length=max_length), ) @@ -287,6 +289,7 @@ def train(): patch_size=model.config.patch_size, query_nums=model.config.query_num, batch_vision=batch_vision, + max_length=training_args.model_max_length, ) trainer = CPMTrainer( diff --git a/finetune/finetune_lora.sh b/finetune/finetune_lora.sh index 6c5e3a1..be3fb38 100644 --- a/finetune/finetune_lora.sh +++ b/finetune/finetune_lora.sh @@ -42,7 +42,7 @@ torchrun $DISTRIBUTED_ARGS finetune.py \ --output_dir output/output_minicpmv2_lora \ --logging_dir output/output_minicpmv2_lora \ --logging_strategy "steps" \ - --per_device_train_batch_size w \ + --per_device_train_batch_size 2 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "steps" \ @@ -57,5 +57,4 @@ torchrun $DISTRIBUTED_ARGS finetune.py \ --logging_steps 1 \ --gradient_checkpointing true \ --deepspeed ds_config_zero2.json \ - --report_to "tensorboard" \ # wandb - + --report_to "tensorboard" # wandb diff --git a/finetune/readme.md b/finetune/readme.md index a347e33..6bf6d8d 100644 --- a/finetune/readme.md +++ b/finetune/readme.md @@ -90,6 +90,22 @@ model = AutoPeftModelForCausalLM.from_pretrained( ).eval() ``` + +### Model Fine-tuning Memory Usage Statistics + +The following table presents the memory usage of the model when fine-tuning using NVIDIA A100 (80GiB) GPUs under different numbers of GPUs. The fine-tuning was performed with the DeepSpeed Zero-2 optimization and Gradient Checkpointing techniques, with a maximum length set to 2048 and batch size set to 1. + +| Fine-tuning Method | GPUs: 2 | GPUs: 4 | GPUs: 8 | +|--------------------|---------|---------|---------| +| LoRA Fine-tuning | 31.2 GiB| 29.3 GiB| 28.4GiB | +| Full Parameters Fine-tuning | Out of memory | 75.0 GiB | 51.2GiB | + +### Notes +- **Fine-tuning Method**: Displays two different fine-tuning strategies, LoRA fine-tuning and Full parameters fine-tuning. +- **Number of GPUs**: The table lists the memory usage for configurations with 2, 4, and 8 GPUs. +- **Memory Usage**: Expressed in GiB, this shows the required memory for each fine-tuning method under corresponding GPU configurations. +- **Out of memory**: Indicates that the memory was insufficient for full parameters fine-tuning under the current GPU configurations. + ### Finetuning FAQs
Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?