diff --git a/finetune/dataset.py b/finetune/dataset.py
index 1774fa4..567330d 100644
--- a/finetune/dataset.py
+++ b/finetune/dataset.py
@@ -66,24 +66,26 @@ class SupervisedDataset(Dataset):
 
         return ret
 
+def data_collator(examples, padding_value=0, max_length=2048):
+    def trim_and_pad(seq, batch_first, padding_value):
+        return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value)
 
-def data_collator(examples, padding_value=0):
-    input_ids = pad_sequence(
+    input_ids = trim_and_pad(
         [example["input_ids"] for example in examples],
         batch_first=True,
         padding_value=padding_value,
     )
-    position_ids = pad_sequence(
+    position_ids = trim_and_pad(
         [example["position_ids"] for example in examples],
         batch_first=True,
         padding_value=padding_value,
     )
-    targets = pad_sequence(
+    targets = trim_and_pad(
         [example["labels"] for example in examples],
         batch_first=True,
-        padding_value=padding_value,
+        padding_value=-100,
     )
-    attention_mask = pad_sequence(
+    attention_mask = trim_and_pad(
         [example["attention_mask"] for example in examples],
         batch_first=True,
         padding_value=padding_value,
diff --git a/finetune/finetune.py b/finetune/finetune.py
index 986abba..2738555 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -3,6 +3,7 @@ import json
 import logging
 import os
 from dataclasses import dataclass, field
+from functools import partial
 from typing import Dict, List, Optional, Union, Literal, Tuple
 from types import MethodType
 import torch
@@ -133,6 +134,7 @@ def make_supervised_data_module(
     patch_size=14,
     query_nums=64,
     batch_vision=False,
+    max_length=2048,
 ) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
     dataset_cls = SupervisedDataset
@@ -169,7 +171,7 @@ def make_supervised_data_module(
     return dict(
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        data_collator=data_collator,
+        data_collator= partial(data_collator, max_length=max_length),
     )
 
 
@@ -287,6 +289,7 @@ def train():
         patch_size=model.config.patch_size,
         query_nums=model.config.query_num,
         batch_vision=batch_vision,
+        max_length=training_args.model_max_length,
     )
     
     trainer = CPMTrainer(
diff --git a/finetune/finetune_lora.sh b/finetune/finetune_lora.sh
index 6c5e3a1..be3fb38 100644
--- a/finetune/finetune_lora.sh
+++ b/finetune/finetune_lora.sh
@@ -42,7 +42,7 @@ torchrun $DISTRIBUTED_ARGS finetune.py  \
     --output_dir output/output_minicpmv2_lora \
     --logging_dir output/output_minicpmv2_lora \
     --logging_strategy "steps" \
-    --per_device_train_batch_size w \
+    --per_device_train_batch_size 2 \
     --per_device_eval_batch_size 1 \
     --gradient_accumulation_steps 1 \
     --evaluation_strategy "steps" \
@@ -57,5 +57,4 @@ torchrun $DISTRIBUTED_ARGS finetune.py  \
     --logging_steps 1 \
     --gradient_checkpointing true \
     --deepspeed ds_config_zero2.json \
-    --report_to "tensorboard" \ # wandb
-
+    --report_to "tensorboard" # wandb
diff --git a/finetune/readme.md b/finetune/readme.md
index a347e33..6bf6d8d 100644
--- a/finetune/readme.md
+++ b/finetune/readme.md
@@ -90,6 +90,22 @@ model = AutoPeftModelForCausalLM.from_pretrained(
 ).eval()
 ```
 
+
+### Model Fine-tuning Memory Usage Statistics
+
+The following table presents the memory usage of the model when fine-tuning using NVIDIA A100 (80GiB) GPUs under different numbers of GPUs. The fine-tuning was performed with the DeepSpeed Zero-2 optimization and Gradient Checkpointing techniques, with a maximum length set to 2048 and batch size set to 1.
+
+| Fine-tuning Method | GPUs: 2 | GPUs: 4 | GPUs: 8 |
+|--------------------|---------|---------|---------|
+| LoRA Fine-tuning   | 31.2 GiB| 29.3 GiB|    28.4GiB   |
+| Full Parameters Fine-tuning | Out of memory | 75.0 GiB | 51.2GiB |
+
+### Notes
+- **Fine-tuning Method**: Displays two different fine-tuning strategies, LoRA fine-tuning and Full parameters fine-tuning.
+- **Number of GPUs**: The table lists the memory usage for configurations with 2, 4, and 8 GPUs.
+- **Memory Usage**: Expressed in GiB, this shows the required memory for each fine-tuning method under corresponding GPU configurations.
+- **Out of memory**: Indicates that the memory was insufficient for full parameters fine-tuning under the current GPU configurations.
+
 ### Finetuning FAQs
 <details>
 <summary>Q: How do I use the `flash_attention_2` implementation when loading a pretrained model?</summary>