From e002c0e6ec2418e25e4019fb6ca911a2dd18adc5 Mon Sep 17 00:00:00 2001 From: qianyu chen <38046403+qyc-98@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:32:17 +0800 Subject: [PATCH 1/9] =?UTF-8?q?zero3=E6=94=AF=E6=8C=81=20(#273)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update for lora's modules_to_save --- finetune/finetune.py | 8 +++----- finetune/readme.md | 28 +++++++++++++--------------- finetune/trainer.py | 41 ++++++++++------------------------------- 3 files changed, 26 insertions(+), 51 deletions(-) diff --git a/finetune/finetune.py b/finetune/finetune.py index 3333454..671d6c7 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -250,6 +250,9 @@ def train(): rank0_print("Currently using LoRA for fine-tuning the MiniCPM-V model.") for name, param in model.llm.named_parameters(): param.requires_grad = False + modules_to_save = ['embed_tokens','resampler'] + if training_args.tune_vision: + modules_to_save.append('vpm') lora_config = LoraConfig( r=lora_args.lora_r, lora_alpha=lora_args.lora_alpha, @@ -257,7 +260,6 @@ def train(): lora_dropout=lora_args.lora_dropout, bias=lora_args.lora_bias, layers_to_transform=lora_args.lora_layers_to_transform, - task_type="CAUSAL_LM", ) if not hasattr(model, 'get_input_embeddings'): def get_input_embeddings(self): @@ -268,10 +270,6 @@ def train(): model, use_gradient_checkpointing=training_args.gradient_checkpointing ) model = get_peft_model(model, lora_config) - model.base_model.resampler.requires_grad_(True) - model.base_model.llm.model.embed_tokens.weight.requires_grad_(True) - if training_args.tune_vision: - model.base_model.vpm.requires_grad_(True) if training_args.gradient_checkpointing: model.enable_input_require_grads() diff --git a/finetune/readme.md b/finetune/readme.md index 2f4e095..c656c0d 100644 --- a/finetune/readme.md +++ b/finetune/readme.md @@ -80,20 +80,16 @@ sh finetune_lora.sh After training, you could load the model with the path to the adapter. We advise you to use absolute path for your pretrained model. This is because LoRA only saves the adapter and the absolute path in the adapter configuration json file is used for finding out the pretrained model to load. ``` -from peft import AutoPeftModelForCausalLM +from peft import AutoPeftModel -path_to_adapter="path_to_adapter" +path_to_adapter="path_to_your_fine_tuned_checkpoint" -model = AutoPeftModelForCausalLM.from_pretrained( +model = AutoPeftModel.from_pretrained( # path to the output directory path_to_adapter, device_map="auto", trust_remote_code=True -).eval() - -vpm_resampler_embedtokens_weight = torch.load(f"{path_to_adapter}/vpm_resampler_embedtokens.pt") - -msg = model.load_state_dict(vpm_resampler_embedtokens_weight, strict=False) +).eval().cuda() ``` @@ -173,14 +169,16 @@ A: The error as described in [issues 168](https://github.com/OpenBMB/MiniCPM-V/i 1.**Reload the Fine-Tuned Model:** Make sure you correctly load the checkpoint that has been fine-tuned using lora techniques. Use the following code example to guide you: ```python - from peft import AutoPeftModelForCausalLM + from peft import AutoPeftModel - model = AutoPeftModelForCausalLM.from_pretrained( - 'path_to_your_fine_tuned_checkpoint', # Path to your fine-tuned checkpoint directory - output='output/minicpmv2_lora', - device_map='auto', - trust_remote_code=True - ).eval() +path_to_adapter="path_to_your_fine_tuned_checkpoint" + +model = AutoPeftModel.from_pretrained( + # path to the output directory + path_to_adapter, + device_map="auto", + trust_remote_code=True +).eval().cuda() ``` 2.**Update the `model_minicpmv.py` File:** - **Verification:** Make sure you verify and update your `model_minicpmv.py` file to ensure it is the latest version. diff --git a/finetune/trainer.py b/finetune/trainer.py index fa57bd0..cc45c97 100644 --- a/finetune/trainer.py +++ b/finetune/trainer.py @@ -15,19 +15,12 @@ class CPMTrainer(Trainer): else: labels = None self.model.resampler.pos_embed = self.model.resampler.pos_embed.to(self.model.device) - if is_deepspeed_zero3_enabled(): - with deepspeed.zero.GatheredParameters(self.model.resampler.attn.parameters(), modifier_rank=0): - if not self.args.use_lora: - outputs = self.model(data = inputs, use_cache=False) - else: - with self.model._enable_peft_forward_hooks(**inputs): - outputs = self.model.base_model(data = inputs, use_cache=False) + + if not self.args.use_lora: + outputs = self.model(data = inputs, use_cache=False) else: - if not self.args.use_lora: - outputs = self.model(data = inputs, use_cache=False) - else: - with self.model._enable_peft_forward_hooks(**inputs): - outputs = self.model.base_model(data = inputs, use_cache=False) + with self.model._enable_peft_forward_hooks(**inputs): + outputs = self.model.base_model(data = inputs, use_cache=False) if labels is not None: # Flatten the tokens @@ -215,11 +208,7 @@ class CPMTrainer(Trainer): with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: - if is_deepspeed_zero3_enabled(): - with deepspeed.zero.GatheredParameters(self.model.resampler.attn.parameters(), modifier_rank=0): - self.accelerator.backward(loss) - else: - self.accelerator.backward(loss) + self.accelerator.backward(loss) return loss.detach() / self.args.gradient_accumulation_steps @@ -249,20 +238,10 @@ class CPMTrainer(Trainer): else: torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - if self.args.use_lora: - from collections import OrderedDict - state_dict_vision = OrderedDict() - for key, values in state_dict.items(): - if 'vpm' in key or 'resampler' in key or 'embed_tokens' in key: - state_dict_vision[key] = values - self.model.save_pretrained( - output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors - ) - torch.save(state_dict_vision, f"{output_dir}/vpm_resampler_embedtokens.pt", ) - else: - self.model.save_pretrained( - output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors - ) + + self.model.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) From dc68caefbdd7cf32eca80d3145a8ef38b1052671 Mon Sep 17 00:00:00 2001 From: qianyu chen <38046403+qyc-98@users.noreply.github.com> Date: Wed, 17 Jul 2024 09:30:28 +0800 Subject: [PATCH 2/9] Update trainer.py --- finetune/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/finetune/trainer.py b/finetune/trainer.py index cc45c97..2bf33c4 100644 --- a/finetune/trainer.py +++ b/finetune/trainer.py @@ -14,7 +14,6 @@ class CPMTrainer(Trainer): labels = inputs.pop("labels") else: labels = None - self.model.resampler.pos_embed = self.model.resampler.pos_embed.to(self.model.device) if not self.args.use_lora: outputs = self.model(data = inputs, use_cache=False) From cb8d4fc5a40c8d688e23e004258287e1ed5fba27 Mon Sep 17 00:00:00 2001 From: qianyu chen <38046403+qyc-98@users.noreply.github.com> Date: Wed, 17 Jul 2024 09:30:35 +0800 Subject: [PATCH 3/9] Update finetune.py --- finetune/finetune.py | 1 + 1 file changed, 1 insertion(+) diff --git a/finetune/finetune.py b/finetune/finetune.py index 671d6c7..d2d2cff 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -260,6 +260,7 @@ def train(): lora_dropout=lora_args.lora_dropout, bias=lora_args.lora_bias, layers_to_transform=lora_args.lora_layers_to_transform, + modules_to_save=modules_to_save, ) if not hasattr(model, 'get_input_embeddings'): def get_input_embeddings(self): From 6c58d38e9e14b2eafb71940927bae34bb229bae1 Mon Sep 17 00:00:00 2001 From: qianyu chen <38046403+qyc-98@users.noreply.github.com> Date: Wed, 17 Jul 2024 10:10:36 +0800 Subject: [PATCH 4/9] Update readme.md --- finetune/readme.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/finetune/readme.md b/finetune/readme.md index c656c0d..4fe6026 100644 --- a/finetune/readme.md +++ b/finetune/readme.md @@ -80,12 +80,18 @@ sh finetune_lora.sh After training, you could load the model with the path to the adapter. We advise you to use absolute path for your pretrained model. This is because LoRA only saves the adapter and the absolute path in the adapter configuration json file is used for finding out the pretrained model to load. ``` -from peft import AutoPeftModel - +from peft import PeftModel +from transformers import AutoModel +model_type="openbmb/MiniCPM-Llama3-V-2_5" # or openbmb/MiniCPM-V-2 path_to_adapter="path_to_your_fine_tuned_checkpoint" -model = AutoPeftModel.from_pretrained( - # path to the output directory +model = AutoModel.from_pretrained( + model_type, + trust_remote_code=True + ) + +lora_model = PeftModel.from_pretrained( + model, path_to_adapter, device_map="auto", trust_remote_code=True From 091236ad34667b27f9915907e2d0295c42893a0b Mon Sep 17 00:00:00 2001 From: qianyu chen <38046403+qyc-98@users.noreply.github.com> Date: Wed, 17 Jul 2024 14:37:18 +0800 Subject: [PATCH 5/9] Update finetune_lora.sh --- finetune/finetune_lora.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/finetune_lora.sh b/finetune/finetune_lora.sh index 49c375b..22cf5a2 100644 --- a/finetune/finetune_lora.sh +++ b/finetune/finetune_lora.sh @@ -37,7 +37,7 @@ torchrun $DISTRIBUTED_ARGS finetune.py \ --tune_vision true \ --tune_llm false \ --use_lora true \ - --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj)" \ + --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj|o_proj)" \ --model_max_length 2048 \ --max_slice_nums 9 \ --max_steps 10000 \ From 247a576a6b2a38bd2203e828d05b36ccee76daae Mon Sep 17 00:00:00 2001 From: qianyu chen <38046403+qyc-98@users.noreply.github.com> Date: Thu, 18 Jul 2024 19:23:45 +0800 Subject: [PATCH 6/9] Update finetune.py --- finetune/finetune.py | 48 +------------------------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/finetune/finetune.py b/finetune/finetune.py index d2d2cff..1c42a17 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -66,42 +66,6 @@ class LoraArguments: lora_layer_replication: Optional[List[Tuple[int, int]]] = None lora_layers_to_transform: Optional[List[int]] = None lora_layers_pattern: Optional[str] = None - -def maybe_zero_3(param): - if hasattr(param, "ds_id"): - assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE - with zero.GatheredParameters([param]): - param = param.data.detach().cpu().clone() - else: - param = param.detach().cpu().clone() - return param - - -# Borrowed from peft.utils.get_peft_model_state_dict -def get_peft_state_maybe_zero_3(named_params, bias): - if bias == "none": - to_return = {k: t for k, t in named_params if "lora_" in k} - elif bias == "all": - to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} - elif bias == "lora_only": - to_return = {} - maybe_lora_bias = {} - lora_bias_names = set() - for k, t in named_params: - if "lora_" in k: - to_return[k] = t - bias_name = k.split("lora_")[0] + "bias" - lora_bias_names.add(bias_name) - elif "bias" in k: - maybe_lora_bias[k] = t - for k, t in maybe_lora_bias: - if bias_name in lora_bias_names: - to_return[bias_name] = t - else: - raise NotImplementedError - to_return = {k: maybe_zero_3(v) for k, v in to_return.items()} - return to_return - local_rank = None def rank0_print(*args): @@ -111,18 +75,8 @@ def rank0_print(*args): def safe_save_model_for_hf_trainer(trainer, output_dir: str, bias="none"): """Collects the state dict and dump to disk.""" - # check if zero3 mode enabled - if deepspeed.is_deepspeed_zero3_enabled(): - state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict() - else: - if trainer.args.use_lora: - state_dict = get_peft_state_maybe_zero_3( - trainer.model.named_parameters(), bias - ) - else: - state_dict = trainer.model.state_dict() if trainer.args.should_save and trainer.args.local_rank == 0: - trainer._save(output_dir, state_dict=state_dict) + trainer.save_model(output_dir,) def make_supervised_data_module( From ffa1e24a6cb06a56e18038481892c7cb4b8d4999 Mon Sep 17 00:00:00 2001 From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:08:05 +0800 Subject: [PATCH 7/9] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e46b405..110dce4 100644 --- a/README.md +++ b/README.md @@ -614,7 +614,7 @@ MiniCPM-Llama3-V 2.5 can run with llama.cpp now! See our fork of [llama.cpp](htt ### Inference with vLLM
-Click to see how to inference MiniCPM-V 2.0 with vLLM (MiniCPM-Llama3-V 2.5 coming soon) +Click to see how to inference MiniCPM-V 2.0 and MiniCPM-Llama3-V 2.5 with vLLM Because our pull request to vLLM is still waiting for reviewing, we fork this repository to build and test our vLLM demo. Here are the steps: 1. Clone our version of vLLM: @@ -624,6 +624,7 @@ git clone https://github.com/OpenBMB/vllm.git 2. Install vLLM: ```shell cd vllm +git checkout minicpmv pip install -e . ``` 3. Install timm: From d31583d1bf082f2267cd6d78d7334f9fff97d993 Mon Sep 17 00:00:00 2001 From: Hongji Zhu Date: Fri, 19 Jul 2024 16:23:33 +0800 Subject: [PATCH 8/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 110dce4..f59a9d1 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Join our 💬 WeChat ## News #### 📌 Pinned - +* [2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See [here](#vllm). * [2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in llama.cpp and ollama! Please pull the latest code **of our provided forks** ([llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md), [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)). GGUF models in various sizes are available [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main). MiniCPM-Llama3-V 2.5 series is **not supported by the official repositories yet**, and we are working hard to merge PRs. Please stay tuned! * [2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics [here](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics). * [2024.05.23] 🔍 We've released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmarks evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click [here](./docs/compare_with_phi-3_vision.md) to view more details. From ba27a162aa236e17036bb903dc13862753f8e484 Mon Sep 17 00:00:00 2001 From: Hongji Zhu Date: Fri, 19 Jul 2024 19:19:45 +0800 Subject: [PATCH 9/9] Update readme, support vLLM --- README_en.md | 7 +++++-- README_zh.md | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README_en.md b/README_en.md index 8c950fd..f59a9d1 100644 --- a/README_en.md +++ b/README_en.md @@ -29,7 +29,7 @@ Join our 💬 WeChat ## News #### 📌 Pinned - +* [2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See [here](#vllm). * [2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in llama.cpp and ollama! Please pull the latest code **of our provided forks** ([llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md), [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)). GGUF models in various sizes are available [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main). MiniCPM-Llama3-V 2.5 series is **not supported by the official repositories yet**, and we are working hard to merge PRs. Please stay tuned! * [2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics [here](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics). * [2024.05.23] 🔍 We've released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmarks evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click [here](./docs/compare_with_phi-3_vision.md) to view more details. @@ -566,6 +566,8 @@ You will get the following output: "The Airbus A380 is a double-deck, wide-body, four-engine jet airliner made by Airbus. It is the world's largest passenger airliner and is known for its long-haul capabilities. The aircraft was developed to improve efficiency and comfort for passengers traveling over long distances. It has two full-length passenger decks, which can accommodate more passengers than a typical single-aisle airplane. The A380 has been operated by airlines such as Lufthansa, Singapore Airlines, and Emirates, among others. It is widely recognized for its unique design and significant impact on the aviation industry." ``` +### Inference on Multiple GPUs +You can run MiniCPM-Llama3-V 2.5 on multiple low VRAM GPUs (12 GB or 16 GB) by distributing the model's layers across multiple GPUs. Please refer to this [tutorial](https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md) for detailed instructions on how to load the model and inference using multiple low VRAM GPUs. ### Inference on Mac @@ -612,7 +614,7 @@ MiniCPM-Llama3-V 2.5 can run with llama.cpp now! See our fork of [llama.cpp](htt ### Inference with vLLM
-Click to see how to inference MiniCPM-V 2.0 with vLLM (MiniCPM-Llama3-V 2.5 coming soon) +Click to see how to inference MiniCPM-V 2.0 and MiniCPM-Llama3-V 2.5 with vLLM Because our pull request to vLLM is still waiting for reviewing, we fork this repository to build and test our vLLM demo. Here are the steps: 1. Clone our version of vLLM: @@ -622,6 +624,7 @@ git clone https://github.com/OpenBMB/vllm.git 2. Install vLLM: ```shell cd vllm +git checkout minicpmv pip install -e . ``` 3. Install timm: diff --git a/README_zh.md b/README_zh.md index 1a44d8c..2d8895c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -32,6 +32,7 @@ #### 📌 置顶 +* [2024.07.19] MiniCPM-Llama3-V 2.5 现已支持[vLLM](#vllm) ! * [2024.05.28] 💥 MiniCPM-Llama3-V 2.5 现在在 llama.cpp 和 ollama 中完全支持其功能!**请拉取我们最新的 fork 来使用**:[llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) & [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)。我们还发布了各种大小的 GGUF 版本,请点击[这里](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main)查看。请注意,**目前官方仓库尚未支持 MiniCPM-Llama3-V 2.5**,我们也正积极推进将这些功能合并到 llama.cpp & ollama 官方仓库,敬请关注! * [2024.05.28] 💫 我们现在支持 MiniCPM-Llama3-V 2.5 的 LoRA 微调,更多内存使用统计信息可以在[这里](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics)找到。 * [2024.05.23] 🔍 我们添加了Phi-3-vision-128k-instruct 与 MiniCPM-Llama3-V 2.5的全面对比,包括基准测试评估、多语言能力和推理效率 🌟📊🌍🚀。点击[这里](./docs/compare_with_phi-3_vision.md)查看详细信息。 @@ -643,7 +644,7 @@ MiniCPM-Llama3-V 2.5 现在支持llama.cpp啦! 用法请参考我们的fork [lla ### vLLM 部署
-点击查看 MiniCPM-V 2.0 利用vLLM 部署运行的方法(MiniCPM-Llama3-V 2.5 支持vLLM将在近期推出) +点击查看 MiniCPM-V 2.0 和 MiniCPM-Llama3-V 2.5 利用vLLM 部署运行的方法 由于我们对 vLLM 提交的 PR 还在 review 中,因此目前我们 fork 了一个 vLLM 仓库以供测试使用。 1. 首先克隆我们 fork 的 vLLM 库: @@ -653,6 +654,7 @@ git clone https://github.com/OpenBMB/vllm.git 2. 安装 vLLM 库: ```shell cd vllm +git checkout minicpmv pip install -e . ``` 3. 安装 timm 库: