
@@ -522,34 +1301,44 @@ pip install -r requirements.txt
```python
-from chat import MiniCPMVChat, img2base64
import torch
-import json
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
torch.manual_seed(0)
-chat_model = MiniCPMVChat('openbmb/MiniCPM-Llama3-V-2_5')
+model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
-im_64 = img2base64('./assets/airplane.jpeg')
+image = Image.open('./assets/airplane.jpeg').convert('RGB')
# First round chat
-msgs = [{"role": "user", "content": "Tell me the model of this aircraft."}]
+question = "Tell me the model of this aircraft."
+msgs = [{'role': 'user', 'content': [image, question]}]
-inputs = {"image": im_64, "question": json.dumps(msgs)}
-answer = chat_model.chat(inputs)
+answer = model.chat(
+ image=None,
+ msgs=msgs,
+ tokenizer=tokenizer
+)
print(answer)
# Second round chat
# pass history context of multi-turn conversation
-msgs.append({"role": "assistant", "content": answer})
-msgs.append({"role": "user", "content": "Introduce something about Airbus A380."})
+msgs.append({"role": "assistant", "content": [answer]})
+msgs.append({"role": "user", "content": ["Introduce something about Airbus A380."]})
-inputs = {"image": im_64, "question": json.dumps(msgs)}
-answer = chat_model.chat(inputs)
+answer = model.chat(
+ image=None,
+ msgs=msgs,
+ tokenizer=tokenizer
+)
print(answer)
```
-可以得到以下输出:
+You will get the following output:
```
"The aircraft in the image is an Airbus A380, which can be identified by its large size, double-deck structure, and the distinctive shape of its wings and engines. The A380 is a wide-body aircraft known for being the world's largest passenger airliner, designed for long-haul flights. It has four engines, which are characteristic of large commercial aircraft. The registration number on the aircraft can also provide specific information about the model if looked up in an aviation database."
@@ -557,15 +1346,137 @@ print(answer)
"The Airbus A380 is a double-deck, wide-body, four-engine jet airliner made by Airbus. It is the world's largest passenger airliner and is known for its long-haul capabilities. The aircraft was developed to improve efficiency and comfort for passengers traveling over long distances. It has two full-length passenger decks, which can accommodate more passengers than a typical single-aisle airplane. The A380 has been operated by airlines such as Lufthansa, Singapore Airlines, and Emirates, among others. It is widely recognized for its unique design and significant impact on the aviation industry."
```
-
-
-
-### Mac 推理
+#### Chat with multiple images
-点击查看 MiniCPM-Llama3-V 2.5 / MiniCPM-V 2.0 基于Mac MPS运行 (Apple silicon 或 AMD GPUs)的示例。
+ Click to view Python code running MiniCPM-V 2.6 with multiple images input.
+
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
+
+image1 = Image.open('image1.jpg').convert('RGB')
+image2 = Image.open('image2.jpg').convert('RGB')
+question = 'Compare image 1 and image 2, tell me about the differences between image 1 and image 2.'
+
+msgs = [{'role': 'user', 'content': [image1, image2, question]}]
+
+answer = model.chat(
+ image=None,
+ msgs=msgs,
+ tokenizer=tokenizer
+)
+print(answer)
+```
+
+
+#### In-context few-shot learning
+
+ Click to view Python code running MiniCPM-V 2.6 with few-shot input.
```python
-# test.py Need more than 16GB memory to run.
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
+
+question = "production date"
+image1 = Image.open('example1.jpg').convert('RGB')
+answer1 = "2023.08.04"
+image2 = Image.open('example2.jpg').convert('RGB')
+answer2 = "2007.04.24"
+image_test = Image.open('test.jpg').convert('RGB')
+
+msgs = [
+ {'role': 'user', 'content': [image1, question]}, {'role': 'assistant', 'content': [answer1]},
+ {'role': 'user', 'content': [image2, question]}, {'role': 'assistant', 'content': [answer2]},
+ {'role': 'user', 'content': [image_test, question]}
+]
+
+answer = model.chat(
+ image=None,
+ msgs=msgs,
+ tokenizer=tokenizer
+)
+print(answer)
+```
+
+
+#### Chat with video
+
+ Click to view Python code running MiniCPM-V 2.6 with video input.
+
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from decord import VideoReader, cpu # pip install decord
+
+model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
+
+MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number
+
+def encode_video(video_path):
+ def uniform_sample(l, n):
+ gap = len(l) / n
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
+ return [l[i] for i in idxs]
+
+ vr = VideoReader(video_path, ctx=cpu(0))
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
+ if len(frame_idx) > MAX_NUM_FRAMES:
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+ frames = vr.get_batch(frame_idx).asnumpy()
+ frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+ print('num frames:', len(frames))
+ return frames
+
+video_path="video_test.mp4"
+frames = encode_video(video_path)
+question = "Describe the video"
+msgs = [
+ {'role': 'user', 'content': frames + [question]},
+]
+
+# Set decode params for video
+params = {}
+params["use_image_id"] = False
+params["max_slice_nums"] = 2 # use 1 if cuda OOM and video resolution > 448*448
+
+answer = model.chat(
+ image=None,
+ msgs=msgs,
+ tokenizer=tokenizer,
+ **params
+)
+print(answer)
+```
+
+
+
+### Inference on Multiple GPUs
+You can run MiniCPM-Llama3-V 2.5 on multiple low VRAM GPUs (12 GB or 16 GB) by distributing the model's layers across multiple GPUs. Please refer to this [tutorial](https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md) for detailed instructions on how to load the model and inference using multiple low VRAM GPUs.
+
+
+### Inference on Mac
+
+Click to view an example, to run MiniCPM-Llama3-V 2.5 on 💻 Mac with MPS (Apple silicon or AMD GPUs).
+
+```python
+# test.py Need more than 16GB memory.
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
@@ -589,121 +1500,160 @@ answer, context, _ = model.chat(
)
print(answer)
```
-运行:
+Run with command:
```shell
PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
```
+### Deployment on Mobile Phone
+MiniCPM-V 2.0 can be deployed on mobile phones with Android operating systems. 🚀 Click [MiniCPM-V 2.0](https://github.com/OpenBMB/mlc-MiniCPM) to install apk.
-### 手机端部署
-MiniCPM-V 2.0 可运行在Android手机上, 点击[2.0](https://github.com/OpenBMB/mlc-MiniCPM)安装apk使用; MiniCPM-Llama3-V 2.5 将很快推出,敬请期待。
+### Inference with llama.cpp
+MiniCPM-V 2.6 can run with llama.cpp now! See [our fork of llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpmv-main/examples/llava/README-minicpmv2.6.md) for more detail. This implementation supports smooth inference of 16~18 token/s on iPad (test environment:iPad Pro + M4).
+
+### Inference with ollama
+MiniCPM-V 2.6 can run with ollama now! See [our fork of ollama](https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md) for more detail. This implementation supports smooth inference of 16~18 token/s on iPad (test environment:iPad Pro + M4).
+
+### Inference with vLLM
-### 本地WebUI Demo部署
-点击查看本地WebUI demo 在 NVIDIA GPU, Mac等不同设备部署方法
-
-```shell
-pip install -r requirements.txt
-```
-
-```shell
-# For NVIDIA GPUs, run:
-python web_demo_2.5.py --device cuda
+ vLLM now officially supports MiniCPM-V 2.6, MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.0, Click to see.
-# For Mac with MPS (Apple silicon or AMD GPUs), run:
-PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+1. Install vLLM(>=0.5.4):
+```shell
+pip install vllm
```
+2. Install timm: (optional, MiniCPM-V 2.0 need timm)
+```shell
+pip install timm==0.9.10
+```
+3. Run the example(for image):
+```python
+from transformers import AutoTokenizer
+from PIL import Image
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "openbmb/MiniCPM-V-2_6"
+# Also available for previous models
+# MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
+# MODEL_NAME = "HwwwH/MiniCPM-V-2"
+
+image = Image.open("xxx.png").convert("RGB")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+llm = LLM(
+ model=MODEL_NAME,
+ trust_remote_code=True,
+ gpu_memory_utilization=1,
+ max_model_len=2048
+)
+
+messages = [{
+ "role":
+ "user",
+ "content":
+ # Number of images
+ "(./)" + \
+ "\nWhat is the content of this image?"
+}]
+prompt = tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True
+)
+
+# Single Inference
+inputs = {
+ "prompt": prompt,
+ "multi_modal_data": {
+ "image": image
+ # Multi images, the number of images should be equal to that of `(./)`
+ # "image": [image, image]
+ },
+}
+# Batch Inference
+# inputs = [{
+# "prompt": prompt,
+# "multi_modal_data": {
+# "image": image
+# },
+# } for _ in 2]
+
+
+# 2.6
+stop_tokens = ['<|im_end|>', '<|endoftext|>']
+stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+# 2.0
+# stop_token_ids = [tokenizer.eos_id]
+# 2.5
+# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+sampling_params = SamplingParams(
+ stop_token_ids=stop_token_ids,
+ use_beam_search=True,
+ temperature=0,
+ best_of=3,
+ max_tokens=1024
+)
+
+outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
+```
+4. click [here](https://modelbest.feishu.cn/wiki/C2BWw4ZP0iCDy7kkCPCcX2BHnOf?from=from_copylink) if you want to use it with *video*, or get more details about `vLLM`.
-### llama.cpp 部署
-MiniCPM-Llama3-V 2.5 现在支持llama.cpp啦! 用法请参考我们的fork [llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpm-v2.5/examples/minicpmv), 在手机上可以支持 6~8 token/s 的流畅推理(测试环境:Xiaomi 14 pro + Snapdragon 8 Gen 3)。
+## Fine-tuning
-### vLLM 部署
-
-点击查看 vLLM 部署运行的方法
-由于我们对 vLLM 提交的 PR 还在 review 中,因此目前我们 fork 了一个 vLLM 仓库以供测试使用。
+### Simple Fine-tuning
-1. 首先克隆我们 fork 的 vLLM 库:
-```shell
-git clone https://github.com/OpenBMB/vllm.git
-```
-2. 安装 vLLM 库:
-```shell
-cd vllm
-pip install -e .
-```
-3. 安装 timm 库:
-```shell
-pip install timm=0.9.10
-```
-4. 测试运行示例程序:
-```shell
-python examples/minicpmv_example.py
-```
+We support simple fine-tuning with Hugging Face for MiniCPM-V 2.0 and MiniCPM-Llama3-V 2.5.
+
+[Reference Document](./finetune/readme.md)
+
+### With the SWIFT Framework
+
+We now support MiniCPM-V series fine-tuning with the SWIFT framework. SWIFT supports training, inference, evaluation and deployment of nearly 200 LLMs and MLLMs . It supports the lightweight training solutions provided by PEFT and a complete Adapters Library including techniques such as NEFTune, LoRA+ and LLaMA-PRO.
+
+Best Practices:[MiniCPM-V 1.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V 2.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md), [MiniCPM-V 2.6](https://github.com/modelscope/ms-swift/issues/1613).
+
+## FAQs
+Click here to view the [FAQs](./docs/faqs.md)
+
+## Model License
+
+* This repository is released under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE) License.
+
+* The usage of MiniCPM-V model weights must strictly follow [MiniCPM Model License.md](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md).
+
+* The models and weights of MiniCPM are completely free for academic research. after filling out a ["questionnaire"](https://modelbest.feishu.cn/share/base/form/shrcnpV5ZT9EJ6xYjh3Kx0J6v8g) for registration, are also available for free commercial use.
+
+
+## Statement
+
+As LMMs, MiniCPM-V models (including OmniLMM) generate contents by learning a large amount of multimodal corpora, but they cannot comprehend, express personal opinions or make value judgement. Anything generated by MiniCPM-V models does not represent the views and positions of the model developers
+
+We will not be liable for any problems arising from the use of MiniCPM-V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination or misuse of the model.
-
+## Institutions
+
+This project is developed by the following institutions:
+
+-

[THUNLP](https://nlp.csai.tsinghua.edu.cn/)
+-

[ModelBest](https://modelbest.cn/)
+-

[Zhihu](https://www.zhihu.com/ )
+
+## 🌟 Star History
-## 微调
+
+
+
+
+
-### 简易微调
-
-我们支持使用 Huggingface Transformers 库简易地微调 MiniCPM-V 2.0 和 MiniCPM-Llama3-V 2.5 模型。
-
-[参考文档](./finetune/readme.md)
-
-### 使用 SWIFT 框架
-
-我们支持使用 SWIFT 框架微调 MiniCPM-V 系列模型。SWIFT 支持近 200 种大语言模型和多模态大模型的训练、推理、评测和部署。支持 PEFT 提供的轻量训练方案和完整的 Adapters 库支持的最新训练技术如 NEFTune、LoRA+、LLaMA-PRO 等。
-
-
-参考文档:[MiniCPM-V 1.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V 2.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md)
-
-## 未来计划
-
-- [x] 支持 MiniCPM-V 系列模型微调
-- [ ] 实时多模态交互代码开源
-
-
-
-
-## 模型协议
-
-本仓库中代码依照 Apache-2.0 协议开源
-
-本项目中模型权重的使用遵循 “[通用模型许可协议-来源说明-宣传限制-商业授权](https://github.com/OpenBMB/General-Model-License/blob/main/通用模型许可协议-来源说明-宣传限制-商业授权.md)”。
-
-本项目中模型权重对学术研究完全开放。
-
-如需将模型用于商业用途,请联系 cpm@modelbest.cn 来获取书面授权,登记后可以免费商业使用。
-
-
-## 声明
-
-作为多模态大模型,MiniCPM-V 系列模型(包括 OmniLMM)通过学习大量的多模态数据来生成内容,但它无法理解、表达个人观点或价值判断,它所输出的任何内容都不代表模型开发者的观点和立场。
-
-因此用户在使用本项目的系列模型生成的内容时,应自行负责对其进行评估和验证。如果由于使用本项目的系列开源模型而导致的任何问题,包括但不限于数据安全问题、公共舆论风险,或模型被误导、滥用、传播或不当利用所带来的任何风险和问题,我们将不承担任何责任。
-
-
-## 机构
-
-本项目由以下机构共同开发:
-
--

[清华大学自然语言处理实验室](https://nlp.csai.tsinghua.edu.cn/)
--

[面壁智能](https://modelbest.cn/)
--

[知乎](https://www.zhihu.com/ )
-
-## 其他多模态项目
-
-👏 欢迎了解我们更多的多模态项目:
-
-[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
-
-## 🌟 Star History
-
-
+
-## 引用
+## Key Techniques and Other Multimodal Projects
-如果您觉得我们模型/代码/论文有帮助,请给我们 ⭐ 和 引用 📝,感谢!
+👏 Welcome to explore key techniques of MiniCPM-V and other multimodal projects of our team:
+
+[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
+
+
+## Citation
+
+If you find our model/code/paper helpful, please consider cite our papers 📝 and star us ⭐️!
```bib
-@article{yu2023rlhf,
- title={Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback},
- author={Yu, Tianyu and Yao, Yuan and Zhang, Haoye and He, Taiwen and Han, Yifeng and Cui, Ganqu and Hu, Jinyi and Liu, Zhiyuan and Zheng, Hai-Tao and Sun, Maosong and others},
- journal={arXiv preprint arXiv:2312.00849},
- year={2023}
-}
-@article{viscpm,
- title={Large Multilingual Models Pivot Zero-Shot Multimodal Learning across Languages},
- author={Jinyi Hu and Yuan Yao and Chongyi Wang and Shan Wang and Yinxu Pan and Qianyu Chen and Tianyu Yu and Hanghao Wu and Yue Zhao and Haoye Zhang and Xu Han and Yankai Lin and Jiao Xue and Dahai Li and Zhiyuan Liu and Maosong Sun},
- journal={arXiv preprint arXiv:2308.12038},
- year={2023}
-}
-@article{xu2024llava-uhd,
- title={{LLaVA-UHD}: an LMM Perceiving Any Aspect Ratio and High-Resolution Images},
- author={Xu, Ruyi and Yao, Yuan and Guo, Zonghao and Cui, Junbo and Ni, Zanlin and Ge, Chunjiang and Chua, Tat-Seng and Liu, Zhiyuan and Huang, Gao},
- journal={arXiv preprint arXiv:2403.11703},
- year={2024}
-}
-@article{yu2024rlaifv,
- title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness},
- author={Yu, Tianyu and Zhang, Haoye and Yao, Yuan and Dang, Yunkai and Chen, Da and Lu, Xiaoman and Cui, Ganqu and He, Taiwen and Liu, Zhiyuan and Chua, Tat-Seng and Sun, Maosong},
- journal={arXiv preprint arXiv:2405.17220},
+@article{yao2024minicpm,
+ title={MiniCPM-V: A GPT-4V Level MLLM on Your Phone},
+ author={Yao, Yuan and Yu, Tianyu and Zhang, Ao and Wang, Chongyi and Cui, Junbo and Zhu, Hongji and Cai, Tianchi and Li, Haoyu and Zhao, Weilin and He, Zhihui and others},
+ journal={arXiv preprint arXiv:2408.01800},
year={2024}
}
```
diff --git a/finetune/finetune_ds.sh b/finetune/finetune_ds.sh
index 4197447..c049471 100644
--- a/finetune/finetune_ds.sh
+++ b/finetune/finetune_ds.sh
@@ -13,7 +13,7 @@ MODEL="openbmb/MiniCPM-V-2_6"
DATA="path/to/trainging_data"
EVAL_DATA="path/to/test_data"
LLM_TYPE="qwen2" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3"
-MODEL_MAX_Length=4096 # if use openbmb/MiniCPM-V-2 or openbmb/MiniCPM-Llama3-V-2_5, please set MODEL_MAX_Length=2048
+MODEL_MAX_Length=2048 # if conduct multi-images sft, please set MODEL_MAX_Length=4096
DISTRIBUTED_ARGS="