diff --git a/.github/ISSUE_TEMPLATE/llamacpp.yaml b/.github/ISSUE_TEMPLATE/llamacpp.yaml
new file mode 100644
index 0000000..c5e370d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/llamacpp.yaml
@@ -0,0 +1,78 @@
+name: "llamacpp issue"
+description: 创建新功能请求 | Create a new ticket for a new feature request
+title: "[llamacpp] -
"
+labels: [
+ "question"
+]
+body:
+ - type: input
+ id: start_date
+ attributes:
+ label: "起始日期 | Start Date"
+ description: |
+ 起始开发日期
+ Start of development
+ placeholder: "month/day/year"
+ validations:
+ required: false
+ - type: textarea
+ id: implementation_pr
+ attributes:
+ label: "实现PR | Implementation PR"
+ description: |
+ 实现该功能的Pull request
+ Pull request used
+ placeholder: "#Pull Request ID"
+ validations:
+ required: false
+ - type: textarea
+ id: reference_issues
+ attributes:
+ label: "相关Issues | Reference Issues"
+ description: |
+ 与该功能相关的issues
+ Common issues
+ placeholder: "#Issues IDs"
+ validations:
+ required: false
+ - type: textarea
+ id: summary
+ attributes:
+ label: "摘要 | Summary"
+ description: |
+ 简要描述新功能的特点
+ Provide a brief explanation of the feature
+ placeholder: |
+ Describe in a few lines your feature request
+ validations:
+ required: true
+ - type: textarea
+ id: basic_example
+ attributes:
+ label: "基本示例 | Basic Example"
+ description: Indicate here some basic examples of your feature.
+ placeholder: A few specific words about your feature request.
+ validations:
+ required: true
+ - type: textarea
+ id: drawbacks
+ attributes:
+ label: "缺陷 | Drawbacks"
+ description: |
+ 该新功能有哪些缺陷/可能造成哪些影响?
+ What are the drawbacks/impacts of your feature request ?
+ placeholder: |
+ Identify the drawbacks and impacts while being neutral on your feature request
+ validations:
+ required: true
+ - type: textarea
+ id: unresolved_question
+ attributes:
+ label: "未解决问题 | Unresolved questions"
+ description: |
+ 有哪些尚未解决的问题?
+ What questions still remain unresolved ?
+ placeholder: |
+ Identify any unresolved issues.
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/ollama.yaml b/.github/ISSUE_TEMPLATE/ollama.yaml
new file mode 100644
index 0000000..e640151
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/ollama.yaml
@@ -0,0 +1,78 @@
+name: "ollama issue"
+description: 创建新功能请求 | Create a new ticket for a new feature request
+title: "[ollama] - "
+labels: [
+ "question"
+]
+body:
+ - type: input
+ id: start_date
+ attributes:
+ label: "起始日期 | Start Date"
+ description: |
+ 起始开发日期
+ Start of development
+ placeholder: "month/day/year"
+ validations:
+ required: false
+ - type: textarea
+ id: implementation_pr
+ attributes:
+ label: "实现PR | Implementation PR"
+ description: |
+ 实现该功能的Pull request
+ Pull request used
+ placeholder: "#Pull Request ID"
+ validations:
+ required: false
+ - type: textarea
+ id: reference_issues
+ attributes:
+ label: "相关Issues | Reference Issues"
+ description: |
+ 与该功能相关的issues
+ Common issues
+ placeholder: "#Issues IDs"
+ validations:
+ required: false
+ - type: textarea
+ id: summary
+ attributes:
+ label: "摘要 | Summary"
+ description: |
+ 简要描述新功能的特点
+ Provide a brief explanation of the feature
+ placeholder: |
+ Describe in a few lines your feature request
+ validations:
+ required: true
+ - type: textarea
+ id: basic_example
+ attributes:
+ label: "基本示例 | Basic Example"
+ description: Indicate here some basic examples of your feature.
+ placeholder: A few specific words about your feature request.
+ validations:
+ required: true
+ - type: textarea
+ id: drawbacks
+ attributes:
+ label: "缺陷 | Drawbacks"
+ description: |
+ 该新功能有哪些缺陷/可能造成哪些影响?
+ What are the drawbacks/impacts of your feature request ?
+ placeholder: |
+ Identify the drawbacks and impacts while being neutral on your feature request
+ validations:
+ required: true
+ - type: textarea
+ id: unresolved_question
+ attributes:
+ label: "未解决问题 | Unresolved questions"
+ description: |
+ 有哪些尚未解决的问题?
+ What questions still remain unresolved ?
+ placeholder: |
+ Identify any unresolved issues.
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/vllm.yaml b/.github/ISSUE_TEMPLATE/vllm.yaml
new file mode 100644
index 0000000..74f98a1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/vllm.yaml
@@ -0,0 +1,78 @@
+name: "vllm issue"
+description: 创建新功能请求 | Create a new ticket for a new feature request
+title: "[vllm] - "
+labels: [
+ "question"
+]
+body:
+ - type: input
+ id: start_date
+ attributes:
+ label: "起始日期 | Start Date"
+ description: |
+ 起始开发日期
+ Start of development
+ placeholder: "month/day/year"
+ validations:
+ required: false
+ - type: textarea
+ id: implementation_pr
+ attributes:
+ label: "实现PR | Implementation PR"
+ description: |
+ 实现该功能的Pull request
+ Pull request used
+ placeholder: "#Pull Request ID"
+ validations:
+ required: false
+ - type: textarea
+ id: reference_issues
+ attributes:
+ label: "相关Issues | Reference Issues"
+ description: |
+ 与该功能相关的issues
+ Common issues
+ placeholder: "#Issues IDs"
+ validations:
+ required: false
+ - type: textarea
+ id: summary
+ attributes:
+ label: "摘要 | Summary"
+ description: |
+ 简要描述新功能的特点
+ Provide a brief explanation of the feature
+ placeholder: |
+ Describe in a few lines your feature request
+ validations:
+ required: true
+ - type: textarea
+ id: basic_example
+ attributes:
+ label: "基本示例 | Basic Example"
+ description: Indicate here some basic examples of your feature.
+ placeholder: A few specific words about your feature request.
+ validations:
+ required: true
+ - type: textarea
+ id: drawbacks
+ attributes:
+ label: "缺陷 | Drawbacks"
+ description: |
+ 该新功能有哪些缺陷/可能造成哪些影响?
+ What are the drawbacks/impacts of your feature request ?
+ placeholder: |
+ Identify the drawbacks and impacts while being neutral on your feature request
+ validations:
+ required: true
+ - type: textarea
+ id: unresolved_question
+ attributes:
+ label: "未解决问题 | Unresolved questions"
+ description: |
+ 有哪些尚未解决的问题?
+ What questions still remain unresolved ?
+ placeholder: |
+ Identify any unresolved issues.
+ validations:
+ required: false
diff --git a/README.md b/README.md
index 1697267..446b92e 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Join our 💬 WeChat
#### 📌 Pinned
* [2024.08.06] 🔥🔥🔥 We open-source MiniCPM-V 2.6, which outperforms GPT-4V on single image, multi-image and video understanding. It advances popular features of MiniCPM-Llama3-V 2.5, and can support real-time video understanding on iPad. Try it now!
* [2024.08.03] MiniCPM-Llama3-V 2.5 technical report is released! See [here](https://arxiv.org/abs/2408.01800).
-* [2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See [here](#vllm).
+* [2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See [here](#inference-with-vllm).
* [2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in llama.cpp and ollama! Please pull the latest code **of our provided forks** ([llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md), [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)). GGUF models in various sizes are available [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main). MiniCPM-Llama3-V 2.5 series is **not supported by the official repositories yet**, and we are working hard to merge PRs. Please stay tuned!
* [2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics [here](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics).
* [2024.05.23] 🔍 We've released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmarks evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click [here](./docs/compare_with_phi-3_vision.md) to view more details.
@@ -45,7 +45,7 @@ Join our 💬 WeChat
* [2024.05.25] MiniCPM-Llama3-V 2.5 now supports streaming outputs and customized system prompts. Try it [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)!
* [2024.05.24] We release the MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf), which supports [llama.cpp](#inference-with-llamacpp) inference and provides a 6~8 token/s smooth decoding on mobile phones. Try it now!
* [2024.05.20] We open-soure MiniCPM-Llama3-V 2.5, it has improved OCR capability and supports 30+ languages, representing the first end-side MLLM achieving GPT-4V level performance! We provide [efficient inference](#deployment-on-mobile-phone) and [simple fine-tuning](./finetune/readme.md). Try it now!
-* [2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click [here](#vllm) to view more details.
+* [2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click [here](#inference-with-vllm) to view more details.
* [2024.04.18] We create a HuggingFace Space to host the demo of MiniCPM-V 2.0 at [here](https://huggingface.co/spaces/openbmb/MiniCPM-V-2)!
* [2024.04.17] MiniCPM-V-2.0 supports deploying [WebUI Demo](#webui-demo) now!
* [2024.04.15] MiniCPM-V-2.0 now also supports [fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) with the SWIFT framework!
@@ -1504,7 +1504,7 @@ PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
### Deployment on Mobile Phone
-MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.0 can be deployed on mobile phones with Android operating systems. 🚀 Click [MiniCPM-Llama3-V 2.5](http://minicpm.modelbest.cn/android/modelbest-release-20240528_182155.apk) / [MiniCPM-V 2.0](https://github.com/OpenBMB/mlc-MiniCPM) to install apk.
+MiniCPM-V 2.0 can be deployed on mobile phones with Android operating systems. 🚀 Click [MiniCPM-V 2.0](https://github.com/OpenBMB/mlc-MiniCPM) to install apk.
### Inference with llama.cpp
MiniCPM-V 2.6 can run with llama.cpp now! See [our fork of llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpmv-main/examples/llava/README-minicpmv2.6.md) for more detail. This implementation supports smooth inference of 16~18 token/s on iPad (test environment:iPad Pro + M4).
@@ -1515,25 +1515,89 @@ MiniCPM-V 2.6 can run with ollama now! See [our fork of ollama](https://github.c
### Inference with vLLM
- vLLM now officially supports MiniCPM-V 2.0, MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.6, Click to see.
+ vLLM now officially supports MiniCPM-V 2.6, MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.0, Click to see.
-1. Clone the official vLLM:
+1. Install vLLM(>=0.5.4):
```shell
-git clone https://github.com/vllm-project/vllm.git
+pip install vllm
```
-2. Install vLLM:
-```shell
-cd vllm
-pip install -e .
-```
-3. Install timm: (optional, MiniCPM-V 2.0 need timm)
+2. Install timm: (optional, MiniCPM-V 2.0 need timm)
```shell
pip install timm==0.9.10
```
-4. Run the example:(Attention: If you use model in local path, please update the model code to the latest version on Hugging Face.)
-```shell
-python examples/minicpmv_example.py
+3. Run the example(for image):
+```python
+from transformers import AutoTokenizer
+from PIL import Image
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "openbmb/MiniCPM-V-2_6"
+# Also available for previous models
+# MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
+# MODEL_NAME = "HwwwH/MiniCPM-V-2"
+
+image = Image.open("xxx.png").convert("RGB")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+llm = LLM(
+ model=MODEL_NAME,
+ trust_remote_code=True,
+ gpu_memory_utilization=1,
+ max_model_len=2048
+)
+
+messages = [{
+ "role":
+ "user",
+ "content":
+ # Number of images
+ "(./)" + \
+ "\nWhat is the content of this image?"
+}]
+prompt = tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True
+)
+
+# Single Inference
+inputs = {
+ "prompt": prompt,
+ "multi_modal_data": {
+ "image": image
+ # Multi images, the number of images should be equal to that of `(./)`
+ # "image": [image, image]
+ },
+}
+# Batch Inference
+# inputs = [{
+# "prompt": prompt,
+# "multi_modal_data": {
+# "image": image
+# },
+# } for _ in 2]
+
+
+# 2.6
+stop_tokens = ['<|im_end|>', '<|endoftext|>']
+stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+# 2.0
+# stop_token_ids = [tokenizer.eos_id]
+# 2.5
+# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+sampling_params = SamplingParams(
+ stop_token_ids=stop_token_ids,
+ use_beam_search=True,
+ temperature=0,
+ best_of=3,
+ max_tokens=1024
+)
+
+outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
```
+4. click [here](https://modelbest.feishu.cn/wiki/C2BWw4ZP0iCDy7kkCPCcX2BHnOf?from=from_copylink) if you want to use it with *video*, or get more details about `vLLM`.
## Fine-tuning
diff --git a/README_en.md b/README_en.md
index e352647..3801cd1 100644
--- a/README_en.md
+++ b/README_en.md
@@ -30,7 +30,7 @@ Join our 💬 WeChat
#### 📌 Pinned
* [2024.08.06] 🔥🔥🔥 We open-source MiniCPM-V 2.6, which outperforms GPT-4V on single image, multi-image and video understanding. It advances popular features of MiniCPM-Llama3-V 2.5, and can support real-time video understanding on iPad. Try it now!
* [2024.08.03] MiniCPM-Llama3-V 2.5 technical report is released! See [here](https://arxiv.org/abs/2408.01800).
-* [2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See [here](#vllm).
+* [2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See [here](#inference-with-vllm).
* [2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in llama.cpp and ollama! Please pull the latest code **of our provided forks** ([llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md), [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)). GGUF models in various sizes are available [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main). MiniCPM-Llama3-V 2.5 series is **not supported by the official repositories yet**, and we are working hard to merge PRs. Please stay tuned!
* [2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics [here](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics).
* [2024.05.23] 🔍 We've released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmarks evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click [here](./docs/compare_with_phi-3_vision.md) to view more details.
@@ -45,7 +45,7 @@ Join our 💬 WeChat
* [2024.05.25] MiniCPM-Llama3-V 2.5 now supports streaming outputs and customized system prompts. Try it [here](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)!
* [2024.05.24] We release the MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf), which supports [llama.cpp](#inference-with-llamacpp) inference and provides a 6~8 token/s smooth decoding on mobile phones. Try it now!
* [2024.05.20] We open-soure MiniCPM-Llama3-V 2.5, it has improved OCR capability and supports 30+ languages, representing the first end-side MLLM achieving GPT-4V level performance! We provide [efficient inference](#deployment-on-mobile-phone) and [simple fine-tuning](./finetune/readme.md). Try it now!
-* [2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click [here](#vllm) to view more details.
+* [2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click [here](#inference-with-vllm) to view more details.
* [2024.04.18] We create a HuggingFace Space to host the demo of MiniCPM-V 2.0 at [here](https://huggingface.co/spaces/openbmb/MiniCPM-V-2)!
* [2024.04.17] MiniCPM-V-2.0 supports deploying [WebUI Demo](#webui-demo) now!
* [2024.04.15] MiniCPM-V-2.0 now also supports [fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) with the SWIFT framework!
@@ -1517,23 +1517,87 @@ MiniCPM-V 2.6 can run with ollama now! See [our fork of ollama](https://github.c
vLLM now officially supports MiniCPM-V 2.0, MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.6, Click to see.
-1. Clone the official vLLM:
+1. Install vLLM(==0.5.4):
```shell
-git clone https://github.com/vllm-project/vllm.git
+pip install vllm
```
-2. Install vLLM:
-```shell
-cd vllm
-pip install -e .
-```
-3. Install timm: (optional, MiniCPM-V 2.0 need timm)
+2. Install timm: (optional, MiniCPM-V 2.0 need timm)
```shell
pip install timm==0.9.10
```
-4. Run the example:(Attention: If you use model in local path, please update the model code to the latest version on Hugging Face.)
-```shell
-python examples/minicpmv_example.py
+3. Run the example(for image):
+```python
+from transformers import AutoTokenizer
+from PIL import Image
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "openbmb/MiniCPM-V-2_6"
+# Also available for previous models
+# MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
+# MODEL_NAME = "HwwwH/MiniCPM-V-2"
+
+image = Image.open("xxx.png").convert("RGB")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+llm = LLM(
+ model=MODEL_NAME,
+ trust_remote_code=True,
+ gpu_memory_utilization=1,
+ max_model_len=2048
+)
+
+messages = [{
+ "role":
+ "user",
+ "content":
+ # Number of images
+ "(./)" + \
+ "\nWhat is the content of this image?"
+}]
+prompt = tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True
+)
+
+# Single Inference
+inputs = {
+ "prompt": prompt,
+ "multi_modal_data": {
+ "image": image
+ # Multi images, the number of images should be equal to that of `(./)`
+ # "image": [image, image]
+ },
+}
+# Batch Inference
+# inputs = [{
+# "prompt": prompt,
+# "multi_modal_data": {
+# "image": image
+# },
+# } for _ in 2]
+
+
+# 2.6
+stop_tokens = ['<|im_end|>', '<|endoftext|>']
+stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+# 2.0
+# stop_token_ids = [tokenizer.eos_id]
+# 2.5
+# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+sampling_params = SamplingParams(
+ stop_token_ids=stop_token_ids,
+ use_beam_search=True,
+ temperature=0,
+ best_of=3,
+ max_tokens=1024
+)
+
+outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
```
+4. click [here](https://modelbest.feishu.cn/wiki/C2BWw4ZP0iCDy7kkCPCcX2BHnOf?from=from_copylink) if you want to use it with *video*, or get more details about `vLLM`.
## Fine-tuning
diff --git a/README_zh.md b/README_zh.md
index 27d29fd..20e6708 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -35,7 +35,7 @@
* [2024.08.06] 🔥🔥🔥 我们开源了 MiniCPM-V 2.6,该模型在单图、多图和视频理解方面取得了优于 GPT-4V 的表现。我们还进一步提升了 MiniCPM-Llama3-V 2.5 的多项亮点能力,并首次支持了 iPad 上的实时视频理解。欢迎试用!
* [2024.08.03] MiniCPM-Llama3-V 2.5 技术报告已发布!欢迎点击[这里](https://arxiv.org/abs/2408.01800)查看。
-* [2024.07.19] MiniCPM-Llama3-V 2.5 现已支持[vLLM](#vllm) !
+* [2024.07.19] MiniCPM-Llama3-V 2.5 现已支持[vLLM](#vllm-部署-) !
* [2024.05.28] 💥 MiniCPM-Llama3-V 2.5 现在在 llama.cpp 和 ollama 中完全支持其功能!**请拉取我们最新的 fork 来使用**:[llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) & [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)。我们还发布了各种大小的 GGUF 版本,请点击[这里](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main)查看。请注意,**目前官方仓库尚未支持 MiniCPM-Llama3-V 2.5**,我们也正积极推进将这些功能合并到 llama.cpp & ollama 官方仓库,敬请关注!
* [2024.05.28] 💫 我们现在支持 MiniCPM-Llama3-V 2.5 的 LoRA 微调,更多内存使用统计信息可以在[这里](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics)找到。
* [2024.05.23] 🔍 我们添加了Phi-3-vision-128k-instruct 与 MiniCPM-Llama3-V 2.5的全面对比,包括基准测试评估、多语言能力和推理效率 🌟📊🌍🚀。点击[这里](./docs/compare_with_phi-3_vision.md)查看详细信息。
@@ -51,7 +51,7 @@
* [2024.05.25] MiniCPM-Llama3-V 2.5 [支持流式输出和自定义系统提示词](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)了,欢迎试用!
* [2024.05.24] 我们开源了 MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf),支持 [llama.cpp](#llamacpp-部署) 推理!实现端侧 6-8 tokens/s 的流畅解码,欢迎试用!
* [2024.05.20] 我们开源了 MiniCPM-Llama3-V 2.5,增强了 OCR 能力,支持 30 多种语言,并首次在端侧实现了 GPT-4V 级的多模态能力!我们提供了[高效推理](#手机端部署)和[简易微调](./finetune/readme.md)的支持,欢迎试用!
-* [2024.04.23] 我们增加了MiniCPM-V 2.0对 [vLLM](#vllm) 的支持,欢迎体验!
+* [2024.04.23] 我们增加了MiniCPM-V 2.0对 [vLLM](#vllm-部署-) 的支持,欢迎体验!
* [2024.04.18] 我们在 HuggingFace Space 新增了 MiniCPM-V 2.0 的 [demo](https://huggingface.co/spaces/openbmb/MiniCPM-V-2),欢迎体验!
* [2024.04.17] MiniCPM-V 2.0 现在支持用户部署本地 [WebUI Demo](#本地webui-demo部署) 了,欢迎试用!
* [2024.04.15] MiniCPM-V 2.0 现在可以通过 SWIFT 框架 [微调](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) 了,支持流式输出!
@@ -1513,7 +1513,7 @@ PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
### 手机端部署
-MiniCPM-Llama3-V 2.5 和 MiniCPM-V 2.0 可运行在Android手机上,点击[MiniCPM-Llama3-V 2.5](http://minicpm.modelbest.cn/android/modelbest-release-20240528_182155.apk) / [MiniCPM-V 2.0](https://github.com/OpenBMB/mlc-MiniCPM)安装apk使用;
+MiniCPM-V 2.0 可运行在Android手机上,点击[MiniCPM-V 2.0](https://github.com/OpenBMB/mlc-MiniCPM)安装apk使用;
### 本地WebUI Demo部署
@@ -1525,10 +1525,7 @@ pip install -r requirements.txt
```shell
# For NVIDIA GPUs, run:
-python web_demo_2.5.py --device cuda
-
-# For Mac with MPS (Apple silicon or AMD GPUs), run:
-PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+python web_demo_2.6.py --device cuda
```
@@ -1540,26 +1537,89 @@ MiniCPM-V 2.6 现在支持ollama啦! 用法请参考[我们的fork ollama](https
### vLLM 部署
-点击查看, vLLM 现已官方支持MiniCPM-V 2.0 、MiniCPM-Llama3-V 2.5 和 MiniCPM-V 2.6
+点击查看, vLLM 现已官方支持MiniCPM-V 2.6、MiniCPM-Llama3-V 2.5 和 MiniCPM-V 2.0
-1. 首先克隆官方的 vLLM 库:
+1. 安装 vLLM(>=0.5.4):
```shell
-git clone https://github.com/vllm-project/vllm.git
-```
-2. 安装 vLLM 库:
-```shell
-cd vllm
-pip install -e .
+pip install vllm
```
3. 安装 timm 库: (可选,MiniCPM-V 2.0需安装)
```shell
pip install timm=0.9.10
```
4. 运行示例代码:(注意:如果使用本地路径的模型,请确保模型代码已更新到Hugging Face上的最新版)
-```shell
-python examples/minicpmv_example.py
-```
+```python
+from transformers import AutoTokenizer
+from PIL import Image
+from vllm import LLM, SamplingParams
+MODEL_NAME = "openbmb/MiniCPM-V-2_6"
+# Also available for previous models
+# MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
+# MODEL_NAME = "HwwwH/MiniCPM-V-2"
+
+image = Image.open("xxx.png").convert("RGB")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+llm = LLM(
+ model=MODEL_NAME,
+ trust_remote_code=True,
+ gpu_memory_utilization=1,
+ max_model_len=2048
+)
+
+messages = [{
+ "role":
+ "user",
+ "content":
+ # Number of images
+ "(./)" + \
+ "\nWhat is the content of this image?"
+}]
+prompt = tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True
+)
+
+# Single Inference
+inputs = {
+ "prompt": prompt,
+ "multi_modal_data": {
+ "image": image
+ # Multi images, the number of images should be equal to that of `(./)`
+ # "image": [image, image]
+ },
+}
+# Batch Inference
+# inputs = [{
+# "prompt": prompt,
+# "multi_modal_data": {
+# "image": image
+# },
+# } for _ in 2]
+
+
+# 2.6
+stop_tokens = ['<|im_end|>', '<|endoftext|>']
+stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+# 2.0
+# stop_token_ids = [tokenizer.eos_id]
+# 2.5
+# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+sampling_params = SamplingParams(
+ stop_token_ids=stop_token_ids,
+ use_beam_search=True,
+ temperature=0,
+ best_of=3,
+ max_tokens=1024
+)
+
+outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
+```
+4. [点击此处](https://modelbest.feishu.cn/wiki/C2BWw4ZP0iCDy7kkCPCcX2BHnOf?from=from_copylink)查看带视频推理和其他有关 `vLLM` 的信息。
@@ -1650,4 +1710,4 @@ python examples/minicpmv_example.py
journal={arXiv preprint 2408.01800},
year={2024},
}
-```
\ No newline at end of file
+```
diff --git a/assets/minicpmv2_6/ICL-Mem.png b/assets/minicpmv2_6/ICL-Mem.png
index 48453d5..e9517ba 100644
Binary files a/assets/minicpmv2_6/ICL-Mem.png and b/assets/minicpmv2_6/ICL-Mem.png differ
diff --git a/assets/minicpmv2_6/ICL-elec.png b/assets/minicpmv2_6/ICL-elec.png
index 39c7dcd..de32d77 100644
Binary files a/assets/minicpmv2_6/ICL-elec.png and b/assets/minicpmv2_6/ICL-elec.png differ
diff --git a/assets/minicpmv2_6/multi_img-bike.png b/assets/minicpmv2_6/multi_img-bike.png
index 0f89782..c19975d 100644
Binary files a/assets/minicpmv2_6/multi_img-bike.png and b/assets/minicpmv2_6/multi_img-bike.png differ
diff --git a/assets/minicpmv2_6/multi_img-code.png b/assets/minicpmv2_6/multi_img-code.png
index e7790a6..abc4a6c 100644
Binary files a/assets/minicpmv2_6/multi_img-code.png and b/assets/minicpmv2_6/multi_img-code.png differ
diff --git a/assets/minicpmv2_6/multi_img-menu.png b/assets/minicpmv2_6/multi_img-menu.png
index 90e78bf..2a0ba36 100644
Binary files a/assets/minicpmv2_6/multi_img-menu.png and b/assets/minicpmv2_6/multi_img-menu.png differ
diff --git a/assets/minicpmv2_6/multiling-medal.png b/assets/minicpmv2_6/multiling-medal.png
index 0aab601..af75d3d 100644
Binary files a/assets/minicpmv2_6/multiling-medal.png and b/assets/minicpmv2_6/multiling-medal.png differ
diff --git a/assets/minicpmv2_6/multiling-olympic.png b/assets/minicpmv2_6/multiling-olympic.png
index 0f4c594..3c869e8 100644
Binary files a/assets/minicpmv2_6/multiling-olympic.png and b/assets/minicpmv2_6/multiling-olympic.png differ