Compare commits
79 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
82ae1419eb | ||
|
|
17827f0c57 | ||
|
|
520c2c476b | ||
|
|
0cd0132b43 | ||
|
|
37b4f05f9c | ||
|
|
48d65128fc | ||
|
|
d2086b19da | ||
|
|
b2d728908b | ||
|
|
6880a27c5f | ||
|
|
4a333deb8c | ||
|
|
90dd7e88a6 | ||
|
|
8c2f41fef5 | ||
|
|
076466dd5a | ||
|
|
592ba7519e | ||
|
|
53bcece5c8 | ||
|
|
168ae8fe46 | ||
|
|
28632248d5 | ||
|
|
74aa48ebeb | ||
|
|
22431c9436 | ||
|
|
6d4da2ee5a | ||
|
|
bb0e3c2a92 | ||
|
|
1d3c5f455e | ||
|
|
9d37b1c2a0 | ||
|
|
c130da1b4d | ||
|
|
91cf50f813 | ||
|
|
6cb4a3bf82 | ||
|
|
af22b8f2ed | ||
|
|
7233ef5473 | ||
|
|
c821cbd7c8 | ||
|
|
a846468195 | ||
|
|
f8da52c35c | ||
|
|
67afdeb934 | ||
|
|
3cde81287d | ||
|
|
e45524cbf7 | ||
|
|
0d8b90df97 | ||
|
|
d16875b120 | ||
|
|
1c89161d65 | ||
|
|
da79d55ad4 | ||
|
|
b9a95ee0ea | ||
|
|
02c68764d4 | ||
|
|
509e934a59 | ||
|
|
3d050a5dd4 | ||
|
|
d01532f89c | ||
|
|
bffc715128 | ||
|
|
af96e66e01 | ||
|
|
eb072b30a0 | ||
|
|
16a79219cb | ||
|
|
663d96c887 | ||
|
|
1dcb4e2fee | ||
|
|
fe7b3d27de | ||
|
|
9d0531b236 | ||
|
|
5443a7c4d7 | ||
|
|
fcecab8045 | ||
|
|
06e220c8f4 | ||
|
|
2ef22c138e | ||
|
|
51f3f36614 | ||
|
|
03111d5c5b | ||
|
|
4f7eba0c29 | ||
|
|
3acd3f9891 | ||
|
|
d828902a98 | ||
|
|
8438ec2147 | ||
|
|
b91fff3ea8 | ||
|
|
e2559a5ca2 | ||
|
|
8185ac321d | ||
|
|
539e70177c | ||
|
|
6e8f1d7a66 | ||
|
|
50214bfa52 | ||
|
|
2d9919ac69 | ||
|
|
48c0611a3f | ||
|
|
afc3b105bd | ||
|
|
732f5e62e4 | ||
|
|
949fc4e843 | ||
|
|
ebb1a5e0a7 | ||
|
|
7084bbfa9f | ||
|
|
523fb11263 | ||
|
|
b2b2b7bd70 | ||
|
|
0234793a3b | ||
|
|
4b5828acb1 | ||
|
|
11ca385133 |
5
.vscode/settings.json
vendored
@@ -1,5 +0,0 @@
|
|||||||
{
|
|
||||||
"githubPullRequests.ignoredPullRequestBranches": [
|
|
||||||
"main"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
2
LICENSE
@@ -186,7 +186,7 @@
|
|||||||
same "printed page" as the copyright notice for easier
|
same "printed page" as the copyright notice for easier
|
||||||
identification within third-party archives.
|
identification within third-party archives.
|
||||||
|
|
||||||
Copyright 2024 OpenBMB
|
Copyright OpenBMB
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|||||||
4803
README_zh.md
|
Before Width: | Height: | Size: 163 KiB |
|
Before Width: | Height: | Size: 163 KiB |
|
Before Width: | Height: | Size: 159 KiB |
BIN
assets/join.png
Normal file
|
After Width: | Height: | Size: 868 B |
BIN
assets/minicpm-o-45-framework.pdf
Normal file
BIN
assets/minicpm-o-45-framework.png
Normal file
|
After Width: | Height: | Size: 359 KiB |
BIN
assets/minicpm-o-45-radar.png
Normal file
|
After Width: | Height: | Size: 1.1 MiB |
|
Before Width: | Height: | Size: 90 KiB |
BIN
assets/minicpm-v-4dot5-framework.png
Normal file
|
After Width: | Height: | Size: 957 KiB |
|
Before Width: | Height: | Size: 562 KiB |
|
Before Width: | Height: | Size: 219 KiB |
|
Before Width: | Height: | Size: 222 KiB |
|
Before Width: | Height: | Size: 209 KiB |
|
Before Width: | Height: | Size: 216 KiB |
|
Before Width: | Height: | Size: 115 KiB |
|
Before Width: | Height: | Size: 84 KiB |
|
Before Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 148 KiB |
|
Before Width: | Height: | Size: 628 KiB |
BIN
assets/minicpm_o_45_main_exp_table.png
Normal file
|
After Width: | Height: | Size: 304 KiB |
BIN
assets/minicpm_v_and_minicpm_o_title.png
Normal file
|
After Width: | Height: | Size: 81 KiB |
BIN
assets/minicpmo4_5/assistant_ref.mp4
Normal file
BIN
assets/minicpmo4_5/assistant_response.mp4
Normal file
BIN
assets/minicpmo4_5/elon_musk_ref.mp4
Normal file
BIN
assets/minicpmo4_5/elon_musk_response.mp4
Normal file
BIN
assets/minicpmo4_5/en_cot.png
Normal file
|
After Width: | Height: | Size: 3.0 MiB |
BIN
assets/minicpmo4_5/en_doc.png
Normal file
|
After Width: | Height: | Size: 4.4 MiB |
BIN
assets/minicpmo4_5/video_play.png
Normal file
|
After Width: | Height: | Size: 7.5 MiB |
BIN
assets/minicpmo4_5/zh_doc.png
Normal file
|
After Width: | Height: | Size: 2.9 MiB |
|
Before Width: | Height: | Size: 234 KiB |
|
Before Width: | Height: | Size: 307 KiB |
BIN
assets/minicpmv4/iphone_cn.gif
Normal file
|
After Width: | Height: | Size: 5.5 MiB |
BIN
assets/minicpmv4/iphone_cn_funny_points.gif
Normal file
|
After Width: | Height: | Size: 6.3 MiB |
BIN
assets/minicpmv4/iphone_en.gif
Normal file
|
After Width: | Height: | Size: 15 MiB |
BIN
assets/minicpmv4/iphone_en_information_extraction.gif
Normal file
|
After Width: | Height: | Size: 3.1 MiB |
BIN
assets/minicpmv4/minicpm-v-4-case.png
Normal file
|
After Width: | Height: | Size: 3.7 MiB |
BIN
assets/minicpmv4_5/MiniCPM-V 4.5-8.26.mp4
Normal file
BIN
assets/minicpmv4_5/MiniCPM-V 4.5-8.26_img.jpeg
Normal file
|
After Width: | Height: | Size: 356 KiB |
BIN
assets/minicpmv4_5/en_case1.png
Normal file
|
After Width: | Height: | Size: 3.2 MiB |
BIN
assets/minicpmv4_5/en_case2.png
Normal file
|
After Width: | Height: | Size: 870 KiB |
BIN
assets/minicpmv4_5/en_case3.jpeg
Normal file
|
After Width: | Height: | Size: 1.8 MiB |
BIN
assets/minicpmv4_5/en_case4.jpeg
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
BIN
assets/minicpmv4_5/en_extra.jpg
Normal file
|
After Width: | Height: | Size: 2.4 MiB |
BIN
assets/minicpmv4_5/v45_cn_handwriting.gif
Normal file
|
After Width: | Height: | Size: 2.4 MiB |
BIN
assets/minicpmv4_5/v45_cn_travel.gif
Normal file
|
After Width: | Height: | Size: 7.5 MiB |
BIN
assets/minicpmv4_5/v45_en_cot.gif
Normal file
|
After Width: | Height: | Size: 22 MiB |
BIN
assets/minicpmv4_5/v45_en_handwriting.gif
Normal file
|
After Width: | Height: | Size: 3.8 MiB |
BIN
assets/minicpmv4_5/zh_case1.jpeg
Normal file
|
After Width: | Height: | Size: 2.6 MiB |
BIN
assets/minicpmv4_5/zh_case2.jpeg
Normal file
|
After Width: | Height: | Size: 2.0 MiB |
BIN
assets/minicpmv4_5/zh_extra.jpeg
Normal file
|
After Width: | Height: | Size: 2.1 MiB |
BIN
assets/minicpmv_4_5_evaluation_result.png
Normal file
|
After Width: | Height: | Size: 589 KiB |
BIN
assets/radar_minicpm_v45.png
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
BIN
assets/radar_minicpmo4.5.png
Normal file
|
After Width: | Height: | Size: 1.2 MiB |
BIN
assets/star-history-25-09-02.png
Normal file
|
After Width: | Height: | Size: 108 KiB |
|
Before Width: | Height: | Size: 52 KiB After Width: | Height: | Size: 12 KiB |
BIN
docs/MiniCPM_V_4_5_Technical_Report.pdf
Normal file
@@ -13,6 +13,7 @@
|
|||||||
- [Inference](#Inference)
|
- [Inference](#Inference)
|
||||||
|
|
||||||
## Support Models
|
## Support Models
|
||||||
|
* [openbmb/MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4)
|
||||||
* [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
|
* [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
|
||||||
* [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)
|
* [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)
|
||||||
|
|
||||||
|
|||||||
176
docs/minicpm-llama-v-2-5_languages.md
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
- English
|
||||||
|
- 中文
|
||||||
|
- 한국어
|
||||||
|
- 日本語
|
||||||
|
- Deutsch
|
||||||
|
- Français
|
||||||
|
- Português
|
||||||
|
- Español
|
||||||
|
- မြန်မာဘာသာ
|
||||||
|
- ไทย
|
||||||
|
- Tiếng Việt
|
||||||
|
- Türkçe
|
||||||
|
- ܣܘܪܝܝܐ
|
||||||
|
- العربية
|
||||||
|
- हिन्दी
|
||||||
|
- বাংলা
|
||||||
|
- नेपाली
|
||||||
|
- Türkmençe
|
||||||
|
- Тоҷикӣ
|
||||||
|
- Кыргызча
|
||||||
|
- Русский
|
||||||
|
- Українська
|
||||||
|
- Беларуская
|
||||||
|
- ქართული
|
||||||
|
- Azərbaycanca
|
||||||
|
- Հայերեն
|
||||||
|
- Polski
|
||||||
|
- Lietuvių
|
||||||
|
- Eesti
|
||||||
|
- Latviešu
|
||||||
|
- Čeština
|
||||||
|
- Slovenčina
|
||||||
|
- Magyar
|
||||||
|
- Slovenščina
|
||||||
|
- Hrvatski
|
||||||
|
- Bosanski
|
||||||
|
- Crnogorski
|
||||||
|
- Српски
|
||||||
|
- Shqip
|
||||||
|
- Română
|
||||||
|
- Български
|
||||||
|
- Македонски
|
||||||
|
|
||||||
|
|
||||||
|
## 支持语言
|
||||||
|
|
||||||
|
英语
|
||||||
|
|
||||||
|
中文
|
||||||
|
|
||||||
|
韩语
|
||||||
|
|
||||||
|
日语
|
||||||
|
|
||||||
|
德语
|
||||||
|
|
||||||
|
法语
|
||||||
|
|
||||||
|
葡萄牙语
|
||||||
|
|
||||||
|
西班牙语
|
||||||
|
|
||||||
|
缅甸语
|
||||||
|
|
||||||
|
泰语
|
||||||
|
|
||||||
|
越南语
|
||||||
|
|
||||||
|
土耳其语
|
||||||
|
|
||||||
|
叙利亚语
|
||||||
|
|
||||||
|
阿拉伯语
|
||||||
|
|
||||||
|
印地语
|
||||||
|
|
||||||
|
孟加拉语
|
||||||
|
|
||||||
|
尼泊尔语
|
||||||
|
|
||||||
|
土库曼语
|
||||||
|
|
||||||
|
塔吉克语
|
||||||
|
|
||||||
|
吉尔吉斯语
|
||||||
|
|
||||||
|
俄语
|
||||||
|
|
||||||
|
乌克兰语
|
||||||
|
|
||||||
|
白俄罗斯语
|
||||||
|
|
||||||
|
格鲁吉亚语
|
||||||
|
|
||||||
|
阿塞拜疆语
|
||||||
|
|
||||||
|
亚美尼亚语
|
||||||
|
|
||||||
|
波兰语
|
||||||
|
|
||||||
|
立陶宛语
|
||||||
|
|
||||||
|
爱沙尼亚语
|
||||||
|
|
||||||
|
拉脱维亚语
|
||||||
|
|
||||||
|
捷克语
|
||||||
|
|
||||||
|
斯洛伐克语
|
||||||
|
|
||||||
|
匈牙利语
|
||||||
|
|
||||||
|
斯洛文尼亚语
|
||||||
|
|
||||||
|
克罗地亚语
|
||||||
|
|
||||||
|
波斯尼亚语
|
||||||
|
|
||||||
|
黑山语
|
||||||
|
|
||||||
|
塞尔维亚语
|
||||||
|
|
||||||
|
阿尔巴尼亚语
|
||||||
|
|
||||||
|
罗马尼亚语
|
||||||
|
|
||||||
|
保加利亚
|
||||||
|
|
||||||
|
马其顿语
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Supported Languages
|
||||||
|
|
||||||
|
English
|
||||||
|
Chinese
|
||||||
|
Korean
|
||||||
|
Japanese
|
||||||
|
German
|
||||||
|
French
|
||||||
|
Portuguese
|
||||||
|
Spanish
|
||||||
|
Burmese
|
||||||
|
Thai
|
||||||
|
Vietnamese
|
||||||
|
Turkish
|
||||||
|
Syriac
|
||||||
|
Arabic
|
||||||
|
Hindi
|
||||||
|
Bengali
|
||||||
|
Nepali
|
||||||
|
Turkmen
|
||||||
|
Tajik
|
||||||
|
Kyrgyz
|
||||||
|
Russian
|
||||||
|
Ukrainian
|
||||||
|
Belarusian
|
||||||
|
Georgian
|
||||||
|
Azerbaijani
|
||||||
|
Armenian
|
||||||
|
Polish
|
||||||
|
Lithuanian
|
||||||
|
Estonian
|
||||||
|
Latvian
|
||||||
|
Czech
|
||||||
|
Slovak
|
||||||
|
Hungarian
|
||||||
|
Slovenian
|
||||||
|
Croatian
|
||||||
|
Bosnian
|
||||||
|
Montenegrin
|
||||||
|
Serbian
|
||||||
|
Albanian
|
||||||
|
Romanian
|
||||||
|
Bulgarian
|
||||||
|
Macedonian
|
||||||
@@ -15,7 +15,7 @@
|
|||||||
Leveraging the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) method (the newest technique in the [RLHF-V](https://github.com/RLHF-V) [CVPR'24] series), MiniCPM-Llama3-V 2.5 exhibits more trustworthy behavior. It achieves a **10.3%** hallucination rate on Object HalBench, lower than GPT-4V-1106 (13.6%), achieving the best-level performance within the open-source community. [Data released](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset).
|
Leveraging the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) method (the newest technique in the [RLHF-V](https://github.com/RLHF-V) [CVPR'24] series), MiniCPM-Llama3-V 2.5 exhibits more trustworthy behavior. It achieves a **10.3%** hallucination rate on Object HalBench, lower than GPT-4V-1106 (13.6%), achieving the best-level performance within the open-source community. [Data released](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset).
|
||||||
|
|
||||||
- 🌏 **Multilingual Support.**
|
- 🌏 **Multilingual Support.**
|
||||||
Thanks to the strong multilingual capabilities of Llama 3 and the cross-lingual generalization technique from [VisCPM](https://github.com/OpenBMB/VisCPM), MiniCPM-Llama3-V 2.5 extends its bilingual (Chinese-English) multimodal capabilities to **over 30 languages including German, French, Spanish, Italian, Korean etc.** [All Supported Languages](./assets/minicpm-llama-v-2-5_languages.md).
|
Thanks to the strong multilingual capabilities of Llama 3 and the cross-lingual generalization technique from [VisCPM](https://github.com/OpenBMB/VisCPM), MiniCPM-Llama3-V 2.5 extends its bilingual (Chinese-English) multimodal capabilities to **over 30 languages including German, French, Spanish, Italian, Korean etc.** [All Supported Languages](../docs/minicpm-llama-v-2-5_languages.md).
|
||||||
|
|
||||||
- 🚀 **Efficient Deployment.**
|
- 🚀 **Efficient Deployment.**
|
||||||
MiniCPM-Llama3-V 2.5 systematically employs **model quantization, CPU optimizations, NPU optimizations and compilation optimizations**, achieving high-efficiency deployment on end-side devices. For mobile phones with Qualcomm chips, we have integrated the NPU acceleration framework QNN into llama.cpp for the first time. After systematic optimization, MiniCPM-Llama3-V 2.5 has realized a **150x acceleration in end-side MLLM image encoding** and a **3x speedup in language decoding**.
|
MiniCPM-Llama3-V 2.5 systematically employs **model quantization, CPU optimizations, NPU optimizations and compilation optimizations**, achieving high-efficiency deployment on end-side devices. For mobile phones with Qualcomm chips, we have integrated the NPU acceleration framework QNN into llama.cpp for the first time. After systematic optimization, MiniCPM-Llama3-V 2.5 has realized a **150x acceleration in end-side MLLM image encoding** and a **3x speedup in language decoding**.
|
||||||
|
|||||||
964
docs/minicpm_o2dot6_en.md
Normal file
@@ -0,0 +1,964 @@
|
|||||||
|
## MiniCPM-o 2.6
|
||||||
|
|
||||||
|
> Archieve at: 2026-02-02
|
||||||
|
|
||||||
|
**MiniCPM-o 2.6** is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip-400M, Whisper-medium-300M, ChatTTS-200M, and Qwen2.5-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.6, and introduces new features for real-time speech conversation and multimodal live streaming. Notable features of MiniCPM-o 2.6 include:
|
||||||
|
|
||||||
|
- 🔥 **Leading Visual Capability.**
|
||||||
|
MiniCPM-o 2.6 achieves an average score of 70.2 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. **With only 8B parameters, it surpasses widely used proprietary models like GPT-4o-202405, Gemini 1.5 Pro, and Claude 3.5 Sonnet** for single image understanding. It also **outperforms GPT-4V and Claude 3.5 Sonnet** in multi-image and video understanding, and shows promising in-context learning capability.
|
||||||
|
|
||||||
|
- 🎙 **State-of-the-art Speech Capability.** MiniCPM-o 2.6 supports **bilingual real-time speech conversation with configurable voices** in English and Chinese. It **outperforms GPT-4o-realtime on audio understanding tasks** such as ASR and STT translation, and shows **state-of-the-art performance on speech conversation in both semantic and acoustic evaluations in the open-source community**. It also allows for fun features such as emotion/speed/style control, end-to-end voice cloning, role play, etc.
|
||||||
|
|
||||||
|
- 🎬 **Strong Multimodal Live Streaming Capability.** As a new feature, MiniCPM-o 2.6 can **accept continuous video and audio streams independent of user queries, and support real-time speech interaction**. It **outperforms GPT-4o-202408 and Claude 3.5 Sonnet and shows state-of-the-art performance in the open-source community on StreamingBench**, a comprehensive benchmark for real-time video understanding, omni-source (video & audio) understanding, and multimodal contextual understanding.
|
||||||
|
|
||||||
|
- 💪 **Strong OCR Capability and Others.**
|
||||||
|
Advancing popular visual capabilities from MiniCPM-V series, MiniCPM-o 2.6 can process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344). It achieves **state-of-the-art performance on OCRBench for models under 25B, surpassing proprietary models such as GPT-4o-202405**.
|
||||||
|
Based on the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) and [VisCPM](https://github.com/OpenBMB/VisCPM) techniques, it features **trustworthy behaviors**, outperforming GPT-4o and Claude 3.5 Sonnet on MMHal-Bench, and supports **multilingual capabilities** on more than 30 languages.
|
||||||
|
|
||||||
|
|
||||||
|
- 🚀 **Superior Efficiency.**
|
||||||
|
In addition to its friendly size, MiniCPM-o 2.6 also shows **state-of-the-art token density** (i.e., the number of pixels encoded into each visual token). **It produces only 640 tokens when processing a 1.8M pixel image, which is 75% fewer than most models**. This directly improves the inference speed, first-token latency, memory usage, and power consumption. As a result, MiniCPM-o 2.6 can efficiently support **multimodal live streaming** on end-side devices such as iPads.
|
||||||
|
|
||||||
|
- 💫 **Easy Usage.**
|
||||||
|
MiniCPM-o 2.6 can be easily used in various ways: (1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-omni/examples/llava/README-minicpmo2.6.md) support for efficient CPU inference on local devices, (2) [int4](https://huggingface.co/openbmb/MiniCPM-o-2_6-int4) and [GGUF](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) format quantized models in 16 sizes, (3) [vLLM](#efficient-inference-with-llamacpp-ollama-vllm) support for high-throughput and memory-efficient inference, (4) fine-tuning on new domains and tasks with [LLaMA-Factory](./docs/llamafactory_train_and_infer.md), (5) quick [local WebUI demo](#chat-with-our-demo-on-gradio), and (6) online web demo on [server](https://minicpm-omni-webdemo-us.modelbest.cn/).
|
||||||
|
|
||||||
|
**Model Architecture.**
|
||||||
|
|
||||||
|
- **End-to-end Omni-modal Architecture.** Different modality encoders/decoders are connected and trained in an **end-to-end** fashion to fully exploit rich multimodal knowledge. The model is trained in a fully end-to-end manner with only CE loss.
|
||||||
|
- **Omni-modal Live Streaming Mechanism.** (1) We change the offline modality encoder/decoders into online ones for **streaming inputs/outputs.** (2) We devise a **time-division multiplexing (TDM) mechanism** for omni-modality streaming processing in the LLM backbone. It divides parallel omni-modality streams into sequential info within small periodic time slices.
|
||||||
|
- **Configurable Speech Modeling Design.** We devise a multimodal system prompt, including traditional text system prompt, and **a new audio system prompt to determine the assistant voice**. This enables flexible voice configurations in inference time, and also facilitates end-to-end voice cloning and description-based voice creation.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="./assets/minicpm-o-26-framework-v2.png" , width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
### Evaluation <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="./assets/radar.jpg", width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view visual understanding results.</summary>
|
||||||
|
|
||||||
|
**Image Understanding**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Token Density<sup>+</sup></th>
|
||||||
|
<th>OpenCompass</th>
|
||||||
|
<th>OCRBench</th>
|
||||||
|
<th>MathVista mini</th>
|
||||||
|
<th>ChartQA</th>
|
||||||
|
<th>MMVet</th>
|
||||||
|
<th>MMStar</th>
|
||||||
|
<th>MME</th>
|
||||||
|
<th>MMB1.1 test</th>
|
||||||
|
<th>AI2D</th>
|
||||||
|
<th>MMMU val</th>
|
||||||
|
<th>HallusionBench</th>
|
||||||
|
<th>TextVQA val</th>
|
||||||
|
<th>DocVQA test</th>
|
||||||
|
<th>MathVerse mini</th>
|
||||||
|
<th>MathVision</th>
|
||||||
|
<th>MMHal Score</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="19" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-20240513</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td><u>69.9</u></td>
|
||||||
|
<td>736</td>
|
||||||
|
<td>61.3</td>
|
||||||
|
<td>85.7</td>
|
||||||
|
<td><strong>69.1</strong></td>
|
||||||
|
<td>63.9</td>
|
||||||
|
<td>2328.7</td>
|
||||||
|
<td>82.2</td>
|
||||||
|
<td>84.6</td>
|
||||||
|
<td><strong>69.2</strong></td>
|
||||||
|
<td><strong>55.0</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>92.8</td>
|
||||||
|
<td><strong>50.2</strong></td>
|
||||||
|
<td><strong>30.4</strong></td>
|
||||||
|
<td><u>3.6</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude3.5-Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>750</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>788</td>
|
||||||
|
<td>61.6</td>
|
||||||
|
<td><strong>90.8</strong></td>
|
||||||
|
<td>66.0</td>
|
||||||
|
<td>62.2</td>
|
||||||
|
<td>1920.0</td>
|
||||||
|
<td>78.5</td>
|
||||||
|
<td>80.2</td>
|
||||||
|
<td><u>65.9</u></td>
|
||||||
|
<td>49.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>95.2</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>64.4</td>
|
||||||
|
<td>754</td>
|
||||||
|
<td>57.7</td>
|
||||||
|
<td>81.3</td>
|
||||||
|
<td>64.0</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>2110.6</td>
|
||||||
|
<td>73.9</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>73.5</td>
|
||||||
|
<td>86.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>19.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-mini-20240718</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>785</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>2003.4</td>
|
||||||
|
<td>76.0</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>46.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="19" align="left"><strong>Open Source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Cambrian-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td><u>1820</u></td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>591</td>
|
||||||
|
<td>50.3</td>
|
||||||
|
<td>75.6</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
<td>54.2</td>
|
||||||
|
<td>2049.9</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>79.5</td>
|
||||||
|
<td>50.4</td>
|
||||||
|
<td>41.6</td>
|
||||||
|
<td>76.7</td>
|
||||||
|
<td>75.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4V-9B</td>
|
||||||
|
<td>13B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>776</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>58.0</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>2018.8</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>71.2</td>
|
||||||
|
<td>46.9</td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Pixtral-12B</td>
|
||||||
|
<td>12B</td>
|
||||||
|
<td>256</td>
|
||||||
|
<td>61.0</td>
|
||||||
|
<td>685</td>
|
||||||
|
<td>56.9</td>
|
||||||
|
<td>81.8</td>
|
||||||
|
<td>58.5</td>
|
||||||
|
<td>54.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>72.7</td>
|
||||||
|
<td>79.0</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>47.0</td>
|
||||||
|
<td>75.7</td>
|
||||||
|
<td>90.7</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>741</td>
|
||||||
|
<td>66.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>52.7</td>
|
||||||
|
<td>60.2</td>
|
||||||
|
<td>2328.1</td>
|
||||||
|
<td>76.8</td>
|
||||||
|
<td>79.2</td>
|
||||||
|
<td>52.6</td>
|
||||||
|
<td>44.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">DeepSeek-VL2-27B (4B)</td>
|
||||||
|
<td>27B</td>
|
||||||
|
<td>672</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>809</td>
|
||||||
|
<td>63.9</td>
|
||||||
|
<td>86.0</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>61.9</td>
|
||||||
|
<td>2253.0</td>
|
||||||
|
<td>81.2</td>
|
||||||
|
<td>83.8</td>
|
||||||
|
<td>54.0</td>
|
||||||
|
<td>45.3</td>
|
||||||
|
<td><u>84.2</u></td>
|
||||||
|
<td>93.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>67.1</td>
|
||||||
|
<td><u>866</u></td>
|
||||||
|
<td>58.2</td>
|
||||||
|
<td>83.0</td>
|
||||||
|
<td>62.0</td>
|
||||||
|
<td>60.7</td>
|
||||||
|
<td>2326.0</td>
|
||||||
|
<td>81.8</td>
|
||||||
|
<td>83.0</td>
|
||||||
|
<td>54.1</td>
|
||||||
|
<td>50.6</td>
|
||||||
|
<td><strong>84.3</strong></td>
|
||||||
|
<td><u>94.5</u></td>
|
||||||
|
<td>31.9</td>
|
||||||
|
<td>16.3</td>
|
||||||
|
<td>3.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-OneVision-72B</td>
|
||||||
|
<td>72B</td>
|
||||||
|
<td>182</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>741</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>83.7</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td><strong>65.8</strong></td>
|
||||||
|
<td>2261.0</td>
|
||||||
|
<td><strong>85.0</strong></td>
|
||||||
|
<td><u>85.6</u></td>
|
||||||
|
<td>56.8</td>
|
||||||
|
<td>49.0</td>
|
||||||
|
<td>80.5</td>
|
||||||
|
<td>91.3</td>
|
||||||
|
<td>39.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>706</td>
|
||||||
|
<td>68.3</td>
|
||||||
|
<td>822</td>
|
||||||
|
<td><u>64.4</u></td>
|
||||||
|
<td>84.8</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>2344.0</td>
|
||||||
|
<td><u>83.6</u></td>
|
||||||
|
<td>84.5</td>
|
||||||
|
<td>56.0</td>
|
||||||
|
<td>50.1</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>93.0</td>
|
||||||
|
<td>39.5</td>
|
||||||
|
<td>19.7</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>2822</strong></td>
|
||||||
|
<td>65.2</td>
|
||||||
|
<td>852*</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>79.4</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>57.5</td>
|
||||||
|
<td><u>2348.4*</u></td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>82.1</td>
|
||||||
|
<td>49.8*</td>
|
||||||
|
<td>48.1*</td>
|
||||||
|
<td>80.1</td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td>25.7</td>
|
||||||
|
<td>18.3</td>
|
||||||
|
<td>3.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>2822</strong></td>
|
||||||
|
<td><strong>70.2</strong></td>
|
||||||
|
<td><strong>897*</strong></td>
|
||||||
|
<td><strong>71.9*</strong></td>
|
||||||
|
<td><u>86.9*</u></td>
|
||||||
|
<td><u>67.5</u></td>
|
||||||
|
<td><u>64.0</u></td>
|
||||||
|
<td><strong>2372.0*</strong></td>
|
||||||
|
<td>80.5</td>
|
||||||
|
<td><strong>85.8</strong></td>
|
||||||
|
<td>50.4*</td>
|
||||||
|
<td><u>51.9</u></td>
|
||||||
|
<td>82.0</td>
|
||||||
|
<td>93.5</td>
|
||||||
|
<td><u>41.4*</u></td>
|
||||||
|
<td><u>23.1*</u></td>
|
||||||
|
<td><strong>3.8</strong></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
* We evaluate this benchmark using chain-of-thought prompting. Specifically, for MME, we used this technique only for the Cognition set.
|
||||||
|
|
||||||
|
|
||||||
|
<sup>+</sup> Token Density: number of pixels encoded into each visual token at maximum resolution, i.e., # pixels at maximum resolution / # visual tokens.
|
||||||
|
|
||||||
|
Note: For proprietary models, we calculate token density based on the image encoding charging strategy defined in the official API documentation, which provides an upper-bound estimation.
|
||||||
|
|
||||||
|
|
||||||
|
**Multi-image and Video Understanding**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>BLINK val</th>
|
||||||
|
<th>Mantis Eval</th>
|
||||||
|
<th>MIRB</th>
|
||||||
|
<th>Video-MME (wo / w subs)</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-20240513</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>68.0</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>71.9/77.2<strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>53.1</td>
|
||||||
|
<td>59.9/63.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>56.1/58.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Interleave 14B</td>
|
||||||
|
<td>14B</td>
|
||||||
|
<td>52.6</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>30.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-OneVision-72B</td>
|
||||||
|
<td>72B</td>
|
||||||
|
<td>55.4</td>
|
||||||
|
<td><strong>77.6</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><u>66.2/69.5</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MANTIS 8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>49.1</td>
|
||||||
|
<td>59.5</td>
|
||||||
|
<td>34.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
<td>69.6*</td>
|
||||||
|
<td><strong>67.6*</strong></td>
|
||||||
|
<td>63.3/69.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>67.7</td>
|
||||||
|
<td>52.5</td>
|
||||||
|
<td>64.2/66.9</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>53.0</td>
|
||||||
|
<td>69.1</td>
|
||||||
|
<td>53.8</td>
|
||||||
|
<td>60.9/63.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><u>56.7</u></td>
|
||||||
|
<td><u>71.9</u></td>
|
||||||
|
<td><u>58.6</u></td>
|
||||||
|
<td>63.9/67.9</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* We evaluate officially released checkpoints by ourselves.
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view audio understanding and speech conversation results.</summary>
|
||||||
|
|
||||||
|
**Audio Understanding**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Task</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th colspan="3">ASR (zh)</th>
|
||||||
|
<th colspan="3">ASR (en)</th>
|
||||||
|
<th colspan="2">AST</th>
|
||||||
|
<th>Emotion</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Metric</th>
|
||||||
|
<td></td>
|
||||||
|
<th colspan="3">CER↓</th>
|
||||||
|
<th colspan="3">WER↓</th>
|
||||||
|
<th colspan="2">BLEU↑</th>
|
||||||
|
<th>ACC↑</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Dataset</th>
|
||||||
|
<td></td>
|
||||||
|
<th>AISHELL-1</th>
|
||||||
|
<th>Fleurs zh</th>
|
||||||
|
<th>WenetSpeech test-net</th>
|
||||||
|
<th>LibriSpeech test-clean</th>
|
||||||
|
<th>GigaSpeech</th>
|
||||||
|
<th>TED-LIUM</th>
|
||||||
|
<th>CoVoST en2zh</th>
|
||||||
|
<th>CoVoST zh2en</th>
|
||||||
|
<th>MELD emotion</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-Realtime</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>7.3*</td>
|
||||||
|
<td><u>5.4*</u></td>
|
||||||
|
<td>28.9*</td>
|
||||||
|
<td>2.6*</td>
|
||||||
|
<td>12.9*</td>
|
||||||
|
<td>4.8*</td>
|
||||||
|
<td>37.1*</td>
|
||||||
|
<td>15.7*</td>
|
||||||
|
<td>33.2*</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>4.5*</td>
|
||||||
|
<td>5.9*</td>
|
||||||
|
<td>14.3*</td>
|
||||||
|
<td>2.9*</td>
|
||||||
|
<td>10.6*</td>
|
||||||
|
<td><strong>3.0*</strong></td>
|
||||||
|
<td><u>47.3*</u></td>
|
||||||
|
<td>22.6*</td>
|
||||||
|
<td>48.4*</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Open-Source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-Audio-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>7.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>1.6</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>45.2</td>
|
||||||
|
<td><u>24.4</u></td>
|
||||||
|
<td><strong>55.3</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-Audio-7B-Instruct</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>2.6*</td>
|
||||||
|
<td>6.9*</td>
|
||||||
|
<td><u>10.3*</u></td>
|
||||||
|
<td>3.1*</td>
|
||||||
|
<td><u>9.7</u>*</td>
|
||||||
|
<td>5.9*</td>
|
||||||
|
<td>39.5*</td>
|
||||||
|
<td>22.9*</td>
|
||||||
|
<td>17.4*</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>2.16</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>8.4</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4-Voice-Base</td>
|
||||||
|
<td>9B</td>
|
||||||
|
<td><u>2.5</u></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>2.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>1.6</strong></td>
|
||||||
|
<td><strong>4.4</strong></td>
|
||||||
|
<td><strong>6.9</strong></td>
|
||||||
|
<td><u>1.7</u></td>
|
||||||
|
<td><strong>8.7</strong></td>
|
||||||
|
<td><strong>3.0</strong></td>
|
||||||
|
<td><strong>48.2</strong></td>
|
||||||
|
<td><strong>27.2</strong></td>
|
||||||
|
<td><u>52.4</u></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
* We evaluate officially released checkpoints by ourselves.<br><br>
|
||||||
|
|
||||||
|
**Speech Generation**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Task</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th colspan="9">SpeechQA</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Metric</th>
|
||||||
|
<th></th>
|
||||||
|
<th colspan="3">ACC↑</th>
|
||||||
|
<th>G-Eval (10 point)↑</th>
|
||||||
|
<th>Semantic ELO score↑</th>
|
||||||
|
<th>Acoustic ELO score↑</th>
|
||||||
|
<th>Overall ELO score↑</th>
|
||||||
|
<th>UTMOS↑</th>
|
||||||
|
<th>ASR-WER↓</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Dataset</th>
|
||||||
|
<th></th>
|
||||||
|
<th>Speech Llama Q.</th>
|
||||||
|
<th>Speech Web Q.</th>
|
||||||
|
<th>Speech Trivia QA</th>
|
||||||
|
<th>Speech AlpacaEval</th>
|
||||||
|
<th colspan="5">AudioArena</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-Realtime</td>
|
||||||
|
<td></td>
|
||||||
|
<td><strong>71.7</strong></td>
|
||||||
|
<td><strong>51.6</strong></td>
|
||||||
|
<td><strong>69.7</strong></td>
|
||||||
|
<td><strong>7.4</strong></td>
|
||||||
|
<td><strong>1157</strong></td>
|
||||||
|
<td><strong>1203</strong></td>
|
||||||
|
<td><strong>1200</strong></td>
|
||||||
|
<td><strong>4.2</strong></td>
|
||||||
|
<td><strong>2.3</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Open-Source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4-Voice</td>
|
||||||
|
<td>9B</td>
|
||||||
|
<td>50.0</td>
|
||||||
|
<td>32.0</td>
|
||||||
|
<td>36.4</td>
|
||||||
|
<td><u>5.1</u></td>
|
||||||
|
<td>999</td>
|
||||||
|
<td>1147</td>
|
||||||
|
<td>1035</td>
|
||||||
|
<td><u>4.1</u></td>
|
||||||
|
<td><u>11.7</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Llama-Omni</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>45.3</td>
|
||||||
|
<td>22.9</td>
|
||||||
|
<td>10.7</td>
|
||||||
|
<td>3.9</td>
|
||||||
|
<td>960</td>
|
||||||
|
<td>878</td>
|
||||||
|
<td>897</td>
|
||||||
|
<td>3.2</td>
|
||||||
|
<td>24.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>46.7</td>
|
||||||
|
<td>28.1</td>
|
||||||
|
<td>23.3</td>
|
||||||
|
<td>2.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Moshi</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>43.7</td>
|
||||||
|
<td>23.8</td>
|
||||||
|
<td>16.7</td>
|
||||||
|
<td>2.4</td>
|
||||||
|
<td>871</td>
|
||||||
|
<td>808</td>
|
||||||
|
<td>875</td>
|
||||||
|
<td>2.8</td>
|
||||||
|
<td>8.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Mini-Omni</td>
|
||||||
|
<td>1B</td>
|
||||||
|
<td>22.0</td>
|
||||||
|
<td>12.8</td>
|
||||||
|
<td>6.9</td>
|
||||||
|
<td>2.5</td>
|
||||||
|
<td>926</td>
|
||||||
|
<td>803</td>
|
||||||
|
<td>865</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
<td>10.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><u>61.0</u></td>
|
||||||
|
<td><u>40.0</u></td>
|
||||||
|
<td><u>40.2</u></td>
|
||||||
|
<td><u>5.1</u></td>
|
||||||
|
<td><u>1088</u></td>
|
||||||
|
<td><u>1163</u></td>
|
||||||
|
<td><u>1131</u></td>
|
||||||
|
<td><strong>4.2</strong></td>
|
||||||
|
<td>9.8</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
All results are from AudioEvals, and the evaluation methods along with further details can be found in <a href="https://github.com/OpenBMB/UltraEval-Audio" target="_blank">AudioEvals</a>.<br><br>
|
||||||
|
|
||||||
|
**End-to-end Voice Cloning**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Task</th>
|
||||||
|
<th colspan="2">Voice cloning</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Metric</th>
|
||||||
|
<th>SIMO↑</th>
|
||||||
|
<th>SIMO↑</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Dataset</th>
|
||||||
|
<th>Seed-TTS test-zh</th>
|
||||||
|
<th>Seed-TTS test-en</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">F5-TTS</td>
|
||||||
|
<td><strong>76</strong></td>
|
||||||
|
<td><strong>67</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">CosyVoice</td>
|
||||||
|
<td><u>75</u></td>
|
||||||
|
<td><u>64</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">FireRedTTS</td>
|
||||||
|
<td>63</td>
|
||||||
|
<td>46</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>57</td>
|
||||||
|
<td>47</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view multimodal live streaming results.</summary>
|
||||||
|
|
||||||
|
**Multimodal Live Streaming**: results on StreamingBench
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Real-Time Video Understanding</th>
|
||||||
|
<th>Omni-Source Understanding</th>
|
||||||
|
<th>Contextual Understanding</th>
|
||||||
|
<th>Overall</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="7" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><u>77.4</u></td>
|
||||||
|
<td><strong>67.8</strong></td>
|
||||||
|
<td><strong>51.1</strong></td>
|
||||||
|
<td><strong>70.3</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-202408</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>74.5</td>
|
||||||
|
<td>51.0</td>
|
||||||
|
<td><u>48.0</u></td>
|
||||||
|
<td>64.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude-3.5-Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>74.0</td>
|
||||||
|
<td>41.4</td>
|
||||||
|
<td>37.8</td>
|
||||||
|
<td>59.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="9" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VILA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>61.5</td>
|
||||||
|
<td>37.5</td>
|
||||||
|
<td>26.7</td>
|
||||||
|
<td>49.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LongVA</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>63.1</td>
|
||||||
|
<td>35.9</td>
|
||||||
|
<td>30.2</td>
|
||||||
|
<td>50.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-Next-Video-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>69.8</td>
|
||||||
|
<td>41.7</td>
|
||||||
|
<td>34.3</td>
|
||||||
|
<td>56.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>71.2</td>
|
||||||
|
<td>40.7</td>
|
||||||
|
<td>33.1</td>
|
||||||
|
<td>57.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>70.1</td>
|
||||||
|
<td>42.7</td>
|
||||||
|
<td>34.1</td>
|
||||||
|
<td>57.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>70.9</td>
|
||||||
|
<td>40.8</td>
|
||||||
|
<td>35.8</td>
|
||||||
|
<td>57.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-OneVision-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>74.3</td>
|
||||||
|
<td>40.8</td>
|
||||||
|
<td>31.0</td>
|
||||||
|
<td>58.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternLM-XC2.5-OL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>75.4</td>
|
||||||
|
<td>46.2</td>
|
||||||
|
<td>33.6</td>
|
||||||
|
<td>60.8</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>72.4</td>
|
||||||
|
<td>40.2</td>
|
||||||
|
<td>33.4</td>
|
||||||
|
<td>57.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>79.9</strong></td>
|
||||||
|
<td><u>53.4</u></td>
|
||||||
|
<td>38.5</td>
|
||||||
|
<td><u>66.0</u></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
### Examples <!-- omit in toc -->
|
||||||
|
|
||||||
|
We deploy MiniCPM-o 2.6 on end devices. The demo video is the raw-speed recording on an iPad Pro and a Web demo.
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<a href="https://www.youtube.com/watch?v=vRIMbxJzStY&t=2s"><img src="./assets/minicpmo2_6/2dot6_o_demo_video_img.png", width=70%></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="assets/minicpmo2_6/minicpmo2_6_math_intersect.png" alt="math" style="margin-bottom: 5px;">
|
||||||
|
<img src="assets/minicpmo2_6/minicpmo2_6_diagram_train_NN.png" alt="diagram" style="margin-bottom: 5px;">
|
||||||
|
<img src="assets/minicpmo2_6/minicpmo2_6_multi-image_bike.png" alt="bike" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
927
docs/minicpm_o2dot6_zh.md
Normal file
@@ -0,0 +1,927 @@
|
|||||||
|
## MiniCPM-o 2.6
|
||||||
|
|
||||||
|
> Archieve at: 2026-02-02
|
||||||
|
|
||||||
|
MiniCPM-o 2.6 是 MiniCPM-o 系列的最新、性能最佳模型。该模型基于 SigLip-400M、Whisper-medium-300M、ChatTTS-200M 和 Qwen2.5-7B 构建,共 8B 参数,通过端到端方式训练和推理。相比 MiniCPM-V 2.6,该模型在性能上有了显著提升,并支持了实时语音对话和多模态流式交互的新功能。MiniCPM-o 2.6 的主要特性包括:
|
||||||
|
|
||||||
|
|
||||||
|
- 🔥 **领先的视觉能力。**
|
||||||
|
MiniCPM-o 2.6 在 OpenCompass 榜单上(综合 8 个主流多模态评测基准)平均得分 70.2,**以 8B 量级的大小在单图理解方面超越了 GPT-4o-202405、Gemini 1.5 Pro 和 Claude 3.5 Sonnet 等主流商用闭源多模态大模型**。此外,它的多图和视频理解表现也**优于 GPT-4V 和 Claude 3.5 Sonnet**,并展现出了优秀的上下文学习能力。
|
||||||
|
|
||||||
|
- 🎙 **出色的语音能力。**
|
||||||
|
MiniCPM-o 2.6 **支持可配置声音的中英双语实时对话**。MiniCPM-o 2.6 在语音理解任务(如 ASR 和 STT 等)**优于 GPT-4o-realtime**,并在语音对话的语义和声学评估中展现了**开源模型中最高的语音生成性能**。它还支持情绪/语速/风格控制、语音克隆、角色扮演等进阶能力。
|
||||||
|
|
||||||
|
- 🎬 **强大的多模态流式交互能力。**
|
||||||
|
作为一项新功能,MiniCPM-o 2.6 能够**接受连续的视频和音频流,并和用户进行实时语音交互**。在针对实时视频理解、全模态视音频理解、多模态上下文理解的综合评测基准 StreamingBench 中,MiniCPM-o 2.6 取得开源社区最佳水平,并**超过了 GPT-4o-202408 和 Claude 3.5 Sonnet**。
|
||||||
|
|
||||||
|
- 💪 **强大的 OCR 能力及其他功能。**
|
||||||
|
MiniCPM-o 2.6 进一步优化了 MiniCPM-V 2.6 的众多视觉理解能力,其可以处理任意长宽比的图像,像素数可达 180 万(如 1344x1344)。在 OCRBench 上取得**25B 以下最佳水平,超过 GPT-4o-202405 等商用闭源模型**。基于最新的 [RLHF-V](https://rlhf-v.github.io/)、[RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) 和 [VisCPM](https://github.com/OpenBMB/VisCPM) 技术,其具备了**可信的多模态行为**,在 MMHal-Bench 上超过了 GPT-4o 和 Claude 3.5,并支持英语、中文、德语、法语、意大利语、韩语等**30多种语言**。
|
||||||
|
|
||||||
|
- 🚀 **卓越的效率。**
|
||||||
|
除了对个人用户友好的模型大小,MiniCPM-o 2.6 还表现出**最先进的视觉 token 密度**(即每个视觉 token 编码的像素数量)。它**仅需 640 个 token 即可处理 180 万像素图像,比大多数模型少 75%**。这一特性优化了模型的推理速度、首 token 延迟、内存占用和功耗。因此,MiniCPM-o 2.6 可以支持 iPad 等终端设备上的高效**多模态实时流式交互**。
|
||||||
|
|
||||||
|
|
||||||
|
- 💫 **易于使用。**
|
||||||
|
MiniCPM-o 2.6 可以通过多种方式轻松使用:(1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-omni/examples/llava/README-minicpmo2.6.md) 支持在本地设备上进行高效的 CPU 推理,(2) [int4](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) 和 [GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) 格式的量化模型,有 16 种尺寸,(3) [vLLM](#基于-llamacppollamavllm-的高效推理) 支持高吞吐量和内存高效的推理,(4) 通过[LLaMA-Factory](./docs/llamafactory_train_and_infer.md)框架针对新领域和任务进行微调,(5) 使用 [Gradio](#本地-webui-demo-) 快速设置本地 WebUI 演示,(6) 部署于服务器的在线 [demo](https://minicpm-omni-webdemo-us.modelbest.cn/)。
|
||||||
|
|
||||||
|
**模型架构。**
|
||||||
|
|
||||||
|
- **端到端全模态架构。** 通过**端到端**的方式连接和训练不同模态的编/解码模块以充分利用丰富的多模态知识。模型完全使用 CE 损失端到端训练。
|
||||||
|
- **全模态流式机制。** (1) 我们将不同模态的离线编/解码器改造为适用于**流式输入/输出**的在线模块。 (2) 我们针对大语言模型基座设计了**时分复用的全模态流式信息处理机制**,将平行的不同模态的信息流拆分重组为周期性时间片序列。
|
||||||
|
- **可配置的声音方案。** 我们设计了新的多模态系统提示,包含传统文本系统提示词,和**用于指定模型声音的语音系统提示词**。模型可在推理时灵活地通过文字或语音样例控制声音风格,并支持端到端声音克隆和音色创建等高级能力。
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="./assets/minicpm-o-26-framework-v2.png" , width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### 性能评估 <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="./assets/radar.jpg", width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看视觉理解能力详细评测结果。</summary>
|
||||||
|
|
||||||
|
**图像理解能力**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Token Density<sup>+</sup></th>
|
||||||
|
<th>OpenCompass</th>
|
||||||
|
<th>OCRBench</th>
|
||||||
|
<th>MathVista mini</th>
|
||||||
|
<th>ChartQA</th>
|
||||||
|
<th>MMVet</th>
|
||||||
|
<th>MMStar</th>
|
||||||
|
<th>MME</th>
|
||||||
|
<th>MMB1.1 test</th>
|
||||||
|
<th>AI2D</th>
|
||||||
|
<th>MMMU val</th>
|
||||||
|
<th>HallusionBench</th>
|
||||||
|
<th>TextVQA val</th>
|
||||||
|
<th>DocVQA test</th>
|
||||||
|
<th>MathVerse mini</th>
|
||||||
|
<th>MathVision</th>
|
||||||
|
<th>MMHal Score</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="19" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-20240513</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td><u>69.9</u></td>
|
||||||
|
<td>736</td>
|
||||||
|
<td>61.3</td>
|
||||||
|
<td>85.7</td>
|
||||||
|
<td><strong>69.1</strong></td>
|
||||||
|
<td>63.9</td>
|
||||||
|
<td>2328.7</td>
|
||||||
|
<td>82.2</td>
|
||||||
|
<td>84.6</td>
|
||||||
|
<td><strong>69.2</strong></td>
|
||||||
|
<td><strong>55.0</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>92.8</td>
|
||||||
|
<td><strong>50.2</strong></td>
|
||||||
|
<td><strong>30.4</strong></td>
|
||||||
|
<td><u>3.6</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude3.5-Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>750</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>788</td>
|
||||||
|
<td>61.6</td>
|
||||||
|
<td><strong>90.8</strong></td>
|
||||||
|
<td>66.0</td>
|
||||||
|
<td>62.2</td>
|
||||||
|
<td>1920.0</td>
|
||||||
|
<td>78.5</td>
|
||||||
|
<td>80.2</td>
|
||||||
|
<td><u>65.9</u></td>
|
||||||
|
<td>49.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>95.2</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>64.4</td>
|
||||||
|
<td>754</td>
|
||||||
|
<td>57.7</td>
|
||||||
|
<td>81.3</td>
|
||||||
|
<td>64.0</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>2110.6</td>
|
||||||
|
<td>73.9</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>73.5</td>
|
||||||
|
<td>86.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>19.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-mini-20240718</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>785</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>2003.4</td>
|
||||||
|
<td>76.0</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>46.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="19" align="left"><strong>Open Source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Cambrian-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td><u>1820</u></td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>591</td>
|
||||||
|
<td>50.3</td>
|
||||||
|
<td>75.6</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
<td>54.2</td>
|
||||||
|
<td>2049.9</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>79.5</td>
|
||||||
|
<td>50.4</td>
|
||||||
|
<td>41.6</td>
|
||||||
|
<td>76.7</td>
|
||||||
|
<td>75.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4V-9B</td>
|
||||||
|
<td>13B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>776</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>58.0</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>2018.8</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>71.2</td>
|
||||||
|
<td>46.9</td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Pixtral-12B</td>
|
||||||
|
<td>12B</td>
|
||||||
|
<td>256</td>
|
||||||
|
<td>61.0</td>
|
||||||
|
<td>685</td>
|
||||||
|
<td>56.9</td>
|
||||||
|
<td>81.8</td>
|
||||||
|
<td>58.5</td>
|
||||||
|
<td>54.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>72.7</td>
|
||||||
|
<td>79.0</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>47.0</td>
|
||||||
|
<td>75.7</td>
|
||||||
|
<td>90.7</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">DeepSeek-VL2-27B (4B)</td>
|
||||||
|
<td>27B</td>
|
||||||
|
<td>672</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>809</td>
|
||||||
|
<td>63.9</td>
|
||||||
|
<td>86.0</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>61.9</td>
|
||||||
|
<td>2253.0</td>
|
||||||
|
<td>81.2</td>
|
||||||
|
<td>83.8</td>
|
||||||
|
<td>54.0</td>
|
||||||
|
<td>45.3</td>
|
||||||
|
<td><u>84.2</u></td>
|
||||||
|
<td>93.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>67.1</td>
|
||||||
|
<td><u>866</u></td>
|
||||||
|
<td>58.2</td>
|
||||||
|
<td>83.0</td>
|
||||||
|
<td>62.0</td>
|
||||||
|
<td>60.7</td>
|
||||||
|
<td>2326.0</td>
|
||||||
|
<td>81.8</td>
|
||||||
|
<td>83.0</td>
|
||||||
|
<td>54.1</td>
|
||||||
|
<td>50.6</td>
|
||||||
|
<td><strong>84.3</strong></td>
|
||||||
|
<td><u>94.5</u></td>
|
||||||
|
<td>31.9</td>
|
||||||
|
<td>16.3</td>
|
||||||
|
<td>3.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-OneVision-72B</td>
|
||||||
|
<td>72B</td>
|
||||||
|
<td>182</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>741</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>83.7</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td><strong>65.8</strong></td>
|
||||||
|
<td>2261.0</td>
|
||||||
|
<td><strong>85.0</strong></td>
|
||||||
|
<td><u>85.6</u></td>
|
||||||
|
<td>56.8</td>
|
||||||
|
<td>49.0</td>
|
||||||
|
<td>80.5</td>
|
||||||
|
<td>91.3</td>
|
||||||
|
<td>39.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>706</td>
|
||||||
|
<td>68.3</td>
|
||||||
|
<td>822</td>
|
||||||
|
<td><u>64.4</u></td>
|
||||||
|
<td>84.8</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>2344.0</td>
|
||||||
|
<td><u>83.6</u></td>
|
||||||
|
<td>84.5</td>
|
||||||
|
<td>56.0</td>
|
||||||
|
<td>50.1</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>93.0</td>
|
||||||
|
<td>39.5</td>
|
||||||
|
<td>19.7</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>2822</strong></td>
|
||||||
|
<td>65.2</td>
|
||||||
|
<td>852*</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>79.4</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>57.5</td>
|
||||||
|
<td><u>2348.4*</u></td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>82.1</td>
|
||||||
|
<td>49.8*</td>
|
||||||
|
<td>48.1*</td>
|
||||||
|
<td>80.1</td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td>25.7</td>
|
||||||
|
<td>18.3</td>
|
||||||
|
<td>3.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>2822</strong></td>
|
||||||
|
<td><strong>70.2</strong></td>
|
||||||
|
<td><strong>897*</strong></td>
|
||||||
|
<td><strong>71.9*</strong></td>
|
||||||
|
<td><u>86.9*</u></td>
|
||||||
|
<td><u>67.5</u></td>
|
||||||
|
<td><u>64.0</u></td>
|
||||||
|
<td><strong>2372.0*</strong></td>
|
||||||
|
<td>80.5</td>
|
||||||
|
<td><strong>85.8</strong></td>
|
||||||
|
<td>50.4*</td>
|
||||||
|
<td><u>51.9</u></td>
|
||||||
|
<td>82.0</td>
|
||||||
|
<td>93.5</td>
|
||||||
|
<td><u>41.4*</u></td>
|
||||||
|
<td><u>23.1*</u></td>
|
||||||
|
<td><strong>3.8</strong></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
* 我们使用思维链提示词来评估这些基准,对于 MME 我们只在 Cognition 任务上使用了思维链。
|
||||||
|
+ Token Density:每个视觉 token 在最大分辨率下编码的像素数,即最大分辨率下的像素数 / 视觉 token 数。
|
||||||
|
|
||||||
|
注意:闭源模型的 Token Density 由 API 收费方式估算得到。
|
||||||
|
|
||||||
|
**多图和视频理解能力**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>BLINK val</th>
|
||||||
|
<th>Mantis Eval</th>
|
||||||
|
<th>MIRB</th>
|
||||||
|
<th>Video-MME (wo / w subs)</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-20240513</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>68</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>71.9/77.2<strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>53.1</td>
|
||||||
|
<td>59.9/63.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Interleave 14B</td>
|
||||||
|
<td>14B</td>
|
||||||
|
<td>52.6</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>30.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-OneVision-72B</td>
|
||||||
|
<td>72B</td>
|
||||||
|
<td>55.4</td>
|
||||||
|
<td><strong>77.6</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><u>66.2/69.5</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MANTIS 8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>49.1</td>
|
||||||
|
<td>59.5</td>
|
||||||
|
<td>34.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
<td>69.6*</td>
|
||||||
|
<td><strong>67.6*</strong></td>
|
||||||
|
<td>63.3/69.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>67.7</td>
|
||||||
|
<td>52.5</td>
|
||||||
|
<td>64.2/66.9</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>53</td>
|
||||||
|
<td>69.1</td>
|
||||||
|
<td>53.8</td>
|
||||||
|
<td>60.9/63.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><u>56.7</u></td>
|
||||||
|
<td><u>71.9</u></td>
|
||||||
|
<td><u>58.6</u></td>
|
||||||
|
<td>63.9/67.9</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* 正式开源模型权重的评测结果。
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看语音理解和生成能力的详细评测结果。</summary>
|
||||||
|
|
||||||
|
**语音理解能力**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Task</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th colspan="3">ASR (zh)</th>
|
||||||
|
<th colspan="3">ASR (en)</th>
|
||||||
|
<th colspan="2">AST</th>
|
||||||
|
<th>Emotion</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Metric</th>
|
||||||
|
<td></td>
|
||||||
|
<th colspan="3">CER↓</th>
|
||||||
|
<th colspan="3">WER↓</th>
|
||||||
|
<th colspan="2">BLEU↑</th>
|
||||||
|
<th>ACC↑</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Dataset</th>
|
||||||
|
<td></td>
|
||||||
|
<th>AISHELL-1</th>
|
||||||
|
<th>Fleurs zh</th>
|
||||||
|
<th>WenetSpeech test-net</th>
|
||||||
|
<th>LibriSpeech test-clean</th>
|
||||||
|
<th>GigaSpeech</th>
|
||||||
|
<th>TED-LIUM</th>
|
||||||
|
<th>CoVoST en2zh</th>
|
||||||
|
<th>CoVoST zh2en</th>
|
||||||
|
<th>MELD emotion</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-Realtime</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>7.3*</td>
|
||||||
|
<td><u>5.4*</u></td>
|
||||||
|
<td>28.9*</td>
|
||||||
|
<td>2.6*</td>
|
||||||
|
<td>12.9*</td>
|
||||||
|
<td>4.8*</td>
|
||||||
|
<td>37.1*</td>
|
||||||
|
<td>15.7*</td>
|
||||||
|
<td>33.2*</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>4.5*</td>
|
||||||
|
<td>5.9*</td>
|
||||||
|
<td>14.3*</td>
|
||||||
|
<td>2.9*</td>
|
||||||
|
<td>10.6*</td>
|
||||||
|
<td><strong>3.0*</strong></td>
|
||||||
|
<td><u>47.3*</u></td>
|
||||||
|
<td>22.6*</td>
|
||||||
|
<td>48.4*</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Open-Source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-Audio-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>7.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><strong>1.6</strong></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>45.2</td>
|
||||||
|
<td><u>24.4</u></td>
|
||||||
|
<td><strong>55.3</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-Audio-7B-Instruct</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>2.6*</td>
|
||||||
|
<td>6.9*</td>
|
||||||
|
<td><u>10.3*</u></td>
|
||||||
|
<td>3.1*</td>
|
||||||
|
<td><u>9.7</u>*</td>
|
||||||
|
<td>5.9*</td>
|
||||||
|
<td>39.5*</td>
|
||||||
|
<td>22.9*</td>
|
||||||
|
<td>17.4*</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4-Voice-Base</td>
|
||||||
|
<td>9B</td>
|
||||||
|
<td><u>2.5</u></td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>2.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>1.6</strong></td>
|
||||||
|
<td><strong>4.4</strong></td>
|
||||||
|
<td><strong>6.9</strong></td>
|
||||||
|
<td><u>1.7</u></td>
|
||||||
|
<td><strong>8.7</strong></td>
|
||||||
|
<td><strong>3.0</strong></td>
|
||||||
|
<td><strong>48.2</strong></td>
|
||||||
|
<td><strong>27.2</strong></td>
|
||||||
|
<td><u>52.4</u></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
* 正式开源模型权重的评测结果。<br><br>
|
||||||
|
|
||||||
|
**语音生成能力。**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Task</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th colspan="9">SpeechQA</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Metric</th>
|
||||||
|
<th></th>
|
||||||
|
<th colspan="3">ACC↑</th>
|
||||||
|
<th>G-Eval (10 point)↑</th>
|
||||||
|
<th>Semantic ELO score↑</th>
|
||||||
|
<th>Acoustic ELO score↑</th>
|
||||||
|
<th>Overall ELO score↑</th>
|
||||||
|
<th>UTMOS↑</th>
|
||||||
|
<th>ASR-WER↓</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Dataset</th>
|
||||||
|
<th></th>
|
||||||
|
<th>Speech Llama Q.</th>
|
||||||
|
<th>Speech Web Q.</th>
|
||||||
|
<th>Speech Trivia QA</th>
|
||||||
|
<th>Speech AlpacaEval</th>
|
||||||
|
<th colspan="5">AudioArena</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-Realtime</td>
|
||||||
|
<td></td>
|
||||||
|
<td><strong>71.7</strong></td>
|
||||||
|
<td><strong>51.6</strong></td>
|
||||||
|
<td><strong>69.7</strong></td>
|
||||||
|
<td><strong>7.4</strong></td>
|
||||||
|
<td><strong>1157</strong></td>
|
||||||
|
<td><strong>1203</strong></td>
|
||||||
|
<td><strong>1200</strong></td>
|
||||||
|
<td><strong>4.2</strong></td>
|
||||||
|
<td><strong>2.3</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Open-Source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4-Voice</td>
|
||||||
|
<td>9B</td>
|
||||||
|
<td>50.0</td>
|
||||||
|
<td>32.0</td>
|
||||||
|
<td>36.4</td>
|
||||||
|
<td><u>5.1</u></td>
|
||||||
|
<td>999</td>
|
||||||
|
<td>1147</td>
|
||||||
|
<td>1035</td>
|
||||||
|
<td><u>4.1</u></td>
|
||||||
|
<td><u>11.7</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Llama-Omni</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>45.3</td>
|
||||||
|
<td>22.9</td>
|
||||||
|
<td>10.7</td>
|
||||||
|
<td>3.9</td>
|
||||||
|
<td>960</td>
|
||||||
|
<td>878</td>
|
||||||
|
<td>897</td>
|
||||||
|
<td>3.2</td>
|
||||||
|
<td>24.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>46.7</td>
|
||||||
|
<td>28.1</td>
|
||||||
|
<td>23.3</td>
|
||||||
|
<td>2.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Moshi</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>43.7</td>
|
||||||
|
<td>23.8</td>
|
||||||
|
<td>16.7</td>
|
||||||
|
<td>2.4</td>
|
||||||
|
<td>871</td>
|
||||||
|
<td>808</td>
|
||||||
|
<td>875</td>
|
||||||
|
<td>2.8</td>
|
||||||
|
<td>8.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Mini-Omni</td>
|
||||||
|
<td>1B</td>
|
||||||
|
<td>22.0</td>
|
||||||
|
<td>12.8</td>
|
||||||
|
<td>6.9</td>
|
||||||
|
<td>2.5</td>
|
||||||
|
<td>926</td>
|
||||||
|
<td>803</td>
|
||||||
|
<td>865</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
<td>10.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><u>61.0</u></td>
|
||||||
|
<td><u>40.0</u></td>
|
||||||
|
<td><u>40.2</u></td>
|
||||||
|
<td><u>5.1</u></td>
|
||||||
|
<td><u>1088</u></td>
|
||||||
|
<td><u>1163</u></td>
|
||||||
|
<td><u>1131</u></td>
|
||||||
|
<td><strong>4.2</strong></td>
|
||||||
|
<td>9.8</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
所有的结果都基于 <a href="https://github.com/OpenBMB/UltraEval-Audio" target="_blank">AudioEvals</a>。<br><br>
|
||||||
|
|
||||||
|
**端到端声音克隆能力。**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Task</th>
|
||||||
|
<th colspan="2">TTS</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Metric</th>
|
||||||
|
<th>SIMO↑</th>
|
||||||
|
<th>SIMO↑</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Dataset</th>
|
||||||
|
<th>Seed-TTS test-zh</th>
|
||||||
|
<th>Seed-TTS test-en</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">F5-TTS</td>
|
||||||
|
<td><strong>76</strong></td>
|
||||||
|
<td><strong>67</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">CosyVoice</td>
|
||||||
|
<td><u>75</u></td>
|
||||||
|
<td><u>64</u></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">FireRedTTS</td>
|
||||||
|
<td>63</td>
|
||||||
|
<td>46</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>57</td>
|
||||||
|
<td>47</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看多模态流式交互能力评测详细结果。</summary>
|
||||||
|
|
||||||
|
**多模态流式交互能力**: StreamingBench 分数
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Real-Time Video Understanding</th>
|
||||||
|
<th>Omni-Source Understanding</th>
|
||||||
|
<th>Contextual Understanding</th>
|
||||||
|
<th>Overall</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="7" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td><u>77.4</u></td>
|
||||||
|
<td><strong>67.8</strong></td>
|
||||||
|
<td><strong>51.1</strong></td>
|
||||||
|
<td><strong>70.3</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-202408</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>74.5</td>
|
||||||
|
<td>51.0</td>
|
||||||
|
<td><u>48.0</u></td>
|
||||||
|
<td>64.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude-3.5-Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>74.0</td>
|
||||||
|
<td>41.4</td>
|
||||||
|
<td>37.8</td>
|
||||||
|
<td>59.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="9" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VILA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>61.5</td>
|
||||||
|
<td>37.5</td>
|
||||||
|
<td>26.7</td>
|
||||||
|
<td>49.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LongVA</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>63.1</td>
|
||||||
|
<td>35.9</td>
|
||||||
|
<td>30.2</td>
|
||||||
|
<td>50.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-Next-Video-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>69.8</td>
|
||||||
|
<td>41.7</td>
|
||||||
|
<td>34.3</td>
|
||||||
|
<td>56.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>71.2</td>
|
||||||
|
<td>40.7</td>
|
||||||
|
<td>33.1</td>
|
||||||
|
<td>57.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>70.1</td>
|
||||||
|
<td>42.7</td>
|
||||||
|
<td>34.1</td>
|
||||||
|
<td>57.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VITA-1.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>70.9</td>
|
||||||
|
<td>40.8</td>
|
||||||
|
<td>35.8</td>
|
||||||
|
<td>57.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-OneVision-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>74.3</td>
|
||||||
|
<td>40.8</td>
|
||||||
|
<td>31.0</td>
|
||||||
|
<td>58.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternLM-XC2.5-OL-7B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>75.4</td>
|
||||||
|
<td>46.2</td>
|
||||||
|
<td>33.6</td>
|
||||||
|
<td>60.8</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>72.4</td>
|
||||||
|
<td>40.2</td>
|
||||||
|
<td>33.4</td>
|
||||||
|
<td>57.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>79.9</strong></td>
|
||||||
|
<td><u>53.4</u></td>
|
||||||
|
<td>38.5</td>
|
||||||
|
<td><u>66.0</u></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
### 典型示例 <!-- omit in toc -->
|
||||||
|
|
||||||
|
以下为 MiniCPM-o 2.6 的 iPad Pro 实机演示和 web demo 演示样例:
|
||||||
|
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<a href="https://www.youtube.com/watch?v=vRIMbxJzStY&t=2s"><img src="./assets/minicpmo2_6/2dot6_o_demo_video_img.png", width=70%></a>
|
||||||
|
</div>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="assets/minicpmo2_6/minicpmo2_6_math_intersect.png" alt="math" style="margin-bottom: 5px;">
|
||||||
|
<img src="assets/minicpmo2_6/minicpmo2_6_diagram_train_NN.png" alt="diagram" style="margin-bottom: 5px;">
|
||||||
|
<img src="assets/minicpmo2_6/minicpmo2_6_multi-image_bike.png" alt="bike" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
953
docs/minicpm_v2dot6_en.md
Normal file
@@ -0,0 +1,953 @@
|
|||||||
|
## MiniCPM-V 2.6
|
||||||
|
|
||||||
|
> Archieve at: 2025-01-13
|
||||||
|
|
||||||
|
**MiniCPM-V 2.6** is the latest and most capable model in the MiniCPM-V series. The model is built on SigLip-400M and Qwen2-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-Llama3-V 2.5, and introduces new features for multi-image and video understanding. Notable features of MiniCPM-V 2.6 include:
|
||||||
|
|
||||||
|
- 🔥 **Leading Performance.**
|
||||||
|
MiniCPM-V 2.6 achieves an average score of 65.2 on the latest version of OpenCompass, a comprehensive evaluation over 8 popular benchmarks. **With only 8B parameters, it surpasses widely used proprietary models like GPT-4o mini, GPT-4V, Gemini 1.5 Pro, and Claude 3.5 Sonnet** for single image understanding.
|
||||||
|
|
||||||
|
- 🖼️ **Multi Image Understanding and In-context Learning.** MiniCPM-V 2.6 can also perform **conversation and reasoning over multiple images**. It achieves **state-of-the-art performance** on popular multi-image benchmarks such as Mantis-Eval, BLINK, Mathverse mv and Sciverse mv, and also shows promising in-context learning capability.
|
||||||
|
|
||||||
|
- 🎬 **Video Understanding.** MiniCPM-V 2.6 can also **accept video inputs**, performing conversation and providing dense captions for spatial-temporal information. It outperforms **GPT-4V, Claude 3.5 Sonnet and LLaVA-NeXT-Video-34B** on Video-MME with/without subtitles.
|
||||||
|
|
||||||
|
- 💪 **Strong OCR Capability and Others.**
|
||||||
|
MiniCPM-V 2.6 can process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344). It achieves **state-of-the-art performance on OCRBench, surpassing proprietary models such as GPT-4o, GPT-4V, and Gemini 1.5 Pro**.
|
||||||
|
Based on the the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) and [VisCPM](https://github.com/OpenBMB/VisCPM) techniques, it features **trustworthy behaviors**, with significantly lower hallucination rates than GPT-4o and GPT-4V on Object HalBench, and supports **multilingual capabilities** on English, Chinese, German, French, Italian, Korean, etc.
|
||||||
|
|
||||||
|
|
||||||
|
- 🚀 **Superior Efficiency.**
|
||||||
|
In addition to its friendly size, MiniCPM-V 2.6 also shows **state-of-the-art token density** (i.e., number of pixels encoded into each visual token). **It produces only 640 tokens when processing a 1.8M pixel image, which is 75% fewer than most models**. This directly improves the inference speed, first-token latency, memory usage, and power consumption. As a result, MiniCPM-V 2.6 can efficiently support **real-time video understanding** on end-side devices such as iPad.
|
||||||
|
|
||||||
|
- 💫 **Easy Usage.**
|
||||||
|
MiniCPM-V 2.6 can be easily used in various ways: (1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpmv-main/examples/llava/README-minicpmv2.6.md) and [ollama](https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md) support for efficient CPU inference on local devices, (2) [int4](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) and [GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) format quantized models in 16 sizes, (3) [vLLM](#inference-with-vllm) support for high-throughput and memory-efficient inference, (4) fine-tuning on new domains and tasks, (5) quick local WebUI demo setup with [Gradio](#chat-with-our-demo-on-gradio), and (6) online web [demo](http://120.92.209.146:8887/).
|
||||||
|
|
||||||
|
### Evaluation <!-- omit in toc -->
|
||||||
|
<div align="center">
|
||||||
|
<img src=../assets/radar_final.png width=66% />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view single image results on OpenCompass, MME, MMVet, OCRBench, MMMU, MathVista, MMB, AI2D, TextVQA, DocVQA, HallusionBench, Object HalBench. </summary>
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Token Density<sup>+</sup></th>
|
||||||
|
<th>OpenCompass</th>
|
||||||
|
<th>MME</th>
|
||||||
|
<th>MMVet</th>
|
||||||
|
<th>OCRBench</th>
|
||||||
|
<th>MMMU val</th>
|
||||||
|
<th>MathVista mini</th>
|
||||||
|
<th>MMB1.1 test</th>
|
||||||
|
<th>AI2D</th>
|
||||||
|
<th>TextVQA val</th>
|
||||||
|
<th>DocVQA test</th>
|
||||||
|
<th>HallusionBench</th>
|
||||||
|
<th>Object HalBench</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="15" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>69.9</td>
|
||||||
|
<td>2328.7</td>
|
||||||
|
<td>69.1</td>
|
||||||
|
<td>736</td>
|
||||||
|
<td>69.2</td>
|
||||||
|
<td>61.3</td>
|
||||||
|
<td>82.2</td>
|
||||||
|
<td>84.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>92.8</td>
|
||||||
|
<td>55.0</td>
|
||||||
|
<td>17.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>750</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>1920.0</td>
|
||||||
|
<td>66.0</td>
|
||||||
|
<td>788</td>
|
||||||
|
<td>65.9</td>
|
||||||
|
<td>61.6</td>
|
||||||
|
<td>78.5</td>
|
||||||
|
<td>80.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>95.2</td>
|
||||||
|
<td>49.9</td>
|
||||||
|
<td>13.8</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>64.4</td>
|
||||||
|
<td>2110.6</td>
|
||||||
|
<td>64.0</td>
|
||||||
|
<td>754</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>57.7</td>
|
||||||
|
<td>73.9</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>73.5</td>
|
||||||
|
<td>86.5</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o mini</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>2003.4</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>785</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>76.0</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>46.1</td>
|
||||||
|
<td>12.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>63.5</td>
|
||||||
|
<td>2070.2</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>656</td>
|
||||||
|
<td>61.7</td>
|
||||||
|
<td>54.7</td>
|
||||||
|
<td>79.8</td>
|
||||||
|
<td>78.6</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>87.2</td>
|
||||||
|
<td>43.9</td>
|
||||||
|
<td>14.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Step-1V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>59.5</td>
|
||||||
|
<td>2206.4</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>625</td>
|
||||||
|
<td>49.9</td>
|
||||||
|
<td>44.8</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>79.2</td>
|
||||||
|
<td>71.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>48.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen-VL-Max</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>2281.7</td>
|
||||||
|
<td>61.8</td>
|
||||||
|
<td>684</td>
|
||||||
|
<td>52.0</td>
|
||||||
|
<td>43.4</td>
|
||||||
|
<td>74.6</td>
|
||||||
|
<td>75.7</td>
|
||||||
|
<td>79.5</td>
|
||||||
|
<td>93.1</td>
|
||||||
|
<td>41.2</td>
|
||||||
|
<td>13.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="15" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Yi-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>157</td>
|
||||||
|
<td>55.0</td>
|
||||||
|
<td>2006.5</td>
|
||||||
|
<td>50.7</td>
|
||||||
|
<td>574</td>
|
||||||
|
<td>48.8</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>78.9</td>
|
||||||
|
<td>69.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>34.8</td>
|
||||||
|
<td>12.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Mini-Gemini-HD-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>157</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>2141.0</td>
|
||||||
|
<td>59.3</td>
|
||||||
|
<td>518</td>
|
||||||
|
<td>48.0</td>
|
||||||
|
<td>43.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>80.5</td>
|
||||||
|
<td>74.1</td>
|
||||||
|
<td>78.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Cambrian-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>1820</td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>2049.9</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
<td>591</td>
|
||||||
|
<td>50.4</td>
|
||||||
|
<td>50.3</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>79.5</td>
|
||||||
|
<td>76.7</td>
|
||||||
|
<td>75.5</td>
|
||||||
|
<td>41.6</td>
|
||||||
|
<td>14.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4V-9B</td>
|
||||||
|
<td>13B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>2018.8</td>
|
||||||
|
<td>58.0</td>
|
||||||
|
<td>776</td>
|
||||||
|
<td>46.9</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>71.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>706</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>2215.1</td>
|
||||||
|
<td>54.3</td>
|
||||||
|
<td>794</td>
|
||||||
|
<td><strong>51.2</strong></td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td><strong>79.4</strong></td>
|
||||||
|
<td><strong>83.6</strong></td>
|
||||||
|
<td>77.4</td>
|
||||||
|
<td><strong>91.6</strong></td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>21.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-Llama-V 2.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>1882</td>
|
||||||
|
<td>58.8</td>
|
||||||
|
<td>2024.6</td>
|
||||||
|
<td>52.8</td>
|
||||||
|
<td>725</td>
|
||||||
|
<td>45.8</td>
|
||||||
|
<td>54.3</td>
|
||||||
|
<td>72.0</td>
|
||||||
|
<td>78.4</td>
|
||||||
|
<td>76.6</td>
|
||||||
|
<td>84.8</td>
|
||||||
|
<td>42.4</td>
|
||||||
|
<td>10.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr style="background-color: #e6f2ff;">
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>2822</strong></td>
|
||||||
|
<td><strong>65.2</strong></td>
|
||||||
|
<td><strong>2348.4</strong>*</td>
|
||||||
|
<td><strong>60.0</strong></td>
|
||||||
|
<td><strong>852</strong>*</td>
|
||||||
|
<td>49.8*</td>
|
||||||
|
<td><strong>60.6</strong></td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>82.1</td>
|
||||||
|
<td><strong>80.1<strong></td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td><strong>48.1</strong>*</td>
|
||||||
|
<td><strong>8.2</strong></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* We evaluate this benchmark using chain-of-thought prompting. Specifically, for MME, we used this technique only for the Cognition set.
|
||||||
|
|
||||||
|
<sup>+</sup> Token Density: number of pixels encoded into each visual token at maximum resolution, i.e., # pixels at maximum resolution / # visual tokens.
|
||||||
|
|
||||||
|
Note: For proprietary models, we calculate token density based on the image encoding charging strategy defined in the official API documentation, which provides an upper-bound estimation.
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view multi-image results on Mantis Eval, BLINK, Mathverse mv, Sciverse mv, MIRB.</summary>
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Mantis Eval</th>
|
||||||
|
<th>BLINK val</th>
|
||||||
|
<th>Mathverse mv</th>
|
||||||
|
<th>Sciverse mv</th>
|
||||||
|
<th>MIRB</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="7" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>60.3</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>53.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Interleave-14B</td>
|
||||||
|
<td>14B</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>52.6</td>
|
||||||
|
<td>32.7</td>
|
||||||
|
<td>30.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="7" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Emu2-Chat</td>
|
||||||
|
<td>37B</td>
|
||||||
|
<td>37.8</td>
|
||||||
|
<td>36.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>27.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">CogVLM</td>
|
||||||
|
<td>17B</td>
|
||||||
|
<td>45.2</td>
|
||||||
|
<td>41.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VPG-C</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>43.1</td>
|
||||||
|
<td>24.3</td>
|
||||||
|
<td>23.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VILA 8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>51.2</td>
|
||||||
|
<td>39.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>36.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternLM-XComposer-2.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>53.1*</td>
|
||||||
|
<td>48.9</td>
|
||||||
|
<td>32.1*</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>42.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>59.0*</td>
|
||||||
|
<td>50.9</td>
|
||||||
|
<td>30.5*</td>
|
||||||
|
<td>34.4*</td>
|
||||||
|
<td><strong>56.9*</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr style="background-color: #e6f2ff;">
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>69.1</strong></td>
|
||||||
|
<td><strong>53.0</strong></td>
|
||||||
|
<td><strong>84.9</strong></td>
|
||||||
|
<td><strong>74.9</strong></td>
|
||||||
|
<td>53.8</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* We evaluate the officially released checkpoint by ourselves.
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view video results on Video-MME and Video-ChatGPT.</summary>
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th colspan="2">Video-MME</th>
|
||||||
|
<th colspan="5">Video-ChatGPT</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left"></th>
|
||||||
|
<th></th>
|
||||||
|
<th>w/o subs</th>
|
||||||
|
<th>w subs</th>
|
||||||
|
<th>Correctness</th>
|
||||||
|
<th>Detail</th>
|
||||||
|
<th>Context</th>
|
||||||
|
<th>Temporal</th>
|
||||||
|
<th>Consistency</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="9" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>62.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>59.9</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="9" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-7B</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.39</td>
|
||||||
|
<td>3.29</td>
|
||||||
|
<td>3.92</td>
|
||||||
|
<td>2.60</td>
|
||||||
|
<td>3.12</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.29</td>
|
||||||
|
<td>3.23</td>
|
||||||
|
<td>3.83</td>
|
||||||
|
<td>2.51</td>
|
||||||
|
<td>3.47</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">CogVLM2-Video</td>
|
||||||
|
<td>12B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.49</td>
|
||||||
|
<td><strong>3.46</strong></td>
|
||||||
|
<td>3.23</td>
|
||||||
|
<td><strong>2.98</strong></td>
|
||||||
|
<td><strong>3.64</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LongVA</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>54.3</td>
|
||||||
|
<td>3.05</td>
|
||||||
|
<td>3.09</td>
|
||||||
|
<td>3.77</td>
|
||||||
|
<td>2.44</td>
|
||||||
|
<td><strong>3.64</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>54.0</td>
|
||||||
|
<td>56.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternLM-XComposer-2.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>55.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Video</td>
|
||||||
|
<td>32B</td>
|
||||||
|
<td>60.2</td>
|
||||||
|
<td>63.0</td>
|
||||||
|
<td>3.48</td>
|
||||||
|
<td>3.37</td>
|
||||||
|
<td><strong>3.95</strong></td>
|
||||||
|
<td>2.64</td>
|
||||||
|
<td>3.28</td>
|
||||||
|
</tr>
|
||||||
|
<tr style="background-color: #e6f2ff;">
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>60.9</strong></td>
|
||||||
|
<td><strong>63.6</strong></td>
|
||||||
|
<td><strong>3.59</strong></td>
|
||||||
|
<td>3.28</td>
|
||||||
|
<td>3.93</td>
|
||||||
|
<td>2.73</td>
|
||||||
|
<td>3.62</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view few-shot results on TextVQA, VizWiz, VQAv2, OK-VQA.</summary>
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Shot</th>
|
||||||
|
<th>TextVQA val</th>
|
||||||
|
<th>VizWiz test-dev</th>
|
||||||
|
<th>VQAv2 test-dev</th>
|
||||||
|
<th>OK-VQA val</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">Flamingo</td>
|
||||||
|
<td rowspan="3">80B</td>
|
||||||
|
<td>0*</td>
|
||||||
|
<td>35.0</td>
|
||||||
|
<td>31.6</td>
|
||||||
|
<td>56.3</td>
|
||||||
|
<td>40.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>36.5</td>
|
||||||
|
<td>39.6</td>
|
||||||
|
<td>63.1</td>
|
||||||
|
<td><strong>57.4</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>37.3</td>
|
||||||
|
<td>44.8</td>
|
||||||
|
<td>65.6</td>
|
||||||
|
<td>57.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">IDEFICS</td>
|
||||||
|
<td rowspan="3">80B</td>
|
||||||
|
<td>0*</td>
|
||||||
|
<td>30.9</td>
|
||||||
|
<td>36.0</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>45.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>34.3</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>35.7</td>
|
||||||
|
<td>46.1</td>
|
||||||
|
<td>64.8</td>
|
||||||
|
<td>55.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">OmniCorpus</td>
|
||||||
|
<td rowspan="3">7B</td>
|
||||||
|
<td>0*</td>
|
||||||
|
<td>43.0</td>
|
||||||
|
<td>49.8</td>
|
||||||
|
<td>63.2</td>
|
||||||
|
<td>45.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>45.4</td>
|
||||||
|
<td>51.3</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>46.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>52.2</td>
|
||||||
|
<td>64.7</td>
|
||||||
|
<td>46.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">Emu2</td>
|
||||||
|
<td rowspan="3">37B</td>
|
||||||
|
<td>0</td>
|
||||||
|
<td>26.4</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>33.5</td>
|
||||||
|
<td>26.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>48.2</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>67.0</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>49.3</td>
|
||||||
|
<td>54.7</td>
|
||||||
|
<td>67.8</td>
|
||||||
|
<td>54.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="2">MM1</td>
|
||||||
|
<td rowspan="2">30B</td>
|
||||||
|
<td>0</td>
|
||||||
|
<td>26.2</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>48.9</td>
|
||||||
|
<td>26.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>49.3</td>
|
||||||
|
<td>54.7</td>
|
||||||
|
<td><strong>70.9</strong></td>
|
||||||
|
<td>54.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr style="background-color: #e6f2ff;">
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">MiniCPM-V 2.6<sup>+</sup></td>
|
||||||
|
<td rowspan="3">8B</td>
|
||||||
|
<td>0</td>
|
||||||
|
<td>43.9</td>
|
||||||
|
<td>33.8</td>
|
||||||
|
<td>45.4</td>
|
||||||
|
<td>23.9</td>
|
||||||
|
</tr>
|
||||||
|
<tr style="background-color: #e6f2ff;">
|
||||||
|
<td>4</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
<td>60.5</td>
|
||||||
|
<td>65.5</td>
|
||||||
|
<td>50.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr style="background-color: #e6f2ff;">
|
||||||
|
<td>8</td>
|
||||||
|
<td><strong>64.6</strong></td>
|
||||||
|
<td><strong>63.4</strong></td>
|
||||||
|
<td>68.2</td>
|
||||||
|
<td>51.4</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* denotes zero image shot and two additional text shots following Flamingo.
|
||||||
|
|
||||||
|
<sup>+</sup> We evaluate the pretraining ckpt without SFT.
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### Examples <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv2_6/multi_img-bike.png" alt="Bike" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multi_img-menu.png" alt="Menu" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multi_img-code.png" alt="Code" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/ICL-Mem.png" alt="Mem" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multiling-medal.png" alt="medal" style="margin-bottom: 10px;">
|
||||||
|
</div>
|
||||||
|
<details>
|
||||||
|
<summary>Click to view more cases.</summary>
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv2_6/ICL-elec.png" alt="elec" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multiling-olympic.png" alt="Menu" style="margin-bottom: 10px;">
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
We deploy MiniCPM-V 2.6 on end devices. The demo video is the raw screen recording on a iPad Pro without edition.
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/gif_cases/ai.gif" width=32%/>
|
||||||
|
|
||||||
|
<img src="../assets/gif_cases/beer.gif" width=32%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/gif_cases/ticket.gif" width=32%/>
|
||||||
|
|
||||||
|
<img src="../assets/gif_cases/wfh.gif" width=32%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<video src="https://github.com/user-attachments/assets/21f4b818-ede1-4822-920e-91281725c830" width="360" /> </video>
|
||||||
|
<!-- <video src="https://github.com/user-attachments/assets/c835f757-206b-4d9c-8e36-70d67b453628" width="360" /> </video> -->
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Multi-turn Conversation
|
||||||
|
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="../assets/airplane.jpeg" width="500px">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
torch.manual_seed(0)
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
|
||||||
|
attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
|
||||||
|
|
||||||
|
image = Image.open('./assets/airplane.jpeg').convert('RGB')
|
||||||
|
|
||||||
|
# First round chat
|
||||||
|
question = "Tell me the model of this aircraft."
|
||||||
|
msgs = [{'role': 'user', 'content': [image, question]}]
|
||||||
|
|
||||||
|
answer = model.chat(
|
||||||
|
image=None,
|
||||||
|
msgs=msgs,
|
||||||
|
tokenizer=tokenizer
|
||||||
|
)
|
||||||
|
print(answer)
|
||||||
|
|
||||||
|
# Second round chat
|
||||||
|
# pass history context of multi-turn conversation
|
||||||
|
msgs.append({"role": "assistant", "content": [answer]})
|
||||||
|
msgs.append({"role": "user", "content": ["Introduce something about Airbus A380."]})
|
||||||
|
|
||||||
|
answer = model.chat(
|
||||||
|
image=None,
|
||||||
|
msgs=msgs,
|
||||||
|
tokenizer=tokenizer
|
||||||
|
)
|
||||||
|
print(answer)
|
||||||
|
```
|
||||||
|
|
||||||
|
You could get the following output:
|
||||||
|
|
||||||
|
```
|
||||||
|
"The aircraft in the image is an Airbus A380, which can be identified by its large size, double-deck structure, and the distinctive shape of its wings and engines. The A380 is a wide-body aircraft known for being the world's largest passenger airliner, designed for long-haul flights. It has four engines, which are characteristic of large commercial aircraft. The registration number on the aircraft can also provide specific information about the model if looked up in an aviation database."
|
||||||
|
|
||||||
|
"The Airbus A380 is a double-deck, wide-body, four-engine jet airliner made by Airbus. It is the world's largest passenger airliner and is known for its long-haul capabilities. The aircraft was developed to improve efficiency and comfort for passengers traveling over long distances. It has two full-length passenger decks, which can accommodate more passengers than a typical single-aisle airplane. The A380 has been operated by airlines such as Lufthansa, Singapore Airlines, and Emirates, among others. It is widely recognized for its unique design and significant impact on the aviation industry."
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Multi-image Understanding
|
||||||
|
<details>
|
||||||
|
<summary> Click to view Python example of MiniCPM-V 2.6 multi-image understanding </summary>
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
|
||||||
|
attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
|
||||||
|
|
||||||
|
image1 = Image.open('image1.jpg').convert('RGB')
|
||||||
|
image2 = Image.open('image2.jpg').convert('RGB')
|
||||||
|
question = 'Compare image 1 and image 2, tell me about the differences between image 1 and image 2.'
|
||||||
|
|
||||||
|
msgs = [{'role': 'user', 'content': [image1, image2, question]}]
|
||||||
|
|
||||||
|
answer = model.chat(
|
||||||
|
image=None,
|
||||||
|
msgs=msgs,
|
||||||
|
tokenizer=tokenizer
|
||||||
|
)
|
||||||
|
print(answer)
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
#### Few-shot In-Context-Learning
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary> Click to view Python example of MiniCPM-V 2.6 few-shot in-context-learning example </summary>
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
|
||||||
|
attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
|
||||||
|
|
||||||
|
question = "production date"
|
||||||
|
image1 = Image.open('example1.jpg').convert('RGB')
|
||||||
|
answer1 = "2023.08.04"
|
||||||
|
image2 = Image.open('example2.jpg').convert('RGB')
|
||||||
|
answer2 = "2007.04.24"
|
||||||
|
image_test = Image.open('test.jpg').convert('RGB')
|
||||||
|
|
||||||
|
msgs = [
|
||||||
|
{'role': 'user', 'content': [image1, question]}, {'role': 'assistant', 'content': [answer1]},
|
||||||
|
{'role': 'user', 'content': [image2, question]}, {'role': 'assistant', 'content': [answer2]},
|
||||||
|
{'role': 'user', 'content': [image_test, question]}
|
||||||
|
]
|
||||||
|
|
||||||
|
answer = model.chat(
|
||||||
|
image=None,
|
||||||
|
msgs=msgs,
|
||||||
|
tokenizer=tokenizer
|
||||||
|
)
|
||||||
|
print(answer)
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
#### Video understanding
|
||||||
|
<details>
|
||||||
|
<summary> Click to view Python example of MiniCPM-V 2.6 video understanding </summary>
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
from decord import VideoReader, cpu # pip install decord
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
|
||||||
|
attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
|
||||||
|
|
||||||
|
MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number
|
||||||
|
|
||||||
|
def encode_video(video_path):
|
||||||
|
def uniform_sample(l, n):
|
||||||
|
gap = len(l) / n
|
||||||
|
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
||||||
|
return [l[i] for i in idxs]
|
||||||
|
|
||||||
|
vr = VideoReader(video_path, ctx=cpu(0))
|
||||||
|
sample_fps = round(vr.get_avg_fps() / 1) # FPS
|
||||||
|
frame_idx = [i for i in range(0, len(vr), sample_fps)]
|
||||||
|
if len(frame_idx) > MAX_NUM_FRAMES:
|
||||||
|
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
||||||
|
frames = vr.get_batch(frame_idx).asnumpy()
|
||||||
|
frames = [Image.fromarray(v.astype('uint8')) for v in frames]
|
||||||
|
print('num frames:', len(frames))
|
||||||
|
return frames
|
||||||
|
|
||||||
|
video_path="video_test.mp4"
|
||||||
|
frames = encode_video(video_path)
|
||||||
|
question = "Describe the video"
|
||||||
|
msgs = [
|
||||||
|
{'role': 'user', 'content': frames + [question]},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Set decode params for video
|
||||||
|
params = {}
|
||||||
|
params["use_image_id"] = False
|
||||||
|
params["max_slice_nums"] = 2 # 如果cuda OOM且视频分辨率大于448*448可设为1
|
||||||
|
|
||||||
|
answer = model.chat(
|
||||||
|
image=None,
|
||||||
|
msgs=msgs,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**params
|
||||||
|
)
|
||||||
|
print(answer)
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### Model Zoo
|
||||||
|
|
||||||
|
| Model | Device | Memory |          Description | Download |
|
||||||
|
|:-----------|:--:|:-----------:|:-------------------|:---------------:|
|
||||||
|
| MiniCPM-V 2.6| GPU | 17 GB | Strong end-side multimodal performance for single image, multi-image and video understanding. | [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6) [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) |
|
||||||
|
| MiniCPM-V 2.6 gguf | CPU | 6 GB | The gguf version, lower memory usage and faster inference. | [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-gguf) |
|
||||||
|
| MiniCPM-V 2.6 int4 | GPU | 7 GB | The int4 quantized version, lower GPU memory usage. | [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-int4) |
|
||||||
773
docs/minicpm_v2dot6_zh.md
Normal file
@@ -0,0 +1,773 @@
|
|||||||
|
## MiniCPM-V 2.6
|
||||||
|
|
||||||
|
> Archieve at: 2025-08-25
|
||||||
|
|
||||||
|
**MiniCPM-V 2.6** 是 MiniCPM-V 系列中最新、性能最佳的模型。该模型基于 SigLip-400M 和 Qwen2-7B 构建,共 8B 参数。与 MiniCPM-Llama3-V 2.5 相比,MiniCPM-V 2.6 性能提升显著,并引入了多图和视频理解的新功能。MiniCPM-V 2.6 的主要特点包括:
|
||||||
|
|
||||||
|
|
||||||
|
- 🔥 **领先的性能。**
|
||||||
|
MiniCPM-V 2.6 在最新版本 OpenCompass 榜单上(综合 8 个主流多模态评测基准)平均得分 65.2,**以8B量级的大小在单图理解方面超越了 GPT-4o mini、GPT-4V、Gemini 1.5 Pro 和 Claude 3.5 Sonnet 等主流商用闭源多模态大模型**。
|
||||||
|
|
||||||
|
- 🖼️ **多图理解和上下文学习。**
|
||||||
|
MiniCPM-V 2.6 还支持**多图对话和推理**。它在 Mantis-Eval、BLINK、Mathverse mv 和 Sciverse mv 等主流多图评测基准中取得了**最佳水平**,并展现出了优秀的上下文学习能力。
|
||||||
|
|
||||||
|
- 🎬 **视频理解。**
|
||||||
|
MiniCPM-V 2.6 还可以**接受视频输入**,进行对话和提供涵盖时序和空间信息的详细视频描述。模型在 有/无字幕 评测场景下的 Video-MME 表现均超过了 **GPT-4V、Claude 3.5 Sonnet 和 LLaVA-NeXT-Video-34B**等商用闭源模型。
|
||||||
|
|
||||||
|
- 💪 **强大的 OCR 能力及其他功能。**
|
||||||
|
MiniCPM-V 2.6 可以处理任意长宽比的图像,像素数可达 180 万(如 1344x1344)。在 OCRBench 上取得**最佳水平,超过 GPT-4o、GPT-4V 和 Gemini 1.5 Pro 等商用闭源模型**。基于最新的 [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) 和 [VisCPM](https://github.com/OpenBMB/VisCPM) 技术,其具备了**可信的多模态行为**,在 Object HalBench 上的幻觉率显著低于 GPT-4o 和 GPT-4V,并支持英语、中文、德语、法语、意大利语、韩语等**多种语言**。
|
||||||
|
|
||||||
|
- 🚀 **卓越的效率。**
|
||||||
|
除了对个人用户友好的模型大小,MiniCPM-V 2.6 还表现出**最先进的视觉 token 密度**(即每个视觉 token 编码的像素数量)。它**仅需 640 个 token 即可处理 180 万像素图像,比大多数模型少 75%**。这一特性优化了模型的推理速度、首 token 延迟、内存占用和功耗。因此,MiniCPM-V 2.6 可以支持 iPad 等终端设备上的高效**实时视频理解**。
|
||||||
|
|
||||||
|
- 💫 **易于使用。**
|
||||||
|
MiniCPM-V 2.6 可以通过多种方式轻松使用:(1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpmv-main/examples/llava/README-minicpmv2.6.md) 和 [ollama](https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md) 支持在本地设备上进行高效的 CPU 推理,(2) [int4](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) 和 [GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) 格式的量化模型,有 16 种尺寸,(3) [vLLM](#vllm-部署-) 支持高吞吐量和内存高效的推理,(4) 针对新领域和任务进行微调,(5) 使用 [Gradio](#本地-webui-demo-) 快速设置本地 WebUI 演示,(6) 在线[demo](http://120.92.209.146:8887/)即可体验。
|
||||||
|
|
||||||
|
### 性能评估 <!-- omit in toc -->
|
||||||
|
<div align="center">
|
||||||
|
<img src=assets/radar_final.png width=90% />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看 OpenCompass, MME, MMVet, OCRBench, MMMU, MathVista, MMB, AI2D, TextVQA, DocVQA, HallusionBench, Object HalBench 上的单图评测结果详情。 </summary>
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Token Density<sup>+</sup></th>
|
||||||
|
<th>OpenCompass</th>
|
||||||
|
<th>MME</th>
|
||||||
|
<th>MMVet</th>
|
||||||
|
<th>OCRBench</th>
|
||||||
|
<th>MMMU val</th>
|
||||||
|
<th>MathVista mini</th>
|
||||||
|
<th>MMB1.1 test</th>
|
||||||
|
<th>AI2D</th>
|
||||||
|
<th>TextVQA val</th>
|
||||||
|
<th>DocVQA test</th>
|
||||||
|
<th>HallusionBench</th>
|
||||||
|
<th>Object HalBench</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="15" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>69.9</td>
|
||||||
|
<td>2328.7</td>
|
||||||
|
<td>69.1</td>
|
||||||
|
<td>736</td>
|
||||||
|
<td>69.2</td>
|
||||||
|
<td>61.3</td>
|
||||||
|
<td>82.2</td>
|
||||||
|
<td>84.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>92.8</td>
|
||||||
|
<td>55.0</td>
|
||||||
|
<td>17.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>750</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>1920.0</td>
|
||||||
|
<td>66.0</td>
|
||||||
|
<td>788</td>
|
||||||
|
<td>65.9</td>
|
||||||
|
<td>61.6</td>
|
||||||
|
<td>78.5</td>
|
||||||
|
<td>80.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>95.2</td>
|
||||||
|
<td>49.9</td>
|
||||||
|
<td>13.8</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>64.4</td>
|
||||||
|
<td>2110.6</td>
|
||||||
|
<td>64.0</td>
|
||||||
|
<td>754</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>57.7</td>
|
||||||
|
<td>73.9</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>73.5</td>
|
||||||
|
<td>86.5</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o mini</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>2003.4</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>785</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>76.0</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>46.1</td>
|
||||||
|
<td>12.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>1088</td>
|
||||||
|
<td>63.5</td>
|
||||||
|
<td>2070.2</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>656</td>
|
||||||
|
<td>61.7</td>
|
||||||
|
<td>54.7</td>
|
||||||
|
<td>79.8</td>
|
||||||
|
<td>78.6</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>87.2</td>
|
||||||
|
<td>43.9</td>
|
||||||
|
<td>14.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Step-1V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>59.5</td>
|
||||||
|
<td>2206.4</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>625</td>
|
||||||
|
<td>49.9</td>
|
||||||
|
<td>44.8</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>79.2</td>
|
||||||
|
<td>71.6</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>48.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen-VL-Max</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>2281.7</td>
|
||||||
|
<td>61.8</td>
|
||||||
|
<td>684</td>
|
||||||
|
<td>52.0</td>
|
||||||
|
<td>43.4</td>
|
||||||
|
<td>74.6</td>
|
||||||
|
<td>75.7</td>
|
||||||
|
<td>79.5</td>
|
||||||
|
<td>93.1</td>
|
||||||
|
<td>41.2</td>
|
||||||
|
<td>13.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="15" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Yi-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>157</td>
|
||||||
|
<td>55.0</td>
|
||||||
|
<td>2006.5</td>
|
||||||
|
<td>50.7</td>
|
||||||
|
<td>574</td>
|
||||||
|
<td>48.8</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>78.9</td>
|
||||||
|
<td>69.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>34.8</td>
|
||||||
|
<td>12.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Mini-Gemini-HD-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>157</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>2141</td>
|
||||||
|
<td>59.3</td>
|
||||||
|
<td>518</td>
|
||||||
|
<td>48.0</td>
|
||||||
|
<td>43.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>80.5</td>
|
||||||
|
<td>74.1</td>
|
||||||
|
<td>78.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Cambrian-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>1820</td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>2049.9</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
<td>591</td>
|
||||||
|
<td>50.4</td>
|
||||||
|
<td>50.3</td>
|
||||||
|
<td>77.8</td>
|
||||||
|
<td>79.5</td>
|
||||||
|
<td>76.7</td>
|
||||||
|
<td>75.5</td>
|
||||||
|
<td>41.6</td>
|
||||||
|
<td>14.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4V-9B</td>
|
||||||
|
<td>13B</td>
|
||||||
|
<td>784</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>2018.8</td>
|
||||||
|
<td>58.0</td>
|
||||||
|
<td>776</td>
|
||||||
|
<td>46.9</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>67.9</td>
|
||||||
|
<td>71.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>706</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>2215.1</td>
|
||||||
|
<td>54.3</td>
|
||||||
|
<td>794</td>
|
||||||
|
<td><strong>51.2</strong></td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td><strong>79.4</strong></td>
|
||||||
|
<td><strong>83.6</strong></td>
|
||||||
|
<td>77.4</td>
|
||||||
|
<td><strong>91.6</strong></td>
|
||||||
|
<td>45.0</td>
|
||||||
|
<td>21.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-Llama-V 2.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>1882</td>
|
||||||
|
<td>58.8</td>
|
||||||
|
<td>2024.6</td>
|
||||||
|
<td>52.8</td>
|
||||||
|
<td>725</td>
|
||||||
|
<td>45.8</td>
|
||||||
|
<td>54.3</td>
|
||||||
|
<td>72.0</td>
|
||||||
|
<td>78.4</td>
|
||||||
|
<td>76.6</td>
|
||||||
|
<td>84.8</td>
|
||||||
|
<td>42.4</td>
|
||||||
|
<td>10.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>2822</strong></td>
|
||||||
|
<td><strong>65.2</strong></td>
|
||||||
|
<td><strong>2348.4</strong>*</td>
|
||||||
|
<td><strong>60.0</strong></td>
|
||||||
|
<td><strong>852</strong>*</td>
|
||||||
|
<td>49.8*</td>
|
||||||
|
<td><strong>60.6</strong></td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>82.1</td>
|
||||||
|
<td><strong>80.1<strong></td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td><strong>48.1</strong>*</td>
|
||||||
|
<td><strong>8.2</strong></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* 我们使用思维链提示词来评估这些基准。
|
||||||
|
|
||||||
|
<sup>+</sup> Token Density:每个视觉 token 在最大分辨率下编码的像素数,即最大分辨率下的像素数 / 视觉 token 数。
|
||||||
|
|
||||||
|
注意:闭源模型的 Token Density 由 API 收费方式估算得到。
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看 Mantis Eval, BLINK, Mathverse mv, Sciverse mv, MIRB 上的多图评测结果详情。</summary>
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Mantis Eval</th>
|
||||||
|
<th>BLINK val</th>
|
||||||
|
<th>Mathverse mv</th>
|
||||||
|
<th>Sciverse mv</th>
|
||||||
|
<th>MIRB</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="7" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>60.3</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>53.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Interleave-14B</td>
|
||||||
|
<td>14B</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>52.6</td>
|
||||||
|
<td>32.7</td>
|
||||||
|
<td>30.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="7" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Emu2-Chat</td>
|
||||||
|
<td>37B</td>
|
||||||
|
<td>37.8</td>
|
||||||
|
<td>36.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>27.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">CogVLM</td>
|
||||||
|
<td>17B</td>
|
||||||
|
<td>45.2</td>
|
||||||
|
<td>41.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VPG-C</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>43.1</td>
|
||||||
|
<td>24.3</td>
|
||||||
|
<td>23.1</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">VILA 8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>51.2</td>
|
||||||
|
<td>39.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>36.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternLM-XComposer-2.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>53.1*</td>
|
||||||
|
<td>48.9</td>
|
||||||
|
<td>32.1*</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>42.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>59.0*</td>
|
||||||
|
<td>50.9</td>
|
||||||
|
<td>30.5*</td>
|
||||||
|
<td>34.4*</td>
|
||||||
|
<td><strong>56.9*</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>69.1</strong></td>
|
||||||
|
<td><strong>53.0</strong></td>
|
||||||
|
<td><strong>84.9</strong></td>
|
||||||
|
<td><strong>74.9</strong></td>
|
||||||
|
<td>53.8</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* 正式开源模型权重的评测结果。
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看 Video-MME 和 Video-ChatGPT 上的视频评测结果详情。</summary>
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th colspan="2">Video-MME</th>
|
||||||
|
<th colspan="5">Video-ChatGPT</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th align="left"></th>
|
||||||
|
<th></th>
|
||||||
|
<th>w/o subs</th>
|
||||||
|
<th>w subs</th>
|
||||||
|
<th>Correctness</th>
|
||||||
|
<th>Detail</th>
|
||||||
|
<th>Context</th>
|
||||||
|
<th>Temporal</th>
|
||||||
|
<th>Consistency</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="9" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>62.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4V</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>59.9</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="9" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-7B</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.39</td>
|
||||||
|
<td>3.29</td>
|
||||||
|
<td>3.92</td>
|
||||||
|
<td>2.60</td>
|
||||||
|
<td>3.12</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-34B</td>
|
||||||
|
<td>34B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.29</td>
|
||||||
|
<td>3.23</td>
|
||||||
|
<td>3.83</td>
|
||||||
|
<td>2.51</td>
|
||||||
|
<td>3.47</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">CogVLM2-Video</td>
|
||||||
|
<td>12B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>3.49</td>
|
||||||
|
<td><strong>3.46</strong></td>
|
||||||
|
<td>3.23</td>
|
||||||
|
<td><strong>2.98</strong></td>
|
||||||
|
<td><strong>3.64</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LongVA</td>
|
||||||
|
<td>7B</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
<td>54.3</td>
|
||||||
|
<td>3.05</td>
|
||||||
|
<td>3.09</td>
|
||||||
|
<td>3.77</td>
|
||||||
|
<td>2.44</td>
|
||||||
|
<td><strong>3.64</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2-8B</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>54.0</td>
|
||||||
|
<td>56.9</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternLM-XComposer-2.5</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td>55.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">LLaVA-NeXT-Video</td>
|
||||||
|
<td>32B</td>
|
||||||
|
<td>60.2</td>
|
||||||
|
<td>63.0</td>
|
||||||
|
<td>3.48</td>
|
||||||
|
<td>3.37</td>
|
||||||
|
<td><strong>3.95</strong></td>
|
||||||
|
<td>2.64</td>
|
||||||
|
<td>3.28</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
|
||||||
|
<td>8B</td>
|
||||||
|
<td><strong>60.9</strong></td>
|
||||||
|
<td><strong>63.6</strong></td>
|
||||||
|
<td><strong>3.59</strong></td>
|
||||||
|
<td>3.28</td>
|
||||||
|
<td>3.93</td>
|
||||||
|
<td>2.73</td>
|
||||||
|
<td>3.62</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看 TextVQA, VizWiz, VQAv2, OK-VQA上的少样本评测结果详情。</summary>
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Shot</th>
|
||||||
|
<th>TextVQA val</th>
|
||||||
|
<th>VizWiz test-dev</th>
|
||||||
|
<th>VQAv2 test-dev</th>
|
||||||
|
<th>OK-VQA val</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">Flamingo</td>
|
||||||
|
<td rowspan="3">80B</td>
|
||||||
|
<td>0*</td>
|
||||||
|
<td>35.0</td>
|
||||||
|
<td>31.6</td>
|
||||||
|
<td>56.3</td>
|
||||||
|
<td>40.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>36.5</td>
|
||||||
|
<td>39.6</td>
|
||||||
|
<td>63.1</td>
|
||||||
|
<td><strong>57.4</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>37.3</td>
|
||||||
|
<td>44.8</td>
|
||||||
|
<td>65.6</td>
|
||||||
|
<td>57.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">IDEFICS</td>
|
||||||
|
<td rowspan="3">80B</td>
|
||||||
|
<td>0*</td>
|
||||||
|
<td>30.9</td>
|
||||||
|
<td>36.0</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>45.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>34.3</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
<td>52.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>35.7</td>
|
||||||
|
<td>46.1</td>
|
||||||
|
<td>64.8</td>
|
||||||
|
<td>55.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">OmniCorpus</td>
|
||||||
|
<td rowspan="3">7B</td>
|
||||||
|
<td>0*</td>
|
||||||
|
<td>43.0</td>
|
||||||
|
<td>49.8</td>
|
||||||
|
<td>63.2</td>
|
||||||
|
<td>45.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>45.4</td>
|
||||||
|
<td>51.3</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>46.5</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>52.2</td>
|
||||||
|
<td>64.7</td>
|
||||||
|
<td>46.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">Emu2</td>
|
||||||
|
<td rowspan="3">37B</td>
|
||||||
|
<td>0</td>
|
||||||
|
<td>26.4</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>33.5</td>
|
||||||
|
<td>26.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>48.2</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>67.0</td>
|
||||||
|
<td>53.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>49.3</td>
|
||||||
|
<td>54.7</td>
|
||||||
|
<td>67.8</td>
|
||||||
|
<td>54.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="2">MM1</td>
|
||||||
|
<td rowspan="2">30B</td>
|
||||||
|
<td>0</td>
|
||||||
|
<td>26.2</td>
|
||||||
|
<td>40.4</td>
|
||||||
|
<td>48.9</td>
|
||||||
|
<td>26.7</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td>49.3</td>
|
||||||
|
<td>54.7</td>
|
||||||
|
<td><strong>70.9</strong></td>
|
||||||
|
<td>54.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td align="left" nowrap="nowrap" rowspan="3">MiniCPM-V 2.6<sup>+</sup></td>
|
||||||
|
<td rowspan="3">8B</td>
|
||||||
|
<td>0</td>
|
||||||
|
<td>43.9</td>
|
||||||
|
<td>33.8</td>
|
||||||
|
<td>45.4</td>
|
||||||
|
<td>23.9</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
<td>60.5</td>
|
||||||
|
<td>65.5</td>
|
||||||
|
<td>50.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>8</td>
|
||||||
|
<td><strong>64.6</strong></td>
|
||||||
|
<td><strong>63.4</strong></td>
|
||||||
|
<td>68.2</td>
|
||||||
|
<td>51.4</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
* 使用 Flamingo 方式 zero image shot 和 two additional text shots 评估零样本性能。
|
||||||
|
|
||||||
|
<sup>+</sup> 我们在没有进行监督微调 (SFT) 的情况下评估预训练的模型权重 (ckpt)。
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### 典型示例 <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv2_6/multi_img-bike.png" alt="Bike" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multi_img-menu.png" alt="Menu" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multi_img-code.png" alt="Code" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/ICL-Mem.png" alt="Mem" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multiling-medal.png" alt="medal" style="margin-bottom: 10px;">
|
||||||
|
</div>
|
||||||
|
<details>
|
||||||
|
<summary>点击查看更多示例。</summary>
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv2_6/ICL-elec.png" alt="elec" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv2_6/multiling-olympic.png" alt="Menu" style="margin-bottom: 10px;">
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
我们将 MiniCPM-V 2.6 部署在iPad Pro上,并录制了以下演示视频。
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/gif_cases/ai.gif" width=32%/>
|
||||||
|
|
||||||
|
<img src="../assets/gif_cases/beer.gif" width=32%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<video src="https://github.com/user-attachments/assets/21f4b818-ede1-4822-920e-91281725c830" width="360" /> </video>
|
||||||
|
<!-- <video src="https://github.com/user-attachments/assets/c835f757-206b-4d9c-8e36-70d67b453628" width="360" /> </video> -->
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### 模型库
|
||||||
|
|
||||||
|
| 模型 | 设备 | 资源 |          简介 | 下载链接 |
|
||||||
|
|:--------------|:-:|:----------:|:-------------------|:---------------:|
|
||||||
|
| MiniCPM-V 2.6| GPU | 17 GB | 提供出色的端侧单图、多图、视频理解能力。 | [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6) [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) |
|
||||||
|
| MiniCPM-V 2.6 gguf | CPU | 6 GB | gguf 版本,更低的内存占用和更高的推理效率。 | [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-gguf) |
|
||||||
|
| MiniCPM-V 2.6 int4 | GPU | 7 GB | int4量化版,更低显存占用。 | [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-int4) |
|
||||||
556
docs/minicpm_v4_en.md
Normal file
@@ -0,0 +1,556 @@
|
|||||||
|
## MiniCPM-V 4.0
|
||||||
|
|
||||||
|
> Archieve at: 2025-08-25
|
||||||
|
|
||||||
|
**MiniCPM-V 4.0** is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. Notable features of MiniCPM-V 4.0 include:
|
||||||
|
|
||||||
|
- 🔥 **Leading Visual Capability.**
|
||||||
|
With only 4.1B parameters, MiniCPM-V 4.0 achieves an average score of 69.0 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks, **outperforming GPT-4.1-mini-20250414, MiniCPM-V 2.6 (8.1B params, OpenCompass 65.2) and Qwen2.5-VL-3B-Instruct (3.8B params, OpenCompass 64.5)**. It also shows good performance in multi-image understanding and video understanding.
|
||||||
|
|
||||||
|
- 🚀 **Superior Efficiency.**
|
||||||
|
Designed for on-device deployment, MiniCPM-V 4.0 runs smoothly on end devices. For example, it devlivers **less than 2s first token delay and more than 17 token/s decoding on iPhone 16 Pro Max**, without heating problems. It also shows superior throughput under concurrent requests.
|
||||||
|
|
||||||
|
- 💫 **Easy Usage.**
|
||||||
|
MiniCPM-V 4.0 can be easily used in various ways including **llama.cpp, Ollama, vLLM, SGLang, LLaMA-Factory and local web demo** etc. We also open-source iOS App that can run on iPhone and iPad. Get started easily with our well-structured [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-CookBook), featuring detailed instructions and practical examples.
|
||||||
|
|
||||||
|
### Evaluation <!-- omit in toc -->
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view single image results on OpenCompass. </summary>
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th nowrap="nowrap" align="left">model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Opencompass</th>
|
||||||
|
<th>OCRBench</th>
|
||||||
|
<th>MathVista</th>
|
||||||
|
<th>HallusionBench</th>
|
||||||
|
<th>MMMU</th>
|
||||||
|
<th>MMVet</th>
|
||||||
|
<th>MMBench V1.1</th>
|
||||||
|
<th>MMStar</th>
|
||||||
|
<th>AI2D</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4v-20240409</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>63.5</td>
|
||||||
|
<td>656</td>
|
||||||
|
<td>55.2</td>
|
||||||
|
<td>43.9</td>
|
||||||
|
<td>61.7</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>79.8</td>
|
||||||
|
<td>56.0</td>
|
||||||
|
<td>78.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini-1.5-Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>754</td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>64.0</td>
|
||||||
|
<td>73.9</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4.1-mini-20250414</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>68.9</td>
|
||||||
|
<td>840</td>
|
||||||
|
<td>70.9</td>
|
||||||
|
<td>49.3</td>
|
||||||
|
<td>55.0</td>
|
||||||
|
<td>74.3</td>
|
||||||
|
<td>80.9</td>
|
||||||
|
<td>60.9</td>
|
||||||
|
<td>76.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet-20241022</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>70.6</td>
|
||||||
|
<td>798</td>
|
||||||
|
<td>65.3</td>
|
||||||
|
<td>55.5</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>70.1</td>
|
||||||
|
<td>81.7</td>
|
||||||
|
<td>65.1</td>
|
||||||
|
<td>81.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-3B-Instruct</td>
|
||||||
|
<td>3.8B</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>828</td>
|
||||||
|
<td>61.2</td>
|
||||||
|
<td>46.6</td>
|
||||||
|
<td>51.2</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>76.8</td>
|
||||||
|
<td>56.3</td>
|
||||||
|
<td>81.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-4B</td>
|
||||||
|
<td>3.7B</td>
|
||||||
|
<td>65.1</td>
|
||||||
|
<td>820</td>
|
||||||
|
<td>60.8</td>
|
||||||
|
<td>46.6</td>
|
||||||
|
<td>51.8</td>
|
||||||
|
<td>61.5</td>
|
||||||
|
<td>78.2</td>
|
||||||
|
<td>58.7</td>
|
||||||
|
<td>81.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>70.9</td>
|
||||||
|
<td>888</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>51.9</td>
|
||||||
|
<td>58.0</td>
|
||||||
|
<td>69.7</td>
|
||||||
|
<td>82.2</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>84.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>821</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>49.0</td>
|
||||||
|
<td>56.2</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>82.5</td>
|
||||||
|
<td>63.2</td>
|
||||||
|
<td>84.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-2.6</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>65.2</td>
|
||||||
|
<td>852</td>
|
||||||
|
<td>60.8</td>
|
||||||
|
<td>48.1</td>
|
||||||
|
<td>49.8</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>57.5</td>
|
||||||
|
<td>82.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o-2.6</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>70.2</td>
|
||||||
|
<td>889</td>
|
||||||
|
<td>73.3</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>50.9</td>
|
||||||
|
<td>67.2</td>
|
||||||
|
<td>80.6</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>86.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-4.0</td>
|
||||||
|
<td>4.1B</td>
|
||||||
|
<td>69.0</td>
|
||||||
|
<td>894</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>50.8</td>
|
||||||
|
<td>51.2</td>
|
||||||
|
<td>68.0</td>
|
||||||
|
<td>79.7</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>82.9</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view single image results on ChartQA, MME, RealWorldQA, TextVQA, DocVQA, MathVision, DynaMath, WeMath, Object HalBench and MM Halbench. </summary>
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th nowrap="nowrap" align="left">model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>ChartQA</th>
|
||||||
|
<th>MME</th>
|
||||||
|
<th>RealWorldQA</th>
|
||||||
|
<th>TextVQA</th>
|
||||||
|
<th>DocVQA</th>
|
||||||
|
<th>MathVision</th>
|
||||||
|
<th>DynaMath</th>
|
||||||
|
<th>WeMath</th>
|
||||||
|
<th colspan="2">Obj Hal</th>
|
||||||
|
<th colspan="2">MM Hal</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>CHAIRs↓</td>
|
||||||
|
<td>CHAIRi↓</td>
|
||||||
|
<td nowrap="nowrap">score avg@3↑</td>
|
||||||
|
<td nowrap="nowrap">hall rate avg@3↓</td>
|
||||||
|
</tr>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="14" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4v-20240409</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>78.5</td>
|
||||||
|
<td>1927</td>
|
||||||
|
<td>61.4</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>88.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini-1.5-Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>87.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>78.8</td>
|
||||||
|
<td>93.1</td>
|
||||||
|
<td>41.0</td>
|
||||||
|
<td>31.5</td>
|
||||||
|
<td>50.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4.1-mini-20250414</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>45.3</td>
|
||||||
|
<td>47.7</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet-20241022</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>60.1</td>
|
||||||
|
<td>74.1</td>
|
||||||
|
<td>95.2</td>
|
||||||
|
<td>35.6</td>
|
||||||
|
<td>35.7</td>
|
||||||
|
<td>44.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="14" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-3B-Instruct</td>
|
||||||
|
<td>3.8B</td>
|
||||||
|
<td>84.0</td>
|
||||||
|
<td>2157</td>
|
||||||
|
<td>65.4</td>
|
||||||
|
<td>79.3</td>
|
||||||
|
<td>93.9</td>
|
||||||
|
<td>21.9</td>
|
||||||
|
<td>13.2</td>
|
||||||
|
<td>22.9</td>
|
||||||
|
<td>18.3</td>
|
||||||
|
<td>10.8</td>
|
||||||
|
<td>3.9 </td>
|
||||||
|
<td>33.3 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-4B</td>
|
||||||
|
<td>3.7B</td>
|
||||||
|
<td>84.0</td>
|
||||||
|
<td>2338</td>
|
||||||
|
<td>64.3</td>
|
||||||
|
<td>76.8</td>
|
||||||
|
<td>91.6</td>
|
||||||
|
<td>18.4</td>
|
||||||
|
<td>15.2</td>
|
||||||
|
<td>21.2</td>
|
||||||
|
<td>13.7</td>
|
||||||
|
<td>8.7</td>
|
||||||
|
<td>3.2 </td>
|
||||||
|
<td>46.5 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>87.3</td>
|
||||||
|
<td>2347</td>
|
||||||
|
<td>68.5</td>
|
||||||
|
<td>84.9</td>
|
||||||
|
<td>95.7</td>
|
||||||
|
<td>25.4</td>
|
||||||
|
<td>21.8</td>
|
||||||
|
<td>36.2</td>
|
||||||
|
<td>13.3</td>
|
||||||
|
<td>7.9</td>
|
||||||
|
<td>4.1 </td>
|
||||||
|
<td>31.6 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>84.8</td>
|
||||||
|
<td>2344</td>
|
||||||
|
<td>70.1</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>93.0</td>
|
||||||
|
<td>17.0</td>
|
||||||
|
<td>9.4</td>
|
||||||
|
<td>23.5</td>
|
||||||
|
<td>18.3</td>
|
||||||
|
<td>11.6</td>
|
||||||
|
<td>3.6 </td>
|
||||||
|
<td>37.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-2.6</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>79.4</td>
|
||||||
|
<td>2348</td>
|
||||||
|
<td>65.0</td>
|
||||||
|
<td>80.1</td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td>17.5</td>
|
||||||
|
<td>9.0</td>
|
||||||
|
<td>20.4</td>
|
||||||
|
<td>7.3</td>
|
||||||
|
<td>4.7</td>
|
||||||
|
<td>4.0 </td>
|
||||||
|
<td>29.9 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o-2.6</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>86.9</td>
|
||||||
|
<td>2372</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>82.0</td>
|
||||||
|
<td>93.5</td>
|
||||||
|
<td>21.7</td>
|
||||||
|
<td>10.4</td>
|
||||||
|
<td>25.2</td>
|
||||||
|
<td>6.3</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
<td>4.1 </td>
|
||||||
|
<td>31.3 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-4.0</td>
|
||||||
|
<td>4.1B</td>
|
||||||
|
<td>84.4</td>
|
||||||
|
<td>2298</td>
|
||||||
|
<td>68.5</td>
|
||||||
|
<td>80.8</td>
|
||||||
|
<td>92.9</td>
|
||||||
|
<td>20.7</td>
|
||||||
|
<td>14.2</td>
|
||||||
|
<td>32.7</td>
|
||||||
|
<td>6.3</td>
|
||||||
|
<td>3.5</td>
|
||||||
|
<td>4.1 </td>
|
||||||
|
<td>29.2 </td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view multi-image and video understanding results on Mantis, Blink and Video-MME. </summary>
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th nowrap="nowrap" align="left">model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Mantis</th>
|
||||||
|
<th>Blink</th>
|
||||||
|
<th nowrap="nowrap" colspan="2" >Video-MME</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>wo subs</td>
|
||||||
|
<td>w subs</td>
|
||||||
|
</tr>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4v-20240409</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>59.9</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini-1.5-Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>75.0</td>
|
||||||
|
<td>81.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-20240513</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>68.0</td>
|
||||||
|
<td>71.9</td>
|
||||||
|
<td>77.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-3B-Instruct</td>
|
||||||
|
<td>3.8B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>47.6</td>
|
||||||
|
<td>61.5</td>
|
||||||
|
<td>67.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-4B</td>
|
||||||
|
<td>3.7B</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>50.8</td>
|
||||||
|
<td>62.3</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>56.4</td>
|
||||||
|
<td>65.1</td>
|
||||||
|
<td>71.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>67.7</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>64.2</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-2.6</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>69.1</td>
|
||||||
|
<td>53.0</td>
|
||||||
|
<td>60.9</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o-2.6</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>71.9</td>
|
||||||
|
<td>56.7</td>
|
||||||
|
<td>63.9</td>
|
||||||
|
<td>69.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-4.0</td>
|
||||||
|
<td>4.1B</td>
|
||||||
|
<td>71.4</td>
|
||||||
|
<td>54.0</td>
|
||||||
|
<td>61.2</td>
|
||||||
|
<td>65.8</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv4/minicpm-v-4-case.png" alt="math" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
We deploy MiniCPM-V 4.0 on iPhone 16 Pro Max with [iOS demo](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/ios_demo/ios.md). The demo video is the raw screen recording without edition.
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4/iphone_en.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4/iphone_en_information_extraction.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4/iphone_cn.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4/iphone_cn_funny_points.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
|
||||||
557
docs/minicpm_v4_zh.md
Normal file
@@ -0,0 +1,557 @@
|
|||||||
|
## MiniCPM-V 4.0
|
||||||
|
|
||||||
|
> Archieve at: 2025-08-25
|
||||||
|
|
||||||
|
MiniCPM-V 4.0 是 MiniCPM-V 系列中的最新模型。该模型基于 SigLIP2-400M 和 MiniCPM4-3B 构建,参数总量为 4.1B。它延续了 MiniCPM-V 2.6 在单图、多图和视频理解方面的强大能力,同时大幅提升了推理效率。MiniCPM-V 4.0 的主要特点包括:
|
||||||
|
|
||||||
|
- 🔥 **领先的视觉能力。**
|
||||||
|
MiniCPM-V 4.0 在 OpenCompass 上获得了平均 69.0 的高分,超越了 MiniCPM-V 2.6(8.1B,得分 65.2)、 Qwen2.5-VL-3B-Instruct(3.8B,得分 64.5)和**广泛使用的闭源模型 GPT-4.1-mini-20250414**。在多图理解与视频理解任务上,MiniCPM-V 4.0 也表现出色。
|
||||||
|
|
||||||
|
- 🚀 **卓越的效率。**
|
||||||
|
MiniCPM-V 4.0 专为端侧设备优化,**可在 iPhone 16 Pro Max 上流畅运行,首 token 延迟低至 2 秒,解码速度达 17.9 tokens/s**,且无发热问题。MiniCPM-V 4.0 在并发请求场景下表现出领先的吞吐率指标。
|
||||||
|
|
||||||
|
- 💫 **易于使用。**
|
||||||
|
MiniCPM-V 4.0 支持多种推理方式,包括 **llama.cpp、Ollama、vLLM、SGLang、LLaMA-Factory 及本地 Web Demo 等**。我们还开源了可以在 iPhone 和 iPad 运行的 iOS App。欢迎参考我们开源的 **结构清晰的[使用手册](https://github.com/OpenSQZ/MiniCPM-V-CookBook)** 玩转 MiniCPM-V 4.0,其中涵盖了详细的部署指南和真实示例。
|
||||||
|
|
||||||
|
|
||||||
|
### 性能评估 <!-- omit in toc -->
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看在OpenCompass上的单图理解能力的评测结果。</summary>
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th nowrap="nowrap" align="left">model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Opencompass</th>
|
||||||
|
<th>OCRBench</th>
|
||||||
|
<th>MathVista</th>
|
||||||
|
<th>HallusionBench</th>
|
||||||
|
<th>MMMU</th>
|
||||||
|
<th>MMVet</th>
|
||||||
|
<th>MMBench V1.1</th>
|
||||||
|
<th>MMStar</th>
|
||||||
|
<th>AI2D</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4v-20240409</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>63.5</td>
|
||||||
|
<td>656</td>
|
||||||
|
<td>55.2</td>
|
||||||
|
<td>43.9</td>
|
||||||
|
<td>61.7</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>79.8</td>
|
||||||
|
<td>56.0</td>
|
||||||
|
<td>78.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini-1.5-Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>754</td>
|
||||||
|
<td>58.3</td>
|
||||||
|
<td>45.6</td>
|
||||||
|
<td>60.6</td>
|
||||||
|
<td>64.0</td>
|
||||||
|
<td>73.9</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4.1-mini-20250414</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>68.9</td>
|
||||||
|
<td>840</td>
|
||||||
|
<td>70.9</td>
|
||||||
|
<td>49.3</td>
|
||||||
|
<td>55.0</td>
|
||||||
|
<td>74.3</td>
|
||||||
|
<td>80.9</td>
|
||||||
|
<td>60.9</td>
|
||||||
|
<td>76.0</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet-20241022</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>70.6</td>
|
||||||
|
<td>798</td>
|
||||||
|
<td>65.3</td>
|
||||||
|
<td>55.5</td>
|
||||||
|
<td>66.4</td>
|
||||||
|
<td>70.1</td>
|
||||||
|
<td>81.7</td>
|
||||||
|
<td>65.1</td>
|
||||||
|
<td>81.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="11" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-3B-Instruct</td>
|
||||||
|
<td>3.8B</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>828</td>
|
||||||
|
<td>61.2</td>
|
||||||
|
<td>46.6</td>
|
||||||
|
<td>51.2</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>76.8</td>
|
||||||
|
<td>56.3</td>
|
||||||
|
<td>81.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-4B</td>
|
||||||
|
<td>3.7B</td>
|
||||||
|
<td>65.1</td>
|
||||||
|
<td>820</td>
|
||||||
|
<td>60.8</td>
|
||||||
|
<td>46.6</td>
|
||||||
|
<td>51.8</td>
|
||||||
|
<td>61.5</td>
|
||||||
|
<td>78.2</td>
|
||||||
|
<td>58.7</td>
|
||||||
|
<td>81.4</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>70.9</td>
|
||||||
|
<td>888</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>51.9</td>
|
||||||
|
<td>58.0</td>
|
||||||
|
<td>69.7</td>
|
||||||
|
<td>82.2</td>
|
||||||
|
<td>64.1</td>
|
||||||
|
<td>84.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>821</td>
|
||||||
|
<td>64.5</td>
|
||||||
|
<td>49.0</td>
|
||||||
|
<td>56.2</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>82.5</td>
|
||||||
|
<td>63.2</td>
|
||||||
|
<td>84.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-2.6</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>65.2</td>
|
||||||
|
<td>852</td>
|
||||||
|
<td>60.8</td>
|
||||||
|
<td>48.1</td>
|
||||||
|
<td>49.8</td>
|
||||||
|
<td>60.0</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>57.5</td>
|
||||||
|
<td>82.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o-2.6</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>70.2</td>
|
||||||
|
<td>889</td>
|
||||||
|
<td>73.3</td>
|
||||||
|
<td>51.1</td>
|
||||||
|
<td>50.9</td>
|
||||||
|
<td>67.2</td>
|
||||||
|
<td>80.6</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
<td>86.1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-4.0</td>
|
||||||
|
<td>4.1B</td>
|
||||||
|
<td>69.0</td>
|
||||||
|
<td>894</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
<td>50.8</td>
|
||||||
|
<td>51.2</td>
|
||||||
|
<td>68.0</td>
|
||||||
|
<td>79.7</td>
|
||||||
|
<td>62.8</td>
|
||||||
|
<td>82.9</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看在图表理解、文档理解、数学推理、幻觉等领域的评测结果。 </summary>
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th nowrap="nowrap" align="left">model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>ChartQA</th>
|
||||||
|
<th>MME</th>
|
||||||
|
<th>RealWorldQA</th>
|
||||||
|
<th>TextVQA</th>
|
||||||
|
<th>DocVQA</th>
|
||||||
|
<th>MathVision</th>
|
||||||
|
<th>DynaMath</th>
|
||||||
|
<th>WeMath</th>
|
||||||
|
<th colspan="2">Obj Hal</th>
|
||||||
|
<th colspan="2">MM Hal</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>CHAIRs↓</td>
|
||||||
|
<td>CHAIRi↓</td>
|
||||||
|
<td nowrap="nowrap">score avg@3↑</td>
|
||||||
|
<td nowrap="nowrap">hall rate avg@3↓</td>
|
||||||
|
</tr>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="14" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4v-20240409</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>78.5</td>
|
||||||
|
<td>1927</td>
|
||||||
|
<td>61.4</td>
|
||||||
|
<td>78.0</td>
|
||||||
|
<td>88.4</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini-1.5-Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>87.2</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>67.5</td>
|
||||||
|
<td>78.8</td>
|
||||||
|
<td>93.1</td>
|
||||||
|
<td>41.0</td>
|
||||||
|
<td>31.5</td>
|
||||||
|
<td>50.5</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4.1-mini-20250414</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>45.3</td>
|
||||||
|
<td>47.7</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Claude 3.5 Sonnet-20241022</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>60.1</td>
|
||||||
|
<td>74.1</td>
|
||||||
|
<td>95.2</td>
|
||||||
|
<td>35.6</td>
|
||||||
|
<td>35.7</td>
|
||||||
|
<td>44.0</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="14" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-3B-Instruct</td>
|
||||||
|
<td>3.8B</td>
|
||||||
|
<td>84.0</td>
|
||||||
|
<td>2157</td>
|
||||||
|
<td>65.4</td>
|
||||||
|
<td>79.3</td>
|
||||||
|
<td>93.9</td>
|
||||||
|
<td>21.9</td>
|
||||||
|
<td>13.2</td>
|
||||||
|
<td>22.9</td>
|
||||||
|
<td>18.3</td>
|
||||||
|
<td>10.8</td>
|
||||||
|
<td>3.9 </td>
|
||||||
|
<td>33.3 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-4B</td>
|
||||||
|
<td>3.7B</td>
|
||||||
|
<td>84.0</td>
|
||||||
|
<td>2338</td>
|
||||||
|
<td>64.3</td>
|
||||||
|
<td>76.8</td>
|
||||||
|
<td>91.6</td>
|
||||||
|
<td>18.4</td>
|
||||||
|
<td>15.2</td>
|
||||||
|
<td>21.2</td>
|
||||||
|
<td>13.7</td>
|
||||||
|
<td>8.7</td>
|
||||||
|
<td>3.2 </td>
|
||||||
|
<td>46.5 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>87.3</td>
|
||||||
|
<td>2347</td>
|
||||||
|
<td>68.5</td>
|
||||||
|
<td>84.9</td>
|
||||||
|
<td>95.7</td>
|
||||||
|
<td>25.4</td>
|
||||||
|
<td>21.8</td>
|
||||||
|
<td>36.2</td>
|
||||||
|
<td>13.3</td>
|
||||||
|
<td>7.9</td>
|
||||||
|
<td>4.1 </td>
|
||||||
|
<td>31.6 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>84.8</td>
|
||||||
|
<td>2344</td>
|
||||||
|
<td>70.1</td>
|
||||||
|
<td>79.1</td>
|
||||||
|
<td>93.0</td>
|
||||||
|
<td>17.0</td>
|
||||||
|
<td>9.4</td>
|
||||||
|
<td>23.5</td>
|
||||||
|
<td>18.3</td>
|
||||||
|
<td>11.6</td>
|
||||||
|
<td>3.6 </td>
|
||||||
|
<td>37.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-2.6</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>79.4</td>
|
||||||
|
<td>2348</td>
|
||||||
|
<td>65.0</td>
|
||||||
|
<td>80.1</td>
|
||||||
|
<td>90.8</td>
|
||||||
|
<td>17.5</td>
|
||||||
|
<td>9.0</td>
|
||||||
|
<td>20.4</td>
|
||||||
|
<td>7.3</td>
|
||||||
|
<td>4.7</td>
|
||||||
|
<td>4.0 </td>
|
||||||
|
<td>29.9 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o-2.6</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>86.9</td>
|
||||||
|
<td>2372</td>
|
||||||
|
<td>68.1</td>
|
||||||
|
<td>82.0</td>
|
||||||
|
<td>93.5</td>
|
||||||
|
<td>21.7</td>
|
||||||
|
<td>10.4</td>
|
||||||
|
<td>25.2</td>
|
||||||
|
<td>6.3</td>
|
||||||
|
<td>3.4</td>
|
||||||
|
<td>4.1 </td>
|
||||||
|
<td>31.3 </td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-4.0</td>
|
||||||
|
<td>4.1B</td>
|
||||||
|
<td>84.4</td>
|
||||||
|
<td>2298</td>
|
||||||
|
<td>68.5</td>
|
||||||
|
<td>80.8</td>
|
||||||
|
<td>92.9</td>
|
||||||
|
<td>20.7</td>
|
||||||
|
<td>14.2</td>
|
||||||
|
<td>32.7</td>
|
||||||
|
<td>6.3</td>
|
||||||
|
<td>3.5</td>
|
||||||
|
<td>4.1 </td>
|
||||||
|
<td>29.2 </td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看多图和视频理解能力的评测结果。 </summary>
|
||||||
|
<div align="center">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th nowrap="nowrap" align="left">model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Mantis</th>
|
||||||
|
<th>Blink</th>
|
||||||
|
<th nowrap="nowrap" colspan="2" >Video-MME</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td></td>
|
||||||
|
<td>wo subs</td>
|
||||||
|
<td>w subs</td>
|
||||||
|
</tr>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Proprietary</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4v-20240409</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>54.6</td>
|
||||||
|
<td>59.9</td>
|
||||||
|
<td>63.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Gemini-1.5-Pro</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>59.1</td>
|
||||||
|
<td>75.0</td>
|
||||||
|
<td>81.3</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GPT-4o-20240513</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>68.0</td>
|
||||||
|
<td>71.9</td>
|
||||||
|
<td>77.2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="6" align="left"><strong>Open-source</strong></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-3B-Instruct</td>
|
||||||
|
<td>3.8B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>47.6</td>
|
||||||
|
<td>61.5</td>
|
||||||
|
<td>67.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-4B</td>
|
||||||
|
<td>3.7B</td>
|
||||||
|
<td>62.7</td>
|
||||||
|
<td>50.8</td>
|
||||||
|
<td>62.3</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>-</td>
|
||||||
|
<td>56.4</td>
|
||||||
|
<td>65.1</td>
|
||||||
|
<td>71.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">InternVL2.5-8B</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>67.7</td>
|
||||||
|
<td>54.8</td>
|
||||||
|
<td>64.2</td>
|
||||||
|
<td>66.9</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-2.6</td>
|
||||||
|
<td>8.1B</td>
|
||||||
|
<td>69.1</td>
|
||||||
|
<td>53.0</td>
|
||||||
|
<td>60.9</td>
|
||||||
|
<td>63.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-o-2.6</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>71.9</td>
|
||||||
|
<td>56.7</td>
|
||||||
|
<td>63.9</td>
|
||||||
|
<td>69.6</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V-4.0</td>
|
||||||
|
<td>4.1B</td>
|
||||||
|
<td>71.4</td>
|
||||||
|
<td>54.0</td>
|
||||||
|
<td>61.2</td>
|
||||||
|
<td>65.8</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### 典型示例
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv4/minicpm-v-4-case.png" alt="math" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
我们在 iPhone 16 Pro Max 上部署了 MiniCPM-V 4.0 [iOS demo](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/ios_demo/ios.md),并录制了以下演示录屏,视频未经加速等任何编辑:
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4/iphone_en.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4/iphone_en_information_extraction.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4/iphone_cn.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4/iphone_cn_funny_points.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
158
docs/minicpm_v4dot5_en.md
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
## MiniCPM-V 4.5
|
||||||
|
|
||||||
|
> Archieve at: 2026-02-03
|
||||||
|
|
||||||
|
**MiniCPM-V 4.5** is the latest and most capable model in the MiniCPM-V series. The model is built on Qwen3-8B and SigLIP2-400M with a total of 8B parameters. It exhibits a significant performance improvement over previous MiniCPM-V and MiniCPM-o models, and introduces new useful features. Notable features of MiniCPM-V 4.5 include:
|
||||||
|
|
||||||
|
- 🔥 **State-of-the-art Vision-Language Capability.**
|
||||||
|
MiniCPM-V 4.5 achieves an average score of 77.0 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. **With only 8B parameters, it surpasses widely used proprietary models like GPT-4o-latest, Gemini-2.0 Pro, and strong open-source models like Qwen2.5-VL 72B** for vision-language capabilities, making it the most performant MLLM under 30B parameters.
|
||||||
|
|
||||||
|
- 🎬 **Efficient High-FPS and Long Video Understanding.** Powered by a new unified 3D-Resampler over images and videos, MiniCPM-V 4.5 can now achieve 96x compression rate for video tokens, where 6 448x448 video frames can be jointly compressed into 64 video tokens (normally 1,536 tokens for most MLLMs). This means that the model can perceive significantly more video frames without increasing the LLM inference cost. This brings state-of-the-art high-FPS (up to 10FPS) video understanding and long video understanding capabilities on Video-MME, LVBench, MLVU, MotionBench, FavorBench, etc., efficiently.
|
||||||
|
|
||||||
|
- ⚙️ **Controllable Hybrid Fast/Deep Thinking.** MiniCPM-V 4.5 supports both fast thinking for efficient frequent usage with competitive performance, and deep thinking for more complex problem solving. To cover efficiency and performance trade-offs in different user scenarios, this fast/deep thinking mode can be switched in a highly controlled fashion.
|
||||||
|
|
||||||
|
- 💪 **Strong OCR, Document Parsing and Others.**
|
||||||
|
Based on [LLaVA-UHD](https://arxiv.org/pdf/2403.11703) architecture, MiniCPM-V 4.5 can process high-resolution images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), using 4x fewer visual tokens than most MLLMs. The model achieves **leading performance on OCRBench, surpassing proprietary models such as GPT-4o-latest and Gemini 2.5**. It also achieves state-of-the-art performance for PDF document parsing capability on OmniDocBench among general MLLMs. Based on the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) and [VisCPM](https://github.com/OpenBMB/VisCPM) techniques, it features **trustworthy behaviors**, outperforming GPT-4o-latest on MMHal-Bench, and supports **multilingual capabilities** in more than 30 languages.
|
||||||
|
|
||||||
|
|
||||||
|
- 💫 **Easy Usage.**
|
||||||
|
MiniCPM-V 4.5 can be easily used in various ways: (1) [llama.cpp](https://github.com/tc-mb/llama.cpp/blob/Support-MiniCPM-V-4.5/docs/multimodal/minicpmv4.5.md) and [ollama](https://github.com/tc-mb/ollama/tree/MIniCPM-V) support for efficient CPU inference on local devices, (2) [int4](https://huggingface.co/openbmb/MiniCPM-V-4_5-int4), [GGUF](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) and [AWQ](https://github.com/tc-mb/AutoAWQ) format quantized models in 16 sizes, (3) [SGLang](https://github.com/tc-mb/sglang/tree/main) and [vLLM](#efficient-inference-with-llamacpp-ollama-vllm) support for high-throughput and memory-efficient inference, (4) fine-tuning on new domains and tasks with [Transformers](https://github.com/tc-mb/transformers/tree/main) and [LLaMA-Factory](./docs/llamafactory_train_and_infer.md), (5) quick [local WebUI demo](#chat-with-our-demo-on-gradio), (6) optimized [local iOS app](https://github.com/tc-mb/MiniCPM-o-demo-iOS) on iPhone and iPad, and (7) online web demo on [server](http://101.126.42.235:30910/). See our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-CookBook) for full usage!
|
||||||
|
|
||||||
|
|
||||||
|
### Key Techniques <!-- omit in toc -->
|
||||||
|
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="../assets/minicpm-v-4dot5-framework.png" , width=100%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
- **Architechture: Unified 3D-Resampler for High-density Video Compression.** MiniCPM-V 4.5 introduces a 3D-Resampler that overcomes the performance-efficiency trade-off in video understanding. By grouping and jointly compressing up to 6 consecutive video frames into just 64 tokens (the same token count used for a single image in MiniCPM-V series), MiniCPM-V 4.5 achieves a 96× compression rate for video tokens. This allows the model to process more video frames without additional LLM computational cost, enabling high-FPS video and long video understanding. The architecture supports unified encoding for images, multi-image inputs, and videos, ensuring seamless capability and knowledge transfer.
|
||||||
|
|
||||||
|
- **Pre-training: Unified Learning for OCR and Knowledge from Documents.** Existing MLLMs learn OCR capability and knowledge from documents in isolated training approaches. We observe that the essential difference between these two training approaches is the visibility of the text in images. By dynamically corrupting text regions in documents with varying noise levels and asking the model to reconstruct the text, the model learns to adaptively and properly switch between accurate text recognition (when text is visible) and multimodal context-based knowledge reasoning (when text is heavily obscured). This eliminates reliance on error-prone document parsers in knowledge learning from documents, and prevents hallucinations from over-augmented OCR data, resulting in top-tier OCR and multimodal knowledge performance with minimal engineering overhead.
|
||||||
|
|
||||||
|
- **Post-training: Hybrid Fast/Deep Thinking with Multimodal RL.** MiniCPM-V 4.5 offers a balanced reasoning experience through two switchable modes: fast thinking for efficient daily use and deep thinking for complex tasks. Using a new hybrid reinforcement learning method, the model jointly optimizes both modes, significantly enhancing fast-mode performance without compromising deep-mode capability. Incorporated with [RLPR](https://github.com/OpenBMB/RLPR) and [RLAIF-V](https://github.com/RLHF-V/RLAIF-V), it generalizes robust reasoning skills from broad multimodal data while effectively reducing hallucinations.
|
||||||
|
|
||||||
|
### Evaluation <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="./assets/radar_minicpm_v45.png", width=60%>
|
||||||
|
</div>
|
||||||
|
<div align="center">
|
||||||
|
<img src="./assets/minicpmv_4_5_evaluation_result.png" , width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
### Inference Efficiency
|
||||||
|
|
||||||
|
|
||||||
|
**OpenCompass**
|
||||||
|
<div align="left">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Avg Score ↑</th>
|
||||||
|
<th>Total Inference Time ↓</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4.1V-9B-Thinking</td>
|
||||||
|
<td>10.3B</td>
|
||||||
|
<td>76.6</td>
|
||||||
|
<td>17.5h</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiMo-VL-7B-RL</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>76.4</td>
|
||||||
|
<td>11h</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 4.5</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td><b>77.0</td>
|
||||||
|
<td><b>7.5h</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
**Video-MME**
|
||||||
|
|
||||||
|
<div align="left">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Avg Score ↑</th>
|
||||||
|
<th>Total Inference Time ↓</th>
|
||||||
|
<th>GPU Mem ↓</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>71.6</td>
|
||||||
|
<td>3h</td>
|
||||||
|
<td>60G</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4.1V-9B-Thinking</td>
|
||||||
|
<td>10.3B</td>
|
||||||
|
<td><b>73.6</td>
|
||||||
|
<td>2.63h</td>
|
||||||
|
<td>32G</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 4.5</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>73.5</td>
|
||||||
|
<td><b>0.26h</td>
|
||||||
|
<td><b>28G</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
Both Video-MME and OpenCompass were evaluated using 8×A100 GPUs for inference. The reported inference time of Video-MME includes full model-side computation, and excludes the external cost of video frame extraction (dependent on specific frame extraction tools) for fair comparison.
|
||||||
|
|
||||||
|
|
||||||
|
### Examples <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<a href="https://www.youtube.com/watch?v=Cn23FujYMMU"><img src="../assets/minicpmv4_5/MiniCPM-V 4.5-8.26_img.jpeg", width=70%></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv4_5/en_case1.png" alt="en_case1" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv4_5/en_case2.png" alt="en_case2" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv4_5/en_case3.jpeg" alt="en_case3" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Click to view more cases.</summary>
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv4_5/zh_extra.jpeg" alt="zh_extra" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
We deploy MiniCPM-V 4.5 on iPad M4 with [iOS demo](https://github.com/tc-mb/MiniCPM-o-demo-iOS). The demo video is the raw screen recording without edition.
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4_5/v45_en_handwriting.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4_5/v45_en_cot.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4_5/v45_cn_handwriting.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4_5/v45_cn_travel.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
156
docs/minicpm_v4dot5_zh.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
## MiniCPM-V 4.5
|
||||||
|
|
||||||
|
> Archieve at: 2026-02-03
|
||||||
|
|
||||||
|
**MiniCPM-V 4.5** 是 MiniCPM-V 系列中最新、最强大的模型。该模型基于 Qwen3-8B 与 SigLIP2-400M 构建,总参数量为 8B。其在性能上较前代 MiniCPM-V 与 MiniCPM-o 有显著提升,并引入了一系列全新的实用特性。其主要亮点包括:
|
||||||
|
|
||||||
|
|
||||||
|
- 🔥 **领先的视觉理解能力**
|
||||||
|
MiniCPM-V 4.5 在 OpenCompass 综合评测(涵盖 8 个主流评测基准)中取得了 77.0 的高分。**在仅 8B 参数的情况下超越了广泛使用的闭源模型(如 GPT-4o-latest、Gemini-2.0 Pro)以及强大的开源模型(如 Qwen2.5-VL 72B)**,成为 30B 参数规模以下最强的多模态大模型。
|
||||||
|
|
||||||
|
- 🎬 **高效的高帧率与长视频理解**
|
||||||
|
借助全新的图像-视频统一 3D-Resampler,MiniCPM-V 4.5 能够实现 96 倍视频 token 压缩率,即将 6 帧 448x448 视频帧联合压缩为 64 个 token(大多数多模态大模型需约 1536 个 token)。这意味着模型在语言模型推理成本不增加的情况下,可以感知显著更多的视频帧,从而实现业界领先的 高帧率(最高 10FPS)视频理解与长视频理解,并在 Video-MME、LVBench、MLVU、MotionBench、FavorBench 等基准上高效率地展现出色性能。
|
||||||
|
|
||||||
|
- ⚙️ **可控的快思考 / 深思考模式**
|
||||||
|
MiniCPM-V 4.5 同时支持 快思考(用于高频高效推理,性能具竞争力)与 深思考(用于复杂问题求解)。用户可根据不同场景对效率与性能的权衡,自由切换两种模式,实现高度可控的推理过程。
|
||||||
|
|
||||||
|
- 💪 **优秀的 OCR、文档解析与多语言能力**
|
||||||
|
基于 [LLaVA-UHD](https://arxiv.org/pdf/2403.11703) 架构,MiniCPM-V 4.5 能处理任意长宽比、最高达 180 万像素(如 1344x1344) 的高分辨率图像,同时使用的视觉 token 数仅为多数 MLLM 的 1/4。其在 OCRBench 上取得超越 GPT-4o-latest 与 Gemini 2.5 等闭源模型的性能,并在 OmniDocBench 上展现了业界顶尖的 PDF 文档解析能力。借助最新的 [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) 和 [VisCPM](https://github.com/OpenBMB/VisCPM) 技术,模型在可靠性上表现优异,在 MMHal-Bench 上超越 GPT-4o-latest,并支持 30+ 种语言的多语言能力。
|
||||||
|
|
||||||
|
- 💫 **便捷易用的部署方式**
|
||||||
|
MiniCPM-V 4.5 提供丰富灵活的使用方式:(1) [llama.cpp](https://github.com/tc-mb/llama.cpp/blob/master/docs/multimodal/minicpmo4.5.md) 与 [ollama](https://github.com/tc-mb/ollama/tree/MIniCPM-V) 支持本地 CPU 高效推理;(2) 提供 [int4](https://huggingface.co/openbmb/MiniCPM-V-4_5-int4)、[GGUF](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf)、[AWQ](https://github.com/tc-mb/AutoAWQ) 等 16 种规格的量化模型;(3)兼容 SGLang 与 [vLLM](#efficient-inference-with-llamacpp-ollama-vllm) (4) 借助 [Transformers](https://github.com/tc-mb/transformers/tree/main) 与 [LLaMA-Factory](./docs/llamafactory_train_and_infer.md) 在新领域与任务上进行微调;(5) 快速启动本地 [WebUI demo](#chat-with-our-demo-on-gradio);(6) 优化适配的 [iOS 本地应用](https://github.com/tc-mb/MiniCPM-o-demo-iOS),可在 iPhone 与 iPad 上高效运行;(7) 在线 [Web demo](http://101.126.42.235:30910/) 体验。更多使用方式请见 [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-CookBook)。
|
||||||
|
|
||||||
|
### 技术亮点 <!-- omit in toc -->
|
||||||
|
|
||||||
|
- **架构:图像-视频统一的高密度视觉压缩 3D-Resampler**。 MiniCPM-V 4.5 在架构上引入了 3D-Resampler,成功突破了视频理解任务中性能与效率难以兼得的瓶颈。该方法能够将多达 6 帧连续视频帧压缩为仅 64 个 token(与 MiniCPM-V 系列中单张图像所用的 token 数相同),从而实现 96× 的视频 token 压缩率。这使得模型在语言模型计算成本不增加的情况下,可以处理更多的视频帧,从而实现高帧率视频理解和长视频理解。该架构统一支持单图、多图和视频的编码处理,确保了能力与知识的无缝迁移。
|
||||||
|
|
||||||
|
- **学习机制:OCR与文档知识的统一学习**。现有多模态大模型一般在不同训练阶段分别单独训练 OCR 能力与文档知识。我们发现这两个训练过程的本质差异在于图像中文本的可见性。通过动态对文档文本区域施加不同强度的噪声干扰,并要求模型重建文本,使其学会自适应地在准确文本识别(当文本清晰时)与基于多模态上下文的知识推理(当文本严重遮挡时)之间切换。这种方法使得 MiniCPM-V 在文档知识学习中摆脱了对高错误率的文档解析器的依赖,同时避免了过度增强的 OCR 数据产生的幻觉问题,以最小工程开销实现了顶尖的 OCR 与多模态知识处理性能。
|
||||||
|
|
||||||
|
- **后训练优化:基于多模态强化学习的混合快思考/深度思考模式**。 MiniCPM-V 4.5 通过两种可切换推理模式提供均衡的体验:面向高效日常应用的快速思考模式,以及处理复杂任务的深度思考模式。采用新颖的混合强化学习方法,模型可联合优化两种模式,在保持深度模式能力的同时显著提升快速模式性能。结合 [RLPR](https://github.com/OpenBMB/RLPR) 和 [RLAIF-V](https://github.com/RLHF-V/RLAIF-V) 技术,该模型可以从海量多模态数据中泛化出强大的推理能力,并有效减少幻觉现象。
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="../assets/minicpm-v-4dot5-framework.png" , width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
### 性能评估 <!-- omit in toc -->
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="../assets/radar_minicpm_v45.png", width=80%>
|
||||||
|
</div>
|
||||||
|
<div align="center">
|
||||||
|
<img src="../assets/minicpmv_4_5_evaluation_result.png" , width=80%>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
### 推理效率 <!-- omit in toc -->
|
||||||
|
|
||||||
|
|
||||||
|
**OpenCompass**
|
||||||
|
<div align="left">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Avg Score ↑</th>
|
||||||
|
<th>Total Inference Time ↓</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4.1V-9B-Thinking</td>
|
||||||
|
<td>10.3B</td>
|
||||||
|
<td>76.6</td>
|
||||||
|
<td>17.5h</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiMo-VL-7B-RL</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>76.4</td>
|
||||||
|
<td>11h</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 4.5</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td><b>77.0</td>
|
||||||
|
<td><b>7.5h</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
**Video-MME**
|
||||||
|
|
||||||
|
<div align="left">
|
||||||
|
<table style="margin: 0px auto;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th align="left">Model</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Avg Score ↑</th>
|
||||||
|
<th>Total Inference Time ↓</th>
|
||||||
|
<th>GPU Mem ↓</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody align="center">
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">Qwen2.5-VL-7B-Instruct</td>
|
||||||
|
<td>8.3B</td>
|
||||||
|
<td>71.6</td>
|
||||||
|
<td>3h</td>
|
||||||
|
<td>60G</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">GLM-4.1V-9B-Thinking</td>
|
||||||
|
<td>10.3B</td>
|
||||||
|
<td><b>73.6</td>
|
||||||
|
<td>2.63h</td>
|
||||||
|
<td>32G</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td nowrap="nowrap" align="left">MiniCPM-V 4.5</td>
|
||||||
|
<td>8.7B</td>
|
||||||
|
<td>73.5</td>
|
||||||
|
<td><b>0.26h</td>
|
||||||
|
<td><b>28G</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
|
||||||
|
OpenCompass 和 Video-MME 均采用 A100*8卡 推理,其中 Video-MME 的推理时间未统计视频抽帧时间
|
||||||
|
|
||||||
|
### 典型示例 <!-- omit in toc -->
|
||||||
|
<div align="center">
|
||||||
|
<a href="https://www.youtube.com/watch?v=Cn23FujYMMU"><img src="../assets/minicpmv4_5/MiniCPM-V 4.5-8.26_img.jpeg", width=70%></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv4_5/zh_case1.jpeg" alt="zh_case1" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv4_5/zh_case2.jpeg" alt="zh_case2" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>点击查看更多示例</summary>
|
||||||
|
<div style="display: flex; flex-direction: column; align-items: center;">
|
||||||
|
<img src="../assets/minicpmv4_5/en_extra.jpg" alt="en_extra" style="margin-bottom: 5px;">
|
||||||
|
<img src="../assets/minicpmv4_5/en_case3.jpeg" alt="en_extra" style="margin-bottom: 5px;">
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
我们使用 [iOS demo](https://github.com/tc-mb/MiniCPM-o-demo-iOS) 将 MiniCPM-V 4.5 部署在 iPad M4 ,并录制以下演示录屏,视频未经任何编辑。
|
||||||
|
|
||||||
|
<table align="center">
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4_5/v45_en_handwriting.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4_5/v45_en_cot.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
<p align="center">
|
||||||
|
<img src="../assets/minicpmv4_5/v45_cn_handwriting.gif" width=45%/>
|
||||||
|
|
||||||
|
<img src="../assets/minicpmv4_5/v45_cn_travel.gif" width=45%/>
|
||||||
|
</p>
|
||||||
|
</table>
|
||||||
|
|
||||||
@@ -69,6 +69,7 @@ class SupervisedDataset(Dataset):
|
|||||||
batch_vision=self.batch_vision,
|
batch_vision=self.batch_vision,
|
||||||
max_length=self.max_length
|
max_length=self.max_length
|
||||||
)
|
)
|
||||||
|
|
||||||
ret = dict(
|
ret = dict(
|
||||||
input_ids=ret["input_ids"],
|
input_ids=ret["input_ids"],
|
||||||
position_ids=ret["position_ids"],
|
position_ids=ret["position_ids"],
|
||||||
@@ -283,20 +284,30 @@ def conversation_to_ids_qwen2(conversation, tokenizer):
|
|||||||
chat.append({"role":prefix, "content":message})
|
chat.append({"role":prefix, "content":message})
|
||||||
raw_msg += prefix + message
|
raw_msg += prefix + message
|
||||||
assert set([i['role'] for i in chat]) & set(['assistant'])
|
assert set([i['role'] for i in chat]) & set(['assistant'])
|
||||||
|
if '<think>' in chat[-1]['content'] and '</think>' in chat[-1]['content']:
|
||||||
|
enable_thinking = True
|
||||||
|
else:
|
||||||
|
enable_thinking = False
|
||||||
|
|
||||||
ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
|
ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False, enable_thinking=enable_thinking)
|
||||||
input_ids = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False)
|
input_ids = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False, enable_thinking=enable_thinking)
|
||||||
input_ids = np.array(input_ids)
|
input_ids = np.array(input_ids)
|
||||||
|
if "<think>\n\n</think>\n\n" in ret:
|
||||||
|
offset = 4
|
||||||
|
else:
|
||||||
|
offset = 0
|
||||||
start_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_start|>'))[0]
|
start_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_start|>'))[0]
|
||||||
assistant_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('assistant'))[0]
|
assistant_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('assistant'))[0]
|
||||||
end_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_end|>'))[0]
|
end_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_end|>'))[0]
|
||||||
|
|
||||||
context = np.ones_like(input_ids, dtype=np.int8)
|
context = np.ones_like(input_ids, dtype=np.int8)
|
||||||
|
|
||||||
for assistant_idx in assistant_idxs:
|
for i, assistant_idx in enumerate(assistant_idxs):
|
||||||
if assistant_idx-1 in set(start_idxs):
|
if assistant_idx-1 in set(start_idxs):
|
||||||
st = assistant_idx + 1
|
if i == len(assistant_idxs) -1:
|
||||||
|
st = assistant_idx + 2 + offset
|
||||||
|
else:
|
||||||
|
st = assistant_idx + 2
|
||||||
for end_idx in end_idxs:
|
for end_idx in end_idxs:
|
||||||
if end_idx > st:
|
if end_idx > st:
|
||||||
context[st: end_idx + 1] = 0
|
context[st: end_idx + 1] = 0
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ torchrun $DISTRIBUTED_ARGS finetune.py \
|
|||||||
--per_device_train_batch_size 1 \
|
--per_device_train_batch_size 1 \
|
||||||
--per_device_eval_batch_size 1 \
|
--per_device_eval_batch_size 1 \
|
||||||
--gradient_accumulation_steps 1 \
|
--gradient_accumulation_steps 1 \
|
||||||
--evaluation_strategy "steps" \
|
--eval_strategy "steps" \
|
||||||
--save_strategy "steps" \
|
--save_strategy "steps" \
|
||||||
--save_steps 1000 \
|
--save_steps 1000 \
|
||||||
--save_total_limit 10 \
|
--save_total_limit 10 \
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# MiniCPM-V Finetuning
|
# MiniCPM-V & o Finetuning
|
||||||
|
|
||||||
|
|
||||||
We offer the official scripts for easy finetuning of the pretrained **MiniCPM-o-2_6**, **MiniCPM-V-2_6**, **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.
|
We offer the official scripts for easy finetuning of the pretrained **MiniCPM-V 4.0**, **MiniCPM-o 2.6**, **MiniCPM-V 2.6**, **MiniCPM-Llama3-V 2.5** and **MiniCPM-V 2.0** on downstream tasks. Our finetune scripts use transformers Trainer and DeepSpeed by default.
|
||||||
|
|
||||||
### Data preparation
|
### Data preparation
|
||||||
|
|
||||||
@@ -96,11 +96,10 @@ If the total token count exceeds `max_length`, truncation will be applied. For m
|
|||||||
Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path, DATA path and LLM_TYPE in the shell scripts.
|
Full-parameter parameter finetuning requires updating all parameters of LLM in the whole training process. Please specify the correct MODEL path, DATA path and LLM_TYPE in the shell scripts.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
MODEL="MiniCPM-o-2_6" # or "openbmb/MiniCPM-V-2_6", openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2
|
MODEL="MiniCPM-o-2_6" # or "openbmb/MiniCPM-V-2_6", "openbmb/MiniCPM-Llama3-V-2_5", "openbmb/MiniCPM-V-2"
|
||||||
DATA="path/to/trainging_data" # json file
|
DATA="path/to/training_data.json"
|
||||||
EVAL_DATA="path/to/test_data" # json file
|
EVAL_DATA="path/to/test_data.json"
|
||||||
LLM_TYPE="qwen" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3",
|
LLM_TYPE="qwen" # llama for MiniCPM-V-4, minicpm for MiniCPM-V-2, llama3 for MiniCPM-Llama3-V-2_5, qwen for MiniCPM-o-2_6/MiniCPM-V-2_6
|
||||||
# if use openbmb/MiniCPM-o-2_6 or openbmb/MiniCPM-V-2_6, please set LLM_TYPE=qwen
|
|
||||||
```
|
```
|
||||||
|
|
||||||
To launch your training, run the following script:
|
To launch your training, run the following script:
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_st
|
|||||||
decord
|
decord
|
||||||
aiosignal
|
aiosignal
|
||||||
tensorboard
|
tensorboard
|
||||||
deepspeed==0.12.3
|
deepspeed
|
||||||
transformers==4.44.2
|
transformers==4.51.2
|
||||||
librosa==0.9.0
|
librosa==0.9.0
|
||||||
soundfile==0.12.1
|
soundfile==0.12.1
|
||||||
vector-quantize-pytorch==1.18.5
|
vector-quantize-pytorch==1.18.5
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from transformers.trainer_pt_utils import nested_detach
|
|||||||
from transformers.utils import is_sagemaker_mp_enabled
|
from transformers.utils import is_sagemaker_mp_enabled
|
||||||
from transformers.trainer import *
|
from transformers.trainer import *
|
||||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
from transformers.integrations import is_deepspeed_zero3_enabled
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
class CPMTrainer(Trainer):
|
class CPMTrainer(Trainer):
|
||||||
def compute_loss(self, model, inputs, return_outputs=False):
|
def compute_loss(self, model, inputs, return_outputs=False):
|
||||||
@@ -170,7 +170,7 @@ class CPMTrainer(Trainer):
|
|||||||
|
|
||||||
return (loss, logits, labels)
|
return (loss, logits, labels)
|
||||||
|
|
||||||
def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
|
def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Perform a training step on a batch of inputs.
|
Perform a training step on a batch of inputs.
|
||||||
|
|
||||||
@@ -190,7 +190,6 @@ class CPMTrainer(Trainer):
|
|||||||
"""
|
"""
|
||||||
model.train()
|
model.train()
|
||||||
inputs = self._prepare_inputs(inputs)
|
inputs = self._prepare_inputs(inputs)
|
||||||
|
|
||||||
if is_sagemaker_mp_enabled():
|
if is_sagemaker_mp_enabled():
|
||||||
loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
|
loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
|
||||||
return loss_mb.reduce_mean().detach().to(self.args.device)
|
return loss_mb.reduce_mean().detach().to(self.args.device)
|
||||||
|
|||||||
@@ -1,81 +0,0 @@
|
|||||||
"""
|
|
||||||
the script will use bitandbytes to quantize the MiniCPM-Llama3-V-2_5 model.
|
|
||||||
the be quantized model can be finetuned by MiniCPM-Llama3-V-2_5 or not.
|
|
||||||
you only need to set the model_path 、save_path and run bash code
|
|
||||||
|
|
||||||
cd MiniCPM-V
|
|
||||||
python quantize/bnb_quantize.py
|
|
||||||
|
|
||||||
you will get the quantized model in save_path、quantized_model test time and gpu usage
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
|
|
||||||
from PIL import Image
|
|
||||||
import time
|
|
||||||
import torch
|
|
||||||
import GPUtil
|
|
||||||
import os
|
|
||||||
|
|
||||||
assert torch.cuda.is_available(),"CUDA is not available, but this code requires a GPU."
|
|
||||||
|
|
||||||
device = 'cuda' # Select GPU to use
|
|
||||||
model_path = '/root/ld/ld_model_pretrained/MiniCPM-Llama3-V-2_5' # Model download path
|
|
||||||
save_path = '/root/ld/ld_model_pretrain/MiniCPM-Llama3-V-2_5_int4' # Quantized model save path
|
|
||||||
image_path = './assets/airplane.jpeg'
|
|
||||||
|
|
||||||
|
|
||||||
# Create a configuration object to specify quantization parameters
|
|
||||||
quantization_config = BitsAndBytesConfig(
|
|
||||||
load_in_4bit=True, # Whether to perform 4-bit quantization
|
|
||||||
load_in_8bit=False, # Whether to perform 8-bit quantization
|
|
||||||
bnb_4bit_compute_dtype=torch.float16, # Computation precision setting
|
|
||||||
bnb_4bit_quant_storage=torch.uint8, # Storage format for quantized weights
|
|
||||||
bnb_4bit_quant_type="nf4", # Quantization format, here using normally distributed int4
|
|
||||||
bnb_4bit_use_double_quant=True, # Whether to use double quantization, i.e., quantizing zeropoint and scaling parameters
|
|
||||||
llm_int8_enable_fp32_cpu_offload=False, # Whether LLM uses int8, with fp32 parameters stored on the CPU
|
|
||||||
llm_int8_has_fp16_weight=False, # Whether mixed precision is enabled
|
|
||||||
llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"], # Modules not to be quantized
|
|
||||||
llm_int8_threshold=6.0 # Outlier value in the llm.int8() algorithm, distinguishing whether to perform quantization based on this value
|
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
||||||
model = AutoModel.from_pretrained(
|
|
||||||
model_path,
|
|
||||||
device_map=device, # Allocate model to device
|
|
||||||
quantization_config=quantization_config,
|
|
||||||
trust_remote_code=True
|
|
||||||
)
|
|
||||||
|
|
||||||
gpu_usage = GPUtil.getGPUs()[0].memoryUsed
|
|
||||||
start=time.time()
|
|
||||||
response = model.chat(
|
|
||||||
image=Image.open(image_path).convert("RGB"),
|
|
||||||
msgs=[
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "What is in this picture?"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
tokenizer=tokenizer
|
|
||||||
) # 模型推理
|
|
||||||
print('Output after quantization:',response)
|
|
||||||
print('Inference time after quantization:',time.time()-start)
|
|
||||||
print(f"GPU memory usage after quantization: {round(gpu_usage/1024,2)}GB")
|
|
||||||
|
|
||||||
"""
|
|
||||||
Expected output:
|
|
||||||
|
|
||||||
Output after quantization: This picture contains specific parts of an airplane, including wings, engines, and tail sections. These components are key parts of large commercial aircraft.
|
|
||||||
The wings support lift during flight, while the engines provide thrust to move the plane forward. The tail section is typically used for stabilizing flight and plays a role in airline branding.
|
|
||||||
The design and color of the airplane indicate that it belongs to Air China, likely a passenger aircraft due to its large size and twin-engine configuration.
|
|
||||||
There are no markings or insignia on the airplane indicating the specific model or registration number; such information may require additional context or a clearer perspective to discern.
|
|
||||||
Inference time after quantization: 8.583992719650269 seconds
|
|
||||||
GPU memory usage after quantization: 6.41 GB
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Save the model and tokenizer
|
|
||||||
os.makedirs(save_path, exist_ok=True)
|
|
||||||
model.save_pretrained(save_path, safe_serialization=True)
|
|
||||||
tokenizer.save_pretrained(save_path)
|
|
||||||