Modify eval_mm for MiniCPM-o 2.6

2026-02-04 09:49:20 +08:00 · 2025-01-21 15:34:54 +08:00
parent ec68cefc17
commit d8f382e157
82 changed files with 14279 additions and 843 deletions
--- a/eval_mm/vlmevalkit/scripts/run_inference.sh
+++ b/eval_mm/vlmevalkit/scripts/run_inference.sh
@@ -5,27 +5,37 @@ export OMP_NUM_THREADS=1
 export timestamp=`date +"%Y%m%d%H%M%S"`
 export OLD_VERSION='False'
 export PYTHONPATH=$(dirname $SELF_DIR):$PYTHONPATH
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

 # gpu consumed
 # fp16 17-18G
 # int4 7-8G

 # model to be used
-# Example: MODELNAME=MiniCPM_V_2_6
+# Example: MODELNAME=MiniCPM-o-2_6
 MODELNAME=$1
 # datasets to be tested
-# Example: DATALIST="MMMU_DEV_VAL MathVista_MINI MMVet MMBench_DEV_EN_V11 MMBench_DEV_CN_V11 MMStar HallusionBench AI2D_TEST"
+# Example: DATALIST=MMMU_DEV_VAL
 DATALIST=$2
-# test mode, all or infer
-MODE=$3

-echo "Starting inference with model $MODELNAME on datasets $DATALIST"
 # run on multi gpus with torchrun command
 # remember to run twice, the first run may fail
-torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE
-torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE
-# run on single gpu with python command
-# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
-# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
+for DATASET in $DATALIST; do
+    echo "Starting inference with model $MODELNAME on dataset $DATASET"
+    torchrun --master_port 29500 --nproc_per_node=8 run.py --data $DATASET --model $MODELNAME --mode infer --reuse
+    torchrun --master_port 29501 --nproc_per_node=8 run.py --data $DATASET --model $MODELNAME --mode infer --reuse

-ls
+    # for benchmarks which require gpt for scoring, you need to specify OPENAI_API_BASE and OPENAI_API_KEY in .env file
+    if [[ "$DATASET" == *"MMBench_TEST"*]]; then
+        echo "Skipping evaluation for dataset $DATASET"
+    else
+        echo "Starting evaluation with model $MODELNAME on datasets $DATASET"
+        python run.py --data $DATASET --model $MODELNAME --nproc 16 --verbose
+    fi
+done
+
+# run on single gpu with python command
+# python run.py --data $DATALIST --model $MODELNAME --verbose --mode infer
+# python run.py --data $DATALIST --model $MODELNAME --verbose --mode infer
+# echo "Starting evaluation with model $MODELNAME on datasets $DATASET"
+# python run.py --data $DATASET --model $MODELNAME --nproc 16 --verbose