Modify eval_mm for MiniCPM-V 2.6

2026-02-05 18:29:18 +08:00 · 2024-08-30 18:18:22 +00:00
parent ab1141ee45
commit 59224808a1
69 changed files with 8231 additions and 1818 deletions
--- a/eval_mm/vlmevalkit/vlmeval/smp/vlm.py
+++ b/eval_mm/vlmevalkit/vlmeval/smp/vlm.py
@@ -7,10 +7,53 @@ from uuid import uuid4
 import os.path as osp
 import base64
 from PIL import Image
-from .file import load, dump
+import sys
+
 Image.MAX_IMAGE_PIXELS = 1e9


+def rescale_img(img, tgt=None):
+    assert isinstance(tgt, tuple) and -1 in tgt
+    w, h = img.size
+    if tgt[0] != -1:
+        new_w, new_h = tgt[0], int(tgt[0] / w * h)
+    elif tgt[1] != -1:
+        new_w, new_h = int(tgt[1] / h * w), tgt[1]
+    img = img.resize((new_w, new_h))
+    return img
+
+
+def concat_images_vlmeval(images, target_size=-1, mode='h', return_image=False):
+    from .file import md5
+
+    ims = [Image.open(im) for im in images]
+    if target_size != -1:
+        ims = [
+            rescale_img(im, (-1, target_size) if mode == 'h' else (target_size, -1))
+            for im in ims
+        ]
+
+    ws, hs = [x.width for x in ims], [x.height for x in ims]
+    if mode == 'h':
+        new_w, new_h = sum(ws), max(hs)
+        dst = Image.new('RGB', (new_w, new_h))
+        for i, im in enumerate(ims):
+            dst.paste(im, (sum(ws[:i]), 0))
+    elif mode == 'v':
+        new_w, new_h = max(ws), sum(hs)
+        dst = Image.new('RGB', (new_w, new_h))
+        for i, im in enumerate(ims):
+            dst.paste(im, (sum(ws[:i], 0)))
+    if return_image:
+        return dst
+    else:
+        _str = '\n'.join(images)
+        str_md5 = md5(_str)
+        tgt = osp.join('/tmp', str_md5 + '.jpg')
+        dst.save(tgt)
+        return tgt
+
+
 def mmqa_display(question, target_size=512):
    question = {k.lower(): v for k, v in question.items()}
    keys = list(question.keys())
@@ -41,14 +84,12 @@ def encode_image_to_base64(img, target_size=-1):
    # else, will set the max_size ot (target_size, target_size)
    if img.mode in ('RGBA', 'P'):
        img = img.convert('RGB')
-    tmp = osp.join('/tmp', str(uuid4()) + '.jpg')
    if target_size > 0:
        img.thumbnail((target_size, target_size))
-    img.save(tmp)
-    with open(tmp, 'rb') as image_file:
-        image_data = image_file.read()
+    img_buffer = io.BytesIO()
+    img.save(img_buffer, format='JPEG')
+    image_data = img_buffer.getvalue()
    ret = base64.b64encode(image_data).decode('utf-8')
-    os.remove(tmp)
    return ret


@@ -110,6 +151,7 @@ def circular_pred(df, extract_func=None):
        extract_func = lambda x: x  # noqa: E731
    df = df.sort_values('index')
    from vlmeval.utils import can_infer_option
+
    shift = int(1e6)

    choices = [extract_func(x) for x in df['prediction']]
@@ -118,9 +160,12 @@ def circular_pred(df, extract_func=None):
    valid_map = {i: True for i in pred_map if i < 1e6}
    for i in df['index']:
        if i >= shift and pred_map[i] and pred_map[i - shift]:
-            if (
-                pred_map[i] not in list(string.ascii_uppercase) or  # noqa: W504
-                pred_map[i - shift] not in list(string.ascii_uppercase)
+            if pred_map[i] not in list(
+                string.ascii_uppercase
+            ) or pred_map[  # noqa: W504
+                i - shift
+            ] not in list(
+                string.ascii_uppercase
            ):

                valid_map[i % shift] = False