Modify eval_mm for MiniCPM-V 2.6

2026-02-05 18:29:18 +08:00 · 2024-08-30 18:18:22 +00:00
parent ab1141ee45
commit 59224808a1
69 changed files with 8231 additions and 1818 deletions
--- a/eval_mm/vlmevalkit/vlmeval/vlm/base.py
+++ b/eval_mm/vlmevalkit/vlmeval/vlm/base.py
@@ -1,12 +1,15 @@
 from ..smp import *
-from ..utils.dataset_config import img_root_map
+from ..dataset import img_root_map
 from abc import abstractmethod


 class BaseModel:

    INTERLEAVE = False
-    allowed_types = ['text', 'image']
+    allowed_types = ['text', 'image', 'video']
+
+    def __init__(self):
+        self.dump_image_func = None

    def use_custom_prompt(self, dataset):
        """Whether to use custom prompt for the given dataset.
@@ -33,34 +36,11 @@ class BaseModel:
        """
        raise NotImplementedError

+    def set_dump_image(self, dump_image_func):
+        self.dump_image_func = dump_image_func
+
    def dump_image(self, line, dataset):
-        """Dump the image(s) of the input line to the corresponding dataset folder.
-
-        Args:
-            line (line of pd.DataFrame): The raw input line.
-            dataset (str): The name of the dataset.
-
-        Returns:
-            str | list[str]: The paths of the dumped images.
-        """
-        ROOT = LMUDataRoot()
-        assert isinstance(dataset, str)
-        img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
-        os.makedirs(img_root, exist_ok=True)
-        if isinstance(line['image'], list):
-            tgt_path = []
-            assert 'image_path' in line
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
-            tgt_path = [tgt_path]
-        return tgt_path
+        return self.dump_image_func(line)

    @abstractmethod
    def generate_inner(self, message, dataset=None):
@@ -134,7 +114,25 @@ class BaseModel:
            assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
        return self.generate_inner(message, dataset)

-    def message_to_promptimg(self, message):
+    def chat(self, messages, dataset=None):
+        """The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
+        assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
+        for msg in messages:
+            assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
+            assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
+            msg['content'] = self.preproc_content(msg['content'])
+
+        while len(messages):
+            try:
+                return self.chat_inner(messages, dataset=dataset)
+            except:
+                messages = messages[1:]
+                while len(messages) and messages[0]['role'] != 'user':
+                    messages = messages[1:]
+                continue
+        return 'Chat Mode: Failed with all possible conversation turns.'
+
+    def message_to_promptimg(self, message, dataset=None):
        assert not self.INTERLEAVE
        model_name = self.__class__.__name__
        warnings.warn(
@@ -146,5 +144,24 @@ class BaseModel:
            image = None
        else:
            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
-            image = [x['value'] for x in message if x['type'] == 'image'][0]
+            images = [x['value'] for x in message if x['type'] == 'image']
+            if 'BLINK' == dataset:
+                image = concat_images_vlmeval(images, target_size=512)
+            else:
+                image = images[0]
        return prompt, image
+
+    def message_to_promptvideo(self, message):
+        if self.VIDEO_LLM:
+            num_videos = len([x for x in message if x['type'] == 'video'])
+            if num_videos == 0:
+                prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+                video = None
+            else:
+                prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+                video = [x['value'] for x in message if x['type'] == 'video'][0]
+            return prompt, video
+        else:
+            import sys
+            warnings.warn('Model does not support video input.')
+            sys.exit(-1)