mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
Modify eval_mm for MiniCPM-V 2.6
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
from ..smp import *
|
||||
from ..utils.dataset_config import img_root_map
|
||||
from ..dataset import img_root_map
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
class BaseModel:
|
||||
|
||||
INTERLEAVE = False
|
||||
allowed_types = ['text', 'image']
|
||||
allowed_types = ['text', 'image', 'video']
|
||||
|
||||
def __init__(self):
|
||||
self.dump_image_func = None
|
||||
|
||||
def use_custom_prompt(self, dataset):
|
||||
"""Whether to use custom prompt for the given dataset.
|
||||
@@ -33,34 +36,11 @@ class BaseModel:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def set_dump_image(self, dump_image_func):
|
||||
self.dump_image_func = dump_image_func
|
||||
|
||||
def dump_image(self, line, dataset):
|
||||
"""Dump the image(s) of the input line to the corresponding dataset folder.
|
||||
|
||||
Args:
|
||||
line (line of pd.DataFrame): The raw input line.
|
||||
dataset (str): The name of the dataset.
|
||||
|
||||
Returns:
|
||||
str | list[str]: The paths of the dumped images.
|
||||
"""
|
||||
ROOT = LMUDataRoot()
|
||||
assert isinstance(dataset, str)
|
||||
img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
|
||||
os.makedirs(img_root, exist_ok=True)
|
||||
if isinstance(line['image'], list):
|
||||
tgt_path = []
|
||||
assert 'image_path' in line
|
||||
for img, im_name in zip(line['image'], line['image_path']):
|
||||
path = osp.join(img_root, im_name)
|
||||
if not read_ok(path):
|
||||
decode_base64_to_image_file(img, path)
|
||||
tgt_path.append(path)
|
||||
else:
|
||||
tgt_path = osp.join(img_root, f"{line['index']}.jpg")
|
||||
if not read_ok(tgt_path):
|
||||
decode_base64_to_image_file(line['image'], tgt_path)
|
||||
tgt_path = [tgt_path]
|
||||
return tgt_path
|
||||
return self.dump_image_func(line)
|
||||
|
||||
@abstractmethod
|
||||
def generate_inner(self, message, dataset=None):
|
||||
@@ -134,7 +114,25 @@ class BaseModel:
|
||||
assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
|
||||
return self.generate_inner(message, dataset)
|
||||
|
||||
def message_to_promptimg(self, message):
|
||||
def chat(self, messages, dataset=None):
|
||||
"""The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
|
||||
assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
|
||||
for msg in messages:
|
||||
assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
|
||||
assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
|
||||
msg['content'] = self.preproc_content(msg['content'])
|
||||
|
||||
while len(messages):
|
||||
try:
|
||||
return self.chat_inner(messages, dataset=dataset)
|
||||
except:
|
||||
messages = messages[1:]
|
||||
while len(messages) and messages[0]['role'] != 'user':
|
||||
messages = messages[1:]
|
||||
continue
|
||||
return 'Chat Mode: Failed with all possible conversation turns.'
|
||||
|
||||
def message_to_promptimg(self, message, dataset=None):
|
||||
assert not self.INTERLEAVE
|
||||
model_name = self.__class__.__name__
|
||||
warnings.warn(
|
||||
@@ -146,5 +144,24 @@ class BaseModel:
|
||||
image = None
|
||||
else:
|
||||
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
||||
image = [x['value'] for x in message if x['type'] == 'image'][0]
|
||||
images = [x['value'] for x in message if x['type'] == 'image']
|
||||
if 'BLINK' == dataset:
|
||||
image = concat_images_vlmeval(images, target_size=512)
|
||||
else:
|
||||
image = images[0]
|
||||
return prompt, image
|
||||
|
||||
def message_to_promptvideo(self, message):
|
||||
if self.VIDEO_LLM:
|
||||
num_videos = len([x for x in message if x['type'] == 'video'])
|
||||
if num_videos == 0:
|
||||
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
||||
video = None
|
||||
else:
|
||||
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
||||
video = [x['value'] for x in message if x['type'] == 'video'][0]
|
||||
return prompt, video
|
||||
else:
|
||||
import sys
|
||||
warnings.warn('Model does not support video input.')
|
||||
sys.exit(-1)
|
||||
|
||||
Reference in New Issue
Block a user