mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
129 lines
4.5 KiB
Python
129 lines
4.5 KiB
Python
from .image_base import ImageBaseDataset
|
|
from .utils.judge_util import build_judge
|
|
from ..smp import *
|
|
from ..utils import track_progress_rich
|
|
|
|
|
|
class ImageMTDataset(ImageBaseDataset):
|
|
|
|
TYPE = 'MT'
|
|
|
|
def build_prompt(self, line):
|
|
if isinstance(line, int):
|
|
line = self.data.iloc[line]
|
|
|
|
if self.meta_only:
|
|
tgt_path = toliststr(line['image_path'])
|
|
else:
|
|
tgt_path = self.dump_image(line)
|
|
|
|
questions = toliststr(line['question'])
|
|
if 'answer' in line:
|
|
answers = toliststr(line['answer'])
|
|
else:
|
|
answers = [''] * len(questions)
|
|
assert len(questions) == len(answers)
|
|
|
|
dlgs, pics_number = [], 0
|
|
for i in range(len(questions)):
|
|
q, a = questions[i], answers[i]
|
|
if '<ImageHere>' in q:
|
|
content = []
|
|
tag_number = q.count('<ImageHere>')
|
|
images = tgt_path[pics_number: pics_number + tag_number]
|
|
pics_number += tag_number
|
|
q_split = q.split('<ImageHere>')
|
|
for i in range(tag_number):
|
|
qsp, im = q_split[i], images[i]
|
|
if qsp != '':
|
|
content.append(dict(type='text', value=qsp))
|
|
content.append(dict(type='image', value=im))
|
|
if q_split[-1] != '':
|
|
content.append(dict(type='text', value=q_split[-1]))
|
|
else:
|
|
content = [dict(type='text', value=q)]
|
|
dlgs.append(dict(role='user', content=content))
|
|
assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
|
|
content = [dict(type='text', value=a)]
|
|
dlgs.append(dict(role='assistant', content=content))
|
|
return dlgs
|
|
|
|
|
|
class MMDUDataset(ImageMTDataset):
|
|
|
|
DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
|
|
DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
|
|
DIMS = [
|
|
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
|
|
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
|
|
]
|
|
|
|
def calculat_metric(self, ans):
|
|
all = defaultdict(lambda: 0)
|
|
tot = defaultdict(lambda: 0)
|
|
valid = defaultdict(lambda: 0)
|
|
for k in ans:
|
|
res = ans[k]['res']
|
|
assert isinstance(res, pd.DataFrame)
|
|
lt = len(res)
|
|
for i in range(lt):
|
|
line = res.iloc[i]
|
|
for k in self.DIMS:
|
|
tot[k] += 1
|
|
if k in line and line[k] is not None:
|
|
try:
|
|
score = int(line[k])
|
|
score = np.clip(score, 0, 10)
|
|
all[k] += score
|
|
valid[k] += 1
|
|
except Exception as e:
|
|
print(f'Failed to parse the score: {str(e)}')
|
|
sp1 = {'set': 'all'}
|
|
sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
|
|
sp2 = {'set': 'valid'}
|
|
sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
|
|
|
|
return pd.DataFrame([sp1, sp2])
|
|
|
|
def evaluate(self, eval_file, **judge_kwargs):
|
|
suffix = eval_file.split('.')[-1]
|
|
model = judge_kwargs['model']
|
|
|
|
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
|
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
|
|
nproc = judge_kwargs.pop('nproc', 4)
|
|
|
|
data = load(eval_file)
|
|
model = judge_kwargs.pop('model', 'gpt-4o')
|
|
judge_model = build_judge(model=model, **judge_kwargs)
|
|
|
|
lt = len(data)
|
|
lines = [data.iloc[i] for i in range(lt)]
|
|
tups = [(judge_model, line) for line in lines]
|
|
indices = [line['index'] for line in lines]
|
|
|
|
ans = {}
|
|
if osp.exists(tmp_file):
|
|
ans = load(tmp_file)
|
|
|
|
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
|
indices = [i for i in indices if i not in ans]
|
|
|
|
from .utils.mmdu import mmdu_score
|
|
|
|
if len(indices):
|
|
new_results = track_progress_rich(
|
|
mmdu_score,
|
|
tups,
|
|
nproc=nproc,
|
|
chunksize=nproc,
|
|
keys=indices,
|
|
save=tmp_file,)
|
|
ans = load(tmp_file)
|
|
for k, v in zip(indices, new_results):
|
|
assert k in ans
|
|
|
|
metric = self.calculat_metric(ans)
|
|
dump(metric, score_file)
|
|
return metric
|