Files
MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py
2025-01-21 15:34:54 +08:00

168 lines
6.4 KiB
Python

import json
import os
import pandas as pd
from .image_base import ImageBaseDataset
from ..smp import *
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
def generate_prompt(d):
question = d['question']
weights = eval(d['component_weight'])
components = eval(d['components'])
num_of_component = int(d['num_of_component'])
response = d['prediction']
if num_of_component == 1:
components = f"The first component is: '{components[0]}'. "
score = f"The first component is worth: {weights[0]} scores. "
elif num_of_component == 2:
components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
elif num_of_component == 3:
components = (
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
f"and the third component is '{components[2]}'. "
)
score = (
"The first, second, and third component is each worth "
f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
)
elif num_of_component == 4:
components = (
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
)
score = (
"The first, second, third, and fourth component is each worth "
f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
)
elif num_of_component == 5:
components = (
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
f"and the fifth component is '{components[4]}'. "
)
score = (
"The first, second, third, fourth, and fifth component is each worth "
f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
)
return (
"Here is an instruction for a multimodal LLM: '"
f"{question}"
"'. You need to grade if the response from the model follows each component of the instruction. "
f"{components}"
"The response is: '"
f"{response}"
"'. You need to score the response and be strict. The total score ranges from 0 to 10, "
"depending on if the response follows the instruction. "
f"{score}"
"List scores of each component, and the total score in one sentence in this format: "
"score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
)
def process_rawscore(component_type, raw_score):
first_sentence = raw_score.split('.')[0].split(',')
score_dict = {}
for i in range(len(first_sentence) - 1):
score_ = first_sentence[i].split(':')[1][1:].split('/')
score = int(score_[0]) / int(score_[1])
score_dict[component_type[i]] = score
total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
total_score = int(total_score_[0]) / int(total_score_[1])
score_dict['total_score'] = total_score
return score_dict
def get_score_dict(data, score_raw):
cat_score_dict = {}
for i in range(len(data)):
try:
cmp = data['component_type'][i][2:-2]
cmp_list = cmp.split('\', \'')
score_dict = process_rawscore(cmp_list, score_raw[i])
for key, val in score_dict.items():
if key not in cat_score_dict.keys():
cat_score_dict[key] = [val]
else:
cat_score_dict[key].append(val)
except:
pass
cat_score_dict_average = {}
for key, val in cat_score_dict.items():
cat_score_dict_average[key] = sum(val) / len(val)
return cat_score_dict_average
class MIABench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
}
DATASET_MD5 = {
'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
}
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
judge_name = judge_kwargs.pop('model', 'gpt-4o')
model = build_judge(model=judge_name, **judge_kwargs)
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
if not osp.exists(storage):
data = load(eval_file)
num_samples = len(data)
lines = [data.loc[i] for i in range(num_samples)]
prompts = [generate_prompt(line) for line in lines]
org_data = MIABench('MIA-Bench').data
img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
image_b64 = [img_map[idx] for idx in data['index']]
indices = list(data['index'])
mm_messages = [
dict(message=[
dict(type='text', value=prompt),
dict(type='image', value=f'data:image/jpeg;base64,{b64}')
])
for prompt, b64 in zip(prompts, image_b64)
]
res = {}
if osp.exists(tmp_file):
res = load(tmp_file)
jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
job_keys = list(jobs.keys())
job_vals = [jobs[k] for k in job_keys]
resps = track_progress_rich(
model.generate,
job_vals,
nproc=nproc,
chunksize=nproc,
keys=job_keys,
save=tmp_file,
)
for k, resp in zip(job_keys, resps):
res[k] = resp
data['score_raw'] = [res[idx] for idx in indices]
dump(data, storage)
goresult = load(storage)
results = get_score_dict(goresult, goresult['score_raw'])
result_pth = storage.replace('.xlsx', '_score.csv')
results_pd = pd.DataFrame.from_dict(list(results.items()))
dump(results_pd, result_pth)
return results