mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
Modify eval_mm for MiniCPM-o 2.6
This commit is contained in:
167
eval_mm/vlmevalkit/vlmeval/dataset/miabench.py
Normal file
167
eval_mm/vlmevalkit/vlmeval/dataset/miabench.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .image_base import ImageBaseDataset
|
||||
from ..smp import *
|
||||
from .utils import build_judge, DEBUG_MESSAGE
|
||||
from ..utils import track_progress_rich
|
||||
|
||||
|
||||
def generate_prompt(d):
|
||||
question = d['question']
|
||||
weights = eval(d['component_weight'])
|
||||
components = eval(d['components'])
|
||||
num_of_component = int(d['num_of_component'])
|
||||
response = d['prediction']
|
||||
|
||||
if num_of_component == 1:
|
||||
components = f"The first component is: '{components[0]}'. "
|
||||
score = f"The first component is worth: {weights[0]} scores. "
|
||||
elif num_of_component == 2:
|
||||
components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
|
||||
score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
|
||||
elif num_of_component == 3:
|
||||
components = (
|
||||
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
||||
f"and the third component is '{components[2]}'. "
|
||||
)
|
||||
score = (
|
||||
"The first, second, and third component is each worth "
|
||||
f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
|
||||
)
|
||||
elif num_of_component == 4:
|
||||
components = (
|
||||
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
||||
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
|
||||
)
|
||||
score = (
|
||||
"The first, second, third, and fourth component is each worth "
|
||||
f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
|
||||
)
|
||||
elif num_of_component == 5:
|
||||
components = (
|
||||
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
||||
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
|
||||
f"and the fifth component is '{components[4]}'. "
|
||||
)
|
||||
score = (
|
||||
"The first, second, third, fourth, and fifth component is each worth "
|
||||
f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
|
||||
)
|
||||
|
||||
return (
|
||||
"Here is an instruction for a multimodal LLM: '"
|
||||
f"{question}"
|
||||
"'. You need to grade if the response from the model follows each component of the instruction. "
|
||||
f"{components}"
|
||||
"The response is: '"
|
||||
f"{response}"
|
||||
"'. You need to score the response and be strict. The total score ranges from 0 to 10, "
|
||||
"depending on if the response follows the instruction. "
|
||||
f"{score}"
|
||||
"List scores of each component, and the total score in one sentence in this format: "
|
||||
"score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
|
||||
)
|
||||
|
||||
|
||||
def process_rawscore(component_type, raw_score):
|
||||
first_sentence = raw_score.split('.')[0].split(',')
|
||||
score_dict = {}
|
||||
for i in range(len(first_sentence) - 1):
|
||||
score_ = first_sentence[i].split(':')[1][1:].split('/')
|
||||
score = int(score_[0]) / int(score_[1])
|
||||
score_dict[component_type[i]] = score
|
||||
total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
|
||||
total_score = int(total_score_[0]) / int(total_score_[1])
|
||||
score_dict['total_score'] = total_score
|
||||
return score_dict
|
||||
|
||||
|
||||
def get_score_dict(data, score_raw):
|
||||
cat_score_dict = {}
|
||||
for i in range(len(data)):
|
||||
try:
|
||||
cmp = data['component_type'][i][2:-2]
|
||||
cmp_list = cmp.split('\', \'')
|
||||
score_dict = process_rawscore(cmp_list, score_raw[i])
|
||||
for key, val in score_dict.items():
|
||||
if key not in cat_score_dict.keys():
|
||||
cat_score_dict[key] = [val]
|
||||
else:
|
||||
cat_score_dict[key].append(val)
|
||||
except:
|
||||
pass
|
||||
cat_score_dict_average = {}
|
||||
for key, val in cat_score_dict.items():
|
||||
cat_score_dict_average[key] = sum(val) / len(val)
|
||||
return cat_score_dict_average
|
||||
|
||||
|
||||
class MIABench(ImageBaseDataset):
|
||||
TYPE = 'VQA'
|
||||
|
||||
DATASET_URL = {
|
||||
'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
|
||||
}
|
||||
DATASET_MD5 = {
|
||||
'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def evaluate(self, eval_file, **judge_kwargs):
|
||||
judge_name = judge_kwargs.pop('model', 'gpt-4o')
|
||||
|
||||
model = build_judge(model=judge_name, **judge_kwargs)
|
||||
suffix = eval_file.split('.')[-1]
|
||||
|
||||
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
|
||||
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
|
||||
nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
|
||||
|
||||
if not osp.exists(storage):
|
||||
data = load(eval_file)
|
||||
num_samples = len(data)
|
||||
lines = [data.loc[i] for i in range(num_samples)]
|
||||
prompts = [generate_prompt(line) for line in lines]
|
||||
org_data = MIABench('MIA-Bench').data
|
||||
img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
|
||||
image_b64 = [img_map[idx] for idx in data['index']]
|
||||
indices = list(data['index'])
|
||||
mm_messages = [
|
||||
dict(message=[
|
||||
dict(type='text', value=prompt),
|
||||
dict(type='image', value=f'data:image/jpeg;base64,{b64}')
|
||||
])
|
||||
for prompt, b64 in zip(prompts, image_b64)
|
||||
]
|
||||
|
||||
res = {}
|
||||
if osp.exists(tmp_file):
|
||||
res = load(tmp_file)
|
||||
|
||||
jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
|
||||
job_keys = list(jobs.keys())
|
||||
job_vals = [jobs[k] for k in job_keys]
|
||||
|
||||
resps = track_progress_rich(
|
||||
model.generate,
|
||||
job_vals,
|
||||
nproc=nproc,
|
||||
chunksize=nproc,
|
||||
keys=job_keys,
|
||||
save=tmp_file,
|
||||
)
|
||||
for k, resp in zip(job_keys, resps):
|
||||
res[k] = resp
|
||||
data['score_raw'] = [res[idx] for idx in indices]
|
||||
dump(data, storage)
|
||||
|
||||
goresult = load(storage)
|
||||
results = get_score_dict(goresult, goresult['score_raw'])
|
||||
result_pth = storage.replace('.xlsx', '_score.csv')
|
||||
results_pd = pd.DataFrame.from_dict(list(results.items()))
|
||||
dump(results_pd, result_pth)
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user