Files
MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py
2025-01-21 15:34:54 +08:00

175 lines
6.2 KiB
Python

from ast import literal_eval
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
from ..utils import track_progress_rich
LLM_PARSE_ANSWER_PROMPT = '''
You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
Return the Answer X ONLY. e.g., Answer 1 or Answer 2.
Judgement: {judgement}
'''
PROMPT_TEMPLATE = '''\
You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions.
Please analyze the following image and question, then determine which of the two provided answers is better.
Question: {query}
Answer 1: {answer_0}
Answer 2: {answer_1}
Please evaluate both answers based on the following criteria:
1. Accuracy: How well does the answer align with the visual information in the image?
2. Completeness: Does the answer fully address all aspects of the question?
3. Clarity: Is the answer easy to understand and well-articulated?
4. Relevance: Does the answer directly relate to the question and the image?
After your evaluation, please:
1. Explain your reasoning for each criterion.
2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\
For example: Overall Judgment: Answer X is better.
Your response should be structured and detailed, \
demonstrating your understanding of both the visual and textual elements of the task.'''
def get_score(line, parsed_response, random_number):
gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1
if 'Answer 1'.lower() in parsed_response.lower():
pred = 1
elif 'Answer 2'.lower() in parsed_response.lower():
pred = 2
else: # failed
pred = 'None' # random.choice([1, 2])
if pred == gt_ans:
return 1.0
else:
return 0.0
def VLRewardBench_eval_answer(model, line):
response = toliststr(line['response'])
random_number = sum(len(res) for res in response) % 2
prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction'])
messages = [dict(type='text', value=prompt)]
resp = model.generate(messages)
score = get_score(line, resp, random_number)
if score is None:
return 'Unknown'
return score
class VLRewardBench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv'
}
DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'}
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
tgt_path = self.dump_image(line) # save image to local
question = line['question']
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
response = toliststr(line['response'])
random_number = sum(len(res) for res in response) % 2
if random_number == 1:
# randomly shuffle the order of the responses
response = response[::-1]
query_prompt = PROMPT_TEMPLATE.format(
query=question, answer_0=response[0], answer_1=response[1]
)
msgs = msgs + [dict(type='text', value=query_prompt)]
return msgs
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
raw_data = VLRewardBench('VL-RewardBench').data
data = load(eval_file)
data['prediction'] = [str(x) for x in data['prediction']]
data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']]
judge_kwargs['temperature'] = 0
judge_kwargs['timeout'] = 60
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), (
'VLRewardBench evaluation requires a working OPENAI API\n'
+ DEBUG_MESSAGE
)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = load(tmp_file) if osp.exists(tmp_file) else {}
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
VLRewardBench_eval_answer,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
ans[k] = v
data['score'] = [ans[idx] for idx in data['index']]
# data.pop('image')
dump(data, storage)
data = load(storage)
lt = len(data)
category_scores = defaultdict(lambda: 0)
category_cnt = defaultdict(lambda: 0)
scores = defaultdict(lambda: 0)
for i in range(lt):
item = data.iloc[i]
category_scores[item['category']] += item['score']
category_cnt[item['category']] += 1
# calculate the average score for each category
for k, v in category_scores.items():
scores[k] = v / category_cnt[k]
# calculate category macro accuracy (average across categories)
scores['Macro Accuracy'] = sum(scores.values()) / len(scores)
# calculate the total average score
scores['Overall Consistency'] = sum(category_scores.values()) / lt
scores = {k: [v] for k, v in scores.items()}
scores = pd.DataFrame(scores)
dump(scores, score_file)
return scores