mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
Modify eval_mm for MiniCPM-o 2.6
This commit is contained in:
254
eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py
Normal file
254
eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py
Normal file
@@ -0,0 +1,254 @@
|
||||
from ...smp import *
|
||||
from .multiple_choice import extract_answer_from_item
|
||||
from PIL import Image, ImageOps
|
||||
import numpy as np
|
||||
|
||||
sys_prompt = "You are an AI assistant for question answering."
|
||||
|
||||
system_prompt_multi_choice = (
|
||||
"You will receive a multi-choice question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa
|
||||
"Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
|
||||
"If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
|
||||
)
|
||||
|
||||
system_prompt_caption_matching = (
|
||||
"You will receive a caption matching question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa
|
||||
"Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
|
||||
"If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
|
||||
)
|
||||
|
||||
system_prompt_captioning = """
|
||||
You will receive a video description and a multi-choice question. Your task is to choose the correct answer and briefly explain the reason why you choose the answer. \
|
||||
If none of the choice candidates are correct or the video description lacks enough information to answer the question, just answer "None of the choices are correct". \
|
||||
Please organize your response in this format:
|
||||
```
|
||||
Reasoning: [Your reason to obtain the answer]
|
||||
Answer: [Your answer]
|
||||
```
|
||||
|
||||
Here are some examples of video description, multi-choice question and the expected answer:
|
||||
```
|
||||
Video Description: A person is palying football.
|
||||
Multi-Choice Question:
|
||||
What is the person doing in the video?
|
||||
A. cooking
|
||||
B. palying football
|
||||
C. playing basketball
|
||||
D. reading book
|
||||
Reasoning: The video description mentions that the person is playing football.
|
||||
Answer: B. palying football
|
||||
|
||||
Video Description: A bird is flying clockwise.
|
||||
Multi-Choice Question:
|
||||
In which direction is the bird flying?
|
||||
A. backwark
|
||||
B. counter-clockwise
|
||||
C. clockwise
|
||||
D. downward
|
||||
Reasoning: The video description mentions that the bird is flying clockwise
|
||||
Answer: C. clockwise
|
||||
|
||||
Video Description: An air balloon is inflating.
|
||||
Multi-Choice Question:
|
||||
What is happening to the air balloon?
|
||||
A. exploding
|
||||
B. getting smaller
|
||||
C. flying
|
||||
Reasoning: The video description mentions that the air balloon is inflating, while none of the coices can be explained as inflating.
|
||||
Answer: None of the choices are correct
|
||||
```
|
||||
""" # noqa
|
||||
|
||||
system_prompt_YorN = """
|
||||
You will receive a Yes/No question, the ground-truth answer and the prediction from a question answering (QA) model. \
|
||||
Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. \
|
||||
If the prediction is correct, respond "Correct". If the prediction is incorrect, respond "Incorrect".
|
||||
""" # noqa
|
||||
|
||||
|
||||
def eval_rule_caption_matching(line):
|
||||
# Determine whether the video llm output is correct, based on word matching rules
|
||||
video_llm_output = line['prediction']
|
||||
answer = line['answer']
|
||||
option_strs = eval(line['candidates']) # complete option strings
|
||||
option_sents = [opt.split(': ')[1] for opt in option_strs] # option sentence
|
||||
# option index, e.g., Sentence A, Caption A, Option 1
|
||||
option_inds = [opt.split(': ')[0] for opt in option_strs] + [opt.split(': ')[0].replace('Sentence ', '').replace('Option ', '').replace('Caption ', '') for opt in option_strs] # noqa
|
||||
video_llm_pred = None
|
||||
for option_str in option_strs:
|
||||
if option_str == video_llm_output:
|
||||
video_llm_pred = option_str
|
||||
for option_sent in option_sents:
|
||||
if option_sent == video_llm_output or (') ' in video_llm_output and option_sent == video_llm_output.split(') ')[1]): # noqa
|
||||
video_llm_pred = option_sent
|
||||
for option_ind in option_inds:
|
||||
if option_ind == video_llm_output or option_ind == video_llm_output.replace('.', ''): # noqa
|
||||
video_llm_pred = option_ind
|
||||
|
||||
if video_llm_pred is None:
|
||||
return "fail"
|
||||
else:
|
||||
return 1 if video_llm_pred == answer or video_llm_pred == answer.split(":")[0] or video_llm_pred == answer.split(": ")[1] or video_llm_pred == answer.split(": ")[0].split()[1] else 0 # noqa
|
||||
|
||||
|
||||
def eval_rule_multi_choice(line):
|
||||
if line['prediction'] == line['answer']:
|
||||
return 1
|
||||
elif line['prediction'] in ['A', 'B', 'C', 'D']:
|
||||
return 1 if line['prediction'] == line['answer'][0] else 0
|
||||
elif any(line['prediction'].startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']):
|
||||
return 1 if line['prediction'].split('.')[0] == line['answer'][0] else 0
|
||||
elif any(line['prediction'].startswith(prefix) for prefix in ['A)', 'B)', 'C)', 'D)']):
|
||||
return 1 if line['prediction'].split(')')[0] == line['answer'][0] else 0
|
||||
else:
|
||||
return "fail"
|
||||
|
||||
|
||||
def eval_rule_YorN(video_llm_output):
|
||||
# Extract the yes/no predction from the original video llm output
|
||||
video_llm_output = video_llm_output.lower()
|
||||
if video_llm_output.startswith("yes"):
|
||||
return "yes"
|
||||
elif video_llm_output.startswith("no"):
|
||||
return "no"
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def llm_output_to_rating(llm_output):
|
||||
if not ('Correct' in llm_output or 'Incorrect' in llm_output):
|
||||
print(f"Warning: LLM output is not in the correct format: {llm_output}")
|
||||
rating = 0
|
||||
return rating
|
||||
if llm_output.startswith('Correct'):
|
||||
rating = 1
|
||||
elif llm_output.startswith('Incorrect'):
|
||||
rating = 0
|
||||
elif ('Correct' in llm_output) and ('Incorrect' not in llm_output):
|
||||
rating = 1
|
||||
elif 'Incorrect' in llm_output:
|
||||
rating = 0
|
||||
return rating
|
||||
|
||||
|
||||
def parse_llm_output(llm_output, gt_answer):
|
||||
if llm_output == "invalid_request_error" or not llm_output:
|
||||
eval_result = {"rating": -1, "chatgpt-answer": None, "chatgpt-reasoning": None}
|
||||
return eval_result
|
||||
|
||||
eval_result = {}
|
||||
lines = llm_output.split("\n")
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if "Reasoning" in line:
|
||||
eval_result['chatgpt-reasoning'] = line.replace("Reasoning:", "").strip()
|
||||
if "Answer" in line:
|
||||
eval_result['chatgpt-answer'] = line.replace("Answer:", "").strip()
|
||||
|
||||
if "chatgpt-answer" not in eval_result:
|
||||
eval_result['chatgpt-answer'] = llm_output
|
||||
if "chatgpt-reasoning" not in eval_result:
|
||||
eval_result['chatgpt-reasoning'] = None
|
||||
|
||||
# Check if the chatgpt answer is the ground-truth answer
|
||||
# calculate the number of 'A.', 'B.', 'C.', 'D.' in chatgpt-answer
|
||||
answer_counts = sum(eval_result['chatgpt-answer'].count(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']) # noqa
|
||||
if eval_result['chatgpt-answer'].split(". ")[0] == gt_answer.split(". ")[0] and answer_counts == 1:
|
||||
eval_result['rating'] = 1
|
||||
else:
|
||||
eval_result['rating'] = 0
|
||||
return eval_result
|
||||
|
||||
|
||||
def evaluate_tempcompass_mcq(model, line):
|
||||
eval_rules_dict = {
|
||||
'caption_matching': eval_rule_caption_matching,
|
||||
'multi-choice': eval_rule_multi_choice
|
||||
}
|
||||
gpt_eval_prompt = {
|
||||
'multi-choice': '{}\nMulti-Choice Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}',
|
||||
'caption_matching': '{}\nCaption Matching Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}'
|
||||
}
|
||||
base_prompt = {
|
||||
'multi-choice': system_prompt_multi_choice,
|
||||
'caption_matching': system_prompt_caption_matching
|
||||
}
|
||||
eval_result = {
|
||||
"question": line['question'],
|
||||
"answer": line['answer'],
|
||||
"prediction": line['prediction'],
|
||||
"task_type": line['task_type'],
|
||||
"candidates": line['candidates'],
|
||||
"match_success": True
|
||||
}
|
||||
result = eval_rules_dict[line['task_type']](line)
|
||||
if result == "fail":
|
||||
eval_result['match_success'] = False
|
||||
if model is None:
|
||||
eval_result['rating'] = 0
|
||||
else:
|
||||
prompt_template = gpt_eval_prompt[line['task_type']]
|
||||
prompt = prompt_template.format(base_prompt[line['task_type']], line['question'], line['answer'], line['prediction']) # noqa
|
||||
llm_output = model.generate(prompt)
|
||||
result = llm_output_to_rating(llm_output)
|
||||
eval_result['chatgpt-response'] = llm_output
|
||||
eval_result['rating'] = result
|
||||
else:
|
||||
eval_result['rating'] = result
|
||||
|
||||
return eval_result
|
||||
|
||||
|
||||
def evaluate_tempcompass_captioning(model, line):
|
||||
prompt = (
|
||||
f"{system_prompt_captioning}\n"
|
||||
f"Video Description:{line['prediction']}\n"
|
||||
f"Multi-Choice Question:\n{line['mc_question']}\n"
|
||||
)
|
||||
if model is not None:
|
||||
llm_output = model.generate(prompt)
|
||||
eval_result = parse_llm_output(llm_output, gt_answer=line['mc_answer'])
|
||||
return eval_result
|
||||
else:
|
||||
raise ValueError("Model is None, TempCompass Captioning task not supported exact matching") # noqa
|
||||
|
||||
|
||||
def evaluate_tempcompass_YorN(model, line):
|
||||
prompt = (
|
||||
f"{system_prompt_YorN}\n"
|
||||
f"Yes/No Question:\n{line['question']}\n"
|
||||
f"Ground-Truth Answer: {line['answer']}\n"
|
||||
f"Model Prediction: {line['prediction']}"
|
||||
)
|
||||
result = eval_rule_YorN(line['prediction'])
|
||||
eval_result = {
|
||||
"question": line['question'],
|
||||
"answer": line['answer'],
|
||||
"prediction": line['prediction'],
|
||||
"match_success": True
|
||||
}
|
||||
if result:
|
||||
eval_result['rating'] = 1 if result == line['answer'] else 0
|
||||
elif model is None:
|
||||
eval_result['match_success'] = False
|
||||
eval_result['rating'] = 0
|
||||
else:
|
||||
eval_result['match_success'] = False
|
||||
llm_output = model.generate(prompt)
|
||||
result = llm_output_to_rating(llm_output)
|
||||
eval_result['chatgpt-response'] = llm_output
|
||||
eval_result['rating'] = result
|
||||
return eval_result
|
||||
|
||||
|
||||
def get_dimension_rating(score_file):
|
||||
data = load(score_file)
|
||||
result_dict = {}
|
||||
for idx, item in data.iterrows():
|
||||
dict_key = item['dim'] + '. ' + item['task_type']
|
||||
if dict_key not in result_dict:
|
||||
result_dict[dict_key] = [0,0]
|
||||
result_dict[dict_key][0] += int(item['score'])
|
||||
result_dict[dict_key][1] += 1
|
||||
return result_dict
|
||||
Reference in New Issue
Block a user