mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
Modify eval_mm for MiniCPM-o 2.6
This commit is contained in:
145
eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py
Normal file
145
eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import re
|
||||
|
||||
|
||||
def extract_answer(output_string, task_type="yes_no"):
|
||||
"""
|
||||
Extracts the answer from the output string based on the task type.
|
||||
|
||||
Parameters:
|
||||
output_string (str): The output string.
|
||||
task_type (str): The type of task. Must be either "yes_no" or "multiple_choice".
|
||||
|
||||
Returns:
|
||||
int:
|
||||
1 if "yes" or "A"
|
||||
0 if "no" or "B"
|
||||
-1 if no relevant answer is found.
|
||||
Raises a ValueError if an unsupported task_type is provided.
|
||||
"""
|
||||
|
||||
def find_word_position(string, word):
|
||||
pattern = r'\b' + re.escape(word) + r'\b'
|
||||
match = re.search(pattern, string, re.IGNORECASE)
|
||||
if match:
|
||||
return match.start()
|
||||
return -1
|
||||
|
||||
if task_type not in ["yes_no", "multiple_choice"]:
|
||||
raise ValueError(f"Task type {task_type} not supported. Must be 'yes_no' or 'multiple_choice'.")
|
||||
|
||||
if task_type == "yes_no":
|
||||
position_yes_and_a = find_word_position(output_string, "yes")
|
||||
position_no_and_b = find_word_position(output_string, "no")
|
||||
elif task_type == "multiple_choice":
|
||||
position_yes_and_a = find_word_position(output_string, "A")
|
||||
position_no_and_b = find_word_position(output_string, "B")
|
||||
|
||||
if position_yes_and_a == -1 and position_no_and_b == -1:
|
||||
print(f"No answer found in the output string: {output_string}.")
|
||||
return -1
|
||||
elif position_yes_and_a != -1 and position_no_and_b != -1:
|
||||
return 1 if position_yes_and_a < position_no_and_b else 0
|
||||
else:
|
||||
return 0 if position_yes_and_a == -1 else 1
|
||||
|
||||
|
||||
def get_scores(scores):
|
||||
"""
|
||||
Calculate various scores based on the given results.
|
||||
|
||||
Args:
|
||||
scores (dict or list): A dictionary or list containing results where each result can be:
|
||||
- dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...}
|
||||
- list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...]
|
||||
|
||||
The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images:
|
||||
- "q0_i0" means question_0 on image_0
|
||||
- "q0_i1" means question_0 on image_1
|
||||
- "q1_i0" means question_1 on image_0
|
||||
- "q1_i1" means question_1 on image_1
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the calculated scores:
|
||||
- 'Q_Acc': Average question score
|
||||
- 'I_Acc': Average image score
|
||||
- 'Acc': Average binary VQA score
|
||||
- 'G_Acc': Average group score
|
||||
"""
|
||||
Q_Acc = 0.0
|
||||
I_Acc = 0.0
|
||||
Acc = 0.0
|
||||
G_Acc = 0.0
|
||||
|
||||
num_samples = len(scores)
|
||||
|
||||
def calculate_image_score(result):
|
||||
image_correct = 0
|
||||
if isinstance(result, dict):
|
||||
if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0:
|
||||
image_correct += 1
|
||||
if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0:
|
||||
image_correct += 1
|
||||
elif isinstance(result, list):
|
||||
if result[0] == 1.0 and result[2] == 0.0:
|
||||
image_correct += 1
|
||||
if result[3] == 1.0 and result[1] == 0.0:
|
||||
image_correct += 1
|
||||
return image_correct
|
||||
|
||||
def calculate_question_score(result):
|
||||
text_correct = 0
|
||||
if isinstance(result, dict):
|
||||
if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0:
|
||||
text_correct += 1
|
||||
if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0:
|
||||
text_correct += 1
|
||||
else:
|
||||
if result[0] == 1.0 and result[1] == 0.0:
|
||||
text_correct += 1
|
||||
if result[3] == 1.0 and result[2] == 0.0:
|
||||
text_correct += 1
|
||||
return text_correct
|
||||
|
||||
def calculate_binary_score(result):
|
||||
binary_score_correct = 0
|
||||
if isinstance(result, dict):
|
||||
binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0
|
||||
binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0
|
||||
binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0
|
||||
binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0
|
||||
else:
|
||||
binary_score_correct += 1 if result[0] == 1.0 else 0
|
||||
binary_score_correct += 1 if result[1] == 0.0 else 0
|
||||
binary_score_correct += 1 if result[2] == 0.0 else 0
|
||||
binary_score_correct += 1 if result[3] == 1.0 else 0
|
||||
|
||||
return binary_score_correct
|
||||
|
||||
def calculate_group(result):
|
||||
group_correct = 0
|
||||
if calculate_question_score(result) == 2 and calculate_image_score(result) == 2:
|
||||
group_correct += 1
|
||||
|
||||
return group_correct
|
||||
|
||||
if isinstance(scores, dict):
|
||||
for _, result in scores.items():
|
||||
Q_Acc += calculate_question_score(result)
|
||||
I_Acc += calculate_image_score(result)
|
||||
Acc += calculate_binary_score(result)
|
||||
G_Acc += calculate_group(result)
|
||||
else:
|
||||
for result in scores:
|
||||
Q_Acc += calculate_question_score(result)
|
||||
I_Acc += calculate_image_score(result)
|
||||
Acc += calculate_binary_score(result)
|
||||
G_Acc += calculate_group(result)
|
||||
|
||||
results = {
|
||||
'Q_Acc': Q_Acc / float(num_samples * 2),
|
||||
'I_Acc': I_Acc / float(num_samples * 2),
|
||||
'Acc': Acc / float(num_samples * 4),
|
||||
'G_Acc': G_Acc / num_samples
|
||||
}
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user