Modify eval_mm for MiniCPM-o 2.6

This commit is contained in:
Poppy Xu
2025-01-21 15:34:54 +08:00
parent ec68cefc17
commit d8f382e157
82 changed files with 14279 additions and 843 deletions

View File

@@ -5,5 +5,5 @@ from .vqa_eval import levenshtein_distance
__all__ = [
'build_judge', 'extract_answer_from_item', 'prefetch_answer',
'levenshtein_distance', 'DEBUG_MESSAGE'
'levenshtein_distance', 'DEBUG_MESSAGE',
]

View File

@@ -0,0 +1,59 @@
# CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy
## Introduction
Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.
## Running Scripts
Once the environment is ready, execute the following script from the root directory of VLMEvalKit
to perform inference and evaluation tasks in batch.
```shell
MODEL_NAME="QwenVLMax"
OUTPUT_DIR="/your/path/to/output_dir"
SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_scene_ocr
python run.py --data CCOCR_MultiSceneOcr_Cord CCOCR_MultiSceneOcr_Funsd CCOCR_MultiSceneOcr_Iam CCOCR_MultiSceneOcr_ZhDoc CCOCR_MultiSceneOcr_ZhHandwriting CCOCR_MultiSceneOcr_Hieragent CCOCR_MultiSceneOcr_Ic15 CCOCR_MultiSceneOcr_Inversetext CCOCR_MultiSceneOcr_Totaltext CCOCR_MultiSceneOcr_ZhScene CCOCR_MultiSceneOcr_UgcLaion CCOCR_MultiSceneOcr_ZhDense CCOCR_MultiSceneOcr_ZhVertical --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_lan_ocr
python run.py --data CCOCR_MultiLanOcr_Arabic CCOCR_MultiLanOcr_French CCOCR_MultiLanOcr_German CCOCR_MultiLanOcr_Italian CCOCR_MultiLanOcr_Japanese CCOCR_MultiLanOcr_Korean CCOCR_MultiLanOcr_Portuguese CCOCR_MultiLanOcr_Russian CCOCR_MultiLanOcr_Spanish CCOCR_MultiLanOcr_Vietnamese --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
SUB_OUTPUT_DIR=${OUTPUT_DIR}/doc_parsing
python run.py --data CCOCR_DocParsing_DocPhotoChn CCOCR_DocParsing_DocPhotoEng CCOCR_DocParsing_DocScanChn CCOCR_DocParsing_DocScanEng CCOCR_DocParsing_TablePhotoChn CCOCR_DocParsing_TablePhotoEng CCOCR_DocParsing_TableScanChn CCOCR_DocParsing_TableScanEng CCOCR_DocParsing_MolecularHandwriting CCOCR_DocParsing_FormulaHandwriting --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
SUB_OUTPUT_DIR=${OUTPUT_DIR}/kie
python run.py --data CCOCR_Kie_Sroie2019Word CCOCR_Kie_Cord CCOCR_Kie_EphoieScut CCOCR_Kie_Poie CCOCR_Kie_ColdSibr CCOCR_Kie_ColdCell --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
```
## Example Output
The evaluation results will be saved in `${SUB_OUTPUT_DIR}/summary.md`. For example, for the KIE subset,
the output is as follows:
| exp_name(f1_score) | COLD_CELL | COLD_SIBR | CORD | EPHOIE_SCUT | POIE | sroie2019_word | summary |
|:-------------------|------------:|------------:|-------:|--------------:|-------:|-----------------:|----------:|
| QwenVLMax | 81.01 | 72.46 | 69.33 | 71.2 | 60.85 | 76.37 | 71.87 |
## Citation
If you find our work helpful, feel free to give us a cite.
```
@misc{yang2024ccocr,
title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
year={2024},
eprint={2412.02210},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.02210},
}
```
## Contact Us
If you have any questions, feel free to send an email to: wpf272043@alibaba-inc.com or xixing.tj@alibaba-inc.com

View File

@@ -0,0 +1,12 @@
from .kie_evaluator import KieEvaluator
from .doc_parsing_evaluator import ParsingEvaluator
from .ocr_evaluator import OcrEvaluator
from .common import summary
evaluator_map_info = {
"kie": KieEvaluator("kie"),
"doc_parsing": ParsingEvaluator("doc_parsing"),
"multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
"multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
}

View File

@@ -0,0 +1,222 @@
import os
import json
import time
import sys
from abc import abstractmethod
from tabulate import tabulate
def pick_response_text(json_path):
"""
"""
try:
with open(json_path, "r") as f:
json_data = json.load(f)
except Exception as e:
print("--> file error: msg: {}, path: {}".format(e, json_path))
return None
for required_key in ["model_name", "response"]:
if required_key not in json_data:
print("--> required key not exists, name: {}, path: {}".format(required_key, json_path))
return None
model_name = json_data["model_name"]
model_response = json_data["response"]
response_text = None
if model_name.startswith("gpt") or model_name.startswith("o1"):
response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501
elif model_name.startswith("local_"):
response_text = model_response
else:
if model_name.startswith("claude"):
content_list = model_response.get("content", None)
elif model_name.startswith("gemini"):
content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None)
elif model_name.startswith("qwen"):
content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501
else:
raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name))
if isinstance(content_list, list) and len(content_list) > 0:
response_text = content_list[0].get("text", None)
if response_text is None:
print("--> [error][{}] text pick error, path: {}".format(model_name, json_path))
return response_text
def load_response_from_dir(res_dir):
"""
"""
response_info = {}
for file_name in os.listdir(res_dir):
file_path = os.path.abspath(os.path.join(res_dir, file_name))
if not file_name.endswith(".json"):
print("--> skip: result file should be a json: but got: {}".format(file_path))
continue
response_text = pick_response_text(file_path)
if response_text is None:
continue
file_name_wo_ext, ext = os.path.splitext(file_name)
response_info[file_name_wo_ext] = response_text
return response_info
class BaseMetric(object):
""" BaseMetric """
""" OCRMetric """
def __init__(self, group_name, **kwargs):
self.group_name = group_name
self.kwargs = kwargs
def response_post_func(self, response_text, **kwargs):
return response_text
@abstractmethod
# Given the prediction and gt, return the evaluation results in the format of a dictionary
# results should contain a 'summary' key, for example:
# {
# "summary": {
# "f1-score": 99.99,
# "metric_name": "metric_value" # used for summaryonly metric info could be placed in this dict.
# },
# "your other info": "xxx"
# }
def evaluate(self, response_info, gt_info, normalize_func=None, **kwargs):
pass
def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs):
if isinstance(pdt_res_dir, dict):
raw_response_info = pdt_res_dir
elif os.path.exists(pdt_res_dir) and os.path.isdir(pdt_res_dir):
raw_response_info = load_response_from_dir(pdt_res_dir)
else:
return ValueError("invalid input: response dict or folder are required, but got {}".format(pdt_res_dir))
post_error_list, response_info = [], {}
response_error_list = list(gt_info.keys() - raw_response_info.keys())
for file_name, single_pdt_str in raw_response_info.items():
single_pdt_str = self.response_post_func(single_pdt_str, **kwargs)
if single_pdt_str is None:
post_error_list.append(file_name)
continue
response_info[file_name] = single_pdt_str
meta_info = {
"gt_total_num": len(gt_info), "pdt_total_num": len(response_info),
"post_error_list": post_error_list, "response_error_list": response_error_list,
}
eval_info = self.evaluate(response_info, gt_info, **kwargs)
# add response_success_ratio
if "summary" in eval_info and with_response_ratio:
success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9)
eval_info["summary"].update({"response_success_ratio": success_ratio})
return meta_info, eval_info
def summary(index_path, exp_dir_base, is_weighted_sum=False):
"""
"""
with open(index_path, "r") as f:
data_list = json.load(f)
all_data_info = {}
for data_info_item in data_list:
data_name = data_info_item["dataset"]
if not data_info_item.get("release", True):
continue
all_data_info[data_name] = data_info_item
dataset_list = list(all_data_info.keys())
summary_path = summary_multi_exp(exp_dir_base, dataset_list, is_weighted_sum=is_weighted_sum)
return summary_path
def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
"""
"""
if dataset_list is None:
all_dataset_name = []
for exp_name in os.listdir(exp_dir_base):
dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
if not os.path.exists(dir_status_path):
continue
with open(dir_status_path, "r") as f:
data_status_info = json.load(f)
all_dataset_name.extend(data_status_info.keys())
dataset_list = sorted(set(all_dataset_name))
# summary main code
all_evaluate_info, _ = {}, 0
for exp_name in os.listdir(exp_dir_base):
dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
if not os.path.exists(dir_status_path):
print("--> skip: status.json not exist: {}".format(dir_status_path))
continue
with open(dir_status_path, "r") as f:
all_status_info = json.load(f)
for data_name in dataset_list:
total_num = all_status_info.get(data_name, {}).get("config", {}).get("num", "-1")
summary_info = all_status_info.get(data_name, {}).get("evaluation", {}).get("summary", {})
for metric_name, metric_value in summary_info.items():
if metric_name not in all_evaluate_info:
all_evaluate_info[metric_name] = {}
if exp_name not in all_evaluate_info[metric_name]:
all_evaluate_info[metric_name][exp_name] = {}
all_evaluate_info[metric_name][exp_name][data_name] = (metric_value, total_num)
all_table_md = []
for metric_name, metric_info in all_evaluate_info.items():
formatted_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time()))
summary_line_list = []
summary_key_name = "summary(weighted)" if is_weighted_sum else "summary"
summary_head = [f"exp_name({metric_name}_{formatted_time})"] + dataset_list + [summary_key_name]
for exp_name, data_eval_info in metric_info.items():
summary_line = [exp_name, ]
all_metric_value = 0
is_summary_valid, all_total_num, all_weighted_metric = True, 0, 0
for data_name in dataset_list:
metric_value, total_num = data_eval_info.get(data_name, ("-1", "-1"))
summary_line.append("{:.2f}".format(float(metric_value) * 100))
if str(metric_value) == "-1" or str(metric_value) == "-1":
is_summary_valid = False
continue
all_total_num += float(total_num)
all_weighted_metric += float(total_num) * float(metric_value)
all_metric_value += float(metric_value)
summary_value_valid = ((all_weighted_metric / (all_total_num + 1e-9)) * 100) if is_weighted_sum \
else (all_metric_value / (len(dataset_list) + 1e-9) * 100)
summary_value = "-" if not is_summary_valid else "{:.2f}".format(summary_value_valid)
summary_line.append(summary_value)
summary_line_list.append(summary_line)
md_table_info = tabulate(summary_line_list, headers=summary_head, tablefmt='pipe')
all_table_md.append(md_table_info)
print("\n\n".join(all_table_md))
summary_path = os.path.abspath(os.path.join(exp_dir_base, "summary.md"))
with open(summary_path, "w") as f:
f.write("\n\n".join(all_table_md))
return summary_path
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python {} exp_base_dir".format(__file__))
exit(-1)
else:
print('--> info: {}'.format(sys.argv))
exp_base_dir = sys.argv[1]
summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False)
print("--> info: summary saved at : {}".format(summary_path))
print("happy coding.")

View File

@@ -0,0 +1,256 @@
import nltk
import re
from tqdm import tqdm
from collections import deque
from apted.helpers import Tree
from apted import APTED, Config
# local import
from .common import BaseMetric
# 移除指定的LaTeX命令
patterns = [
r'\\documentclass\{.*?\}',
r'\\usepackage\[.*?\]\{.*?\}',
r'\\usepackage\{.*?\}',
r'\\geometry\{.*?\}',
r'\\begin\{document\}',
r'\\end\{document\}',
r'\\noindent'
]
class TableTree(Tree):
"""
# Copyright 2020 IBM
# Author: peter.zhong@au1.ibm.com
# License: Apache 2.0 License.
"""
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
self.children = list(children)
def bracket(self):
"""Show tree using brackets notation"""
if self.tag == "td":
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % (
self.tag,
self.colspan,
self.rowspan,
self.content,
)
else:
result = '"tag": %s' % self.tag
for child in self.children:
result += child.bracket()
return "{{{}}}".format(result)
class CustomConfig(Config):
"""
# Copyright 2020 IBM
# Author: peter.zhong@au1.ibm.com
# License: Apache 2.0 License.
"""
def rename(self, node1, node2):
"""Compares attributes of trees"""
# print(node1.tag)
if (
(node1.tag != node2.tag)
or (node1.colspan != node2.colspan)
or (node1.rowspan != node2.rowspan)
):
return 1.0
if node1.tag == "td":
if node1.content or node2.content:
return nltk.edit_distance(node1.content, node2.content) / max(len(node1.content), len(node2.content))
return 0.0
class TEDS(object):
"""Tree Edit Distance basead Similarity
# Copyright 2020 IBM
# Author: peter.zhong@au1.ibm.com
# License: Apache 2.0 License.
"""
def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
assert isinstance(n_jobs, int) and (
n_jobs >= 1
), "n_jobs must be an integer greather than 1"
self.structure_only = structure_only
self.n_jobs = n_jobs
self.ignore_nodes = ignore_nodes
self.__tokens__ = []
def tokenize(self, node):
"""Tokenizes table cells"""
self.__tokens__.append("<%s>" % node.tag)
if node.text is not None:
self.__tokens__ += list(node.text)
for n in node.getchildren():
self.tokenize(n)
if node.tag != "unk":
self.__tokens__.append("</%s>" % node.tag)
if node.tag != "td" and node.tail is not None:
self.__tokens__ += list(node.tail)
def load_html_tree(self, node, parent=None):
"""Converts HTML tree to the format required by apted"""
global __tokens__
if node.tag == "td":
if self.structure_only:
cell = []
else:
self.__tokens__ = []
self.tokenize(node)
cell = self.__tokens__[1:-1].copy()
new_node = TableTree(
node.tag,
int(node.attrib.get("colspan", "1")),
int(node.attrib.get("rowspan", "1")),
cell,
*deque(),
)
else:
new_node = TableTree(node.tag, None, None, None, *deque())
if parent is not None:
parent.children.append(new_node)
if node.tag != "td":
for n in node.getchildren():
self.load_html_tree(n, new_node)
if parent is None:
return new_node
def evaluate(self, pred, true):
"""Computes TEDS score between the prediction and the ground truth of a
given sample
"""
# try_import("lxml")
from lxml import etree, html
if (not pred) or (not true):
return 0.0
parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
pred = html.fromstring(pred, parser=parser)
true = html.fromstring(true, parser=parser)
if pred.xpath("body/table") and true.xpath("body/table"):
pred = pred.xpath("body/table")[0]
true = true.xpath("body/table")[0]
if self.ignore_nodes:
etree.strip_tags(pred, *self.ignore_nodes)
etree.strip_tags(true, *self.ignore_nodes)
n_nodes_pred = len(pred.xpath(".//*"))
n_nodes_true = len(true.xpath(".//*"))
n_nodes = max(n_nodes_pred, n_nodes_true)
tree_pred = self.load_html_tree(pred)
tree_true = self.load_html_tree(true)
distance = APTED(
tree_pred, tree_true, CustomConfig()
).compute_edit_distance()
return 1.0 - (float(distance) / n_nodes)
else:
return 0.0
class ParsingEvaluator(BaseMetric):
def response_post_func(self, response_text, **kwargs):
return response_text
def evaluate(self, response_info, gt_info, **kwargs):
op = kwargs['op']
if op == 'doc':
score = self.eval_doc(response_info, gt_info)
elif op == 'table':
score = self.eval_table(response_info, gt_info)
elif op in ['molecular', "formula"]:
score = self.eval_formula(response_info, gt_info, op_name=op)
else:
raise ValueError(f'doc parsing unsupported op: {op}')
# summary info
eval_info = {"summary": {"score": score}}
return eval_info
def eval_doc(self, response_info, gt_info):
results = []
for img_name, gt in tqdm(gt_info.items()):
if img_name not in response_info:
results.append(0)
continue
pred = response_info[img_name]
for pattern in patterns:
pred = re.sub(pattern, '', pred)
try:
pred = pred.split('```')[1]
except:
pass
pred = pred.replace('```latex', '')
pred = pred.replace('```', '')
pred = pred.replace(' ', '').replace('\n', '')
gt = gt.replace(' ', '').replace('\n', '')
edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
results.append(1 - edit_dist)
score = sum(results) / len(results)
return score
def eval_table(self, response_info, gt_info):
teds = TEDS(structure_only=False, n_jobs=1)
results = []
for img_name, gt in tqdm(gt_info.items()):
if img_name not in response_info:
results.append(0)
continue
pred = response_info[img_name]
for pattern in patterns:
pred = re.sub(pattern, '', pred)
try:
pred = pred.split('```html')[1]
except:
pass
pred = pred.replace('```', '')
pred = pred.replace(' ', '').replace('\n', '').replace('', ',')
gt = gt.replace(' ', '').replace('\n', '')
pred_html = '<html><body>{}</body></html>'.format(pred)
gt_html = '<html><body>{}</body></html>'.format(gt)
results.append(teds.evaluate(pred_html, gt_html))
score = sum(results) / len(results)
return score
def eval_formula(self, response_info, gt_info, op_name='formula'):
results = []
for img_name, gt in tqdm(gt_info.items()):
if img_name not in response_info:
results.append(0)
continue
pred = response_info[img_name]
if op_name == 'formula':
pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") # noqa: E501
gt = gt.replace(" ", "")
elif op_name == 'molecular':
pred = pred.replace("\n", "").replace(" ", "").replace("<smiles>", "").replace("</smiles>", "")
gt = gt.replace(" ", "")
edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
results.append(1 - edit_dist)
score = sum(results) / len(results)
return score
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,385 @@
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
import json
import os
import sys
import re
import time
from typing import Any, Dict, List, Tuple, Union
import zss
from zss import Node
from collections import Counter
from nltk import edit_distance
# local import
from .common import BaseMetric
def flatten(data: dict):
"""
Convert Dictionary into Non-nested Dictionary
Example:
input(dict)
{
"menu": [
{"name" : ["cake"], "count" : ["2"]},
{"name" : ["juice"], "count" : ["1"]},
]
}
output(list)
[
("menu.name", "cake"),
("menu.count", "2"),
("menu.name", "juice"),
("menu.count", "1"),
]
"""
flatten_data = list()
def _flatten(value, key=""):
if type(value) is dict:
for child_key, child_value in value.items():
_flatten(child_value, f"{key}.{child_key}" if key else child_key)
elif type(value) is list:
for value_item in value:
_flatten(value_item, key)
else:
flatten_data.append((key, value))
_flatten(data)
return flatten_data
def update_cost(node1: Node, node2: Node):
"""
Update cost for tree edit distance.
If both are leaf node, calculate string edit distance between two labels (special token '<leaf>' will be ignored).
If one of them is leaf node, cost is length of string in leaf node + 1.
If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1
"""
label1 = node1.label
label2 = node2.label
label1_leaf = "<leaf>" in label1
label2_leaf = "<leaf>" in label2
if label1_leaf and label2_leaf:
return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
elif not label1_leaf and label2_leaf:
return 1 + len(label2.replace("<leaf>", ""))
elif label1_leaf and not label2_leaf:
return 1 + len(label1.replace("<leaf>", ""))
else:
return int(label1 != label2)
def insert_and_remove_cost(node: Node):
"""
Insert and remove cost for tree edit distance.
If leaf node, cost is length of label name.
Otherwise, 1
"""
label = node.label
if "<leaf>" in label:
return len(label.replace("<leaf>", ""))
else:
return 1
def normalize_dict(data: Union[Dict, List, Any]):
"""
Sort by value, while iterate over element if data is list
"""
# if not data:
# return {}
if isinstance(data, dict):
new_data = dict()
for key in sorted(data.keys(), key=lambda k: (len(k), k)):
value = normalize_dict(data[key])
if value:
if not isinstance(value, list):
value = [value]
new_data[key] = value
elif isinstance(data, list):
if all(isinstance(item, dict) for item in data):
new_data = []
for item in data:
item = normalize_dict(item)
if item:
new_data.append(item)
else:
new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()]
else:
new_data = [str(data).strip()]
return new_data
def cal_f1_all(preds, answers):
"""
Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives,
false negatives and false positives
"""
metric_info, error_info = {}, {}
total_tp, total_fn_or_fp = 0, 0
for file_name, answer in answers.items():
sample_error_info = {"fp": [], "fn": [], "tp": []}
pred = preds.get(file_name, {})
pred, answer = flatten(normalize_dict(pred)), flatten(normalize_dict(answer))
for field in pred:
field_name = field[0]
if field_name not in metric_info:
metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0}
if field in answer:
total_tp += 1
metric_info[field_name]["total_tp"] += 1
sample_error_info["tp"].append(field)
answer.remove(field)
else:
total_fn_or_fp += 1
metric_info[field_name]["total_fn_or_fp"] += 1
sample_error_info["fp"].append(field)
total_fn_or_fp += len(answer)
for field in answer:
field_name = field[0]
if field_name not in metric_info:
metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0}
metric_info[field_name]["total_fn_or_fp"] += 1
sample_error_info["fn"].append(field)
sample_error_num = sum([len(v) for k, v in sample_error_info.items() if k != "tp"])
if sample_error_num > 0:
sample_error_info["error_num"] = sample_error_num
error_class_list = ["counter_" + x[0] for x in (sample_error_info["fn"] + sample_error_info["fp"])]
counter = Counter(error_class_list)
sample_error_info["error_info"] = dict(counter)
error_info[file_name] = sample_error_info
# summary
for field_name, field_info in metric_info.items():
field_tp, field_fn_or_fp = field_info["total_tp"], field_info["total_fn_or_fp"]
metric_info[field_name]["acc"] = field_tp / (field_tp + field_fn_or_fp / 2 + 1e-6)
print("donut_evaluator: total_tp: {}, total_fn_or_fp: {}, ptd_num: {}, gt_num: {}".format(total_tp, total_fn_or_fp,
len(preds), len(answers)))
error_info = {k: v for k, v in
sorted(error_info.items(), key=lambda item: item[1].get("error_num", 0), reverse=True)}
metric_info = {k: v for k, v in
sorted(metric_info.items(), key=lambda item: item[1].get("total_fn_or_fp", 0), reverse=True)}
return total_tp / (total_tp + total_fn_or_fp / 2 + 1e-6), metric_info, error_info
def construct_tree_from_dict(data: Union[Dict, List], node_name: str = None):
"""
Convert Dictionary into Tree
Example:
input(dict)
{
"menu": [
{"name" : ["cake"], "count" : ["2"]},
{"name" : ["juice"], "count" : ["1"]},
]
}
output(tree)
<root>
|
menu
/ \
<subtree> <subtree>
/ | | \
name count name count
/ | | \
<leaf>cake <leaf>2 <leaf>juice <leaf>1
"""
if node_name is None:
node_name = "<root>"
node = Node(node_name)
if isinstance(data, dict):
for key, value in data.items():
kid_node = construct_tree_from_dict(value, key)
node.addkid(kid_node)
elif isinstance(data, list):
if all(isinstance(item, dict) for item in data):
for item in data:
kid_node = construct_tree_from_dict(
item,
"<subtree>",
)
node.addkid(kid_node)
else:
for item in data:
node.addkid(Node(f"<leaf>{item}"))
else:
raise Exception(data, node_name)
return node
def cal_acc(pred: dict, answer: dict):
"""
Calculate normalized tree edit distance(nTED) based accuracy.
1) Construct tree from dict,
2) Get tree distance with insert/remove/update cost,
3) Divide distance with GT tree size (i.e., nTED),
4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ).
"""
pred = construct_tree_from_dict(normalize_dict(pred))
answer = construct_tree_from_dict(normalize_dict(answer))
val1 = zss.distance(
pred,
answer,
get_children=zss.Node.get_children,
insert_cost=insert_and_remove_cost,
remove_cost=insert_and_remove_cost,
update_cost=update_cost,
return_operations=False,
)
val2 = zss.distance(
construct_tree_from_dict(normalize_dict({})),
answer,
get_children=zss.Node.get_children,
insert_cost=insert_and_remove_cost,
remove_cost=insert_and_remove_cost,
update_cost=update_cost,
return_operations=False,
)
return max(0, 1 - val1 / val2)
def cal_acc_all(pred_info, answer_info):
acc_info, error_info = {}, {}
for file_name, answer in answer_info.items():
# if file_name not in pred_info:
# print("---> error: pdt not found: {}".format(file_name))
# continue
pred = pred_info.get(file_name, {})
acc = cal_acc(pred, answer)
acc_info[file_name] = acc
if acc < 1.0:
error_info[file_name] = {"acc": acc, "pred": pred, "answer": answer}
error_info = {k: v for k, v in sorted(error_info.items(), key=lambda item: item[1].get("acc", 0))}
acc_averge = sum(list(acc_info.values())) / (len(acc_info) + 1e-6)
return acc_averge, error_info
def normalize_values_of_nested_dict(d, normalize_func):
"""
"""
if isinstance(d, dict):
return {k: normalize_values_of_nested_dict(v, normalize_func) for k, v in d.items()}
elif isinstance(d, list):
return [normalize_values_of_nested_dict(x, normalize_func) if isinstance(x, dict) else x for x in d]
elif isinstance(d, str):
return normalize_func(d)
else:
return d
def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None):
"""
"""
if normalize_func is not None:
print("--> info: normalize_func executed.")
pdt_info = normalize_values_of_nested_dict(pdt_info, normalize_func)
gt_info = normalize_values_of_nested_dict(gt_info, normalize_func)
f1_score, class_eval_info, error_info = cal_f1_all(pdt_info, gt_info)
acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info)
eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info,
"f1_error_info": error_info, "acc_error_info": acc_error_info}
print(data_name, "f1_score", f1_score, "acc", acc_average)
return eval_info
def post_process_to_json(qwen_info_str, file_name=None):
try:
if "```json" in qwen_info_str:
if "```" not in qwen_info_str:
qwen_info_str += "```"
qwen_info_group = re.search(r'```json(.*?)```', qwen_info_str, re.DOTALL)
json_str = qwen_info_group.group(1).strip().replace("\n", "")
else:
json_str = qwen_info_str.strip().replace("\n", "")
json_data = json.loads(json_str)
return json_data
except Exception as err: # noqa: F841
return None
def fullwidth_to_halfwidth(text):
# 全角转半角
result = ''
for char in text:
code_point = ord(char)
# 全角空格直接转化
if code_point == 0x3000:
code_point = 0x0020
# 其他全角字符(除空格)转换为半角
elif 0xFF01 <= code_point <= 0xFF5E:
code_point -= 0xFEE0
result += chr(code_point)
result = result.replace("", ",")
return result
def remove_unnecessary_spaces(text):
# 去掉中文字符之间的空格
text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', text)
# 去掉中文和英文、数字之间的空格
text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[a-zA-Z0-9])', '', text)
text = re.sub(r'(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fff])', '', text)
# 去掉符号前的不必要空格,保留符号后的一个空格
text = re.sub(r'(?<![0-9])\s*([,.!?:;])\s*', r'\1 ', text) # 非数字前后的符号
# 在数字和英文之间添加空格
text = re.sub(r'(?<=[0-9])(?=[a-zA-Z])', ' ', text)
text = re.sub(r'(?<=[a-zA-Z])(?=[0-9])', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text
class KieEvaluator(BaseMetric):
def response_post_func(self, response_text, **kwargs):
response_text = post_process_to_json(response_text, file_name=kwargs.get('file_name', None))
return response_text
def normalize_func(self, text, **kwargs):
halfwidth_text = fullwidth_to_halfwidth(str(text))
cleaned_text = remove_unnecessary_spaces(halfwidth_text)
return cleaned_text
def evaluate(self, response_info, gt_info, **kwargs):
"""
response_info: dict: {"file_name_1": response, "file_name_2": gt}
gt_info: dict: {"file_name_1": gt, "file_name_2": gt}
kwargs: dataset index config: {'dataset': 'kie_benchmark_POIE', 'group': 'kie', 'op': 'poie', 'num': 250}
"""
# gt should be a dict for kie task, fix for VLMEvalKit
for image_name, label_content in gt_info.items():
if isinstance(label_content, str):
gt_info[image_name] = json.loads(label_content)
response_info = normalize_values_of_nested_dict(response_info, self.normalize_func)
gt_info = normalize_values_of_nested_dict(gt_info, self.normalize_func)
f1_score, class_eval_info, error_info = cal_f1_all(response_info, gt_info)
acc_average, acc_error_info = cal_acc_all(response_info, gt_info)
# summary info
summary_info = {"f1_score": f1_score, "acc": acc_average}
eval_info = {"summary": summary_info, "class_f1_score": class_eval_info,
"f1_error_info": error_info, "acc_error_info": acc_error_info}
return eval_info
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,106 @@
import os
import sys
import json
import re
from collections import Counter
# local import
from .common import BaseMetric
def token_normalize(token_text, is_lower=False, is_alphanum_only=False):
"""
"""
if is_lower:
token_text = token_text.lower()
if is_alphanum_only:
token_text = re.sub('[^A-Za-z0-9]+', '', token_text)
return token_text
def text_normalize_and_tokenize(text, is_keep_blank=True, is_lower=True, is_alphanum_only=False):
text = text.replace("\t", " ").replace("\n", " ").replace("###", "").replace("***", "")
text = re.sub(r'\s+', ' ', text)
if not is_keep_blank:
text = text.replace(" ", "")
text_tokens = text.split(" ") if is_keep_blank else list(text)
text_token_normalized = [token_normalize(t, is_lower, is_alphanum_only) for t in text_tokens]
text_token_normalized = [x for x in text_token_normalized if len(x) > 0]
return text_token_normalized
def evaluate_single_sample(gts, preds):
right_num = 0
gt_counter_info = dict(Counter(gts))
pdt_counter_info = dict(Counter(preds))
for gt_token, gt_count in gt_counter_info.items():
pred_count = pdt_counter_info.get(gt_token, 0)
right_num += min(gt_count, pred_count)
return right_num
def calculate_metrics(response_info, gt_info, is_verbose=False):
"""
"""
macro_recall_list, macro_precision_list, macro_f1_list = [], [], []
total_gt_num, total_pred_num, total_right_num = 0, 0, 0
for file_name, fullbox_gts in gt_info.items():
fullbox_preds = response_info.get(file_name, [])
right_num = evaluate_single_sample(fullbox_gts, fullbox_preds)
total_right_num += right_num
total_gt_num += len(fullbox_gts)
total_pred_num += len(fullbox_preds)
macro_recall = right_num / (len(fullbox_gts) + 1e-9)
macro_precision = right_num / (len(fullbox_preds) + 1e-9)
macro_f1 = 2 * macro_recall * macro_precision / (macro_recall + macro_precision + 1e-9)
macro_recall_list.append(macro_recall)
macro_precision_list.append(macro_precision)
macro_f1_list.append(macro_f1)
# marco
final_macro_recall = sum(macro_recall_list) / (len(macro_recall_list) + 1e-9)
final_macro_precision = sum(macro_precision_list) / (len(macro_precision_list) + 1e-9)
final_macro_f1 = sum(macro_f1_list) / (len(macro_f1_list) + 1e-9)
# micro
recall_acc = total_right_num / (total_gt_num + 1e-9)
preci_acc = total_right_num / (total_pred_num + 1e-9)
hmean = 2 * recall_acc * preci_acc / (recall_acc + preci_acc + 1e-9)
vbs_eval_result = {
'macro_recall': final_macro_recall, 'macro_precision': final_macro_precision, 'macro_f1_score': final_macro_f1,
'micro_recall': recall_acc, 'micro_precision': preci_acc, 'mirco_f1_score': hmean
}
eval_result = vbs_eval_result if is_verbose else {'macro_f1_score': final_macro_f1, 'mirco_f1_score': hmean}
return eval_result
class OcrEvaluator(BaseMetric):
def response_post_func(self, response_text, **kwargs):
return response_text
def evaluate(self, response_info, gt_info, **kwargs):
# hard code here
dataset_name = kwargs['dataset']
is_word_level, is_lower, is_alphanum_only = True, True, False
if dataset_name in ["Arabic", "Japanese", "Korean"] or "zh" in dataset_name:
is_word_level = False
if "multi_scene_ocr" in self.group_name and is_word_level:
is_alphanum_only = True
eval_config = {"word_level": is_word_level, "alphanum_only": is_alphanum_only, "lowercase": is_lower}
image_pdt_info, image_gt_info = {}, {}
for file_name, gt_src in gt_info.items():
pred_src = response_info.get(file_name, "")
pdt_token_list = text_normalize_and_tokenize(
str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
gt_token_list = text_normalize_and_tokenize(
str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
image_pdt_info[file_name] = pdt_token_list
image_gt_info[file_name] = gt_token_list
eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False)
return {"summary": eval_result, "metric_config": eval_config}
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,682 @@
from ...smp import *
from .multiple_choice import extract_answer_from_item
import pandas as pd
import numpy as np
import re
FAIL_MSG = "Failed to obtain answer via API."
frame_tmpl = "frame-{}-of-{}.jpg"
sys_prompt_open_eval_step_1 = (
"You will be provided with a question, a model's prediction, and the ground "
"truth answer for this question.\n"
"Your task is to judge whether the model's prediction is correct based on the "
"meaning of the two texts.\n"
"In most cases, this can be done by determining if the meaning of the model's "
"prediction is consistent with, or contains, the ground truth answer. However, "
"in some cases where the two texts differ, it may represent different "
"descriptions of the same visual scene, in which case visual information is "
"needed for further judgment.\n"
"Therefore, I hope you:\n"
"- Output 0, if the model's prediction and the ground truth answer are neither "
"consistent nor related by inclusion, with fundamentally different meanings.\n"
"- Output 1, if the meaning of the model's prediction and the ground truth "
"answer is consistent, or if the model's prediction meaningfully contains the "
"ground truth answer.\n"
"- Output 2, if the model's prediction and ground truth are not consistent or "
"inclusive, but may be different descriptions of the same visual scene, "
"requiring visual information for further judgment.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": choice}\n```\n\n'
"The choice is either 0, 1, or 2 as specified above."
)
sys_prompt_open_eval_step_2 = (
"You will be provided with a question, a model's prediction, and the sampling "
"frames of the clue intervals related to this question.\n"
"Your task is to determine whether the model has answered the question "
"correctly based on the visual information provided.\n"
"Therefore, I hope you:\n"
"- Output 0, if the model's prediction does not correctly answer the question.\n"
"- Output 1, if the model's prediction correctly answers the question.\n"
"Only output the answer in the following format without output extra "
"explanation:\n\n"
'```json\n{"result": choice}\n```\n\n'
"The choice is either 0 or 1 as specified above."
)
FAIL_MSG = "Failed to obtain answer via API."
# '10-20', '20-30', '30-40', '40-50', '50-60'
DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"]
DOMAINS = [
"Life Record",
"Music & TV show",
"Instruction & Knowledge",
"Driving",
"Embodied Expert",
"Humor/funny",
"Electonic/Social Gaming",
"Security & Health",
"Sports & Exercise",
"Special Scenes",
"Art & Culture",
"GUI",
"News",
"Animal & Pet",
]
SUB_CATEGORIES = [
"Time Cognition",
"Hallucination",
"Entity Perception",
"2D Spatial Perception",
"Time Perception",
"Scene Perception",
"Text Perception",
"Event Cognition",
"Entity Cognition",
"Text Cognition",
"Event Perception",
"Scene Cognition",
]
def get_dimention_rating_open_ended(data_path):
# 读取数据
df = load(data_path)
df = df[df["score"] != -1]
# 将秒转换为分钟并分配到对应区间
df["duration_minutes"] = df["duration"] / 60
df["duration_range"] = pd.cut(
df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
)
# 初始化结果字典
result = {
"overall": 0,
"duration": {k: 0 for k in DURATIONS},
"domain": {k: 0 for k in DOMAINS},
"sub_category": {k: 0 for k in SUB_CATEGORIES},
}
# Overall
result["overall"] = round(df["score"].mean(), 4)
# Duration
for dur in DURATIONS:
dur_scores = df[df["duration_range"] == dur]["score"]
result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
# Domain
for domain in DOMAINS:
domain_scores = df[df["domain"] == domain]["score"]
result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
# Sub-category
for sub_cat in SUB_CATEGORIES:
sub_cat_scores = df[df["sub_category"] == sub_cat]["score"]
result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
return result
def get_dimention_rating_mcq_grouding(data_path):
# 读取数据
df = load(data_path)
# df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0
df = df[df["score"] != -1]
# 将秒转换为分钟并分配到对应区间
df["duration_minutes"] = df["duration"] / 60
df["duration_range"] = pd.cut(
df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
)
# 初始化结果字典
result = {
metric: {
"overall": 0,
"duration": {k: 0 for k in DURATIONS},
"domain": {k: 0 for k in DOMAINS},
"sub_category": {k: 0 for k in SUB_CATEGORIES},
}
for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"]
}
# 计算基础指标
for metric in ["long_acc", "clue_acc", "miou"]:
metric_df = df[df["task_mode"] == metric]
# Overall
result[metric]["overall"] = round(metric_df["score"].mean(), 4)
# Duration
for dur in DURATIONS:
dur_scores = metric_df[metric_df["duration_range"] == dur]["score"]
result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
# Domain
for domain in DOMAINS:
domain_scores = metric_df[metric_df["domain"] == domain]["score"]
result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
# Sub-category
for sub_cat in SUB_CATEGORIES:
sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"]
result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
# 计算复合指标 CRR
def calculate_crr(scores):
long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean()
clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean()
return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0
# Overall CRR
result["CRR"]["overall"] = calculate_crr(df)
# Duration CRR
for dur in DURATIONS:
dur_df = df[df["duration_range"] == dur]
result["CRR"]["duration"][dur] = calculate_crr(dur_df)
# Domain CRR
for domain in DOMAINS:
domain_df = df[df["domain"] == domain]
result["CRR"]["domain"][domain] = calculate_crr(domain_df)
# Sub-category CRR
for sub_cat in SUB_CATEGORIES:
sub_cat_df = df[df["sub_category"] == sub_cat]
result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df)
# 计算 acc@iou
def calculate_acc_at_iou_threshold(scores, threshold):
miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"])
long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"])
valid_qids = miou_qids & long_acc_qids
miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"])
long_acc_positive = scores[
(scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1)
]
acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0
return round(acc_at_iou_threshold, 4)
def calculate_acc_at_iou(scores):
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds]
return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4)
# Overall acc@iou
result["acc@iou"]["overall"] = calculate_acc_at_iou(df)
# Duration acc@iou
for dur in DURATIONS:
dur_df = df[df["duration_range"] == dur]
result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df)
# Domain acc@iou
for domain in DOMAINS:
domain_df = df[df["domain"] == domain]
result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df)
# Sub-category acc@iou
for sub_cat in SUB_CATEGORIES:
sub_cat_df = df[df["sub_category"] == sub_cat]
result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df)
# 计算 rec@iou
def calculate_rec_at_iou_threshold(scores, threshold):
# 获取所有 miou 类型的数据
miou_scores = scores[scores["task_mode"] == "miou"]
# 计算 miou score 大于 threshold 的数量
miou_positive = miou_scores[miou_scores["score"] > threshold]
# 计算比例
rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0
return round(rec_at_iou, 4)
def calculate_rec_at_iou(scores):
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds]
return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4)
# Overall rec@iou
result["rec@iou"]["overall"] = calculate_rec_at_iou(df)
# Duration rec@iou
for dur in DURATIONS:
dur_df = df[df["duration_range"] == dur]
result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df)
# Domain rec@iou
for domain in DOMAINS:
domain_df = df[df["domain"] == domain]
result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df)
# Sub-category rec@iou
for sub_cat in SUB_CATEGORIES:
sub_cat_df = df[df["sub_category"] == sub_cat]
result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df)
return result
def milliseconds_to_seconds(milliseconds):
return milliseconds / 1000
def sample_frames_clue_average(clues_time_intervals, frame_num, fps):
# 计算每个线索区间的时长
clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals]
clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals]
total_duration = sum(clue_durations)
# 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧
if frame_num >= total_duration:
return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])]
frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations]
frame_indices = []
for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)):
num_frames = max(1, num_frames)
seg_size = (interval[1] - interval[0]) / num_frames
clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)]
frame_indices.extend(clue_frame_indices)
return frame_indices
def merge_intervals(intervals):
"""
Merge overlapping intervals in a list.
Assumes each interval is a list [start, end].
"""
if not intervals:
return []
# Sort intervals by start time
intervals.sort(key=lambda x: x[0])
merged = [intervals[0]]
for current in intervals[1:]:
last_merged = merged[-1]
# Check if there is an overlap
if current[0] <= last_merged[1]:
# Merge the current interval with the last one
last_merged[1] = max(last_merged[1], current[1])
else:
# No overlap, add current interval
merged.append(current)
return merged
def calculate_intervals_iou(intervals1, intervals2):
"""
Calculate the IoU of two lists of intervals.
Each list contains intervals represented as [start, end].
"""
# Merge overlapping intervals in both lists
merged1 = merge_intervals(intervals1)
merged2 = merge_intervals(intervals2)
# Calculate total length of intervals for both lists
def total_length(merged_intervals):
return sum(end - start for start, end in merged_intervals)
length1 = total_length(merged1)
length2 = total_length(merged2)
# Calculate intersection length
intersection_length = 0
for interval1 in merged1:
for interval2 in merged2:
intersection_start = max(interval1[0], interval2[0])
intersection_end = min(interval1[1], interval2[1])
intersection_length += max(0, intersection_end - intersection_start)
# Calculate union length
union_length = length1 + length2 - intersection_length
# IoU is intersection divided by union
iou = intersection_length / union_length if union_length > 0 else 0
return iou
def post_process(response, right_answer, task_mode, duration):
result = -1
if response:
# 找到 ```json 和 ``` 的位置
json_start = response.find("```json")
json_end = response.find("```", json_start + len("```json"))
# 如果找到了 json 内容
if json_start != -1 and json_end != -1:
json_content = response[json_start + len("```json"):json_end].strip()
else:
json_content = ""
if json_content:
if task_mode in ["long_acc", "clue_acc"]:
json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content)
try:
model_result = json.loads(json_content)["result"]
if task_mode in ["long_acc", "clue_acc"]:
result = 1 if right_answer == model_result else 0
elif task_mode == "miou":
if not isinstance(model_result, list):
return -1
if not isinstance(model_result[0], list):
model_result = [model_result]
need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result)
if need_duration:
model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result]
right_answer = eval(right_answer)
result = calculate_intervals_iou(right_answer, model_result)
except Exception as e:
print(f"Error in parsing JSON: {e}, {json_content}")
if result == -1:
if task_mode in ["long_acc", "clue_acc"]:
# 检查是否存在大写字母 A-H认为其为模型答案
matches = re.findall(r"\b[A-H]\b", response)
if matches:
result = 1 if right_answer in matches else 0
elif task_mode == "miou":
# 提取所有实数,进行配对
numbers = re.findall(r"-?\d+\.?\d*", response)
if len(numbers) < 2:
result = -1
else:
if len(numbers) % 2 != 0:
numbers = numbers[:-1]
model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)]
if type(right_answer) is str:
right_answer = eval(right_answer)
result = calculate_intervals_iou(right_answer, model_result)
return result
def get_timestampes(frame_indices, fps):
seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices))
timestamps = ", ".join(seconds)
return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format(
frame_num=len(frame_indices), timestamps=timestamps
)
def post_process_open(response):
model_result = -1
if response and response != FAIL_MSG:
json_start = response.find("```json")
json_end = response.find("```", json_start + len("```json"))
# 如果找到了 json 内容
if json_start != -1 and json_end != -1:
json_content = response[json_start + len("```json"):json_end].strip()
else:
json_content = ""
if json_content:
try:
model_result = json.loads(json_content)["result"]
except Exception as e:
print(f"Error in parsing JSON: {e}, {json_content}")
if model_result == -1:
model_result = response
return model_result
def post_process_eval_open(response, step):
model_result = -1
if response and response != FAIL_MSG:
json_start = response.find("```json")
json_end = response.find("```", json_start + len("```json"))
if json_start != -1 and json_end != -1:
json_content = response[json_start + len("```json"):json_end].strip()
else:
json_content = ""
if json_content:
try:
model_result = json.loads(json_content)["result"]
except Exception as e:
print(f"Error in parsing JSON: {e}, {json_content}")
return -1
if model_result == -1:
if step == 1:
match = re.search(r"[012]", response)
if match:
model_result = int(match.group())
else:
match = re.search(r"[01]", response)
if match:
model_result = int(match.group())
return model_result
def eval_open_first(model, line):
user_prompt = ""
user_prompt += f"Question: {line['question']}\n\n"
user_prompt += f"The ground truth answer is '{line['answer']}'\n\n"
user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
result = model.generate(user_prompt)
return result
def save_step_1_steps(data, step_1_results):
# 处理所有结果
data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1))
# 条件更新
mask = data["step_1_result"].isin([-1, 0, 1])
data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"]
data.loc[mask, "score"] = data.loc[mask, "step_1_result"]
return data
def eval_open_second(model, line, frame_paths):
user_prompt = ""
user_prompt += f"Question: {line['question']}\n\n"
user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
result = model.generate([user_prompt] + frame_paths)
return result
def save_step_2_steps(data, step_1_results):
# 处理所有结果
data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2))
return data
def clue_frame_paths(clue_frame_root, qid, num_frames=8):
frame_root = osp.join(clue_frame_root, str(qid))
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1):
if type(uid) is str:
uid = str(uid)
vid_path = osp.join(data_root, video)
vid = decord.VideoReader(vid_path)
vid_fps = vid.get_avg_fps()
if clue_intervals is not None:
# 1. 合并重叠区间
merged_intervals = merge_intervals(clue_intervals)
if num_frames > 0 and fps < 0:
# 2. 基于clue_intervals均匀抽帧
indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices))
# 保存帧
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths, indices, vid_fps
def get_chunk_number(filename):
try:
num = filename.split("chunk_")[1].split(".zip")[0]
return int(num)
except:
return float('inf')
def unzip_hf_zip(pth):
import zipfile
target_dir = pth
if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\
and os.path.exists(f"{target_dir}/cg_clue_videos"):
print("all exists")
return
video_zip_files = [
os.path.join(target_dir, file)
for file in os.listdir(target_dir)
if file.endswith(".zip") and file.startswith("video")
]
video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
videos_temp_zip = os.path.join(target_dir, "videos_merged.zip")
print("Merging video files ...")
with open(videos_temp_zip, "wb") as outfile:
for video_zip_file in tqdm(video_zip_files, desc="Merging videos"):
with open(video_zip_file, "rb") as infile:
outfile.write(infile.read())
print("Extracting video files...")
try:
with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref:
total_files = len(zip_ref.namelist())
for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
zip_ref.extract(file, target_dir)
print(f"Successfully extracted to {target_dir}")
except Exception as e:
print(f"Error during extraction: {e}")
finally:
if os.path.exists(videos_temp_zip):
os.remove(videos_temp_zip)
print("Cleaned up temporary video file")
clue_video_zip_files = [
os.path.join(target_dir, file)
for file in os.listdir(target_dir)
if file.endswith(".zip") and file.startswith("clue_video")
]
clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip")
print("Merging clue video files ...")
with open(clue_videos_temp_zip, "wb") as outfile:
for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"):
with open(clue_video_zip_file, "rb") as infile:
outfile.write(infile.read())
print("Extracting clue video files...")
try:
with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref:
total_files = len(zip_ref.namelist())
for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
zip_ref.extract(file, target_dir)
print(f"Successfully extracted to {target_dir}")
except Exception as e:
print(f"Error during extraction: {e}")
finally:
if os.path.exists(clue_videos_temp_zip):
os.remove(clue_videos_temp_zip)
print("Cleaned up temporary clue video file")
print("Extracting subtitle files ...")
subtitles_zip = os.path.join(target_dir, "subtitles.zip")
try:
with zipfile.ZipFile(subtitles_zip, "r") as zip_ref:
total_files = len(zip_ref.namelist())
for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
zip_ref.extract(file, target_dir)
print(f"Successfully extracted to {target_dir}")
except Exception as e:
print(f"Error during extraction: {e}")

View File

@@ -0,0 +1,13 @@
import json
import argparse
from collections import defaultdict
def is_correct(predict, answer):
# predict是标准答案 answer是预测
if len(answer) == 1:
return answer[0] == predict[0]
elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
return answer[0] == predict[0]
elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
return predict[4:].lower() in answer.lower()

View File

@@ -0,0 +1,54 @@
from ...smp import *
import os
def report_acc_hrbench(df):
cycle_group = df.groupby('cycle_category')
result_dic = defaultdict(list)
avg_dic = defaultdict(int)
count = 0
for key, data_value in cycle_group:
count += 1
_, resp_dic = hrbench_score(data_value)
for task_type, accuracy in resp_dic.items():
result_dic['cycle'].append(key)
result_dic['type'].append(task_type)
result_dic['accuracy'].append(accuracy)
avg_dic[task_type] += accuracy
for task_type, accuracy in avg_dic.items():
result_dic['cycle'].append('Average')
result_dic['type'].append(task_type)
result_dic['accuracy'].append(accuracy / count)
result_pd = pd.DataFrame(result_dic)
return result_pd
def hrbench_score(data):
ret = defaultdict(list)
resp_dic = {}
category_list = set(data['category'])
score_dict = defaultdict(list)
for i in range(len(data)):
d = data.iloc[i]
category = d['category']
gpt_score = d['hit']
score_dict[category].append(gpt_score)
score_dict['all'].append(gpt_score)
all_acc = np.mean(score_dict['all'])
ret['type'].append('all')
ret['acc'].append(all_acc)
resp_dic['all'] = all_acc
for cate in category_list:
acc = np.mean(score_dict[cate])
ret['type'].append(cate)
ret['acc'].append(acc)
resp_dic[cate] = acc
return pd.DataFrame(ret), resp_dic

View File

@@ -1,11 +1,11 @@
import os
from ...api import OpenAIWrapper
from ...smp import load_env
INTERNAL = os.environ.get('INTERNAL', 0)
def build_judge(**kwargs):
from ...api import OpenAIWrapper, SiliconFlowAPI
model = kwargs.pop('model', None)
kwargs.pop('nproc', None)
load_env()
@@ -19,12 +19,20 @@ def build_judge(**kwargs):
'chatgpt-1106': 'gpt-3.5-turbo-1106',
'chatgpt-0125': 'gpt-3.5-turbo-0125',
'gpt-4o': 'gpt-4o-2024-05-13',
'gpt-4o-0806': 'gpt-4o-2024-08-06',
'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
'deepseek': 'deepseek-ai/DeepSeek-V2.5',
}
model_version = model_map[model]
else:
model_version = LOCAL_LLM
model = OpenAIWrapper(model_version, **kwargs)
if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
model = SiliconFlowAPI(model_version, **kwargs)
else:
model = OpenAIWrapper(model_version, **kwargs)
return model
@@ -32,7 +40,7 @@ DEBUG_MESSAGE = """
To debug the OpenAI API, you can try the following scripts in python:
```python
from vlmeval.api import OpenAIWrapper
model = OpenAIWrapper('gpt-4-1106-preview', verbose=True)
model = OpenAIWrapper('gpt-4o', verbose=True)
msgs = [dict(type='text', value='Hello!')]
code, answer, resp = model.generate_inner(msgs)
print(code, answer, resp)

View File

@@ -0,0 +1,150 @@
import pandas as pd
# from colorama import Fore, Back, Style
from ...smp import *
FAIL_MSG = 'Failed to obtain answer via API.'
def build_prompt_logicvista(line):
question = line['question']
prediction = str(line['prediction'])
tmpl = (
"You are a information extractor that extracts multiple choice letter answer choices "
"from a paragraph that contains the answer choice and sometimes explaination of why that "
"choice is correct to the given question.\n"
"What letter did the following answer choose? If the answer did not select a letter answer choice, "
"first try to infer the answer based off the given choices.\n"
"If it does not seem like the given answer corresponds to an answer choice OR if there is no selected answer, please just respond with Z.\n"
"Make sure you answer with ONLY the letters chosen.\n"
'Example 1: \n'
'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
'Example 2: \n'
'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
'Example 3: \n'
'Question: <start>\nWhich figure is a rotation of the object?\n<end>\n'
'Answer: <start>\nThe figure on the right, labeled "D," is a rotation of the object shown in the top left corner.\n<end>\nYour output: D\n'
'Example 4: \n'
'Question: <start>\nWhich of the boxes comes next in the sequence? Select from A-E\n<end>\n'
'Answer: <start>\nThe sequence of the boxes is A, B, C, D, E.\n<end>\nYour output: ABCDE\n'
'Example 5: \n'
'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
)
return tmpl.format(question, prediction)
def LogicVista_auxeval(model, line):
prompt = build_prompt_logicvista(line)
print(prompt)
log = ''
retry = 5
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
answer = line['answer'].split(", ")
for j in range(0, len(answer)):
answer[j] = answer[j].lower()
answer.sort()
answer = ''.join(answer)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
elif not res.isupper() or not res.isalpha():
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
hit = 0
extracted = [alpha.lower() for alpha in res]
extracted.sort()
extracted = ''.join(extracted)
if extracted == answer:
hit = 1
return dict(log=log, res=res, hit=hit)
log += 'All 5 retries failed.\n'
return dict(log=log, res='', hit=0)
cat = ["diagram", "ocr", "patterns", "graphs", "tables", "3d shapes", "puzzles", "sequences", "physics"]
def evaluate_logicvista(file_path):
df = pd.read_excel(file_path)
tot = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
acc = defaultdict(lambda: 0)
lt = len(df)
skill_list = []
df_tot = df
df_inductive = df[df["skill"].str.contains("inductive")]
df_deductive = df[df["skill"].str.contains("deductive")]
df_numerical = df[df["skill"].str.contains("numerical")]
df_spatial = df[df["skill"].str.contains("spatial")]
df_mechanical = df[df["skill"].str.contains("mechanical")]
tot_correct = df_tot["hit"].sum()
tot_acc = (tot_correct / df_tot.shape[0]) * 100
tot['Overall'] = df_tot.shape[0]
hit['Overall'] = tot_correct
acc['Overall'] = tot_acc
inductive_correct = df_inductive["hit"].sum()
inductive_acc = (inductive_correct / df_inductive.shape[0]) * 100
tot["inductive"] = df_inductive.shape[0]
hit["inductive"] = inductive_correct
acc["inductive"] = inductive_acc
deductive_correct = df_deductive["hit"].sum()
deductive_acc = (deductive_correct / df_deductive.shape[0]) * 100
tot["deductive"] = df_deductive.shape[0]
hit["deductive"] = deductive_correct
acc["deductive"] = deductive_acc
numerical_correct = df_numerical["hit"].sum()
numerical_acc = (numerical_correct / df_numerical.shape[0]) * 100
tot["numerical"] = df_numerical.shape[0]
hit["numerical"] = numerical_correct
acc["numerical"] = numerical_acc
spatial_correct = df_spatial["hit"].sum()
spatial_acc = (spatial_correct / df_spatial.shape[0]) * 100
tot["spatial"] = df_spatial.shape[0]
hit["spatial"] = spatial_correct
acc["spatial"] = spatial_acc
mechanical_correct = df_mechanical["hit"].sum()
mechanical_acc = (mechanical_correct / df_mechanical.shape[0]) * 100
tot["mechanical"] = df_mechanical.shape[0]
hit["mechanical"] = mechanical_correct
acc["mechanical"] = mechanical_acc
# capability dimension, the official data json does not contain 'capability' column, so it is now ignored
# for i in cat:
# curr = df[df["capability"].str.contains(i.replace(" ", ""))]
# correct = curr["hit"].sum()
# accuracy = (correct / curr.shape[0]) * 100
# tot[i] = curr.shape[0]
# hit[i] = correct
# acc[i] = accuracy
res = defaultdict(list)
for k in tot.keys():
res['Task&Skill'].append(k)
res['tot'].append(tot[k])
res['hit'].append(hit[k])
res['acc'].append(acc[k])
res = pd.DataFrame(res)
return res

View File

@@ -0,0 +1,80 @@
from ...smp import *
from .multiple_choice import extract_answer_from_item
import numpy as np
import re
FAIL_MSG = 'Failed to obtain answer via API.'
DURATIONS = [15, 60, 600, 3600]
TASK_CATEGORIES = [
"S2E", "S2O", "S2A",
"E2O", "O2E", "T2E",
"T2O", "T2A", "E3E",
"O3O", "SSS", "SOS",
"SAA", "T3E", "T3O",
"TOS", "TAA"
]
def get_dimension_rating(data_path):
data = load(data_path)
print(data.iloc[0])
duration_rating = {k: {} for k in DURATIONS}
for duration in DURATIONS + ['overall']:
duration_rating[duration] = {
'overall': '',
'question_category': {k: [] for k in TASK_CATEGORIES}
}
for i in range(len(data)):
task_ctg = data.iloc[i]['question_category']
duration = data.iloc[i]['duration_group']
duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
for duration in DURATIONS + ['overall']:
overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}' # noqa: E501
duration_rating[duration]['overall'] = overall_res_dur
for task_ctg in TASK_CATEGORIES:
task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}' # noqa: E501
duration_rating[duration]['question_category'][task_ctg] = task_res_dur
return duration_rating
def extract_option(model, input_item, dataset_name):
options = input_item['question'].split('\n')[1:]
for id, option in enumerate(options):
option_id = chr(ord('A') + id) + '.'
if option.find(option_id) >= 0:
input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
return extract_answer_from_item(model, input_item, dataset_name)['opt']
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
'The best answer is',
'The correct answer is',
'The answer is',
'The answer',
'The best option is'
'The correct option is',
'Best answer:'
'Best option:',
'Answer:',
'Option:',
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, '')
if len(s.split()) > 10 and not re.search('[ABCDE]', s):
return ''
matches = re.search(r'[ABCDE]', s)
if matches is None:
return ''
return matches[0]

View File

@@ -2,8 +2,9 @@ from ...smp import *
from ...utils import can_infer
try:
from latex2sympy2 import latex2sympy
except ImportError:
print('Please install latex2sympy2 by running "pip install latex2sympy2"')
except Exception as e:
logging.critical(f'{type(e)}: {e}')
logging.critical('Please install latex2sympy2 by running "pip install latex2sympy2"')
FAIL_MSG = 'Failed to obtain answer via API.'

View File

@@ -0,0 +1,193 @@
from ...smp import *
from ...utils import can_infer
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_extract_ICE():
example_1 = """
1.
Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
Extracted Answer: (-2, 1)
""" # noqa
example_2 = """
2.
Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
Extracted Answer: D
""" # noqa
example_3 = """
3.
Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
""" # noqa
example_4 = """
4.
Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
Extracted Answer: null
""" # noqa
example_5 = """
5.
Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
Extracted answer: 22.3
""" # noqa
example_6 = """
6.
Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
Extracted answer: f(x) = -x^2 - 2x + 1
""" # noqa
return [example_1, example_2, example_3, example_4, example_5, example_6]
def get_gpt4_score_ICE():
example_1 = """
[Question]: Write the set of numbers represented on the number line in interval notation.
[Standard Answer]: (-2,1]
[Model_answer] : Extracted Answer: \\((-2, 1)\\)
Judgement: 0
""" # noqa
example_2 = """
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
[Standard Answer]: C
[Model_answer] : B:2\u221a{{3}}
Judgement: 0
""" # noqa
example_3 = """
[Question]: Find the domain and range of the function f using interval notation.
[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
[Model_answer] : Range: \\((-4, 1]\\)
Judgement: 0
""" # noqa
example_4 = """
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
[Standard Answer]: C
[Model_answer] : null
Judgement: 0
""" # noqa
return [example_1, example_2, example_3, example_4]
def build_mathverse_gpt4_extract_prompt(line):
task_description = """
I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n
""" # noqa
prediction = str(line['prediction'])
demo_prompt = task_description
examples = get_gpt4_extract_ICE()
for example in examples:
demo_prompt += example + '\n\n'
test_prompt = f"Model response: '{prediction}'\nExtracted Answer: "
full_prompt = f'{demo_prompt}7.\n{test_prompt}'
return full_prompt
def build_mathverse_gpt4_score_prompt(line):
task_description = """
Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent.
Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n
""" # noqa
question_for_eval = line['question_for_eval']
extract = line['extract']
answer = line['answer']
demo_prompt = task_description
examples = get_gpt4_score_ICE()
for example in examples:
demo_prompt += example + '\n\n'
test_prompt = f"""
[Question]: {question_for_eval}
[Standard Answer]: {answer}
[Model_answer] : {extract}
Judgement:"""
full_prompt = f'{demo_prompt}{test_prompt}'
return full_prompt
def post_check_score(line, prefetch=False):
ans = str(line['answer']).strip()
response = str(line['extract']).strip()
if response == ans:
return response if prefetch else True
else:
return False
def MathVerse_auxeval_extract(model, line):
prompt = build_mathverse_gpt4_extract_prompt(line)
log = ''
retry = 5
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log_extract=log, extract=res)
log += 'All 5 retries failed.\n'
return dict(log_extract=log, extract='')
def MathVerse_auxeval_score(model, line):
prompt = build_mathverse_gpt4_score_prompt(line)
log = ''
retry = 5
if post_check_score(line, prefetch=True):
res = post_check_score(line, prefetch=True)
return dict(log_score='Prefetch succeed', score=True)
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res or res.strip() not in ['0', '1']:
log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log_score=log, score=int(res) == 1)
log += 'All 5 retries failed.\n'
return dict(log_score=log, score=False)
def MathVerse_acc(result_file):
df = load(result_file)
df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"'))
df['metadata'] = df['metadata'].apply(json.loads)
df_metadata = pd.json_normalize(df['metadata'])
df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1)
subset = list(set(df['problem_version']))
res = defaultdict(list)
for p in subset:
if p != 'Overall':
sub = df[df['problem_version'] == p]
else:
sub = cp.deepcopy(df)
res['split'].append(p)
# Overall Acc
res['Overall'].append(np.mean(sub['score']) * 100)
# Subject
subjects = set(df['subject'])
for k in subjects:
res[k].append(np.mean(sub[sub['subject'] == k]['score']) * 100)
# Subfield
subfields = set(df['subfield'])
for k in subfields:
res[k].append(np.mean(sub[sub['subfield'] == k]['score']) * 100)
return pd.DataFrame(res)

View File

@@ -0,0 +1,189 @@
from ...smp import *
from .multiple_choice import extract_answer_from_item
from PIL import Image, ImageOps
import numpy as np
FAIL_MSG = 'Failed to obtain answer via API.'
system_prompt_sub_scene = """
##TASK DESCRIPTION:
You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category.
##ACCURACY Scoring Criteria:
Evaluate the respondent's answer against specific scoring points as follows:
Score 1: The response completely misses the scoring point.
Score 3: The response mentions content related to the scoring point but is not entirely correct.
Score 5: The response accurately addresses the scoring point.
Calculate the average score across all scoring points to determine the final accuracy score.
##RELEVANCE Scoring Criteria:
Assess how the respondent's answer relates to the original question:
Score 1: The response is completely off-topic from the question.
Score 2: The response is partially related to the question but contains a significant amount of irrelevant content.
Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer.
Score 4: The response mostly addresses the question and the respondent appears confident in their answer.
Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty.
----
##INSTRUCTION:
1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score.
2. Evaluate RELEVANCE: Assess the relevance of the respondents answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score.
3. Output Scores in JSON Format: Present the scores in JSON format as follows:
{'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele}
""" # noqa
system_prompt_summary = """
##TASK DESCRIPTION:
You are required to evaluate the performance of the respondent in the video summarization task based on the standard answer and the respondent's answer. You should provide two scores. The first is the COMPLETENESS score, which should range from 1 to 5. The second is the RELIABILITY score, which should also range from 1 to 5. Below are the criteria for each scoring category:
##COMPLETENESS Scoring Criteria:
The completeness score focuses on whether the summary covers all key points and main information from the video.
Score 1: The summary hardly covers any of the main content or key points of the video.
Score 2: The summary covers some of the main content and key points but misses many.
Score 3: The summary covers most of the main content and key points.
Score 4: The summary is very comprehensive, covering most to nearly all of the main content and key points.
Score 5: The summary completely covers all the main content and key points of the video.
##RELIABILITY Scoring Criteria:
The reliability score evaluates the correctness and clarity of the video summary. It checks for factual errors, misleading statements, and contradictions with the video content. If the respondent's answer includes details that are not present in the standard answer, as long as these details do not conflict with the correct answer and are reasonable, points should not be deducted.
Score 1: Contains multiple factual errors and contradictions; presentation is confusing.
Score 2: Includes several errors and some contradictions; needs clearer presentation.
Score 3: Generally accurate with minor errors; minimal contradictions; reasonably clear presentation.
Score 4: Very accurate with negligible inaccuracies; no contradictions; clear and fluent presentation.
Score 5: Completely accurate with no errors or contradictions; presentation is clear and easy to understand.
----
##INSTRUCTION:
1. Evaluate COMPLETENESS: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
2. Evaluate RELIABILITY: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
3. Output Scores in JSON Format: Present the scores in JSON format as follows:
{'score_completeness': score_comp, 'score_reliability': score_reli, 'total_score': score_comp + score_reli}
""" # noqa
def check_ans_with_model(pred, gt, model, item, dataset_name='MLVU_MCQ'):
flag = False
index = gt.index("(") # noqa
index2 = gt.index(")") # noqa
gt_option = gt[index + 1: index2]
if ")" in pred:
index3 = pred.index(")")
pred = pred[index3 - 1: index3]
if pred == gt_option:
flag = True
elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
flag = True
return flag
def extract_scores_summary(text):
# Define the keys to locate in the text
keys = ["score_completeness", "score_reliability"]
scores = []
for key in keys:
# Find the index where each key starts
start_index = text.find(key)
if start_index == -1:
continue # Skip if key is not found
# Find the start of the number which is after the colon and space
start_number_index = text.find(":", start_index) + 2
end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma
# Extract and convert the number to float
score = float(text[start_number_index:end_number_index])
scores.append(score)
return scores
def check_ans_with_model_summary(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
user_prompt = f"""
Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
Standard Answer: {gt}
Respondent's Answer: {pred}
""" # noqa
result = model.generate(user_prompt)
result = extract_scores_summary(result)
result = np.sum(result)
return result
def extract_scores_sub_scene(text):
# Define the keys to locate in the text
keys = ["score_accuracy", "score_relevance"]
scores = []
for key in keys:
# Find the index where each key starts
start_index = text.find(key)
if start_index == -1:
continue # Skip if key is not found
# Find the start of the number which is after the colon and space
start_number_index = text.find(":", start_index) + 2
end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma
# Extract and convert the number to float
score = float(text[start_number_index:end_number_index])
scores.append(score)
return scores
def check_ans_with_model_sub_scene(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
user_prompt = f"""
Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
Question: {item['question']}
Scoring Points: {item['scoring_points']}
Respondent's Answer: {pred}
""" # noqa
result = model.generate(user_prompt)
result = extract_scores_sub_scene(result)
result = np.sum(result)
return result
def MLVU_OpenEnded_generate(model, line):
task_type = line['task_type']
if task_type == 'summary':
user_prompt = (
f"Please score the respondent's answer according to the steps in the Instructions. "
f"You must end with a JSON dict to store the scores.\n"
f"Standard Answer: {line['answer']}\n"
f"Respondent's Answer: {line['prediction']}\n"
)
elif task_type == 'sub_scene':
user_prompt = (
f"Please score the respondent's answer according to the steps in the Instructions. "
f"You must end with a JSON dict to store the scores.\n"
f"Question: {line['question']}\n"
f"Scoring Points: {line['scoring_points']}\n"
f"Respondent's Answer: {line['prediction']}\n"
)
else:
AssertionError(f'MLVU don\'t have {task_type} open ended task!')
result = model.generate(user_prompt)
return result
def MLVU_OpenEnded_extract(gpt_generate_data, org_data):
extract_func = {
'sub_scene': extract_scores_sub_scene,
'summary': extract_scores_summary
}
for idx, item in org_data.iterrows():
func = extract_func[item['task_type']]
text = gpt_generate_data[idx]
org_data.loc[idx, 'score'] = np.sum(func(text))
return org_data
def get_dimension_rating(data_path):
data = load(data_path)
result_dict = {}
for idx, item in data.iterrows():
if item['task_type'] not in result_dict:
result_dict[item['task_type']] = [0,0]
result_dict[item['task_type']][0] += int(item['score'])
result_dict[item['task_type']][1] += 1
return result_dict

View File

@@ -118,7 +118,7 @@ def mmdu_score(model, line):
f'{",".join([x for x in DIMS if x not in result_dict])}'
)
except Exception as e:
print({e})
logging.warning(str(e))
all_result_dict.append({d: None for d in DIMS})
logs.append(str(e))

View File

@@ -0,0 +1,298 @@
import re
import json
def has_word(sentence, word):
pattern = r'\b' + re.escape(word) + r'\b'
match = re.search(pattern, sentence)
if match:
return True
else:
return False
class VQAEval:
def __init__(self):
self.contractions = {
'aint': "ain't",
'arent': "aren't",
'cant': "can't",
'couldve': "could've",
'couldnt': "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
'didnt': "didn't",
'doesnt': "doesn't",
'dont': "don't",
'hadnt': "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
'hasnt': "hasn't",
'havent': "haven't",
'hed': "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
'hes': "he's",
'howd': "how'd",
'howll': "how'll",
'hows': "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
'Im': "I'm",
'Ive': "I've",
'isnt': "isn't",
'itd': "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
'itll': "it'll",
"let's": "let's",
'maam': "ma'am",
'mightnt': "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
'mightve': "might've",
'mustnt': "mustn't",
'mustve': "must've",
'neednt': "needn't",
'notve': "not've",
'oclock': "o'clock",
'oughtnt': "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
'shant': "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
'shouldve': "should've",
'shouldnt': "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": 'somebodyd',
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
'somebodyll': "somebody'll",
'somebodys': "somebody's",
'someoned': "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
'someonell': "someone'll",
'someones': "someone's",
'somethingd': "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
'somethingll': "something'll",
'thats': "that's",
'thered': "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
'therere': "there're",
'theres': "there's",
'theyd': "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
'theyll': "they'll",
'theyre': "they're",
'theyve': "they've",
'twas': "'twas",
'wasnt': "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
'weve': "we've",
'werent': "weren't",
'whatll': "what'll",
'whatre': "what're",
'whats': "what's",
'whatve': "what've",
'whens': "when's",
'whered': "where'd",
'wheres': "where's",
'whereve': "where've",
'whod': "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
'wholl': "who'll",
'whos': "who's",
'whove': "who've",
'whyll': "why'll",
'whyre': "why're",
'whys': "why's",
'wont': "won't",
'wouldve': "would've",
'wouldnt': "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
'yall': "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
'youd': "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
'youll': "you'll",
'youre': "you're",
'youve': "you've",
}
self.manualMap = {
'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
'fourteen': 14, 'fifteen': 15, 'sixteen': 16,
'seventeen': 17, 'eighteen': 18, 'nineteen': 19,
'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50,
'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90}
self.articles = ['a', 'an', 'the']
self.periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)')
self.commaStrip = re.compile('(\\d)(\\,)(\\d)')
self.punct = [
';',
r'/',
'[',
']',
'"',
'{',
'}',
'(',
')',
'=',
'+',
'\\',
'_',
'-',
'>',
'<',
'@',
'`',
',',
'?',
'!',
]
def evaluate(self, answer, gt_answers):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
if isinstance(gt_answers, list):
for i in range(len(gt_answers)):
gt_answers[i] = str(gt_answers[i])
gt_answers[i] = gt_answers[i].replace('\n', ' ')
gt_answers[i] = gt_answers[i].replace('\t', ' ')
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1
return 0
else:
gt_answers = gt_answers.replace('\n', ' ')
gt_answers = gt_answers.replace('\t', ' ')
gt_answers = gt_answers.strip()
gt_answers = self.processPunctuation(gt_answers)
gt_answers = self.processDigitArticle(gt_answers)
if has_word(answer, gt_answers):
return 1
else:
return 0
def evaluate_MRR(self, answer, gt_answers):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
assert isinstance(gt_answers, list)
for i in range(len(gt_answers)):
gt_answers[i] = gt_answers[i].replace('\n', ' ')
gt_answers[i] = gt_answers[i].replace('\t', ' ')
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1 / (i + 1)
return 0.0
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + ' ' in inText or ' ' + p in inText) or (
re.search(self.commaStrip, inText) is not None
):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = self.periodStrip.sub('', outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = [str(text) for text in outText]
outText = ' '.join(outText)
return outText
def is_correct(answer, response):
# response_orig = response
response = response.strip('.')
if isinstance(answer, int):
if response.isdigit():
return int(int(response) == answer)
response = response.lower()
response = response.replace('the answer is', '')
response = response.replace('*', '') # parse **A**
if response.find('.') != -1:
response = response.split('.')[0]
response = response.replace(',', '')
response = response.strip()
response = response.strip()
if response == 'none':
return 0
if 'the camera is moving left' in response:
response = 'a'
elif 'the camera is moving right' in response:
response = 'b'
if len(response) != 1:
# print(f"Fail to parse {response_orig}")
return 0
return (ord(response) - ord('a')) == answer
if isinstance(answer, list):
try:
response = response.replace('json', '').replace('```', '').strip()
response = json.loads(response)
if isinstance(response, dict):
response = sum(list(response.values()), start=[])
except:
# print(f"Fail to parse {response_orig} Exception: {e}")
return 0
if not isinstance(response, (list, tuple)):
# print(f"Fail to parse {response_orig} Exception: not a list!")
return 0
match = 0
for res, ans in zip(response, answer):
match += res == ans
return match / len(answer)
return VQAEval().evaluate(response, answer)

View File

@@ -2,6 +2,7 @@ import pandas as pd
from ...utils import can_infer, track_progress_rich
from ...smp import *
import numpy as np
import re
MMB_abbrs = {
'coarse_perception': 'CP',
@@ -170,6 +171,31 @@ def build_prompt(question, options, prediction):
return tmpl.format(question, options, prediction)
def build_prompt_wemath(question, prediction):
tmpl = (
'You are an AI assistant who will help me to match '
'an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
'If the meaning of all options are significantly different from the answer, output Z. '
'Your should output a single uppercase character in A, B, C, D, E, F, G (if they are valid options), and Z. \n'
'Example 1: \n'
'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
'Example 2: \n'
'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
'Example 3: \n'
'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
)
question = question.replace(
("Regarding the format, please answer following the template below, and be sure to include two <> symbols:\n"
"<Thought process>: <<your thought process>> <Answer>: <<your option>>"),
'',
)
return tmpl.format(question, prediction)
def build_prompt_blink(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match an answer with several options of a single-choice question. '
@@ -241,6 +267,8 @@ def extract_answer_from_item(model, item, dataset_name=None):
if dataset_name == 'BLINK':
prompt = build_prompt_blink(item['question'], option_str, item['prediction'])
elif dataset_name == 'WeMath':
prompt = build_prompt_wemath(item['question'], item['prediction'])
elif cn_string(item['question']):
prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
else:
@@ -359,9 +387,7 @@ def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None):
res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys)
result = load(result_file)
for k, v in zip(keys, res):
if k in result:
assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
else:
if k not in result:
result[k] = v
data['hit'] = [result[i]['hit'] for i in data['index']]
data['log'] = [result[i]['log'] for i in data['index']]
@@ -425,9 +451,7 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
keys=keys)
result = load(result_file)
for k, v in zip(keys, res):
if k in result:
assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
else:
if k not in result:
result[k] = v
tmp_pth = f'/tmp/{timestr()}.xlsx'
@@ -440,3 +464,95 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
data_main.pop('GT')
return data_main
def extract_characters_regex(s, choices=['(A)', '(B)', '(C)', '(D)', '(E)']):
if type(s) is dict:
s = ''
s = s.strip()
answer_prefixes = [
'The best answer is',
'The correct answer is',
'The answer is',
'The answer',
'The best option is'
'The correct option is',
'Best answer:'
'Best option:',
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, '')
if len(s.split()) > 10 and not re.search('[ABCDE]', s):
return ''
matches = re.search(r'[ABCDE]', s)
if matches is None:
for choice in choices:
if s.lower() in choice.lower():
return choice[1]
return ''
return matches[0]
def get_dimension_rating(data_path):
TASKS = [
'Reasoning',
'Perception',
]
SUBTASKS = [
'Monitoring',
'Autonomous_Driving',
'OCR with Complex Context',
'Diagram and Table',
'Remote Sensing',
]
data = load(data_path)
results = {}
results['Overall'] = {}
for task in TASKS:
results[f'{task}'] = {}
for subtask in SUBTASKS:
results[f'{task}'][f'{subtask}'] = {}
for i in range(len(data)):
question = data.iloc[i]
Task = question['category'].split('/')[0]
Subtask = question['category'].split('/')[1]
Category = question['l2-category'].lower()
if 'attribute' in Category.lower():
Category = Category.split('/')[0] + '/attribute'
if question['score'] >= 0:
cnt = question['score']
if Category not in results[Task][Subtask].keys():
results[Task][Subtask][f'{Category}'] = {'true': cnt, 'false': 1 - cnt}
else:
results[Task][Subtask][f'{Category}']['true'] += cnt
results[Task][Subtask][f'{Category}']['false'] += 1 - cnt
sum_all, succ_all = 0, 0
for task, tasks_values in results.items():
cnt_task, sum_task = 0, 0
for substask, subtask_value in tasks_values.items():
cnt_subtask, sum_subtask = 0, 0
for category, category_dict in subtask_value.items():
cnt_subtask += category_dict['true']
sum_subtask += category_dict['false'] + category_dict['true']
acc = category_dict['true'] / (category_dict['false'] + category_dict['true'])
results[task][substask][category] = acc
if sum_subtask == 0:
acc_subtasks = 0
else:
acc_subtasks = cnt_subtask / sum_subtask
cnt_task += cnt_subtask
sum_task += sum_subtask
results[task][substask]['Avg'] = acc_subtasks
if sum_task == 0:
acc_task = 0
else:
acc_task = cnt_task / sum_task
succ_all += cnt_task
sum_all += sum_task
results[task]['Avg'] = acc_task
results['Overall'] = succ_all / sum_all
return results

View File

@@ -1,4 +1,5 @@
from ...smp import *
from .multiple_choice import extract_answer_from_item
from PIL import Image, ImageOps
import torchvision
import random
@@ -32,9 +33,9 @@ def get_dimension_rating(data_path):
def check_ans(pred, gt):
flag = False
pred_list = pred.lower().split(' ')
pred_list = pred.lower().strip().split(' ')
pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
gt_list = gt.lower().split(' ')
gt_list = gt.lower().strip().split(' ')
gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
if gt_content[-1] == '.':
gt_content = gt_content[:-1]
@@ -47,6 +48,64 @@ def check_ans(pred, gt):
return flag
def check_ans_with_model(pred, gt, model, item, dataset_name='MVBench'):
flag = False
pred_list = pred.lower().strip().split(' ')
pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
gt_list = gt.lower().strip().split(' ')
gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
if gt_content[-1] == '.':
gt_content = gt_content[:-1]
if pred_option.replace('.', '') in gt_option:
flag = True
elif gt_option in pred_option:
flag = True
elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
flag = True
return flag
def check_ans_advanced(pred, gt):
number_table = {
0: 'zero',
1: 'one',
2: 'two',
3: 'three',
4: 'four',
5: 'five',
6: 'six',
7: 'seven',
8: 'eight',
9: 'nine',
}
flag = False
pred_list = pred.lower().strip().split(' ')
pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
gt_list = gt.lower().strip().split(' ')
gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
if gt_content[-1] == '.':
gt_content = gt_content[:-1]
try:
gt_content = number_table[int(gt_content.strip('. \n'))]
print(gt_content)
except:
pass
if pred_option.replace('.', '') in gt_option:
flag = True
elif gt_option in pred_option:
flag = True
elif gt_content.lower().strip('. \n') in pred.lower().strip('. \n'):
flag = True
return flag
class GroupRandomCrop(object):
def __init__(self, size):
if isinstance(size, numbers.Number):

View File

@@ -0,0 +1,145 @@
import re
def extract_answer(output_string, task_type="yes_no"):
"""
Extracts the answer from the output string based on the task type.
Parameters:
output_string (str): The output string.
task_type (str): The type of task. Must be either "yes_no" or "multiple_choice".
Returns:
int:
1 if "yes" or "A"
0 if "no" or "B"
-1 if no relevant answer is found.
Raises a ValueError if an unsupported task_type is provided.
"""
def find_word_position(string, word):
pattern = r'\b' + re.escape(word) + r'\b'
match = re.search(pattern, string, re.IGNORECASE)
if match:
return match.start()
return -1
if task_type not in ["yes_no", "multiple_choice"]:
raise ValueError(f"Task type {task_type} not supported. Must be 'yes_no' or 'multiple_choice'.")
if task_type == "yes_no":
position_yes_and_a = find_word_position(output_string, "yes")
position_no_and_b = find_word_position(output_string, "no")
elif task_type == "multiple_choice":
position_yes_and_a = find_word_position(output_string, "A")
position_no_and_b = find_word_position(output_string, "B")
if position_yes_and_a == -1 and position_no_and_b == -1:
print(f"No answer found in the output string: {output_string}.")
return -1
elif position_yes_and_a != -1 and position_no_and_b != -1:
return 1 if position_yes_and_a < position_no_and_b else 0
else:
return 0 if position_yes_and_a == -1 else 1
def get_scores(scores):
"""
Calculate various scores based on the given results.
Args:
scores (dict or list): A dictionary or list containing results where each result can be:
- dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...}
- list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...]
The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images:
- "q0_i0" means question_0 on image_0
- "q0_i1" means question_0 on image_1
- "q1_i0" means question_1 on image_0
- "q1_i1" means question_1 on image_1
Returns:
dict: A dictionary containing the calculated scores:
- 'Q_Acc': Average question score
- 'I_Acc': Average image score
- 'Acc': Average binary VQA score
- 'G_Acc': Average group score
"""
Q_Acc = 0.0
I_Acc = 0.0
Acc = 0.0
G_Acc = 0.0
num_samples = len(scores)
def calculate_image_score(result):
image_correct = 0
if isinstance(result, dict):
if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0:
image_correct += 1
if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0:
image_correct += 1
elif isinstance(result, list):
if result[0] == 1.0 and result[2] == 0.0:
image_correct += 1
if result[3] == 1.0 and result[1] == 0.0:
image_correct += 1
return image_correct
def calculate_question_score(result):
text_correct = 0
if isinstance(result, dict):
if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0:
text_correct += 1
if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0:
text_correct += 1
else:
if result[0] == 1.0 and result[1] == 0.0:
text_correct += 1
if result[3] == 1.0 and result[2] == 0.0:
text_correct += 1
return text_correct
def calculate_binary_score(result):
binary_score_correct = 0
if isinstance(result, dict):
binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0
binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0
binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0
binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0
else:
binary_score_correct += 1 if result[0] == 1.0 else 0
binary_score_correct += 1 if result[1] == 0.0 else 0
binary_score_correct += 1 if result[2] == 0.0 else 0
binary_score_correct += 1 if result[3] == 1.0 else 0
return binary_score_correct
def calculate_group(result):
group_correct = 0
if calculate_question_score(result) == 2 and calculate_image_score(result) == 2:
group_correct += 1
return group_correct
if isinstance(scores, dict):
for _, result in scores.items():
Q_Acc += calculate_question_score(result)
I_Acc += calculate_image_score(result)
Acc += calculate_binary_score(result)
G_Acc += calculate_group(result)
else:
for result in scores:
Q_Acc += calculate_question_score(result)
I_Acc += calculate_image_score(result)
Acc += calculate_binary_score(result)
G_Acc += calculate_group(result)
results = {
'Q_Acc': Q_Acc / float(num_samples * 2),
'I_Acc': I_Acc / float(num_samples * 2),
'Acc': Acc / float(num_samples * 4),
'G_Acc': G_Acc / num_samples
}
return results

View File

@@ -0,0 +1,532 @@
import re
import json
from math import isclose
import sympy as sp
from sympy import simplify, Eq, sympify, evalf, Pow
from sympy.parsing.latex import parse_latex
import antlr4
from decimal import Decimal, getcontext
from fractions import Fraction
import sys
import math
chinese_answer_type_dict = {
'Numerical': '数值',
'Expression': '表达式',
'Equation': '方程',
'Interval': '区间'
}
english_answer_type_dict = {
'Numerical': 'a numerical value',
'Expression': 'an expression',
'Equation': 'an equation',
'Interval': 'an interval'
}
def get_single_answer_type_text(answer_type, is_chinese):
if '-' in answer_type: # No need now
answer_type = answer_type[:answer_type.find('-')]
for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
if t in answer_type:
if is_chinese:
return chinese_answer_type_dict[t]
else:
return english_answer_type_dict[t]
exit(f'Error parsing answer type {answer_type}!')
def get_answer_type_text(answer_type, is_chinese, multiple_answer):
# 'Tuple' has various meanings in different context, such as position or values of a series of variable,
# so it may lead to confusion to directly use 'tuple' in the prompt.
if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
full_answer_text = ''
else:
if not multiple_answer:
answer_text = get_single_answer_type_text(answer_type, is_chinese)
if is_chinese:
full_answer_text = f',答案类型为{answer_text}'
else:
full_answer_text = f"The answer of The problem should be {answer_text}. "
else:
if ',' not in answer_type: # Same answer type for all answers
answer_text = get_single_answer_type_text(answer_type, is_chinese)
if is_chinese:
full_answer_text = f',题目有多个答案,答案类型均为{answer_text}'
else:
full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
else:
answer_types = answer_type.split(',')
answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
if len(set(answer_types)) == 1:
answer_text = answer_types[0]
if is_chinese:
full_answer_text = f',题目有多个答案,答案类型均为{answer_text}'
else:
full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
else:
if is_chinese:
answer_text = ''.join(answer_types)
full_answer_text = f',题目有多个答案,答案类型分别为{answer_text}'
else:
answer_text = ', '.join(answer_types)
full_answer_text = (
f'The problem has multiple answers, with the answers in order being {answer_text}. '
)
return full_answer_text
def make_input(prompt, question_content):
# diversified based on the vllm, which is not implemented temporarily
input = prompt + '\n' + question_content
return input
sys.set_int_max_str_digits(1000000)
# 设置decimal的精度
getcontext().prec = 50
class MathJudger:
def __init__(self):
self.special_signal_map = {
"\\left": "",
"\\right": "",
"": ":",
"": ",",
"$": "",
"\\approx": "=",
"\\simeq": "=",
"\\sim": "=",
"^\\prime": "'",
"^{\\prime}": "'",
"^\\circ": "",
"%": "",
}
self.pi = parse_latex("\\pi")
self.precision = 1e-8
def split_by_comma(self, expr: str):
in_bracket_num = 0
splitted_expr = []
start_idx = 0
for i, char in enumerate(expr):
if char == "(" or char == "[":
in_bracket_num += 1
elif char == ")" or char == "]":
in_bracket_num -= 1
elif char == "," and in_bracket_num == 0:
splitted_expr.append(expr[start_idx:i].strip())
start_idx = i + 1
if start_idx < len(expr):
splitted_expr.append(expr[start_idx:].strip())
return splitted_expr
def trans_plus_minus_sign(self, expr_list: list):
new_expr_list = []
for expr in expr_list:
if "\\pm" in expr:
new_expr_list.append(expr.replace("\\pm", "+"))
new_expr_list.append(expr.replace("\\pm", "-"))
else:
new_expr_list.append(expr)
return new_expr_list
def judge(self, expression1, expression2, precision=1e-8):
# (默认 expression1 为 Ground_Truth)
precision = precision if isinstance(precision, list) else [precision]
try:
expression1, expression2 = self.preprocess(expression1, expression2)
except:
return False
if expression1 == expression2:
# print("原生相等")
return True
# 去除字符串中的中文字符,因为上面已经判断过了类似回答为"能"或"不能"的含有中文字符的回答情况
expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
expression1 = self.split_by_comma(expression1)
expression2 = self.split_by_comma(expression2)
temp_list1 = self.trans_plus_minus_sign(expression1)
temp_list2 = self.trans_plus_minus_sign(expression2)
# 设计误差值列表
if len(precision) <= 1:
precision = precision * len(temp_list1)
if len(temp_list1) != len(temp_list2):
return False
# 判断两个列表中的元素是否可以两两配对,并且两两相等,由此支持多个回答的比较
idx = -1
while len(temp_list1) != 0:
idx = (idx + 1) % len(temp_list1)
item1 = temp_list1[idx]
self.precision = precision[idx]
# print(self.precision)
for item2 in temp_list2:
if self.is_equal(item1, item2):
temp_list1.remove(item1)
temp_list2.remove(item2)
precision.remove(self.precision)
break
else:
# If we didn't break from the inner loop, it means no match was found
return False
# If all elements are matched and removed, the lists can be paired
return True
def is_interval(self, epr):
return epr.startswith(("(", "[")) and epr.endswith((")", "]"))
# 在进行数值计算前需要将sympy中的pi符号替换为pi的近似数值
# def sympy_sub_pi(self, expression_sympy):
# return expression_sympy.subs(self.pi, math.pi)
# 默认第一个表达式是 ground_truth
def is_equal(self, expression1, expression2):
if expression1 == expression2 and expression1 != "" and expression2 != "":
# print("原生等价")
return True
# 先判断是否是两个区间,是的话进行判断相等,不相等则返回 False
if self.is_interval(expression1) and self.is_interval(expression2):
try:
if self.interval_equal(expression1, expression2):
# print("区间等价")
return True
except:
return False
# 再判断是否在数值上相等
try:
if self.numerical_equal(expression1, expression2):
# print("数值等价")
return True
except:
pass
# 再判断是否是表达式相等
try:
if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
# print("表达式等价")
return True
except:
pass
# 再判断是否是等式相等
try:
if self.equation_equal(expression1, expression2):
# print("等式等价")
return True
except:
pass
return False
# 判断两个数值在误差允许范围内是否相等
def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
"""
(默认 expression1 为 Ground_Truth)
函数: 判读两个数值是否在误差允许范围内相等
步骤1: 将可能出现的百分号的情况包含进来
步骤2: 使用 math.isclose 函数判断是否相等
"""
reference = float(expression1)
prediction = float(expression2)
if include_percentage:
gt_result = [reference / 100, reference, reference * 100]
else:
gt_result = [reference]
for item in gt_result:
# if isclose(item, prediction, abs_tol=self.precision, rel_tol=0):
if abs(item - prediction) <= self.precision * 1.01:
return True
return False
def expression_equal(self, exp1, exp2):
"""
(默认 expression1 为 Ground_Truth)
函数: 判断两个表达式是否在数学意义上等价
步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1"
步骤2: 使用 sympy 库进行等价判断
"""
# 只提取等号右边的表达式,一般左边是所求的量
def extract_expression(expression):
if "=" in expression:
expression = expression.split("=")[1]
return expression.strip()
exp1 = extract_expression(exp1)
exp2 = extract_expression(exp2)
exp_too_long = len(exp1) > 300 or len(exp2) > 300
# 将表达式转换为 sympy 中能够进行处理的格式
expr1_sym = sympify(parse_latex(exp1))
expr2_sym = sympify(parse_latex(exp2))
if expr1_sym == expr2_sym:
return True
else:
expr1_sym = self.sympy_sub_pi(expr1_sym)
expr2_sym = self.sympy_sub_pi(expr2_sym)
# 如果输入的表达式可以计算出具体数值的话,则将其进行数值计算的比较
if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (
not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
return False
elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
try:
if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
print(
"These two number can not be calculated by current computer for: "
f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\""
)
return False
if exp_too_long:
print(f'Expression {exp1} or {exp2} is too long to compute. ')
return False
if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
return True
else:
return False
except:
return False
elif exp_too_long:
print(f'Expression {exp1} or {exp2} is too long to compute. ')
return False
else:
try:
simplified_expr = simplify(expr1_sym - expr2_sym)
num_value = simplified_expr.evalf()
return abs(num_value) < 1e-3
except:
return False
def equation_equal(self, expression1, expression2):
"""
(默认 expression1 为 Ground_Truth)
函数: 判断两个方程是否在数学意义上等价
步骤1: 将一个方程/等式化简为标准方程, 即等式的右边严格等于0, 接下来只需要判断两个等式的左边是否"等价"
步骤2: 使用 sympy 库计算两个等式左边的商, 如果这个商或者这个商的倒数为整数, 那么数学意义上我们可以推导出这两个方程等价👌
"""
# 将等式的右边都移到左边,并返回一个 sympy 格式的表达式
def simplify_equation(latex_eq):
# 分割等式的左边和右边
lhs, rhs = latex_eq.split('=')
# 使用 parse_latex 解析 LaTeX 表达式
lhs_expr = parse_latex(lhs)
rhs_expr = parse_latex(rhs)
# 创建等式对象
equation = Eq(lhs_expr, rhs_expr)
# 化简等式:将等式右边移到左边
simplified_eq = simplify(equation.lhs - equation.rhs)
return simplified_eq
expr1_sym = simplify_equation(expression1)
expr2_sym = simplify_equation(expression2)
division_result_1 = simplify(expr1_sym / expr2_sym)
division_result_2 = simplify(expr2_sym / expr1_sym)
# 如果两个方程转换后的式子相除为整数 且非零,则根据推导可知这两个方程等价
if (division_result_1.is_Integer and division_result_1 != 0) or (
division_result_2.is_Integer and division_result_2 != 0):
return True
else:
return False
def interval_equal(self, expression1, expression2):
# 函数: 判断两个区间是否在数学意义上等价
# 步骤1: 简化区间的表达式, 去除无关的符号比如"\left", "\right", 同时将可能出现的"x \in"删去
# 步骤2: 对比两个区间的左右符号、中间出现的数学表达式等是否一致
def compare_two_interval(inter1, inter2):
# 首先比较两边的括号是否一致,一致的话再进行下一步比较
if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
return False
inter1 = inter1.strip('[]()')
inter2 = inter2.strip('[]()')
# 分割区间的左右部分
items_1 = inter1.split(',')
items_2 = inter2.split(',')
for item_1, item_2 in zip(items_1, items_2):
if not self.expression_equal(item_1, item_2):
return False
return True
interval1 = expression1
interval2 = expression2
if interval1 == interval2:
return True
else:
inter_list1 = interval1.split("\\cup")
inter_list2 = interval2.split("\\cup")
if len(inter_list1) != len(inter_list2):
return False
else:
for inter1, inter2 in zip(inter_list1, inter_list2):
if not compare_two_interval(inter1, inter2):
return False
return True
def preprocess(self, expression1, expression2):
# 尝试捕获box中的内容如果有多个则以逗号相连返回如果一个都没有则报错
def extract_boxed_content(latex_str):
# 查找所有的 \boxed{...} 结构
boxed_matches = re.finditer(r'\\boxed{', latex_str)
results = ""
for match in boxed_matches:
start_index = match.end()
end_index = start_index
stack = 1
# 从 \boxed{ 之后开始搜索,直到找到对应的闭合括号
while stack > 0 and end_index < len(latex_str):
if latex_str[end_index] == '{':
stack += 1
elif latex_str[end_index] == '}':
stack -= 1
end_index += 1
if stack == 0:
# 提取 \boxed{} 内部的内容
content = latex_str[start_index:end_index - 1]
results += content + ","
else:
# 如果括号没有正确闭合,则返回错误信息
raise ValueError("Mismatched braces in LaTeX string.")
# 如果没有匹配到'\boxed{}'字符,则默认提取有内容的文字最后一行中的所有公式部分
if results == "":
last_line_ans = latex_str.strip().split("\n")[-1]
dollar_pattern = r"\$(.*?)\$"
answers = re.findall(dollar_pattern, last_line_ans)
if answers:
for ans in answers:
results += ans + ","
else:
results = latex_str
return results
def sepcial_symbol_replace(expression):
if "\\in " in expression:
expression = expression.split("\\in ")[1]
# 进行特殊字符的替换这些字符都不影响latex的解析属于美观/修饰性字符
for signal in self.special_signal_map:
expression = expression.replace(signal, self.special_signal_map[signal])
expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。")
pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
expression = re.sub(pattern, r'\1', expression)
return expression
exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
return exp1, exp2
def can_compute_power(self, expr):
"""
Check if the power expression can be computed.
Parameters:
expr (sympy expression): The expression to check.
Returns:
bool: True if the expression can be computed, False otherwise.
"""
# Check if the expression is a power expression
if isinstance(expr, Pow):
# Extract the base and the exponent
base, exp = expr.as_base_exp()
# Check if the base and the exponent are numbers
if base.is_number and exp.is_number:
# Set a threshold for the maximum size of the exponent
MAX_EXP = 1000 # This threshold can be adjusted based on the computing environment
# Check if the exponent is greater than the threshold
if abs(exp.evalf()) > MAX_EXP:
return False
else:
return True
else:
# If the base or the exponent is not a number, we cannot compute the power
return False
else:
# If the expression is not a power expression, return True as it is not the case we are checking for
return True
def extract_answer(is_chinese, model_output, is_deepseek=False):
# deepseekmath has special answering format
if str(model_output) == 'nan':
model_output = 'nan'
if is_deepseek:
if is_chinese:
matches = re.findall('## 解题答案(.*)', model_output)
else:
matches = re.findall('The answer is: (.*)', model_output)
# 检测是否至少找到一个匹配,如果没有就直接整个送进去找\boxed{}
if matches:
# 如果找到多个匹配,取最后一个
model_answer = matches[-1].strip()
return model_answer
else:
return model_output
if is_chinese:
matches = re.findall('所以最终答案是(.*)', model_output)
else:
matches = re.findall('So the final answer is (.*)', model_output)
# 检测是否至少找到一个匹配,如果没有就直接整个送进去找\boxed{}
if matches:
# 如果找到多个匹配,取最后一个
model_answer = matches[-1].strip()
return model_answer
else:
return model_output
def calculate_merged_accuracy(reference_dir, text_only):
pass

View File

@@ -0,0 +1,123 @@
from ...smp import *
from ...utils import can_infer
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_ICE_for_qspatial():
example_1 = """
Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
e.g., (1, m), (2.2, cm), (3.12, meter), at the end.\n
Model response: **Object Identification**
* The object in question is a chair.
* The chair is not visible in the image.
**Conclusion**
The height of the chair cannot be determined from the provided image.\n
Extracted answer: (0, cm)
"""
example_2 = """
Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
e.g., (1, inch), (1.2, cm), (3.0, feet), at the end.\n
Model response: **Step 1: Identify the stapler and the recycle bin in the image.**
The stapler is located on the wooden table, and the recycle bin is located on the floor.
**Step 2: Determine the distance between the stapler and the recycle bin.**
The stapler is 0.5 meters from the edge of the table, and the recycle bin is 1.5 meters from the edge of the table.
Therefore, the minimum distance between the stapler and the recycle bin is 1.5 - 0.5 = 1 meter.
**Answer:** 1 m\n
Extracted answer: (1, m)
"""
example_3 = """
Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
e.g., (1, foot), (2, cm), (4.3, meter), at the end.\n
Model response: The mirror in the image is approximately 5 feet 4 inches tall.\n
Extracted answer: (64, inch)
"""
example_4 = """
Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
e.g., (0.1, cm), (2.9, cm), (0.3, meter), at the end.\n
Model response: The minimum distance between the wooden chair and the chair near the camera in the image is 1.7 feet.\n
Extracted answer: (1.7, feet)
"""
example_5 = """
Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
e.g., (5.1, cm), (0.9, cm), (55, mm), at the end.\n
Model response: The height of the painting's bottom edge from the floor is approximately 4.5 feet.\n
Extracted answer: (4.5, feet)
"""
return [example_1, example_2, example_3, example_4, example_5]
def list_to_dict(lst):
return {chr(65 + i): val for i, val in enumerate(lst)}
def post_check(line, prefetch=False):
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if line['question_type'] == 'multi_choice':
ans = line['answer_option']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
if line['answer_type'] == 'integer':
res = int(response)
ans = int(line['answer'])
elif line['answer_type'] == 'float':
res = float(response)
ans = float(line['answer'])
else:
res = str(res)
ans = str(ans)
except ValueError:
pass
if res == ans:
return res if prefetch else True
else:
return False
def build_qspatial_gpt4_prompt(line):
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE_for_qspatial()
for example in examples:
prompt += example + '\n'
prompt += 'Model respone: ' + prediction
prompt += '\nExtracted answer:'
return prompt
def QSpatial_auxeval(model, line):
prompt = build_qspatial_gpt4_prompt(line)
log = ''
retry = 5
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log=log, res=res)
log += 'All 5 retries failed.\n'
return dict(log=log, res='')

View File

@@ -0,0 +1,500 @@
"""
Copied from https://github.com/allenai/allennlp-semparse
Modified from https://github.com/naver-ai/tablevqabench
"""
import re
import unicodedata
import time
from abc import ABCMeta, abstractmethod
from math import isinf, isnan
# Vision Prompts
VWTQ_PROMPT = (
'You are asked to answer questions asked on an image.\n'
'You should answer the question with a single word.\n'
'Example: \n'
'Question: what was the only year mr. wu competed in the olympic games?\n'
'Answer: 2004\n'
'Question: which township in pope county, arkansas has the least amount of water area?\n'
'Answer: Freeman\n'
'If you have multiple answers, please separate them with || marks. Example: Apple||Banana||Tomato\n\n'
'Question: {question}\n'
'Answer:'
)
VTABFACT_PROMPT = (
'You are asked to answer whether the statement is True or False based on given image\n'
'You should only answer True or False.\n'
'Example: \n'
'Statement: the milwaukee buck win 6 game in the 2010 - 11 season\n'
'Answer: True\n'
'Statement: only the top team score above the average of 8.8\n'
'Answer: False\n\n'
'Statement: {question}\n'
'Answer:'
)
FINTABNETQA_PROMPT = (
'You are asked to answer questions asked on a image.\n'
'You should answer the question within a single word or few words.\n'
'If units can be known, the answer should include units such as $, %, million and etc.\n'
'Example: \n'
'Question: What were the total financing originations for the fiscal year ended October 31, 2004?\n'
'Answer: $3,852 million\n'
'Question: What is the time period represented in the table?\n'
'Answer: October 31\n'
'Question: What was the percentage of net sales for selling, general and administrative expenses in 2006?\n'
'Answer: 34.2%\n'
'Question: {question}\n'
'Answer:'
)
def evaluate_tabfact(data, score_keys):
num_examples = 0
num_correct = 0
manual_check = 0
start_time = time.time()
for instance in data:
if instance['prediction'] is None:
instance['prediction'] = 'none'
pred = instance['prediction'].lower()
gt = instance['answer']
num_examples += 1
if 'true' in pred and 'false' in pred:
manual_check += 1
score = None
elif 'true' in pred and gt == '1':
num_correct += 1
score = 1
elif 'false' in pred and gt == '0':
num_correct += 1
score = 1
else:
score = 0
instance['scores'] = {score_keys[0]: score}
if manual_check > 0:
print(f'the number of not properly parsed samples: {manual_check}')
end_time = time.time()
elapsed_time = end_time - start_time
Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
meta = {
'evaluators': 'correctness',
'score_info': [score_keys[0]],
'evaluated_time': elapsed_time,
'total_num_sample': len(data),
'average_scores': [Accuracy],
}
return meta
def evaluate_wtq(data, score_keys):
num_examples = 0
num_correct = 0
start_time = time.time()
for instance in data:
pred = instance['prediction'].replace('||', '|')
gt = instance['answer']
original_strings = tsv_unescape_list(gt)
target_values = to_value_list(original_strings)
predicted_strings = tsv_unescape_list(pred)
predicted_values = to_value_list(predicted_strings)
correct = check_denotation(target_values, predicted_values)
num_examples += 1
score = 0
if correct:
num_correct += 1
score = 1
instance['scores'] = {score_keys[0]: score}
end_time = time.time()
elapsed_time = end_time - start_time
Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
meta = {
'evaluators': 'correctness',
'score_info': [score_keys[0]],
'evaluated_time': elapsed_time,
'total_num_sample': len(data),
'average_scores': [Accuracy],
}
return meta
def evaluate_fintabnet(data, score_keys):
num_examples = 0
num_correct, _num_correct = 0, 0
start_time = time.time()
for instance in data:
pred, preds = fintabnet_normalize(instance['prediction'])
gt, gts = fintabnet_normalize(instance['answer'])
correct = 1 if gt == pred else 0
_correct = any(_pred == _gt for _pred in preds for _gt in gts)
num_examples += 1
score, _score = 0, 0
if correct:
num_correct += 1
score = 1
if _correct:
_num_correct += 1
_score = 1
instance['scores'] = {score_keys[0]: _score, 'exact_score': score}
end_time = time.time()
elapsed_time = end_time - start_time
Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
_Accuracy = round((_num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
meta = {
'evaluators': 'correctness',
'score_info': ['relieved_accuracy', score_keys[0]],
'evaluated_time': elapsed_time,
'total_num_sample': len(data),
'average_scores': [_Accuracy, Accuracy],
}
return meta
def fintabnet_normalize(s):
s = normalize(s)
remove_words = [
'dollar', 'gallons', 'square feet', 'shares', 'mbtu',
'mbpd', 'mbbls', 'mmbtu', 'unit', 'gwh', 'year', 'mmcf', 'mile', 'mboe'
]
# Data specific filtering using regular expressions
# Remove special characters like $, (, and )
s = re.sub(r'[\$\(\),]', '', s)
# Replace "dollar" with empty string if it's not part of another word
pattern = r'\b(' + '|'.join(remove_words) + r')s?\b'
s = re.sub(pattern, '', s, flags=re.IGNORECASE)
# Unit conversion dictionary with regex patterns for flexibility
unit_conversion = {
r' \bthousand\b': 'e3',
r' \bmillion\b': 'e6',
r' \bbillion\b': 'e9',
r'\bthousand\b': 'e3',
r'\bmillion\b': 'e6',
r'\bbillion\b': 'e9',
r' ?%': 'e-2',
}
# Convert percentages to their decimal representation.
# Applying this after unit_conversion prevents "percent" from being processed
# in cases like "million %", which would be incorrect.
# s = re.sub(r' ?%', 'e-2', s)
# s_percent = re.sub(r' ?%', '', s_percent)
s_unit_free = s
# Iterate over unit_conversion and apply transformations
for pattern, value in unit_conversion.items():
s = re.sub(pattern, value, s)
s_unit_free = re.sub(pattern, '', s_unit_free)
# Attempt to convert to float
try:
return float(s), [float(s), float(s_unit_free)]
except ValueError:
# Return the original string and the error for debugging purposes
return s, [s, s_unit_free]
def normalize(x):
if not isinstance(x, str):
x = x.decode('utf8', errors='ignore')
# Remove diacritics
x = ''.join(
c for c in unicodedata.normalize('NFKD', x) if unicodedata.category(c) != 'Mn'
)
# Normalize quotes and dashes
x = re.sub(r'[´`]', "'", x)
x = re.sub(r'[“”]', '"', x)
x = re.sub(r'[‐‑‒–—−]', '-', x)
while True:
old_x = x
# Remove citations
x = re.sub(r'((?<!^)\[[^\]]*\]|\[\d+\]|[•♦†‡*#+])*$', '', x.strip())
# Remove details in parenthesis
x = re.sub(r'(?<!^)( \([^)]*\))*$', '', x.strip())
# Remove outermost quotation mark
x = re.sub(r'^"([^"]*)"$', r'\1', x.strip())
if x == old_x:
break
# Remove final '.'
if x and x[-1] == '.':
x = x[:-1]
# Collapse whitespaces and convert to lower case
x = re.sub(r'\s+', ' ', x, flags=re.U).lower().strip()
return x
# Value Types
class Value(object):
__metaclass__ = ABCMeta
# Should be populated with the normalized string
_normalized = None
@abstractmethod
def match(self, other):
"""Return True if the value matches the other value.
Args:
other (Value)
Returns:
a boolean
"""
pass
@property
def normalized(self):
return self._normalized
class StringValue(Value):
def __init__(self, content):
assert isinstance(content, str)
self._normalized = normalize(content)
self._hash = hash(self._normalized)
def __eq__(self, other):
return isinstance(other, StringValue) and self.normalized == other.normalized
def __hash__(self):
return self._hash
def __str__(self):
return 'S' + str([self.normalized])
def __repr__(self):
return self.__str__()
def match(self, other):
assert isinstance(other, Value)
return self.normalized == other.normalized
class NumberValue(Value):
def __init__(self, amount, original_string=None):
assert isinstance(amount, (int, float))
if abs(amount - round(amount)) < 1e-6:
self._amount = int(amount)
else:
self._amount = float(amount)
if not original_string:
self._normalized = str(self._amount)
else:
self._normalized = normalize(original_string)
self._hash = hash(self._amount)
@property
def amount(self):
return self._amount
def __eq__(self, other):
return isinstance(other, NumberValue) and self.amount == other.amount
def __hash__(self):
return self._hash
def __str__(self):
return 'N({})'.format(self.amount) + str([self.normalized])
def __repr__(self):
return self.__str__()
def match(self, other):
assert isinstance(other, Value)
if self.normalized == other.normalized:
return True
if isinstance(other, NumberValue):
return abs(self.amount - other.amount) < 1e-6
return False
@staticmethod
def parse(text):
"""Try to parse into a number.
Return:
the number (int or float) if successful; otherwise None.
"""
try:
return int(text)
except ValueError:
try:
amount = float(text)
assert not isnan(amount) and not isinf(amount)
return amount
except ValueError:
return None
class DateValue(Value):
def __init__(self, year, month, day, original_string=None):
"""Create a new DateValue. Placeholders are marked as -1."""
assert isinstance(year, int)
assert isinstance(month, int) and (month == -1 or 1 <= month <= 12)
assert isinstance(day, int) and (day == -1 or 1 <= day <= 31)
assert not (year == month == day == -1)
self._year = year
self._month = month
self._day = day
if not original_string:
self._normalized = '{}-{}-{}'.format(
year if year != -1 else 'xx',
month if month != -1 else 'xx',
day if day != '-1' else 'xx',
)
else:
self._normalized = normalize(original_string)
self._hash = hash((self._year, self._month, self._day))
@property
def ymd(self):
return (self._year, self._month, self._day)
def __eq__(self, other):
return isinstance(other, DateValue) and self.ymd == other.ymd
def __hash__(self):
return self._hash
def __str__(self):
return ('D(%d,%d,%d)' % (self._year, self._month, self._day)) + str(
[self._normalized]
)
__repr__ = __str__
def match(self, other):
assert isinstance(other, Value)
if self.normalized == other.normalized:
return True
if isinstance(other, DateValue):
return self.ymd == other.ymd
return False
@staticmethod
def parse(text):
"""Try to parse into a date.
Return:
tuple (year, month, date) if successful; otherwise None.
"""
try:
ymd = text.lower().split('-')
assert len(ymd) == 3
year = -1 if ymd[0] in ('xx', 'xxxx') else int(ymd[0])
month = -1 if ymd[1] == 'xx' else int(ymd[1])
day = -1 if ymd[2] == 'xx' else int(ymd[2])
assert not (year == month == day == -1)
assert month == -1 or 1 <= month <= 12
assert day == -1 or 1 <= day <= 31
return (year, month, day)
except:
return None
# Value Instantiation
def to_value(original_string, corenlp_value=None):
"""Convert the string to Value object.
Args:
original_string (basestring): Original string
corenlp_value (basestring): Optional value returned from CoreNLP
Returns:
Value
"""
if isinstance(original_string, Value):
# Already a Value
return original_string
if not corenlp_value:
corenlp_value = original_string
# Number?
amount = NumberValue.parse(corenlp_value)
if amount is not None:
return NumberValue(amount, original_string)
# Date?
ymd = DateValue.parse(corenlp_value)
if ymd is not None:
if ymd[1] == ymd[2] == -1:
return NumberValue(ymd[0], original_string)
else:
return DateValue(ymd[0], ymd[1], ymd[2], original_string)
# String.
return StringValue(original_string)
def to_value_list(original_strings, corenlp_values=None):
"""Convert a list of strings to a list of Values
Args:
original_strings (list[basestring])
corenlp_values (list[basestring or None])
Returns:
list[Value]
"""
assert isinstance(original_strings, (list, tuple, set))
if corenlp_values is not None:
assert isinstance(corenlp_values, (list, tuple, set))
assert len(original_strings) == len(corenlp_values)
return list(
set(to_value(x, y) for (x, y) in zip(original_strings, corenlp_values))
)
else:
return list(set(to_value(x) for x in original_strings))
# Check the Predicted Denotations
def check_denotation(target_values, predicted_values):
"""Return True if the predicted denotation is correct.
Args:
target_values (list[Value])
predicted_values (list[Value])
Returns:
bool
"""
# Check size
if len(target_values) != len(predicted_values):
return False
# Check items
for target in target_values:
if not any(target.match(pred) for pred in predicted_values):
return False
return True
# Batch Mode
def tsv_unescape(x):
"""Unescape strings in the TSV file.
Escaped characters include:
newline (0x10) -> backslash + n
vertical bar (0x7C) -> backslash + p
backslash (0x5C) -> backslash + backslash
Args:
x (str or unicode)
Returns:
a unicode
"""
return x.replace(r'\n', '\n').replace(r'\p', '|').replace('\\\\', '\\')
def tsv_unescape_list(x):
"""Unescape a list in the TSV file.
List items are joined with vertical bars (0x5C)
Args:
x (str or unicode)
Returns:
a list of unicodes
"""
return [tsv_unescape(y) for y in x.split('|')]

View File

@@ -0,0 +1,254 @@
from ...smp import *
from .multiple_choice import extract_answer_from_item
from PIL import Image, ImageOps
import numpy as np
sys_prompt = "You are an AI assistant for question answering."
system_prompt_multi_choice = (
"You will receive a multi-choice question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa
"Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
"If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
)
system_prompt_caption_matching = (
"You will receive a caption matching question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa
"Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
"If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
)
system_prompt_captioning = """
You will receive a video description and a multi-choice question. Your task is to choose the correct answer and briefly explain the reason why you choose the answer. \
If none of the choice candidates are correct or the video description lacks enough information to answer the question, just answer "None of the choices are correct". \
Please organize your response in this format:
```
Reasoning: [Your reason to obtain the answer]
Answer: [Your answer]
```
Here are some examples of video description, multi-choice question and the expected answer:
```
Video Description: A person is palying football.
Multi-Choice Question:
What is the person doing in the video?
A. cooking
B. palying football
C. playing basketball
D. reading book
Reasoning: The video description mentions that the person is playing football.
Answer: B. palying football
Video Description: A bird is flying clockwise.
Multi-Choice Question:
In which direction is the bird flying?
A. backwark
B. counter-clockwise
C. clockwise
D. downward
Reasoning: The video description mentions that the bird is flying clockwise
Answer: C. clockwise
Video Description: An air balloon is inflating.
Multi-Choice Question:
What is happening to the air balloon?
A. exploding
B. getting smaller
C. flying
Reasoning: The video description mentions that the air balloon is inflating, while none of the coices can be explained as inflating.
Answer: None of the choices are correct
```
""" # noqa
system_prompt_YorN = """
You will receive a Yes/No question, the ground-truth answer and the prediction from a question answering (QA) model. \
Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. \
If the prediction is correct, respond "Correct". If the prediction is incorrect, respond "Incorrect".
""" # noqa
def eval_rule_caption_matching(line):
# Determine whether the video llm output is correct, based on word matching rules
video_llm_output = line['prediction']
answer = line['answer']
option_strs = eval(line['candidates']) # complete option strings
option_sents = [opt.split(': ')[1] for opt in option_strs] # option sentence
# option index, e.g., Sentence A, Caption A, Option 1
option_inds = [opt.split(': ')[0] for opt in option_strs] + [opt.split(': ')[0].replace('Sentence ', '').replace('Option ', '').replace('Caption ', '') for opt in option_strs] # noqa
video_llm_pred = None
for option_str in option_strs:
if option_str == video_llm_output:
video_llm_pred = option_str
for option_sent in option_sents:
if option_sent == video_llm_output or (') ' in video_llm_output and option_sent == video_llm_output.split(') ')[1]): # noqa
video_llm_pred = option_sent
for option_ind in option_inds:
if option_ind == video_llm_output or option_ind == video_llm_output.replace('.', ''): # noqa
video_llm_pred = option_ind
if video_llm_pred is None:
return "fail"
else:
return 1 if video_llm_pred == answer or video_llm_pred == answer.split(":")[0] or video_llm_pred == answer.split(": ")[1] or video_llm_pred == answer.split(": ")[0].split()[1] else 0 # noqa
def eval_rule_multi_choice(line):
if line['prediction'] == line['answer']:
return 1
elif line['prediction'] in ['A', 'B', 'C', 'D']:
return 1 if line['prediction'] == line['answer'][0] else 0
elif any(line['prediction'].startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']):
return 1 if line['prediction'].split('.')[0] == line['answer'][0] else 0
elif any(line['prediction'].startswith(prefix) for prefix in ['A)', 'B)', 'C)', 'D)']):
return 1 if line['prediction'].split(')')[0] == line['answer'][0] else 0
else:
return "fail"
def eval_rule_YorN(video_llm_output):
# Extract the yes/no predction from the original video llm output
video_llm_output = video_llm_output.lower()
if video_llm_output.startswith("yes"):
return "yes"
elif video_llm_output.startswith("no"):
return "no"
else:
return False
def llm_output_to_rating(llm_output):
if not ('Correct' in llm_output or 'Incorrect' in llm_output):
print(f"Warning: LLM output is not in the correct format: {llm_output}")
rating = 0
return rating
if llm_output.startswith('Correct'):
rating = 1
elif llm_output.startswith('Incorrect'):
rating = 0
elif ('Correct' in llm_output) and ('Incorrect' not in llm_output):
rating = 1
elif 'Incorrect' in llm_output:
rating = 0
return rating
def parse_llm_output(llm_output, gt_answer):
if llm_output == "invalid_request_error" or not llm_output:
eval_result = {"rating": -1, "chatgpt-answer": None, "chatgpt-reasoning": None}
return eval_result
eval_result = {}
lines = llm_output.split("\n")
for line in lines:
line = line.strip()
if "Reasoning" in line:
eval_result['chatgpt-reasoning'] = line.replace("Reasoning:", "").strip()
if "Answer" in line:
eval_result['chatgpt-answer'] = line.replace("Answer:", "").strip()
if "chatgpt-answer" not in eval_result:
eval_result['chatgpt-answer'] = llm_output
if "chatgpt-reasoning" not in eval_result:
eval_result['chatgpt-reasoning'] = None
# Check if the chatgpt answer is the ground-truth answer
# calculate the number of 'A.', 'B.', 'C.', 'D.' in chatgpt-answer
answer_counts = sum(eval_result['chatgpt-answer'].count(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']) # noqa
if eval_result['chatgpt-answer'].split(". ")[0] == gt_answer.split(". ")[0] and answer_counts == 1:
eval_result['rating'] = 1
else:
eval_result['rating'] = 0
return eval_result
def evaluate_tempcompass_mcq(model, line):
eval_rules_dict = {
'caption_matching': eval_rule_caption_matching,
'multi-choice': eval_rule_multi_choice
}
gpt_eval_prompt = {
'multi-choice': '{}\nMulti-Choice Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}',
'caption_matching': '{}\nCaption Matching Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}'
}
base_prompt = {
'multi-choice': system_prompt_multi_choice,
'caption_matching': system_prompt_caption_matching
}
eval_result = {
"question": line['question'],
"answer": line['answer'],
"prediction": line['prediction'],
"task_type": line['task_type'],
"candidates": line['candidates'],
"match_success": True
}
result = eval_rules_dict[line['task_type']](line)
if result == "fail":
eval_result['match_success'] = False
if model is None:
eval_result['rating'] = 0
else:
prompt_template = gpt_eval_prompt[line['task_type']]
prompt = prompt_template.format(base_prompt[line['task_type']], line['question'], line['answer'], line['prediction']) # noqa
llm_output = model.generate(prompt)
result = llm_output_to_rating(llm_output)
eval_result['chatgpt-response'] = llm_output
eval_result['rating'] = result
else:
eval_result['rating'] = result
return eval_result
def evaluate_tempcompass_captioning(model, line):
prompt = (
f"{system_prompt_captioning}\n"
f"Video Description:{line['prediction']}\n"
f"Multi-Choice Question:\n{line['mc_question']}\n"
)
if model is not None:
llm_output = model.generate(prompt)
eval_result = parse_llm_output(llm_output, gt_answer=line['mc_answer'])
return eval_result
else:
raise ValueError("Model is None, TempCompass Captioning task not supported exact matching") # noqa
def evaluate_tempcompass_YorN(model, line):
prompt = (
f"{system_prompt_YorN}\n"
f"Yes/No Question:\n{line['question']}\n"
f"Ground-Truth Answer: {line['answer']}\n"
f"Model Prediction: {line['prediction']}"
)
result = eval_rule_YorN(line['prediction'])
eval_result = {
"question": line['question'],
"answer": line['answer'],
"prediction": line['prediction'],
"match_success": True
}
if result:
eval_result['rating'] = 1 if result == line['answer'] else 0
elif model is None:
eval_result['match_success'] = False
eval_result['rating'] = 0
else:
eval_result['match_success'] = False
llm_output = model.generate(prompt)
result = llm_output_to_rating(llm_output)
eval_result['chatgpt-response'] = llm_output
eval_result['rating'] = result
return eval_result
def get_dimension_rating(score_file):
data = load(score_file)
result_dict = {}
for idx, item in data.iterrows():
dict_key = item['dim'] + '. ' + item['task_type']
if dict_key not in result_dict:
result_dict[dict_key] = [0,0]
result_dict[dict_key][0] += int(item['score'])
result_dict[dict_key][1] += 1
return result_dict

View File

@@ -1,4 +1,5 @@
from ...smp import *
from .multiple_choice import extract_answer_from_item
import numpy as np
import re
@@ -97,24 +98,33 @@ def get_dimension_rating(data_path):
for duration in DURATIONS + ['overall']:
overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.2f}'
overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.3f}'
duration_rating[duration]['overall'] = overall_res_dur
for domain in DOMAINS:
domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.2f}'
domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.3f}'
duration_rating[duration]['domain'][domain] = domain_res_dur
for sub_ctg in SUB_CATEGORIES:
sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.2f}'
sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.3f}'
duration_rating[duration]['sub_category'][sub_ctg] = sub_res_dur
for task_ctg in TASK_CATEGORIES:
task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.2f}'
task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.3f}'
duration_rating[duration]['task_type'][task_ctg] = task_res_dur
return duration_rating
def extract_option(model, input_item, dataset_name):
options = input_item['question'].split('\n')[1:]
for id, option in enumerate(options):
option_id = chr(ord('A') + id) + '.'
if option.find(option_id) >= 0:
input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
return extract_answer_from_item(model, input_item, dataset_name)['opt']
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [

View File

@@ -0,0 +1,896 @@
# pylint: skip-file
import pandas as pd
import json
import numpy as np
import os
import argparse
# four_dimensional_metrics.py
# Function to evaluate steps
def evaluate_evaluate_steps(json, steps): # noqa
jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)]
for i in range(steps):
jokers[i].rename(
columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'},
inplace=True,
)
concatenated_steps = pd.concat(jokers, axis=0)
return concatenated_steps
# Function to load and process JSON data
def load_and_process_data(filepath):
df = pd.read_excel(filepath)
if 'hit' not in df.columns:
df['processed_answer'] = (
df['prediction']
.str.split('Answer')
.str[-1]
.str.strip()
.str.replace(r'[>><<:.]', '', regex=True)
.str.strip()
)
df['processed_answer'] = df['processed_answer'].apply(lambda x: x[0] if x and x[0] in 'ABCDEFGH' else None)
df['joker'] = df['processed_answer'] == df['answer']
else:
df['joker'] = df['hit'].astype(bool)
return df
# Function to process steps data and merge results
def evaluate_process_steps_data(df, steps):
steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)}
steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi']
for key, data in steps_data.items():
data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns]
merged_data = steps_data[f'{steps}steps_1']
for i in range(2, steps + 1):
merged_data = pd.merge(
merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left' # noqa
)
merged_data = pd.merge(
merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left' # noqa
)
return merged_data
# Function to calculate evaluation metrics
def evaluate_calculate_metrics(merged_2steps, merged_3steps):
metrics = {}
metrics['steps2_filtered_rows_1_loose'] = merged_2steps[
((merged_2steps['joker_1'] == False) & (merged_2steps['joker_2'] == False)) # noqa
& (merged_2steps['joker_multi'] == True) # noqa
]
metrics['steps2_filtered_rows_1_strict'] = merged_2steps[
((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False)) # noqa
& (merged_2steps['joker_multi'] == True) # noqa
]
metrics['steps2_filtered_rows_2'] = merged_2steps[
((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True)) # noqa
& (merged_2steps['joker_multi'] == False) # noqa
]
metrics['steps2_filtered_rows_3'] = merged_2steps[
((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False)) # noqa
& (merged_2steps['joker_multi'] == False) # noqa
]
metrics['steps2_filtered_rows_4_loose'] = merged_2steps[
((merged_2steps['joker_1'] == True) | (merged_2steps['joker_2'] == True))
& (merged_2steps['joker_multi'] == True)
]
metrics['steps2_filtered_rows_4_strict'] = merged_2steps[
((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True))
& (merged_2steps['joker_multi'] == True)
]
metrics['steps3_filtered_rows_1_loose'] = merged_3steps[
(
(merged_3steps['joker_1'] == False)
& (merged_3steps['joker_2'] == False)
& (merged_3steps['joker_3'] == False)
)
& (merged_3steps['joker_multi'] == True)
]
metrics['steps3_filtered_rows_1_strict'] = merged_3steps[
(
(merged_3steps['joker_1'] == False)
| (merged_3steps['joker_2'] == False)
| (merged_3steps['joker_3'] == False)
)
& (merged_3steps['joker_multi'] == True)
]
metrics['steps3_filtered_rows_2'] = merged_3steps[
((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True))
& (merged_3steps['joker_multi'] == False)
]
metrics['steps3_filtered_rows_3'] = merged_3steps[
(
(merged_3steps['joker_1'] == False)
| (merged_3steps['joker_2'] == False)
| (merged_3steps['joker_3'] == False)
)
& (merged_3steps['joker_multi'] == False)
]
metrics['steps3_filtered_rows_4_loose'] = merged_3steps[
((merged_3steps['joker_1'] == True) | (merged_3steps['joker_2'] == True) | (merged_3steps['joker_3'] == True))
& (merged_3steps['joker_multi'] == True)
]
metrics['steps3_filtered_rows_4_strict'] = merged_3steps[
((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True))
& (merged_3steps['joker_multi'] == True)
]
# metrics.to_csv("/Users/mac/Desktop/测试结果/error_anal/csv/gpt4o-0626.csv", index = False)
return metrics
# Function to compute evaluation rates and final scores
def evaluate_compute_final_scores(metrics, total_count):
total_counts = {
'InadequateGeneralization': len(metrics['steps2_filtered_rows_2']) + len(metrics['steps3_filtered_rows_2']),
'InsufficientKnowledge': len(metrics['steps2_filtered_rows_3']) + len(metrics['steps3_filtered_rows_3']),
'CompleteMastery_loose': len(metrics['steps2_filtered_rows_4_loose'])
+ len(metrics['steps3_filtered_rows_4_loose']),
'CompleteMastery_strict': len(metrics['steps2_filtered_rows_4_strict'])
+ len(metrics['steps3_filtered_rows_4_strict']),
'RoteMemorization_loose': len(metrics['steps2_filtered_rows_1_loose'])
+ len(metrics['steps3_filtered_rows_1_loose']),
'RoteMemorization_strict': len(metrics['steps2_filtered_rows_1_strict'])
+ len(metrics['steps3_filtered_rows_1_strict']),
}
rates = {
'InadequateGeneralization_rate': "{:.2%}".format(total_counts['InadequateGeneralization'] / total_count),
'InsufficientKnowledge_rate': "{:.2%}".format(total_counts['InsufficientKnowledge'] / total_count),
'CompleteMastery_loose_rate': "{:.2%}".format(total_counts['CompleteMastery_loose'] / total_count),
'CompleteMastery_strict_rate': "{:.2%}".format(total_counts['CompleteMastery_strict'] / total_count),
'RoteMemorization_loose_rate': "{:.2%}".format(
total_counts['RoteMemorization_loose']
/ (total_counts['CompleteMastery_loose'] + total_counts['RoteMemorization_loose'])
),
'RoteMemorization_strict_rate': "{:.2%}".format(
total_counts['RoteMemorization_strict']
/ (total_counts['CompleteMastery_strict'] + total_counts['RoteMemorization_strict'])
),
}
return total_counts, rates
# Function to update main results DataFrame
def evaluate_update_main_results_df(main_results_df, total_counts, rates):
final_score_loose = "{:.2%}".format(
(
525
- 0.5 * total_counts['InadequateGeneralization']
- total_counts['RoteMemorization_loose']
- total_counts['InsufficientKnowledge']
)
/ 525
)
final_score_strict = "{:.2%}".format(
(
525
- 0.5 * total_counts['InadequateGeneralization']
- total_counts['RoteMemorization_strict']
- total_counts['InsufficientKnowledge']
)
/ 525
)
new_row = {
# 'Model': model,
'Score (Strict)': final_score_strict,
'InsufficientKnowledge (Strict)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})",
'InadequateGeneralization (Strict)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})",
'CompleteMastery (Strict)': f"{rates['CompleteMastery_strict_rate']} ({total_counts['CompleteMastery_strict']})",
'RoteMemorization (Strict)': f"{rates['RoteMemorization_strict_rate']} ({total_counts['RoteMemorization_strict']})",
'Score (Loose)': final_score_loose,
'InsufficientKnowledge (Loose)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})",
'InadequateGeneralization (Loose)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})",
'CompleteMastery (Loose)': f"{rates['CompleteMastery_loose_rate']} ({total_counts['CompleteMastery_loose']})",
'RoteMemorization (Loose)': f"{rates['RoteMemorization_loose_rate']} ({total_counts['RoteMemorization_loose']})",
}
main_results_df = main_results_df._append(new_row, ignore_index=True)
return main_results_df
# Main function to evaluate models
def wemath_evaluate_models(output_json, main_results_csv_path=None):
main_results_df = pd.DataFrame(
columns=[
'Model',
'Score (Strict)',
'InsufficientKnowledge (Strict)',
'InadequateGeneralization (Strict)',
'CompleteMastery (Strict)',
'RoteMemorization (Strict)',
'Score (Loose)',
'InsufficientKnowledge (Loose)',
'InadequateGeneralization (Loose)',
'CompleteMastery (Loose)',
'RoteMemorization (Loose)',
]
)
# print(f"Evaluating model: {model_name}, JSON path: {output_json}")
data = load_and_process_data(output_json)
data_2steps = data[data['key'].str.contains('2steps')]
data_3steps = data[data['key'].str.contains('3steps')]
merged_2steps = evaluate_process_steps_data(data_2steps, 2)
merged_3steps = evaluate_process_steps_data(data_3steps, 3)
metrics = evaluate_calculate_metrics(merged_2steps, merged_3steps)
total_counts, rates = evaluate_compute_final_scores(metrics, total_count=525)
main_results_df = evaluate_update_main_results_df(main_results_df, total_counts, rates)
print(main_results_df.to_string(index=False))
if main_results_csv_path is not None:
main_results_df.to_csv(main_results_csv_path, index=False)
print("Evaluation completed and results saved to CSV.")
return main_results_df.to_dict()
### Accuracy.py
# Function to load knowledge structure nodes
def load_knowledge_structure_nodes(filepath):
# with open(filepath, "r") as file:
# nodes = json.load(file)
nodes = knowledge_structure_nodes
nodes = pd.DataFrame(nodes)
nodes['final_key'] = nodes['full node'].str.split('_').str[-1]
nodes['root_2'] = nodes['full node'].str.split('_').str[1]
return nodes
# Function to evaluate steps
def accuracy_evaluate_steps(json, steps, nodes):
jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)]
for i in range(steps):
jokers[i] = pd.merge(
jokers[i],
nodes[['final_key', 'full node', 'root_2']],
left_on=f'knowledge concept_{i + 1}',
right_on='final_key',
how='left',
)
jokers[i].rename(
columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'},
inplace=True,
)
concatenated_steps = pd.concat(jokers, axis=0)
return concatenated_steps
# Function to process steps data and merge results
def accuracy_process_steps_data(df, steps):
steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)}
steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi']
for key, data in steps_data.items():
data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns]
merged_data = steps_data[f'{steps}steps_1']
for i in range(2, steps + 1):
merged_data = pd.merge(
merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left'
)
merged_data = pd.merge(
merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left'
)
return merged_data
# Function to update main results DataFrame
def accuracy_update_main_results_df(nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps):
One_step_acc = "{:.2%}".format(concatenated_data['joker'].mean())
Two_step_acc = "{:.2%}".format(merged_2steps['joker_multi'].mean())
Three_step_acc = "{:.2%}".format(merged_3steps['joker_multi'].mean())
new_row = {
# 'Model': model_name,
'One-step(S1)': One_step_acc,
'Two-step(S2)': Two_step_acc,
'Three-step(S3)': Three_step_acc,
}
# Calculate rates according to Nodes
nodes['final_rode'] = nodes['full node'].str.split('_').str[-1]
csv_final_score = concatenated_data.groupby('final_key')['joker'].mean()
csv_final_score = pd.merge(nodes, csv_final_score, left_on='final_rode', right_on='final_key', how='left')
new_row.update(csv_final_score.groupby('root2')['joker'].mean().apply(lambda x: "{:.2%}".format(x)).to_dict())
main_results_df = main_results_df._append(new_row, ignore_index=True)
return main_results_df
# Main function to evaluate models
def wemath_accuracy(output_json, main_results_csv_path=None):
# nodes = load_knowledge_structure_nodes(knowledge_structure_nodes_path)
nodes = knowledge_structure_nodes
nodes = pd.DataFrame(nodes)
nodes['final_key'] = nodes['full node'].str.split('_').str[-1]
nodes['root_2'] = nodes['full node'].str.split('_').str[1]
main_results_df = pd.DataFrame(
columns=[
'Model',
'One-step(S1)',
'Two-step(S2)',
'Three-step(S3)',
'Understanding and Conversion of Units',
'Angles and Length',
'Calculation of Plane Figures',
'Understanding of Plane Figures',
'Calculation of Solid Figures',
'Understanding of Solid Figures',
'Basic Transformations of Figures',
'Cutting and Combining of Figures',
'Direction',
'Position',
'Route Map',
'Correspondence of Coordinates and Positions',
]
)
# print(f"Evaluating model: {model_name}, JSON path: {output_json}")
data = load_and_process_data(output_json)
data_2steps = data[data['key'].str.contains('2steps')]
data_3steps = data[data['key'].str.contains('3steps')]
merged_2steps = accuracy_process_steps_data(data_2steps, 2)
merged_3steps = accuracy_process_steps_data(data_3steps, 3)
concatenated_data = pd.concat(
[accuracy_evaluate_steps(merged_2steps, 2, nodes), accuracy_evaluate_steps(merged_3steps, 3, nodes)],
axis=0,
)
main_results_df = accuracy_update_main_results_df(
nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps
)
print(main_results_df.to_string(index=False))
if main_results_csv_path is not None:
main_results_df.to_csv(main_results_csv_path, index=False)
print("Evaluation completed and results saved to CSV.")
return main_results_df.to_dict()
knowledge_structure_nodes = [
{
"root0": "Geometry and Figures",
"root1": "Measurement",
"root2": "Understanding and Conversion of Units",
"root3": "Conversion Rates and Calculations Between Area Units",
"root4": None,
"full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Area Units",
},
{
"root0": "Geometry and Figures",
"root1": "Measurement",
"root2": "Understanding and Conversion of Units",
"root3": "Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)",
"root4": None,
"full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)",
},
{
"root0": "Geometry and Figures",
"root1": "Measurement",
"root2": "Understanding and Conversion of Units",
"root3": "Conversion Rates and Calculations Between Length Units",
"root4": None,
"full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Length Units",
},
{
"root0": "Geometry and Figures",
"root1": "Measurement",
"root2": "Angles and Length",
"root3": "Understanding Angles (Using a Protractor)",
"root4": None,
"full node": "Measurement_Angles and Length_Understanding Angles (Using a Protractor)",
},
{
"root0": "Geometry and Figures",
"root1": "Measurement",
"root2": "Angles and Length",
"root3": "Understanding Length (Using a Ruler)",
"root4": None,
"full node": "Measurement_Angles and Length_Understanding Length (Using a Ruler)",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Surface Area of Solid Figures",
"root4": "Surface Area of Cylinders",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cylinders",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Surface Area of Solid Figures",
"root4": "Surface Area of Rectangular Cuboids",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Rectangular Cuboids",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Surface Area of Solid Figures",
"root4": "Surface Area of Cubes",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cubes",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Volume of Solid Figures",
"root4": "Volume and Capacity of Cylinders",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cylinders",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Volume of Solid Figures",
"root4": "Volume and Capacity of Cones",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cones",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Volume of Solid Figures",
"root4": "Volume and Capacity of Rectangular Cuboids",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Rectangular Cuboids",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Calculation of Solid Figures",
"root3": "Calculation of Volume of Solid Figures",
"root4": "Volume and Capacity of Cubes",
"full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cubes",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Expanded View of Solids",
"root4": "Expanded View of Cylinders",
"full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cylinders",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Expanded View of Solids",
"root4": "Expanded View of Rectangular Cuboids",
"full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Rectangular Cuboids",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Expanded View of Solids",
"root4": "Expanded View of Cubes",
"full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cubes",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Cylinders and Cones",
"root4": "Properties of Cylinders",
"full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cylinders",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Cylinders and Cones",
"root4": "Properties of Cones",
"full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cones",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Rectangular Cuboids and Cubes",
"root4": "Properties and Understanding of Rectangular Cuboids",
"full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Rectangular Cuboids",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Rectangular Cuboids and Cubes",
"root4": "Properties and Understanding of Cubes",
"full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Cubes",
},
{
"root0": "Geometry and Figures",
"root1": "Solid Figures",
"root2": "Understanding of Solid Figures",
"root3": "Observing Objects",
"root4": None,
"full node": "Solid Figures_Understanding of Solid Figures_Observing Objects",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Sum of Interior Angles of Polygons",
"root4": "Sum of Interior Angles of Other Polygons",
"full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Other Polygons",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Sum of Interior Angles of Polygons",
"root4": "Sum of Interior Angles of Triangles",
"full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Triangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation and Comparison of Angles",
"root4": None,
"full node": "Plane Figures_Calculation of Plane Figures_Calculation and Comparison of Angles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Parallelograms",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Parallelograms",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Triangles",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Triangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Sectors",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Sectors",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Trapezoids",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Trapezoids",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Circles",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Circles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Rectangles",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Rectangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Areas",
"root4": "Area of Squares",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Squares",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Perimeters",
"root4": "Perimeter of Parallelograms",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Parallelograms",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Perimeters",
"root4": "Perimeter of Triangles",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Triangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Perimeters",
"root4": "Perimeter of Trapezoids",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Trapezoids",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Perimeters",
"root4": "Circumference of Circles",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Circumference of Circles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Perimeters",
"root4": "Perimeter of Rectangles",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Rectangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Calculation of Plane Figures",
"root3": "Calculation of Perimeters",
"root4": "Perimeter of Squares",
"full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Squares",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Polygons",
"root4": "Properties and Understanding of Parallelograms",
"full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Parallelograms",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Polygons",
"root4": "Properties and Understanding of Triangles",
"full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Triangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Polygons",
"root4": "Properties and Understanding of Trapezoids",
"full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Trapezoids",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Polygons",
"root4": "Properties and Understanding of Rectangles",
"full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Rectangles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Polygons",
"root4": "Properties and Understanding of Squares",
"full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Squares",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Classification and Understanding of Angles",
"root4": "Understanding Triangular Rulers",
"full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding Triangular Rulers",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Classification and Understanding of Angles",
"root4": "Understanding and Representing Angles",
"full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding and Representing Angles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Properties and Understanding of Line Segments",
"root4": "Distance Between Two Points",
"full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Distance Between Two Points",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Properties and Understanding of Line Segments",
"root4": "Understanding Line Segments, Lines, and Rays",
"full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Understanding Line Segments, Lines, and Rays",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Positional Relationships Between Line Segments",
"root4": "perpendicularity",
"full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_perpendicularity",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Positional Relationships Between Line Segments",
"root4": "Parallel",
"full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_Parallel",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Circles and Sectors",
"root4": "Understanding Sectors",
"full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Sectors",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Circles and Sectors",
"root4": "Understanding Circles",
"full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Circles",
},
{
"root0": "Geometry and Figures",
"root1": "Plane Figures",
"root2": "Understanding of Plane Figures",
"root3": "Observing Figures",
"root4": None,
"full node": "Plane Figures_Understanding of Plane Figures_Observing Figures",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Basic Transformations of Figures",
"root3": "Axial Symmetry",
"root4": None,
"full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Axial Symmetry",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Basic Transformations of Figures",
"root3": "Translation",
"root4": None,
"full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Translation",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Basic Transformations of Figures",
"root3": "Rotation",
"root4": None,
"full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Rotation",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Cutting and Combining of Figures",
"root3": "Combining and Dividing Solids",
"root4": None,
"full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining and Dividing Solids",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Cutting and Combining of Figures",
"root3": "Combining Plane Figures",
"root4": "Division of Plane Figures",
"full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Division of Plane Figures",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Cutting and Combining of Figures",
"root3": "Combining Plane Figures",
"root4": "Combining Plane Figures",
"full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Combining Plane Figures",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Cutting and Combining of Figures",
"root3": "Combining Plane Figures",
"root4": "Tessellation of Figures",
"full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Tessellation of Figures",
},
{
"root0": "Geometry and Figures",
"root1": "Transformation and Motion of Figures",
"root2": "Cutting and Combining of Figures",
"root3": "Combining Plane Figures",
"root4": "Folding Problems of Figures",
"full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Folding Problems of Figures",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Direction",
"root3": "Southeast, Southwest, Northeast, Northwest Directions",
"root4": None,
"full node": "Position and Direction_Direction_Southeast, Southwest, Northeast, Northwest Directions",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Direction",
"root3": "Cardinal Directions (East, South, West, North)",
"root4": None,
"full node": "Position and Direction_Direction_Cardinal Directions (East, South, West, North)",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Route Map",
"root3": "Determining the Positions of Objects Based on Direction, Angle, and Distance",
"root4": None,
"full node": "Position and Direction_Route Map_Determining the Positions of Objects Based on Direction, Angle, and Distance",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Route Map",
"root3": "Describing Simple Routes Based on Direction and Distance",
"root4": None,
"full node": "Position and Direction_Route Map_Describing Simple Routes Based on Direction and Distance",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Correspondence of Coordinates and Positions",
"root3": "Representing Positions Using Ordered Pairs",
"root4": None,
"full node": "Position and Direction_Correspondence of Coordinates and Positions_Representing Positions Using Ordered Pairs",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Correspondence of Coordinates and Positions",
"root3": "Finding Positions Based on Ordered Pairs",
"root4": None,
"full node": "Position and Direction_Correspondence of Coordinates and Positions_Finding Positions Based on Ordered Pairs",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Position",
"root3": "Front-Back Position",
"root4": None,
"full node": "Position and Direction_Position_Front-Back Position",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Position",
"root3": "Up-Down Position",
"root4": None,
"full node": "Position and Direction_Position_Up-Down Position",
},
{
"root0": "Geometry and Figures",
"root1": "Position and Direction",
"root2": "Position",
"root3": "Left-Right Position",
"root4": None,
"full node": "Position and Direction_Position_Left-Right Position",
},
]

View File

@@ -1,6 +1,47 @@
from ...smp import *
def AMBER_rating(data_file):
data = load(data_file)
stats = defaultdict(dict)
lt = len(data)
category_mapping = {
'discriminative-attribute-state': 'Attribute',
'discriminative-attribute-number': 'Attribute',
'discriminative-attribute-action': 'Attribute',
'discriminative-hallucination': 'Existence',
'discriminative-relation': 'Relation',
'relation': 'Relation'
}
for i in range(lt):
item = data.iloc[i]
category = item['category']
image_path = item['image_path']
score = item['score']
new_category = category_mapping.get(category, category)
if image_path not in stats[new_category]:
stats[new_category][image_path] = []
stats[new_category][image_path].append(score)
def acc(key):
res = stats[key]
values = []
for val in res.values():
values.extend(val)
return np.mean(values) * 100
scores = {}
for k in stats:
scores[k] = acc(k)
scores['Avg ACC'] = np.mean(list(scores.values()))
ret = d2df(scores)
return ret
def MME_rating(data_file):
data = load(data_file)
stats = defaultdict(dict)