Modify eval_mm for MiniCPM-o 2.6

2026-02-05 18:29:18 +08:00 · 2025-01-21 15:34:54 +08:00
parent ec68cefc17
commit d8f382e157
82 changed files with 14279 additions and 843 deletions
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/init.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/init.py
@@ -5,5 +5,5 @@ from .vqa_eval import levenshtein_distance

 __all__ = [
    'build_judge', 'extract_answer_from_item', 'prefetch_answer',
-    'levenshtein_distance', 'DEBUG_MESSAGE'
+    'levenshtein_distance', 'DEBUG_MESSAGE',
 ]
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md
@@ -0,0 +1,59 @@
+# CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy
+
+## Introduction
+
+Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.
+
+## Running Scripts
+
+Once the environment is ready, execute the following script from the root directory of VLMEvalKit
+to perform inference and evaluation tasks in batch.
+
+```shell
+MODEL_NAME="QwenVLMax"
+OUTPUT_DIR="/your/path/to/output_dir"
+
+SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_scene_ocr
+python run.py --data CCOCR_MultiSceneOcr_Cord CCOCR_MultiSceneOcr_Funsd CCOCR_MultiSceneOcr_Iam CCOCR_MultiSceneOcr_ZhDoc CCOCR_MultiSceneOcr_ZhHandwriting CCOCR_MultiSceneOcr_Hieragent CCOCR_MultiSceneOcr_Ic15 CCOCR_MultiSceneOcr_Inversetext CCOCR_MultiSceneOcr_Totaltext CCOCR_MultiSceneOcr_ZhScene CCOCR_MultiSceneOcr_UgcLaion CCOCR_MultiSceneOcr_ZhDense CCOCR_MultiSceneOcr_ZhVertical --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
+python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
+
+SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_lan_ocr
+python run.py --data CCOCR_MultiLanOcr_Arabic CCOCR_MultiLanOcr_French CCOCR_MultiLanOcr_German CCOCR_MultiLanOcr_Italian CCOCR_MultiLanOcr_Japanese CCOCR_MultiLanOcr_Korean CCOCR_MultiLanOcr_Portuguese CCOCR_MultiLanOcr_Russian CCOCR_MultiLanOcr_Spanish CCOCR_MultiLanOcr_Vietnamese --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
+python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
+
+SUB_OUTPUT_DIR=${OUTPUT_DIR}/doc_parsing
+python run.py --data CCOCR_DocParsing_DocPhotoChn CCOCR_DocParsing_DocPhotoEng CCOCR_DocParsing_DocScanChn CCOCR_DocParsing_DocScanEng CCOCR_DocParsing_TablePhotoChn CCOCR_DocParsing_TablePhotoEng CCOCR_DocParsing_TableScanChn CCOCR_DocParsing_TableScanEng CCOCR_DocParsing_MolecularHandwriting CCOCR_DocParsing_FormulaHandwriting --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
+python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
+
+SUB_OUTPUT_DIR=${OUTPUT_DIR}/kie
+python run.py --data CCOCR_Kie_Sroie2019Word CCOCR_Kie_Cord CCOCR_Kie_EphoieScut CCOCR_Kie_Poie CCOCR_Kie_ColdSibr CCOCR_Kie_ColdCell --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose
+python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR}
+```
+
+## Example Output
+The evaluation results will be saved in `${SUB_OUTPUT_DIR}/summary.md`. For example, for the KIE subset,
+the output is as follows:
+
+| exp_name(f1_score) |   COLD_CELL |   COLD_SIBR |   CORD |   EPHOIE_SCUT |   POIE |   sroie2019_word |   summary |
+|:-------------------|------------:|------------:|-------:|--------------:|-------:|-----------------:|----------:|
+| QwenVLMax          |       81.01 |       72.46 |  69.33 |          71.2 |  60.85 |            76.37 |     71.87 |
+
+
+## Citation
+If you find our work helpful, feel free to give us a cite.
+
+```
+@misc{yang2024ccocr,
+      title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
+      author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
+      year={2024},
+      eprint={2412.02210},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2412.02210},
+}
+```
+
+## Contact Us
+
+If you have any questions, feel free to send an email to: wpf272043@alibaba-inc.com or xixing.tj@alibaba-inc.com
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/init.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/init.py
@@ -0,0 +1,12 @@
+from .kie_evaluator import KieEvaluator
+from .doc_parsing_evaluator import ParsingEvaluator
+from .ocr_evaluator import OcrEvaluator
+from .common import summary
+
+
+evaluator_map_info = {
+    "kie": KieEvaluator("kie"),
+    "doc_parsing": ParsingEvaluator("doc_parsing"),
+    "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
+    "multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
+}
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py
@@ -0,0 +1,222 @@
+import os
+import json
+import time
+import sys
+from abc import abstractmethod
+from tabulate import tabulate
+
+
+def pick_response_text(json_path):
+    """
+    """
+    try:
+        with open(json_path, "r") as f:
+            json_data = json.load(f)
+    except Exception as e:
+        print("--> file error: msg: {}, path: {}".format(e, json_path))
+        return None
+
+    for required_key in ["model_name", "response"]:
+        if required_key not in json_data:
+            print("--> required key not exists, name: {}, path: {}".format(required_key, json_path))
+            return None
+
+    model_name = json_data["model_name"]
+    model_response = json_data["response"]
+
+    response_text = None
+    if model_name.startswith("gpt") or model_name.startswith("o1"):
+        response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None)  # noqa: E501
+    elif model_name.startswith("local_"):
+        response_text = model_response
+    else:
+        if model_name.startswith("claude"):
+            content_list = model_response.get("content", None)
+        elif model_name.startswith("gemini"):
+            content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None)
+        elif model_name.startswith("qwen"):
+            content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None)  # noqa: E501
+        else:
+            raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name))
+
+        if isinstance(content_list, list) and len(content_list) > 0:
+            response_text = content_list[0].get("text", None)
+
+    if response_text is None:
+        print("--> [error][{}] text pick error, path: {}".format(model_name, json_path))
+    return response_text
+
+
+def load_response_from_dir(res_dir):
+    """
+    """
+    response_info = {}
+    for file_name in os.listdir(res_dir):
+        file_path = os.path.abspath(os.path.join(res_dir, file_name))
+        if not file_name.endswith(".json"):
+            print("--> skip: result file should be a json: but got: {}".format(file_path))
+            continue
+
+        response_text = pick_response_text(file_path)
+        if response_text is None:
+            continue
+
+        file_name_wo_ext, ext = os.path.splitext(file_name)
+        response_info[file_name_wo_ext] = response_text
+    return response_info
+
+
+class BaseMetric(object):
+    """ BaseMetric """
+    """ OCRMetric """
+    def __init__(self, group_name, **kwargs):
+        self.group_name = group_name
+        self.kwargs = kwargs
+
+    def response_post_func(self, response_text, **kwargs):
+        return response_text
+
+    @abstractmethod
+    # Given the prediction and gt, return the evaluation results in the format of a dictionary
+    # results should contain a 'summary' key, for example:
+    # {
+    #     "summary": {
+    #         "f1-score": 99.99,
+    #         "metric_name": "metric_value"  # used for summary，only metric info could be placed in this dict.
+    #     },
+    #     "your other info": "xxx"
+    # }
+    def evaluate(self, response_info, gt_info, normalize_func=None, **kwargs):
+        pass
+
+    def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs):
+        if isinstance(pdt_res_dir, dict):
+            raw_response_info = pdt_res_dir
+        elif os.path.exists(pdt_res_dir) and os.path.isdir(pdt_res_dir):
+            raw_response_info = load_response_from_dir(pdt_res_dir)
+        else:
+            return ValueError("invalid input: response dict or folder are required, but got {}".format(pdt_res_dir))
+
+        post_error_list, response_info = [], {}
+        response_error_list = list(gt_info.keys() - raw_response_info.keys())
+        for file_name, single_pdt_str in raw_response_info.items():
+            single_pdt_str = self.response_post_func(single_pdt_str, **kwargs)
+            if single_pdt_str is None:
+                post_error_list.append(file_name)
+                continue
+            response_info[file_name] = single_pdt_str
+
+        meta_info = {
+            "gt_total_num": len(gt_info), "pdt_total_num": len(response_info),
+            "post_error_list": post_error_list, "response_error_list": response_error_list,
+        }
+        eval_info = self.evaluate(response_info, gt_info, **kwargs)
+
+        # add response_success_ratio
+        if "summary" in eval_info and with_response_ratio:
+            success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9)
+            eval_info["summary"].update({"response_success_ratio": success_ratio})
+        return meta_info, eval_info
+
+
+def summary(index_path, exp_dir_base, is_weighted_sum=False):
+    """
+    """
+    with open(index_path, "r") as f:
+        data_list = json.load(f)
+
+    all_data_info = {}
+    for data_info_item in data_list:
+        data_name = data_info_item["dataset"]
+        if not data_info_item.get("release", True):
+            continue
+        all_data_info[data_name] = data_info_item
+    dataset_list = list(all_data_info.keys())
+    summary_path = summary_multi_exp(exp_dir_base, dataset_list, is_weighted_sum=is_weighted_sum)
+    return summary_path
+
+
+def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
+    """
+    """
+    if dataset_list is None:
+        all_dataset_name = []
+        for exp_name in os.listdir(exp_dir_base):
+            dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
+            if not os.path.exists(dir_status_path):
+                continue
+            with open(dir_status_path, "r") as f:
+                data_status_info = json.load(f)
+            all_dataset_name.extend(data_status_info.keys())
+        dataset_list = sorted(set(all_dataset_name))
+
+    # summary main code
+    all_evaluate_info, _ = {}, 0
+    for exp_name in os.listdir(exp_dir_base):
+        dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
+        if not os.path.exists(dir_status_path):
+            print("--> skip: status.json not exist: {}".format(dir_status_path))
+            continue
+
+        with open(dir_status_path, "r") as f:
+            all_status_info = json.load(f)
+
+        for data_name in dataset_list:
+            total_num = all_status_info.get(data_name, {}).get("config", {}).get("num", "-1")
+            summary_info = all_status_info.get(data_name, {}).get("evaluation", {}).get("summary", {})
+            for metric_name, metric_value in summary_info.items():
+                if metric_name not in all_evaluate_info:
+                    all_evaluate_info[metric_name] = {}
+                if exp_name not in all_evaluate_info[metric_name]:
+                    all_evaluate_info[metric_name][exp_name] = {}
+                all_evaluate_info[metric_name][exp_name][data_name] = (metric_value, total_num)
+
+    all_table_md = []
+    for metric_name, metric_info in all_evaluate_info.items():
+        formatted_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time()))
+        summary_line_list = []
+        summary_key_name = "summary(weighted)" if is_weighted_sum else "summary"
+        summary_head = [f"exp_name({metric_name}_{formatted_time})"] + dataset_list + [summary_key_name]
+        for exp_name, data_eval_info in metric_info.items():
+            summary_line = [exp_name, ]
+
+            all_metric_value = 0
+            is_summary_valid, all_total_num, all_weighted_metric = True, 0, 0
+            for data_name in dataset_list:
+                metric_value, total_num = data_eval_info.get(data_name, ("-1", "-1"))
+                summary_line.append("{:.2f}".format(float(metric_value) * 100))
+                if str(metric_value) == "-1" or str(metric_value) == "-1":
+                    is_summary_valid = False
+                    continue
+
+                all_total_num += float(total_num)
+                all_weighted_metric += float(total_num) * float(metric_value)
+                all_metric_value += float(metric_value)
+
+            summary_value_valid = ((all_weighted_metric / (all_total_num + 1e-9)) * 100) if is_weighted_sum \
+                else (all_metric_value / (len(dataset_list) + 1e-9) * 100)
+            summary_value = "-" if not is_summary_valid else "{:.2f}".format(summary_value_valid)
+            summary_line.append(summary_value)
+            summary_line_list.append(summary_line)
+
+        md_table_info = tabulate(summary_line_list, headers=summary_head, tablefmt='pipe')
+        all_table_md.append(md_table_info)
+
+    print("\n\n".join(all_table_md))
+    summary_path = os.path.abspath(os.path.join(exp_dir_base, "summary.md"))
+    with open(summary_path, "w") as f:
+        f.write("\n\n".join(all_table_md))
+    return summary_path
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python {} exp_base_dir".format(__file__))
+        exit(-1)
+    else:
+        print('--> info: {}'.format(sys.argv))
+        exp_base_dir = sys.argv[1]
+
+    summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False)
+    print("--> info: summary saved at : {}".format(summary_path))
+    print("happy coding.")
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py
@@ -0,0 +1,256 @@
+import nltk
+import re
+from tqdm import tqdm
+from collections import deque
+from apted.helpers import Tree
+from apted import APTED, Config
+
+# local import
+from .common import BaseMetric
+
+
+# 移除指定的LaTeX命令
+patterns = [
+    r'\\documentclass\{.*?\}',
+    r'\\usepackage\[.*?\]\{.*?\}',
+    r'\\usepackage\{.*?\}',
+    r'\\geometry\{.*?\}',
+    r'\\begin\{document\}',
+    r'\\end\{document\}',
+    r'\\noindent'
+]
+
+
+class TableTree(Tree):
+    """
+    # Copyright 2020 IBM
+    # Author: peter.zhong@au1.ibm.com
+    # License:  Apache 2.0 License.
+    """
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == "td":
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % (
+                self.tag,
+                self.colspan,
+                self.rowspan,
+                self.content,
+            )
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+
+
+class CustomConfig(Config):
+    """
+    # Copyright 2020 IBM
+    # Author: peter.zhong@au1.ibm.com
+    # License:  Apache 2.0 License.
+    """
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        # print(node1.tag)
+        if (
+            (node1.tag != node2.tag)
+            or (node1.colspan != node2.colspan)
+            or (node1.rowspan != node2.rowspan)
+        ):
+            return 1.0
+        if node1.tag == "td":
+            if node1.content or node2.content:
+                return nltk.edit_distance(node1.content, node2.content) / max(len(node1.content), len(node2.content))
+        return 0.0
+
+
+class TEDS(object):
+    """Tree Edit Distance basead Similarity
+    # Copyright 2020 IBM
+    # Author: peter.zhong@au1.ibm.com
+    # License:  Apache 2.0 License.
+    """
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (
+            n_jobs >= 1
+        ), "n_jobs must be an integer greather than 1"
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+
+    def tokenize(self, node):
+        """Tokenizes table cells"""
+        self.__tokens__.append("<%s>" % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != "unk":
+            self.__tokens__.append("</%s>" % node.tag)
+        if node.tag != "td" and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+
+    def load_html_tree(self, node, parent=None):
+        """Converts HTML tree to the format required by apted"""
+        global __tokens__
+        if node.tag == "td":
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(
+                node.tag,
+                int(node.attrib.get("colspan", "1")),
+                int(node.attrib.get("rowspan", "1")),
+                cell,
+                *deque(),
+            )
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != "td":
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+
+    def evaluate(self, pred, true):
+        """Computes TEDS score between the prediction and the ground truth of a
+        given sample
+        """
+        # try_import("lxml")
+        from lxml import etree, html
+        if (not pred) or (not true):
+            return 0.0
+
+        parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath("body/table") and true.xpath("body/table"):
+            pred = pred.xpath("body/table")[0]
+            true = true.xpath("body/table")[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(
+                tree_pred, tree_true, CustomConfig()
+            ).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+
+
+class ParsingEvaluator(BaseMetric):
+    def response_post_func(self, response_text, **kwargs):
+        return response_text
+
+    def evaluate(self, response_info, gt_info, **kwargs):
+        op = kwargs['op']
+        if op == 'doc':
+            score = self.eval_doc(response_info, gt_info)
+        elif op == 'table':
+            score = self.eval_table(response_info, gt_info)
+        elif op in ['molecular', "formula"]:
+            score = self.eval_formula(response_info, gt_info, op_name=op)
+        else:
+            raise ValueError(f'doc parsing unsupported op: {op}')
+
+        # summary info
+        eval_info = {"summary": {"score": score}}
+        return eval_info
+
+    def eval_doc(self, response_info, gt_info):
+        results = []
+        for img_name, gt in tqdm(gt_info.items()):
+            if img_name not in response_info:
+                results.append(0)
+                continue
+
+            pred = response_info[img_name]
+            for pattern in patterns:
+                pred = re.sub(pattern, '', pred)
+
+            try:
+                pred = pred.split('```')[1]
+            except:
+                pass
+
+            pred = pred.replace('```latex', '')
+            pred = pred.replace('```', '')
+
+            pred = pred.replace(' ', '').replace('\n', '')
+            gt = gt.replace(' ', '').replace('\n', '')
+
+            edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
+            results.append(1 - edit_dist)
+
+        score = sum(results) / len(results)
+        return score
+
+    def eval_table(self, response_info, gt_info):
+        teds = TEDS(structure_only=False, n_jobs=1)
+        results = []
+        for img_name, gt in tqdm(gt_info.items()):
+            if img_name not in response_info:
+                results.append(0)
+                continue
+
+            pred = response_info[img_name]
+            for pattern in patterns:
+                pred = re.sub(pattern, '', pred)
+
+            try:
+                pred = pred.split('```html')[1]
+            except:
+                pass
+
+            pred = pred.replace('```', '')
+            pred = pred.replace(' ', '').replace('\n', '').replace('，', ',')
+            gt = gt.replace(' ', '').replace('\n', '')
+
+            pred_html = '<html><body>{}</body></html>'.format(pred)
+            gt_html = '<html><body>{}</body></html>'.format(gt)
+            results.append(teds.evaluate(pred_html, gt_html))
+
+        score = sum(results) / len(results)
+        return score
+
+    def eval_formula(self, response_info, gt_info, op_name='formula'):
+        results = []
+        for img_name, gt in tqdm(gt_info.items()):
+            if img_name not in response_info:
+                results.append(0)
+                continue
+
+            pred = response_info[img_name]
+
+            if op_name == 'formula':
+                pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "")  # noqa: E501
+                gt = gt.replace(" ", "")
+            elif op_name == 'molecular':
+                pred = pred.replace("\n", "").replace(" ", "").replace("<smiles>", "").replace("</smiles>", "")
+                gt = gt.replace(" ", "")
+            edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
+            results.append(1 - edit_dist)
+        score = sum(results) / len(results)
+        return score
+
+
+if __name__ == '__main__':
+    pass
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py
@@ -0,0 +1,385 @@
+
+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import json
+import os
+import sys
+import re
+import time
+from typing import Any, Dict, List, Tuple, Union
+
+import zss
+from zss import Node
+from collections import Counter
+from nltk import edit_distance
+
+# local import
+from .common import BaseMetric
+
+
+def flatten(data: dict):
+    """
+    Convert Dictionary into Non-nested Dictionary
+    Example:
+        input(dict)
+            {
+                "menu": [
+                    {"name" : ["cake"], "count" : ["2"]},
+                    {"name" : ["juice"], "count" : ["1"]},
+                ]
+            }
+        output(list)
+            [
+                ("menu.name", "cake"),
+                ("menu.count", "2"),
+                ("menu.name", "juice"),
+                ("menu.count", "1"),
+            ]
+    """
+    flatten_data = list()
+
+    def _flatten(value, key=""):
+        if type(value) is dict:
+            for child_key, child_value in value.items():
+                _flatten(child_value, f"{key}.{child_key}" if key else child_key)
+        elif type(value) is list:
+            for value_item in value:
+                _flatten(value_item, key)
+        else:
+            flatten_data.append((key, value))
+
+    _flatten(data)
+    return flatten_data
+
+
+def update_cost(node1: Node, node2: Node):
+    """
+    Update cost for tree edit distance.
+    If both are leaf node, calculate string edit distance between two labels (special token '<leaf>' will be ignored).
+    If one of them is leaf node, cost is length of string in leaf node + 1.
+    If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1
+    """
+    label1 = node1.label
+    label2 = node2.label
+    label1_leaf = "<leaf>" in label1
+    label2_leaf = "<leaf>" in label2
+    if label1_leaf and label2_leaf:
+        return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
+    elif not label1_leaf and label2_leaf:
+        return 1 + len(label2.replace("<leaf>", ""))
+    elif label1_leaf and not label2_leaf:
+        return 1 + len(label1.replace("<leaf>", ""))
+    else:
+        return int(label1 != label2)
+
+
+def insert_and_remove_cost(node: Node):
+    """
+    Insert and remove cost for tree edit distance.
+    If leaf node, cost is length of label name.
+    Otherwise, 1
+    """
+    label = node.label
+    if "<leaf>" in label:
+        return len(label.replace("<leaf>", ""))
+    else:
+        return 1
+
+
+def normalize_dict(data: Union[Dict, List, Any]):
+    """
+    Sort by value, while iterate over element if data is list
+    """
+    # if not data:
+    #     return {}
+
+    if isinstance(data, dict):
+        new_data = dict()
+        for key in sorted(data.keys(), key=lambda k: (len(k), k)):
+            value = normalize_dict(data[key])
+            if value:
+                if not isinstance(value, list):
+                    value = [value]
+                new_data[key] = value
+
+    elif isinstance(data, list):
+        if all(isinstance(item, dict) for item in data):
+            new_data = []
+            for item in data:
+                item = normalize_dict(item)
+                if item:
+                    new_data.append(item)
+        else:
+            new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()]
+    else:
+        new_data = [str(data).strip()]
+    return new_data
+
+
+def cal_f1_all(preds, answers):
+    """
+    Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives,
+    false negatives and false positives
+    """
+    metric_info, error_info = {}, {}
+    total_tp, total_fn_or_fp = 0, 0
+    for file_name, answer in answers.items():
+        sample_error_info = {"fp": [], "fn": [], "tp": []}
+        pred = preds.get(file_name, {})
+        pred, answer = flatten(normalize_dict(pred)), flatten(normalize_dict(answer))
+        for field in pred:
+            field_name = field[0]
+            if field_name not in metric_info:
+                metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0}
+            if field in answer:
+                total_tp += 1
+                metric_info[field_name]["total_tp"] += 1
+                sample_error_info["tp"].append(field)
+                answer.remove(field)
+            else:
+                total_fn_or_fp += 1
+                metric_info[field_name]["total_fn_or_fp"] += 1
+                sample_error_info["fp"].append(field)
+
+        total_fn_or_fp += len(answer)
+        for field in answer:
+            field_name = field[0]
+            if field_name not in metric_info:
+                metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0}
+            metric_info[field_name]["total_fn_or_fp"] += 1
+            sample_error_info["fn"].append(field)
+
+        sample_error_num = sum([len(v) for k, v in sample_error_info.items() if k != "tp"])
+        if sample_error_num > 0:
+            sample_error_info["error_num"] = sample_error_num
+            error_class_list = ["counter_" + x[0] for x in (sample_error_info["fn"] + sample_error_info["fp"])]
+            counter = Counter(error_class_list)
+            sample_error_info["error_info"] = dict(counter)
+            error_info[file_name] = sample_error_info
+
+    # summary
+    for field_name, field_info in metric_info.items():
+        field_tp, field_fn_or_fp = field_info["total_tp"], field_info["total_fn_or_fp"]
+        metric_info[field_name]["acc"] = field_tp / (field_tp + field_fn_or_fp / 2 + 1e-6)
+
+    print("donut_evaluator: total_tp: {}, total_fn_or_fp: {}, ptd_num: {}, gt_num: {}".format(total_tp, total_fn_or_fp,
+                                                                                              len(preds), len(answers)))
+    error_info = {k: v for k, v in
+                  sorted(error_info.items(), key=lambda item: item[1].get("error_num", 0), reverse=True)}
+    metric_info = {k: v for k, v in
+                   sorted(metric_info.items(), key=lambda item: item[1].get("total_fn_or_fp", 0), reverse=True)}
+    return total_tp / (total_tp + total_fn_or_fp / 2 + 1e-6), metric_info, error_info
+
+
+def construct_tree_from_dict(data: Union[Dict, List], node_name: str = None):
+    """
+    Convert Dictionary into Tree
+
+    Example:
+        input(dict)
+
+            {
+                "menu": [
+                    {"name" : ["cake"], "count" : ["2"]},
+                    {"name" : ["juice"], "count" : ["1"]},
+                ]
+            }
+
+        output(tree)
+                                 <root>
+                                   |
+                                 menu
+                                /    \
+                         <subtree>  <subtree>
+                        /      |     |      \
+                     name    count  name    count
+                    /         |     |         \
+              <leaf>cake  <leaf>2  <leaf>juice  <leaf>1
+     """
+    if node_name is None:
+        node_name = "<root>"
+
+    node = Node(node_name)
+
+    if isinstance(data, dict):
+        for key, value in data.items():
+            kid_node = construct_tree_from_dict(value, key)
+            node.addkid(kid_node)
+    elif isinstance(data, list):
+        if all(isinstance(item, dict) for item in data):
+            for item in data:
+                kid_node = construct_tree_from_dict(
+                    item,
+                    "<subtree>",
+                )
+                node.addkid(kid_node)
+        else:
+            for item in data:
+                node.addkid(Node(f"<leaf>{item}"))
+    else:
+        raise Exception(data, node_name)
+    return node
+
+
+def cal_acc(pred: dict, answer: dict):
+    """
+    Calculate normalized tree edit distance(nTED) based accuracy.
+    1) Construct tree from dict,
+    2) Get tree distance with insert/remove/update cost,
+    3) Divide distance with GT tree size (i.e., nTED),
+    4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ).
+    """
+    pred = construct_tree_from_dict(normalize_dict(pred))
+    answer = construct_tree_from_dict(normalize_dict(answer))
+    val1 = zss.distance(
+        pred,
+        answer,
+        get_children=zss.Node.get_children,
+        insert_cost=insert_and_remove_cost,
+        remove_cost=insert_and_remove_cost,
+        update_cost=update_cost,
+        return_operations=False,
+    )
+    val2 = zss.distance(
+        construct_tree_from_dict(normalize_dict({})),
+        answer,
+        get_children=zss.Node.get_children,
+        insert_cost=insert_and_remove_cost,
+        remove_cost=insert_and_remove_cost,
+        update_cost=update_cost,
+        return_operations=False,
+    )
+    return max(0, 1 - val1 / val2)
+
+
+def cal_acc_all(pred_info, answer_info):
+    acc_info, error_info = {}, {}
+    for file_name, answer in answer_info.items():
+        # if file_name not in pred_info:
+        #     print("---> error: pdt not found: {}".format(file_name))
+        #     continue
+        pred = pred_info.get(file_name, {})
+        acc = cal_acc(pred, answer)
+        acc_info[file_name] = acc
+        if acc < 1.0:
+            error_info[file_name] = {"acc": acc, "pred": pred, "answer": answer}
+
+    error_info = {k: v for k, v in sorted(error_info.items(), key=lambda item: item[1].get("acc", 0))}
+    acc_averge = sum(list(acc_info.values())) / (len(acc_info) + 1e-6)
+    return acc_averge, error_info
+
+
+def normalize_values_of_nested_dict(d, normalize_func):
+    """
+    """
+    if isinstance(d, dict):
+        return {k: normalize_values_of_nested_dict(v, normalize_func) for k, v in d.items()}
+    elif isinstance(d, list):
+        return [normalize_values_of_nested_dict(x, normalize_func) if isinstance(x, dict) else x for x in d]
+    elif isinstance(d, str):
+        return normalize_func(d)
+    else:
+        return d
+
+
+def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None):
+    """
+    """
+    if normalize_func is not None:
+        print("--> info: normalize_func executed.")
+        pdt_info = normalize_values_of_nested_dict(pdt_info, normalize_func)
+        gt_info = normalize_values_of_nested_dict(gt_info, normalize_func)
+
+    f1_score, class_eval_info, error_info = cal_f1_all(pdt_info, gt_info)
+    acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info)
+    eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info,
+                 "f1_error_info": error_info, "acc_error_info": acc_error_info}
+    print(data_name, "f1_score", f1_score, "acc", acc_average)
+    return eval_info
+
+
+def post_process_to_json(qwen_info_str, file_name=None):
+    try:
+        if "```json" in qwen_info_str:
+            if "```" not in qwen_info_str:
+                qwen_info_str += "```"
+            qwen_info_group = re.search(r'```json(.*?)```', qwen_info_str, re.DOTALL)
+            json_str = qwen_info_group.group(1).strip().replace("\n", "")
+        else:
+            json_str = qwen_info_str.strip().replace("\n", "")
+        json_data = json.loads(json_str)
+        return json_data
+    except Exception as err:  # noqa: F841
+        return None
+
+
+def fullwidth_to_halfwidth(text):
+    # 全角转半角
+    result = ''
+    for char in text:
+        code_point = ord(char)
+        # 全角空格直接转化
+        if code_point == 0x3000:
+            code_point = 0x0020
+        # 其他全角字符（除空格）转换为半角
+        elif 0xFF01 <= code_point <= 0xFF5E:
+            code_point -= 0xFEE0
+        result += chr(code_point)
+    result = result.replace("、", ",")
+    return result
+
+
+def remove_unnecessary_spaces(text):
+    # 去掉中文字符之间的空格
+    text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', text)
+    # 去掉中文和英文、数字之间的空格
+    text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[a-zA-Z0-9])', '', text)
+    text = re.sub(r'(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fff])', '', text)
+    # 去掉符号前的不必要空格，保留符号后的一个空格
+    text = re.sub(r'(?<![0-9])\s*([,.!?:;])\s*', r'\1 ', text)  # 非数字前后的符号
+    # 在数字和英文之间添加空格
+    text = re.sub(r'(?<=[0-9])(?=[a-zA-Z])', ' ', text)
+    text = re.sub(r'(?<=[a-zA-Z])(?=[0-9])', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text
+
+
+class KieEvaluator(BaseMetric):
+    def response_post_func(self, response_text, **kwargs):
+        response_text = post_process_to_json(response_text, file_name=kwargs.get('file_name', None))
+        return response_text
+
+    def normalize_func(self, text, **kwargs):
+        halfwidth_text = fullwidth_to_halfwidth(str(text))
+        cleaned_text = remove_unnecessary_spaces(halfwidth_text)
+        return cleaned_text
+
+    def evaluate(self, response_info, gt_info, **kwargs):
+        """
+        response_info: dict: {"file_name_1": response, "file_name_2": gt}
+        gt_info: dict: {"file_name_1": gt, "file_name_2": gt}
+        kwargs: dataset index config: {'dataset': 'kie_benchmark_POIE', 'group': 'kie', 'op': 'poie', 'num': 250}
+        """
+        # gt should be a dict for kie task, fix for VLMEvalKit
+        for image_name, label_content in gt_info.items():
+            if isinstance(label_content, str):
+                gt_info[image_name] = json.loads(label_content)
+
+        response_info = normalize_values_of_nested_dict(response_info, self.normalize_func)
+        gt_info = normalize_values_of_nested_dict(gt_info, self.normalize_func)
+
+        f1_score, class_eval_info, error_info = cal_f1_all(response_info, gt_info)
+        acc_average, acc_error_info = cal_acc_all(response_info, gt_info)
+
+        # summary info
+        summary_info = {"f1_score": f1_score, "acc": acc_average}
+        eval_info = {"summary": summary_info, "class_f1_score": class_eval_info,
+                     "f1_error_info": error_info, "acc_error_info": acc_error_info}
+        return eval_info
+
+
+if __name__ == '__main__':
+    pass
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py
@@ -0,0 +1,106 @@
+import os
+import sys
+import json
+import re
+from collections import Counter
+
+# local import
+from .common import BaseMetric
+
+
+def token_normalize(token_text, is_lower=False, is_alphanum_only=False):
+    """
+    """
+    if is_lower:
+        token_text = token_text.lower()
+    if is_alphanum_only:
+        token_text = re.sub('[^A-Za-z0-9]+', '', token_text)
+    return token_text
+
+
+def text_normalize_and_tokenize(text, is_keep_blank=True, is_lower=True, is_alphanum_only=False):
+    text = text.replace("\t", " ").replace("\n", " ").replace("###", "").replace("***", "")
+    text = re.sub(r'\s+', ' ', text)
+    if not is_keep_blank:
+        text = text.replace(" ", "")
+    text_tokens = text.split(" ") if is_keep_blank else list(text)
+    text_token_normalized = [token_normalize(t, is_lower, is_alphanum_only) for t in text_tokens]
+    text_token_normalized = [x for x in text_token_normalized if len(x) > 0]
+    return text_token_normalized
+
+
+def evaluate_single_sample(gts, preds):
+    right_num = 0
+    gt_counter_info = dict(Counter(gts))
+    pdt_counter_info = dict(Counter(preds))
+    for gt_token, gt_count in gt_counter_info.items():
+        pred_count = pdt_counter_info.get(gt_token, 0)
+        right_num += min(gt_count, pred_count)
+    return right_num
+
+
+def calculate_metrics(response_info, gt_info, is_verbose=False):
+    """
+    """
+    macro_recall_list, macro_precision_list, macro_f1_list = [], [], []
+    total_gt_num, total_pred_num, total_right_num = 0, 0, 0
+    for file_name, fullbox_gts in gt_info.items():
+        fullbox_preds = response_info.get(file_name, [])
+        right_num = evaluate_single_sample(fullbox_gts, fullbox_preds)
+        total_right_num += right_num
+        total_gt_num += len(fullbox_gts)
+        total_pred_num += len(fullbox_preds)
+
+        macro_recall = right_num / (len(fullbox_gts) + 1e-9)
+        macro_precision = right_num / (len(fullbox_preds) + 1e-9)
+        macro_f1 = 2 * macro_recall * macro_precision / (macro_recall + macro_precision + 1e-9)
+        macro_recall_list.append(macro_recall)
+        macro_precision_list.append(macro_precision)
+        macro_f1_list.append(macro_f1)
+
+    # marco
+    final_macro_recall = sum(macro_recall_list) / (len(macro_recall_list) + 1e-9)
+    final_macro_precision = sum(macro_precision_list) / (len(macro_precision_list) + 1e-9)
+    final_macro_f1 = sum(macro_f1_list) / (len(macro_f1_list) + 1e-9)
+
+    # micro
+    recall_acc = total_right_num / (total_gt_num + 1e-9)
+    preci_acc = total_right_num / (total_pred_num + 1e-9)
+    hmean = 2 * recall_acc * preci_acc / (recall_acc + preci_acc + 1e-9)
+    vbs_eval_result = {
+        'macro_recall': final_macro_recall, 'macro_precision': final_macro_precision, 'macro_f1_score': final_macro_f1,
+        'micro_recall': recall_acc, 'micro_precision': preci_acc, 'mirco_f1_score': hmean
+    }
+    eval_result = vbs_eval_result if is_verbose else {'macro_f1_score': final_macro_f1, 'mirco_f1_score': hmean}
+    return eval_result
+
+
+class OcrEvaluator(BaseMetric):
+    def response_post_func(self, response_text, **kwargs):
+        return response_text
+
+    def evaluate(self, response_info, gt_info, **kwargs):
+        # hard code here
+        dataset_name = kwargs['dataset']
+        is_word_level, is_lower, is_alphanum_only = True, True, False
+        if dataset_name in ["Arabic", "Japanese", "Korean"] or "zh" in dataset_name:
+            is_word_level = False
+        if "multi_scene_ocr" in self.group_name and is_word_level:
+            is_alphanum_only = True
+        eval_config = {"word_level": is_word_level, "alphanum_only": is_alphanum_only, "lowercase": is_lower}
+
+        image_pdt_info, image_gt_info = {}, {}
+        for file_name, gt_src in gt_info.items():
+            pred_src = response_info.get(file_name, "")
+            pdt_token_list = text_normalize_and_tokenize(
+                str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
+            gt_token_list = text_normalize_and_tokenize(
+                str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
+            image_pdt_info[file_name] = pdt_token_list
+            image_gt_info[file_name] = gt_token_list
+        eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False)
+        return {"summary": eval_result, "metric_config": eval_config}
+
+
+if __name__ == '__main__':
+    pass
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py
@@ -0,0 +1,682 @@
+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+import pandas as pd
+import numpy as np
+import re
+
+FAIL_MSG = "Failed to obtain answer via API."
+
+frame_tmpl = "frame-{}-of-{}.jpg"
+
+sys_prompt_open_eval_step_1 = (
+    "You will be provided with a question, a model's prediction, and the ground "
+    "truth answer for this question.\n"
+    "Your task is to judge whether the model's prediction is correct based on the "
+    "meaning of the two texts.\n"
+    "In most cases, this can be done by determining if the meaning of the model's "
+    "prediction is consistent with, or contains, the ground truth answer. However, "
+    "in some cases where the two texts differ, it may represent different "
+    "descriptions of the same visual scene, in which case visual information is "
+    "needed for further judgment.\n"
+    "Therefore, I hope you:\n"
+    "- Output 0, if the model's prediction and the ground truth answer are neither "
+    "consistent nor related by inclusion, with fundamentally different meanings.\n"
+    "- Output 1, if the meaning of the model's prediction and the ground truth "
+    "answer is consistent, or if the model's prediction meaningfully contains the "
+    "ground truth answer.\n"
+    "- Output 2, if the model's prediction and ground truth are not consistent or "
+    "inclusive, but may be different descriptions of the same visual scene, "
+    "requiring visual information for further judgment.\n"
+    "Only output the answer in the following format:\n\n"
+    '```json\n{"result": choice}\n```\n\n'
+    "The choice is either 0, 1, or 2 as specified above."
+)
+
+sys_prompt_open_eval_step_2 = (
+    "You will be provided with a question, a model's prediction, and the sampling "
+    "frames of the clue intervals related to this question.\n"
+    "Your task is to determine whether the model has answered the question "
+    "correctly based on the visual information provided.\n"
+    "Therefore, I hope you:\n"
+    "- Output 0, if the model's prediction does not correctly answer the question.\n"
+    "- Output 1, if the model's prediction correctly answers the question.\n"
+    "Only output the answer in the following format without output extra "
+    "explanation:\n\n"
+    '```json\n{"result": choice}\n```\n\n'
+    "The choice is either 0 or 1 as specified above."
+)
+
+FAIL_MSG = "Failed to obtain answer via API."
+
+# '10-20', '20-30', '30-40', '40-50', '50-60'
+DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"]
+
+DOMAINS = [
+    "Life Record",
+    "Music & TV show",
+    "Instruction & Knowledge",
+    "Driving",
+    "Embodied Expert",
+    "Humor/funny",
+    "Electonic/Social Gaming",
+    "Security & Health",
+    "Sports & Exercise",
+    "Special Scenes",
+    "Art & Culture",
+    "GUI",
+    "News",
+    "Animal & Pet",
+]
+
+SUB_CATEGORIES = [
+    "Time Cognition",
+    "Hallucination",
+    "Entity Perception",
+    "2D Spatial Perception",
+    "Time Perception",
+    "Scene Perception",
+    "Text Perception",
+    "Event Cognition",
+    "Entity Cognition",
+    "Text Cognition",
+    "Event Perception",
+    "Scene Cognition",
+]
+
+
+def get_dimention_rating_open_ended(data_path):
+    # 读取数据
+    df = load(data_path)
+
+    df = df[df["score"] != -1]
+
+    # 将秒转换为分钟并分配到对应区间
+    df["duration_minutes"] = df["duration"] / 60
+    df["duration_range"] = pd.cut(
+        df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
+    )
+
+    # 初始化结果字典
+    result = {
+        "overall": 0,
+        "duration": {k: 0 for k in DURATIONS},
+        "domain": {k: 0 for k in DOMAINS},
+        "sub_category": {k: 0 for k in SUB_CATEGORIES},
+    }
+
+    # Overall
+    result["overall"] = round(df["score"].mean(), 4)
+
+    # Duration
+    for dur in DURATIONS:
+        dur_scores = df[df["duration_range"] == dur]["score"]
+        result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
+
+    # Domain
+    for domain in DOMAINS:
+        domain_scores = df[df["domain"] == domain]["score"]
+        result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
+
+    # Sub-category
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_scores = df[df["sub_category"] == sub_cat]["score"]
+        result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
+
+    return result
+
+
+def get_dimention_rating_mcq_grouding(data_path):
+
+    # 读取数据
+    df = load(data_path)
+
+    # df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0
+
+    df = df[df["score"] != -1]
+
+    # 将秒转换为分钟并分配到对应区间
+    df["duration_minutes"] = df["duration"] / 60
+    df["duration_range"] = pd.cut(
+        df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS
+    )
+
+    # 初始化结果字典
+    result = {
+        metric: {
+            "overall": 0,
+            "duration": {k: 0 for k in DURATIONS},
+            "domain": {k: 0 for k in DOMAINS},
+            "sub_category": {k: 0 for k in SUB_CATEGORIES},
+        }
+        for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"]
+    }
+
+    # 计算基础指标
+    for metric in ["long_acc", "clue_acc", "miou"]:
+        metric_df = df[df["task_mode"] == metric]
+
+        # Overall
+        result[metric]["overall"] = round(metric_df["score"].mean(), 4)
+
+        # Duration
+        for dur in DURATIONS:
+            dur_scores = metric_df[metric_df["duration_range"] == dur]["score"]
+            result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0
+
+        # Domain
+        for domain in DOMAINS:
+            domain_scores = metric_df[metric_df["domain"] == domain]["score"]
+            result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0
+
+        # Sub-category
+        for sub_cat in SUB_CATEGORIES:
+            sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"]
+            result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0
+
+    # 计算复合指标 CRR
+    def calculate_crr(scores):
+        long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean()
+        clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean()
+        return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0
+
+    # Overall CRR
+    result["CRR"]["overall"] = calculate_crr(df)
+
+    # Duration CRR
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["CRR"]["duration"][dur] = calculate_crr(dur_df)
+
+    # Domain CRR
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["CRR"]["domain"][domain] = calculate_crr(domain_df)
+
+    # Sub-category CRR
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df)
+
+    # 计算 acc@iou
+    def calculate_acc_at_iou_threshold(scores, threshold):
+
+        miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"])
+
+        long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"])
+
+        valid_qids = miou_qids & long_acc_qids
+
+        miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"])
+
+        long_acc_positive = scores[
+            (scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1)
+        ]
+
+        acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0
+        return round(acc_at_iou_threshold, 4)
+
+    def calculate_acc_at_iou(scores):
+        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
+        acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds]
+
+        return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4)
+
+    # Overall acc@iou
+    result["acc@iou"]["overall"] = calculate_acc_at_iou(df)
+
+    # Duration acc@iou
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df)
+
+    # Domain acc@iou
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df)
+
+    # Sub-category acc@iou
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df)
+
+    # 计算 rec@iou
+    def calculate_rec_at_iou_threshold(scores, threshold):
+        # 获取所有 miou 类型的数据
+        miou_scores = scores[scores["task_mode"] == "miou"]
+
+        # 计算 miou score 大于 threshold 的数量
+        miou_positive = miou_scores[miou_scores["score"] > threshold]
+
+        # 计算比例
+        rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0
+
+        return round(rec_at_iou, 4)
+
+    def calculate_rec_at_iou(scores):
+        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
+        rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds]
+
+        return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4)
+
+    # Overall rec@iou
+    result["rec@iou"]["overall"] = calculate_rec_at_iou(df)
+
+    # Duration rec@iou
+    for dur in DURATIONS:
+        dur_df = df[df["duration_range"] == dur]
+        result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df)
+
+    # Domain rec@iou
+    for domain in DOMAINS:
+        domain_df = df[df["domain"] == domain]
+        result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df)
+
+    # Sub-category rec@iou
+    for sub_cat in SUB_CATEGORIES:
+        sub_cat_df = df[df["sub_category"] == sub_cat]
+        result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df)
+
+    return result
+
+
+def milliseconds_to_seconds(milliseconds):
+    return milliseconds / 1000
+
+
+def sample_frames_clue_average(clues_time_intervals, frame_num, fps):
+    # 计算每个线索区间的时长
+    clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals]
+    clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals]
+    total_duration = sum(clue_durations)
+    # 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧
+    if frame_num >= total_duration:
+        return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])]
+    frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations]
+    frame_indices = []
+    for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)):
+        num_frames = max(1, num_frames)
+        seg_size = (interval[1] - interval[0]) / num_frames
+        clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)]
+        frame_indices.extend(clue_frame_indices)
+    return frame_indices
+
+
+def merge_intervals(intervals):
+    """
+    Merge overlapping intervals in a list.
+    Assumes each interval is a list [start, end].
+    """
+    if not intervals:
+        return []
+
+    # Sort intervals by start time
+    intervals.sort(key=lambda x: x[0])
+
+    merged = [intervals[0]]
+
+    for current in intervals[1:]:
+        last_merged = merged[-1]
+
+        # Check if there is an overlap
+        if current[0] <= last_merged[1]:
+            # Merge the current interval with the last one
+            last_merged[1] = max(last_merged[1], current[1])
+        else:
+            # No overlap, add current interval
+            merged.append(current)
+
+    return merged
+
+
+def calculate_intervals_iou(intervals1, intervals2):
+    """
+    Calculate the IoU of two lists of intervals.
+    Each list contains intervals represented as [start, end].
+    """
+    # Merge overlapping intervals in both lists
+    merged1 = merge_intervals(intervals1)
+    merged2 = merge_intervals(intervals2)
+
+    # Calculate total length of intervals for both lists
+    def total_length(merged_intervals):
+        return sum(end - start for start, end in merged_intervals)
+
+    length1 = total_length(merged1)
+    length2 = total_length(merged2)
+
+    # Calculate intersection length
+    intersection_length = 0
+    for interval1 in merged1:
+        for interval2 in merged2:
+            intersection_start = max(interval1[0], interval2[0])
+            intersection_end = min(interval1[1], interval2[1])
+            intersection_length += max(0, intersection_end - intersection_start)
+    # Calculate union length
+    union_length = length1 + length2 - intersection_length
+    # IoU is intersection divided by union
+    iou = intersection_length / union_length if union_length > 0 else 0
+    return iou
+
+
+def post_process(response, right_answer, task_mode, duration):
+    result = -1
+
+    if response:
+        # 找到 ```json 和 ``` 的位置
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+
+        # 如果找到了 json 内容
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+
+        if json_content:
+            if task_mode in ["long_acc", "clue_acc"]:
+                json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content)
+
+            try:
+                model_result = json.loads(json_content)["result"]
+
+                if task_mode in ["long_acc", "clue_acc"]:
+                    result = 1 if right_answer == model_result else 0
+                elif task_mode == "miou":
+                    if not isinstance(model_result, list):
+                        return -1
+                    if not isinstance(model_result[0], list):
+                        model_result = [model_result]
+
+                    need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result)
+
+                    if need_duration:
+                        model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result]
+
+                    right_answer = eval(right_answer)
+
+                    result = calculate_intervals_iou(right_answer, model_result)
+
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+
+        if result == -1:
+            if task_mode in ["long_acc", "clue_acc"]:
+                # 检查是否存在大写字母 A-H，认为其为模型答案
+                matches = re.findall(r"\b[A-H]\b", response)
+                if matches:
+                    result = 1 if right_answer in matches else 0
+            elif task_mode == "miou":
+                # 提取所有实数，进行配对
+                numbers = re.findall(r"-?\d+\.?\d*", response)
+                if len(numbers) < 2:
+                    result = -1
+                else:
+                    if len(numbers) % 2 != 0:
+                        numbers = numbers[:-1]
+                    model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)]
+
+                    if type(right_answer) is str:
+                        right_answer = eval(right_answer)
+
+                    result = calculate_intervals_iou(right_answer, model_result)
+
+    return result
+
+
+def get_timestampes(frame_indices, fps):
+    seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices))
+    timestamps = ", ".join(seconds)
+    return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format(
+        frame_num=len(frame_indices), timestamps=timestamps
+    )
+
+
+def post_process_open(response):
+    model_result = -1
+
+    if response and response != FAIL_MSG:
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+
+        # 如果找到了 json 内容
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+
+        if json_content:
+            try:
+                model_result = json.loads(json_content)["result"]
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+
+        if model_result == -1:
+            model_result = response
+
+    return model_result
+
+
+def post_process_eval_open(response, step):
+
+    model_result = -1
+
+    if response and response != FAIL_MSG:
+
+        json_start = response.find("```json")
+        json_end = response.find("```", json_start + len("```json"))
+
+        if json_start != -1 and json_end != -1:
+            json_content = response[json_start + len("```json"):json_end].strip()
+        else:
+            json_content = ""
+
+        if json_content:
+            try:
+                model_result = json.loads(json_content)["result"]
+            except Exception as e:
+                print(f"Error in parsing JSON: {e}, {json_content}")
+                return -1
+        if model_result == -1:
+            if step == 1:
+                match = re.search(r"[012]", response)
+                if match:
+                    model_result = int(match.group())
+            else:
+                match = re.search(r"[01]", response)
+                if match:
+                    model_result = int(match.group())
+
+    return model_result
+
+
+def eval_open_first(model, line):
+
+    user_prompt = ""
+
+    user_prompt += f"Question: {line['question']}\n\n"
+
+    user_prompt += f"The ground truth answer is '{line['answer']}'\n\n"
+
+    user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
+
+    result = model.generate(user_prompt)
+
+    return result
+
+
+def save_step_1_steps(data, step_1_results):
+
+    # 处理所有结果
+    data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1))
+
+    # 条件更新
+    mask = data["step_1_result"].isin([-1, 0, 1])
+    data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"]
+    data.loc[mask, "score"] = data.loc[mask, "step_1_result"]
+
+    return data
+
+
+def eval_open_second(model, line, frame_paths):
+
+    user_prompt = ""
+
+    user_prompt += f"Question: {line['question']}\n\n"
+
+    user_prompt += f"The model's prediction is '{line['model_result']}'\n\n"
+
+    result = model.generate([user_prompt] + frame_paths)
+
+    return result
+
+
+def save_step_2_steps(data, step_1_results):
+
+    # 处理所有结果
+    data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2))
+
+    return data
+
+
+def clue_frame_paths(clue_frame_root, qid, num_frames=8):
+    frame_root = osp.join(clue_frame_root, str(qid))
+    os.makedirs(frame_root, exist_ok=True)
+    return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+
+
+def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+
+    if type(uid) is str:
+        uid = str(uid)
+
+    vid_path = osp.join(data_root, video)
+    vid = decord.VideoReader(vid_path)
+    vid_fps = vid.get_avg_fps()
+
+    if clue_intervals is not None:
+        # 1. 合并重叠区间
+        merged_intervals = merge_intervals(clue_intervals)
+
+        if num_frames > 0 and fps < 0:
+            # 2. 基于clue_intervals均匀抽帧
+            indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+            frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices))
+
+    # 保存帧
+    flag = np.all([osp.exists(p) for p in frame_paths])
+    if not flag:
+        images = [vid[i].asnumpy() for i in indices]
+        images = [Image.fromarray(arr) for arr in images]
+        for im, pth in zip(images, frame_paths):
+            if not osp.exists(pth):
+                im.save(pth)
+
+    return frame_paths, indices, vid_fps
+
+
+def get_chunk_number(filename):
+    try:
+        num = filename.split("chunk_")[1].split(".zip")[0]
+        return int(num)
+    except:
+        return float('inf')
+
+
+def unzip_hf_zip(pth):
+
+    import zipfile
+
+    target_dir = pth
+
+    if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\
+            and os.path.exists(f"{target_dir}/cg_clue_videos"):
+        print("all exists")
+        return
+
+    video_zip_files = [
+        os.path.join(target_dir, file)
+        for file in os.listdir(target_dir)
+        if file.endswith(".zip") and file.startswith("video")
+    ]
+
+    video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
+
+    videos_temp_zip = os.path.join(target_dir, "videos_merged.zip")
+
+    print("Merging video files ...")
+
+    with open(videos_temp_zip, "wb") as outfile:
+        for video_zip_file in tqdm(video_zip_files, desc="Merging videos"):
+            with open(video_zip_file, "rb") as infile:
+                outfile.write(infile.read())
+
+    print("Extracting video files...")
+
+    try:
+        with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref:
+
+            total_files = len(zip_ref.namelist())
+
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+    finally:
+
+        if os.path.exists(videos_temp_zip):
+            os.remove(videos_temp_zip)
+            print("Cleaned up temporary video file")
+
+    clue_video_zip_files = [
+        os.path.join(target_dir, file)
+        for file in os.listdir(target_dir)
+        if file.endswith(".zip") and file.startswith("clue_video")
+    ]
+
+    clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x)))
+
+    clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip")
+
+    print("Merging clue video files ...")
+
+    with open(clue_videos_temp_zip, "wb") as outfile:
+        for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"):
+            with open(clue_video_zip_file, "rb") as infile:
+                outfile.write(infile.read())
+
+    print("Extracting clue video files...")
+
+    try:
+        with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref:
+
+            total_files = len(zip_ref.namelist())
+
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
+    finally:
+
+        if os.path.exists(clue_videos_temp_zip):
+            os.remove(clue_videos_temp_zip)
+            print("Cleaned up temporary clue video file")
+
+    print("Extracting subtitle files ...")
+
+    subtitles_zip = os.path.join(target_dir, "subtitles.zip")
+
+    try:
+        with zipfile.ZipFile(subtitles_zip, "r") as zip_ref:
+
+            total_files = len(zip_ref.namelist())
+
+            for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files):
+                zip_ref.extract(file, target_dir)
+
+        print(f"Successfully extracted to {target_dir}")
+    except Exception as e:
+        print(f"Error during extraction: {e}")
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py
@@ -0,0 +1,13 @@
+import json
+import argparse
+from collections import defaultdict
+
+
+def is_correct(predict, answer):
+    # predict是标准答案 answer是预测
+    if len(answer) == 1:
+        return answer[0] == predict[0]
+    elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
+        return answer[0] == predict[0]
+    elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
+        return predict[4:].lower() in answer.lower()
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py
@@ -0,0 +1,54 @@
+from ...smp import *
+import os
+
+
+def report_acc_hrbench(df):
+    cycle_group = df.groupby('cycle_category')
+    result_dic = defaultdict(list)
+    avg_dic = defaultdict(int)
+
+    count = 0
+    for key, data_value in cycle_group:
+        count += 1
+        _, resp_dic = hrbench_score(data_value)
+
+        for task_type, accuracy in resp_dic.items():
+            result_dic['cycle'].append(key)
+            result_dic['type'].append(task_type)
+            result_dic['accuracy'].append(accuracy)
+
+            avg_dic[task_type] += accuracy
+    for task_type, accuracy in avg_dic.items():
+        result_dic['cycle'].append('Average')
+        result_dic['type'].append(task_type)
+        result_dic['accuracy'].append(accuracy / count)
+    result_pd = pd.DataFrame(result_dic)
+
+    return result_pd
+
+
+def hrbench_score(data):
+    ret = defaultdict(list)
+    resp_dic = {}
+    category_list = set(data['category'])
+    score_dict = defaultdict(list)
+
+    for i in range(len(data)):
+        d = data.iloc[i]
+        category = d['category']
+        gpt_score = d['hit']
+        score_dict[category].append(gpt_score)
+        score_dict['all'].append(gpt_score)
+
+    all_acc = np.mean(score_dict['all'])
+    ret['type'].append('all')
+    ret['acc'].append(all_acc)
+    resp_dic['all'] = all_acc
+    for cate in category_list:
+        acc = np.mean(score_dict[cate])
+        ret['type'].append(cate)
+        ret['acc'].append(acc)
+
+        resp_dic[cate] = acc
+
+    return pd.DataFrame(ret), resp_dic
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py
@@ -1,11 +1,11 @@
 import os
-from ...api import OpenAIWrapper
 from ...smp import load_env

 INTERNAL = os.environ.get('INTERNAL', 0)


 def build_judge(**kwargs):
+    from ...api import OpenAIWrapper, SiliconFlowAPI
    model = kwargs.pop('model', None)
    kwargs.pop('nproc', None)
    load_env()
@@ -19,12 +19,20 @@ def build_judge(**kwargs):
            'chatgpt-1106': 'gpt-3.5-turbo-1106',
            'chatgpt-0125': 'gpt-3.5-turbo-0125',
            'gpt-4o': 'gpt-4o-2024-05-13',
+            'gpt-4o-0806': 'gpt-4o-2024-08-06',
            'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
+            'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
+            'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
+            'deepseek': 'deepseek-ai/DeepSeek-V2.5',
        }
        model_version = model_map[model]
    else:
        model_version = LOCAL_LLM
-    model = OpenAIWrapper(model_version, **kwargs)
+
+    if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
+        model = SiliconFlowAPI(model_version, **kwargs)
+    else:
+        model = OpenAIWrapper(model_version, **kwargs)
    return model


@@ -32,7 +40,7 @@ DEBUG_MESSAGE = """
 To debug the OpenAI API, you can try the following scripts in python:
 ```python
 from vlmeval.api import OpenAIWrapper
-model = OpenAIWrapper('gpt-4-1106-preview', verbose=True)
+model = OpenAIWrapper('gpt-4o', verbose=True)
 msgs = [dict(type='text', value='Hello!')]
 code, answer, resp = model.generate_inner(msgs)
 print(code, answer, resp)
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py
@@ -0,0 +1,150 @@
+import pandas as pd
+
+# from colorama import Fore, Back, Style
+from ...smp import *
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def build_prompt_logicvista(line):
+    question = line['question']
+    prediction = str(line['prediction'])
+    tmpl = (
+        "You are a information extractor that extracts multiple choice letter answer choices "
+        "from a paragraph that contains the answer choice and sometimes explaination of why that "
+        "choice is correct to the given question.\n"
+        "What letter did the following answer choose? If the answer did not select a letter answer choice, "
+        "first try to infer the answer based off the given choices.\n"
+        "If it does not seem like the given answer corresponds to an answer choice OR if there is no selected answer, please just respond with Z.\n"
+        "Make sure you answer with ONLY the letters chosen.\n"
+        'Example 1: \n'
+        'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
+        'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
+        'Example 2: \n'
+        'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
+        'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: <start>\nWhich figure is a rotation of the object?\n<end>\n'
+        'Answer: <start>\nThe figure on the right, labeled "D," is a rotation of the object shown in the top left corner.\n<end>\nYour output: D\n'
+        'Example 4: \n'
+        'Question: <start>\nWhich of the boxes comes next in the sequence? Select from A-E\n<end>\n'
+        'Answer: <start>\nThe sequence of the boxes is A, B, C, D, E.\n<end>\nYour output: ABCDE\n'
+        'Example 5: \n'
+        'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
+    )
+
+    return tmpl.format(question, prediction)
+
+
+def LogicVista_auxeval(model, line):
+    prompt = build_prompt_logicvista(line)
+    print(prompt)
+    log = ''
+    retry = 5
+
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        answer = line['answer'].split(", ")
+        for j in range(0, len(answer)):
+            answer[j] = answer[j].lower()
+        answer.sort()
+        answer = ''.join(answer)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        elif not res.isupper() or not res.isalpha():
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            hit = 0
+            extracted = [alpha.lower() for alpha in res]
+            extracted.sort()
+            extracted = ''.join(extracted)
+            if extracted == answer:
+                hit = 1
+            return dict(log=log, res=res, hit=hit)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='', hit=0)
+
+
+cat = ["diagram", "ocr", "patterns", "graphs", "tables", "3d shapes", "puzzles", "sequences", "physics"]
+
+
+def evaluate_logicvista(file_path):
+    df = pd.read_excel(file_path)
+
+    tot = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    acc = defaultdict(lambda: 0)
+
+    lt = len(df)
+    skill_list = []
+
+    df_tot = df
+
+    df_inductive = df[df["skill"].str.contains("inductive")]
+    df_deductive = df[df["skill"].str.contains("deductive")]
+    df_numerical = df[df["skill"].str.contains("numerical")]
+    df_spatial = df[df["skill"].str.contains("spatial")]
+    df_mechanical = df[df["skill"].str.contains("mechanical")]
+
+    tot_correct = df_tot["hit"].sum()
+    tot_acc = (tot_correct / df_tot.shape[0]) * 100
+    tot['Overall'] = df_tot.shape[0]
+    hit['Overall'] = tot_correct
+    acc['Overall'] = tot_acc
+
+    inductive_correct = df_inductive["hit"].sum()
+    inductive_acc = (inductive_correct / df_inductive.shape[0]) * 100
+
+    tot["inductive"] = df_inductive.shape[0]
+    hit["inductive"] = inductive_correct
+    acc["inductive"] = inductive_acc
+
+    deductive_correct = df_deductive["hit"].sum()
+    deductive_acc = (deductive_correct / df_deductive.shape[0]) * 100
+
+    tot["deductive"] = df_deductive.shape[0]
+    hit["deductive"] = deductive_correct
+    acc["deductive"] = deductive_acc
+
+    numerical_correct = df_numerical["hit"].sum()
+    numerical_acc = (numerical_correct / df_numerical.shape[0]) * 100
+
+    tot["numerical"] = df_numerical.shape[0]
+    hit["numerical"] = numerical_correct
+    acc["numerical"] = numerical_acc
+
+    spatial_correct = df_spatial["hit"].sum()
+    spatial_acc = (spatial_correct / df_spatial.shape[0]) * 100
+
+    tot["spatial"] = df_spatial.shape[0]
+    hit["spatial"] = spatial_correct
+    acc["spatial"] = spatial_acc
+
+    mechanical_correct = df_mechanical["hit"].sum()
+    mechanical_acc = (mechanical_correct / df_mechanical.shape[0]) * 100
+
+    tot["mechanical"] = df_mechanical.shape[0]
+    hit["mechanical"] = mechanical_correct
+    acc["mechanical"] = mechanical_acc
+
+    # capability dimension, the official data json does not contain 'capability' column, so it is now ignored
+    # for i in cat:
+    #     curr = df[df["capability"].str.contains(i.replace(" ", ""))]
+    #     correct = curr["hit"].sum()
+    #     accuracy = (correct / curr.shape[0]) * 100
+    #     tot[i] = curr.shape[0]
+    #     hit[i] = correct
+    #     acc[i] = accuracy
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Task&Skill'].append(k)
+        res['tot'].append(tot[k])
+        res['hit'].append(hit[k])
+        res['acc'].append(acc[k])
+    res = pd.DataFrame(res)
+    return res
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py
@@ -0,0 +1,80 @@
+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+import numpy as np
+import re
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+DURATIONS = [15, 60, 600, 3600]
+TASK_CATEGORIES = [
+    "S2E", "S2O", "S2A",
+    "E2O", "O2E", "T2E",
+    "T2O", "T2A", "E3E",
+    "O3O", "SSS", "SOS",
+    "SAA", "T3E", "T3O",
+    "TOS", "TAA"
+]
+
+
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    print(data.iloc[0])
+
+    duration_rating = {k: {} for k in DURATIONS}
+    for duration in DURATIONS + ['overall']:
+        duration_rating[duration] = {
+            'overall': '',
+            'question_category': {k: [] for k in TASK_CATEGORIES}
+        }
+
+    for i in range(len(data)):
+
+        task_ctg = data.iloc[i]['question_category']
+
+        duration = data.iloc[i]['duration_group']
+        duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
+
+        duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
+
+    for duration in DURATIONS + ['overall']:
+        overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}'  # noqa: E501
+        duration_rating[duration]['overall'] = overall_res_dur
+        for task_ctg in TASK_CATEGORIES:
+            task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}'  # noqa: E501
+            duration_rating[duration]['question_category'][task_ctg] = task_res_dur
+
+    return duration_rating
+
+
+def extract_option(model, input_item, dataset_name):
+    options = input_item['question'].split('\n')[1:]
+    for id, option in enumerate(options):
+        option_id = chr(ord('A') + id) + '.'
+        if option.find(option_id) >= 0:
+            input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
+    return extract_answer_from_item(model, input_item, dataset_name)['opt']
+
+
+def extract_characters_regex(s):
+    s = s.strip()
+    answer_prefixes = [
+        'The best answer is',
+        'The correct answer is',
+        'The answer is',
+        'The answer',
+        'The best option is'
+        'The correct option is',
+        'Best answer:'
+        'Best option:',
+        'Answer:',
+        'Option:',
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, '')
+
+    if len(s.split()) > 10 and not re.search('[ABCDE]', s):
+        return ''
+    matches = re.search(r'[ABCDE]', s)
+    if matches is None:
+        return ''
+    return matches[0]
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py
@@ -2,8 +2,9 @@ from ...smp import *
 from ...utils import can_infer
 try:
    from latex2sympy2 import latex2sympy
-except ImportError:
-    print('Please install latex2sympy2 by running "pip install latex2sympy2"')
+except Exception as e:
+    logging.critical(f'{type(e)}: {e}')
+    logging.critical('Please install latex2sympy2 by running "pip install latex2sympy2"')

 FAIL_MSG = 'Failed to obtain answer via API.'

--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py
@@ -0,0 +1,193 @@
+from ...smp import *
+from ...utils import can_infer
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def get_gpt4_extract_ICE():
+    example_1 = """
+1.
+Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
+Extracted Answer: (-2, 1)
+""" # noqa
+
+    example_2 = """
+2.
+Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
+Extracted Answer: D
+""" # noqa
+
+    example_3 = """
+3.
+Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
+Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
+""" # noqa
+
+    example_4 = """
+4.
+Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
+Extracted Answer: null
+""" # noqa
+
+    example_5 = """
+5.
+Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
+Extracted answer: 22.3
+""" # noqa
+
+    example_6 = """
+6.
+Model response:  have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
+Extracted answer: f(x) = -x^2 - 2x + 1
+""" # noqa
+
+    return [example_1, example_2, example_3, example_4, example_5, example_6]
+
+
+def get_gpt4_score_ICE():
+    example_1 = """
+[Question]: Write the set of numbers represented on the number line in interval notation.
+[Standard Answer]: (-2,1]
+[Model_answer] : Extracted Answer: \\((-2, 1)\\)
+Judgement: 0
+""" # noqa
+
+    example_2 = """
+[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
+[Standard Answer]: C
+[Model_answer] : B:2\u221a{{3}}
+Judgement: 0
+""" # noqa
+
+    example_3 = """
+[Question]: Find the domain and range of the function f using interval notation.
+[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
+[Model_answer] : Range: \\((-4, 1]\\)
+Judgement: 0
+""" # noqa
+
+    example_4 = """
+[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
+[Standard Answer]: C
+[Model_answer] : null
+Judgement: 0
+""" # noqa
+
+    return [example_1, example_2, example_3, example_4]
+
+
+def build_mathverse_gpt4_extract_prompt(line):
+    task_description = """
+I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n
+""" # noqa
+    prediction = str(line['prediction'])
+    demo_prompt = task_description
+    examples = get_gpt4_extract_ICE()
+    for example in examples:
+        demo_prompt += example + '\n\n'
+    test_prompt = f"Model response: '{prediction}'\nExtracted Answer: "
+    full_prompt = f'{demo_prompt}7.\n{test_prompt}'
+
+    return full_prompt
+
+
+def build_mathverse_gpt4_score_prompt(line):
+    task_description = """
+Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question.  Determine whether these two answers are consistent.
+Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n
+""" # noqa
+    question_for_eval = line['question_for_eval']
+    extract = line['extract']
+    answer = line['answer']
+    demo_prompt = task_description
+    examples = get_gpt4_score_ICE()
+    for example in examples:
+        demo_prompt += example + '\n\n'
+    test_prompt = f"""
+    [Question]: {question_for_eval}
+    [Standard Answer]: {answer}
+    [Model_answer] : {extract}
+    Judgement:"""
+    full_prompt = f'{demo_prompt}{test_prompt}'
+
+    return full_prompt
+
+
+def post_check_score(line, prefetch=False):
+    ans = str(line['answer']).strip()
+    response = str(line['extract']).strip()
+
+    if response == ans:
+        return response if prefetch else True
+    else:
+        return False
+
+
+def MathVerse_auxeval_extract(model, line):
+    prompt = build_mathverse_gpt4_extract_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log_extract=log, extract=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log_extract=log, extract='')
+
+
+def MathVerse_auxeval_score(model, line):
+    prompt = build_mathverse_gpt4_score_prompt(line)
+    log = ''
+    retry = 5
+    if post_check_score(line, prefetch=True):
+        res = post_check_score(line, prefetch=True)
+        return dict(log_score='Prefetch succeed', score=True)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res or res.strip() not in ['0', '1']:
+            log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log_score=log, score=int(res) == 1)
+    log += 'All 5 retries failed.\n'
+    return dict(log_score=log, score=False)
+
+
+def MathVerse_acc(result_file):
+    df = load(result_file)
+
+    df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"'))
+    df['metadata'] = df['metadata'].apply(json.loads)
+    df_metadata = pd.json_normalize(df['metadata'])
+    df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1)
+
+    subset = list(set(df['problem_version']))
+
+    res = defaultdict(list)
+    for p in subset:
+        if p != 'Overall':
+            sub = df[df['problem_version'] == p]
+        else:
+            sub = cp.deepcopy(df)
+        res['split'].append(p)
+        # Overall Acc
+        res['Overall'].append(np.mean(sub['score']) * 100)
+        # Subject
+        subjects = set(df['subject'])
+        for k in subjects:
+            res[k].append(np.mean(sub[sub['subject'] == k]['score']) * 100)
+        # Subfield
+        subfields = set(df['subfield'])
+        for k in subfields:
+            res[k].append(np.mean(sub[sub['subfield'] == k]['score']) * 100)
+
+    return pd.DataFrame(res)
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py
@@ -0,0 +1,189 @@
+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+from PIL import Image, ImageOps
+import numpy as np
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+system_prompt_sub_scene = """
+##TASK DESCRIPTION:
+You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category.
+##ACCURACY Scoring Criteria:
+Evaluate the respondent's answer against specific scoring points as follows:
+Score 1: The response completely misses the scoring point.
+Score 3: The response mentions content related to the scoring point but is not entirely correct.
+Score 5: The response accurately addresses the scoring point.
+Calculate the average score across all scoring points to determine the final accuracy score.
+##RELEVANCE Scoring Criteria:
+Assess how the respondent's answer relates to the original question:
+Score 1: The response is completely off-topic from the question.
+Score 2: The response is partially related to the question but contains a significant amount of irrelevant content.
+Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer.
+Score 4: The response mostly addresses the question and the respondent appears confident in their answer.
+Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty.
+----
+##INSTRUCTION:
+1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score.
+2. Evaluate RELEVANCE: Assess the relevance of the respondent’s answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score.
+3. Output Scores in JSON Format: Present the scores in JSON format as follows:
+{'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele}
+"""  # noqa
+
+system_prompt_summary = """
+##TASK DESCRIPTION:
+You are required to evaluate the performance of the respondent in the video summarization task based on the standard answer and the respondent's answer. You should provide two scores. The first is the COMPLETENESS score, which should range from 1 to 5. The second is the RELIABILITY score, which should also range from 1 to 5. Below are the criteria for each scoring category:
+##COMPLETENESS Scoring Criteria:
+The completeness score focuses on whether the summary covers all key points and main information from the video.
+Score 1: The summary hardly covers any of the main content or key points of the video.
+Score 2: The summary covers some of the main content and key points but misses many.
+Score 3: The summary covers most of the main content and key points.
+Score 4: The summary is very comprehensive, covering most to nearly all of the main content and key points.
+Score 5: The summary completely covers all the main content and key points of the video.
+##RELIABILITY Scoring Criteria:
+The reliability score evaluates the correctness and clarity of the video summary. It checks for factual errors, misleading statements, and contradictions with the video content. If the respondent's answer includes details that are not present in the standard answer, as long as these details do not conflict with the correct answer and are reasonable, points should not be deducted.
+Score 1: Contains multiple factual errors and contradictions; presentation is confusing.
+Score 2: Includes several errors and some contradictions; needs clearer presentation.
+Score 3: Generally accurate with minor errors; minimal contradictions; reasonably clear presentation.
+Score 4: Very accurate with negligible inaccuracies; no contradictions; clear and fluent presentation.
+Score 5: Completely accurate with no errors or contradictions; presentation is clear and easy to understand.
+----
+##INSTRUCTION:
+1. Evaluate COMPLETENESS: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
+2. Evaluate RELIABILITY: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
+3. Output Scores in JSON Format: Present the scores in JSON format as follows:
+{'score_completeness': score_comp, 'score_reliability': score_reli, 'total_score': score_comp + score_reli}
+"""  # noqa
+
+
+def check_ans_with_model(pred, gt, model, item, dataset_name='MLVU_MCQ'):
+    flag = False
+
+    index = gt.index("(")  # noqa
+    index2 = gt.index(")")  # noqa
+    gt_option = gt[index + 1: index2]
+
+    if ")" in pred:
+        index3 = pred.index(")")
+        pred = pred[index3 - 1: index3]
+    if pred == gt_option:
+        flag = True
+    elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
+        flag = True
+
+    return flag
+
+
+def extract_scores_summary(text):
+    # Define the keys to locate in the text
+    keys = ["score_completeness", "score_reliability"]
+    scores = []
+
+    for key in keys:
+        # Find the index where each key starts
+        start_index = text.find(key)
+        if start_index == -1:
+            continue  # Skip if key is not found
+
+        # Find the start of the number which is after the colon and space
+        start_number_index = text.find(":", start_index) + 2
+        end_number_index = text.find(",", start_number_index)  # Assuming the number ends before a comma
+
+        # Extract and convert the number to float
+        score = float(text[start_number_index:end_number_index])
+        scores.append(score)
+
+    return scores
+
+
+def check_ans_with_model_summary(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
+    user_prompt = f"""
+    Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
+    Standard Answer: {gt}
+    Respondent's Answer: {pred}
+    """  # noqa
+    result = model.generate(user_prompt)
+    result = extract_scores_summary(result)
+    result = np.sum(result)
+    return result
+
+
+def extract_scores_sub_scene(text):
+    # Define the keys to locate in the text
+    keys = ["score_accuracy", "score_relevance"]
+    scores = []
+
+    for key in keys:
+        # Find the index where each key starts
+        start_index = text.find(key)
+        if start_index == -1:
+            continue  # Skip if key is not found
+
+        # Find the start of the number which is after the colon and space
+        start_number_index = text.find(":", start_index) + 2
+        end_number_index = text.find(",", start_number_index)  # Assuming the number ends before a comma
+
+        # Extract and convert the number to float
+        score = float(text[start_number_index:end_number_index])
+        scores.append(score)
+
+    return scores
+
+
+def check_ans_with_model_sub_scene(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
+    user_prompt = f"""
+    Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
+    Question: {item['question']}
+    Scoring Points: {item['scoring_points']}
+    Respondent's Answer: {pred}
+    """  # noqa
+    result = model.generate(user_prompt)
+    result = extract_scores_sub_scene(result)
+    result = np.sum(result)
+    return result
+
+
+def MLVU_OpenEnded_generate(model, line):
+    task_type = line['task_type']
+    if task_type == 'summary':
+        user_prompt = (
+            f"Please score the respondent's answer according to the steps in the Instructions. "
+            f"You must end with a JSON dict to store the scores.\n"
+            f"Standard Answer: {line['answer']}\n"
+            f"Respondent's Answer: {line['prediction']}\n"
+        )
+    elif task_type == 'sub_scene':
+        user_prompt = (
+            f"Please score the respondent's answer according to the steps in the Instructions. "
+            f"You must end with a JSON dict to store the scores.\n"
+            f"Question: {line['question']}\n"
+            f"Scoring Points: {line['scoring_points']}\n"
+            f"Respondent's Answer: {line['prediction']}\n"
+        )
+    else:
+        AssertionError(f'MLVU don\'t have {task_type} open ended task!')
+    result = model.generate(user_prompt)
+    return result
+
+
+def MLVU_OpenEnded_extract(gpt_generate_data, org_data):
+    extract_func = {
+        'sub_scene': extract_scores_sub_scene,
+        'summary': extract_scores_summary
+    }
+    for idx, item in org_data.iterrows():
+        func = extract_func[item['task_type']]
+        text = gpt_generate_data[idx]
+        org_data.loc[idx, 'score'] = np.sum(func(text))
+
+    return org_data
+
+
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    result_dict = {}
+    for idx, item in data.iterrows():
+        if item['task_type'] not in result_dict:
+            result_dict[item['task_type']] = [0,0]
+        result_dict[item['task_type']][0] += int(item['score'])
+        result_dict[item['task_type']][1] += 1
+    return result_dict
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py
@@ -118,7 +118,7 @@ def mmdu_score(model, line):
                    f'{",".join([x for x in DIMS if x not in result_dict])}'
                )
        except Exception as e:
-            print({e})
+            logging.warning(str(e))
            all_result_dict.append({d: None for d in DIMS})
            logs.append(str(e))

--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py
@@ -0,0 +1,298 @@
+import re
+import json
+
+
+def has_word(sentence, word):
+    pattern = r'\b' + re.escape(word) + r'\b'
+    match = re.search(pattern, sentence)
+    if match:
+        return True
+    else:
+        return False
+
+
+class VQAEval:
+    def __init__(self):
+        self.contractions = {
+            'aint': "ain't",
+            'arent': "aren't",
+            'cant': "can't",
+            'couldve': "could've",
+            'couldnt': "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            'didnt': "didn't",
+            'doesnt': "doesn't",
+            'dont': "don't",
+            'hadnt': "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            'hasnt': "hasn't",
+            'havent': "haven't",
+            'hed': "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            'hes': "he's",
+            'howd': "how'd",
+            'howll': "how'll",
+            'hows': "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            'Im': "I'm",
+            'Ive': "I've",
+            'isnt': "isn't",
+            'itd': "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            'itll': "it'll",
+            "let's": "let's",
+            'maam': "ma'am",
+            'mightnt': "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            'mightve': "might've",
+            'mustnt': "mustn't",
+            'mustve': "must've",
+            'neednt': "needn't",
+            'notve': "not've",
+            'oclock': "o'clock",
+            'oughtnt': "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            'shant': "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            'shouldve': "should've",
+            'shouldnt': "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": 'somebodyd',
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            'somebodyll': "somebody'll",
+            'somebodys': "somebody's",
+            'someoned': "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            'someonell': "someone'll",
+            'someones': "someone's",
+            'somethingd': "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            'somethingll': "something'll",
+            'thats': "that's",
+            'thered': "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            'therere': "there're",
+            'theres': "there's",
+            'theyd': "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            'theyll': "they'll",
+            'theyre': "they're",
+            'theyve': "they've",
+            'twas': "'twas",
+            'wasnt': "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            'weve': "we've",
+            'werent': "weren't",
+            'whatll': "what'll",
+            'whatre': "what're",
+            'whats': "what's",
+            'whatve': "what've",
+            'whens': "when's",
+            'whered': "where'd",
+            'wheres': "where's",
+            'whereve': "where've",
+            'whod': "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            'wholl': "who'll",
+            'whos': "who's",
+            'whove': "who've",
+            'whyll': "why'll",
+            'whyre': "why're",
+            'whys': "why's",
+            'wont': "won't",
+            'wouldve': "would've",
+            'wouldnt': "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            'yall': "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            'youd': "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            'youll': "you'll",
+            'youre': "you're",
+            'youve': "you've",
+        }
+        self.manualMap = {
+            'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
+            'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
+            'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
+            'fourteen': 14, 'fifteen': 15, 'sixteen': 16,
+            'seventeen': 17, 'eighteen': 18, 'nineteen': 19,
+            'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50,
+            'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90}
+        self.articles = ['a', 'an', 'the']
+
+        self.periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)')
+        self.commaStrip = re.compile('(\\d)(\\,)(\\d)')
+        self.punct = [
+            ';',
+            r'/',
+            '[',
+            ']',
+            '"',
+            '{',
+            '}',
+            '(',
+            ')',
+            '=',
+            '+',
+            '\\',
+            '_',
+            '-',
+            '>',
+            '<',
+            '@',
+            '`',
+            ',',
+            '?',
+            '!',
+        ]
+
+    def evaluate(self, answer, gt_answers):
+        answer = answer.replace('\n', ' ')
+        answer = answer.replace('\t', ' ')
+        answer = answer.strip()
+        answer = self.processPunctuation(answer)
+        answer = self.processDigitArticle(answer)
+        if isinstance(gt_answers, list):
+            for i in range(len(gt_answers)):
+                gt_answers[i] = str(gt_answers[i])
+                gt_answers[i] = gt_answers[i].replace('\n', ' ')
+                gt_answers[i] = gt_answers[i].replace('\t', ' ')
+                gt_answers[i] = gt_answers[i].strip()
+                gt_answers[i] = self.processPunctuation(gt_answers[i])
+                gt_answers[i] = self.processDigitArticle(gt_answers[i])
+                if has_word(answer, gt_answers[i]):
+                    return 1
+            return 0
+        else:
+            gt_answers = gt_answers.replace('\n', ' ')
+            gt_answers = gt_answers.replace('\t', ' ')
+            gt_answers = gt_answers.strip()
+            gt_answers = self.processPunctuation(gt_answers)
+            gt_answers = self.processDigitArticle(gt_answers)
+            if has_word(answer, gt_answers):
+                return 1
+            else:
+                return 0
+
+    def evaluate_MRR(self, answer, gt_answers):
+        answer = answer.replace('\n', ' ')
+        answer = answer.replace('\t', ' ')
+        answer = answer.strip()
+        answer = self.processPunctuation(answer)
+        answer = self.processDigitArticle(answer)
+        assert isinstance(gt_answers, list)
+        for i in range(len(gt_answers)):
+            gt_answers[i] = gt_answers[i].replace('\n', ' ')
+            gt_answers[i] = gt_answers[i].replace('\t', ' ')
+            gt_answers[i] = gt_answers[i].strip()
+            gt_answers[i] = self.processPunctuation(gt_answers[i])
+            gt_answers[i] = self.processDigitArticle(gt_answers[i])
+            if has_word(answer, gt_answers[i]):
+                return 1 / (i + 1)
+        return 0.0
+
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + ' ' in inText or ' ' + p in inText) or (
+                re.search(self.commaStrip, inText) is not None
+            ):
+                outText = outText.replace(p, '')
+            else:
+                outText = outText.replace(p, ' ')
+        outText = self.periodStrip.sub('', outText, re.UNICODE)
+        return outText
+
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+
+        outText = [str(text) for text in outText]
+        outText = ' '.join(outText)
+        return outText
+
+
+def is_correct(answer, response):
+    # response_orig = response
+    response = response.strip('.')
+    if isinstance(answer, int):
+        if response.isdigit():
+            return int(int(response) == answer)
+
+        response = response.lower()
+        response = response.replace('the answer is', '')
+        response = response.replace('*', '')  # parse **A**
+        if response.find('.') != -1:
+            response = response.split('.')[0]
+            response = response.replace(',', '')
+            response = response.strip()
+        response = response.strip()
+
+        if response == 'none':
+            return 0
+
+        if 'the camera is moving left' in response:
+            response = 'a'
+        elif 'the camera is moving right' in response:
+            response = 'b'
+
+        if len(response) != 1:
+            # print(f"Fail to parse {response_orig}")
+            return 0
+
+        return (ord(response) - ord('a')) == answer
+
+    if isinstance(answer, list):
+        try:
+            response = response.replace('json', '').replace('```', '').strip()
+            response = json.loads(response)
+            if isinstance(response, dict):
+                response = sum(list(response.values()), start=[])
+        except:
+            # print(f"Fail to parse {response_orig} Exception: {e}")
+            return 0
+
+        if not isinstance(response, (list, tuple)):
+            # print(f"Fail to parse {response_orig} Exception: not a list!")
+            return 0
+
+        match = 0
+        for res, ans in zip(response, answer):
+            match += res == ans
+        return match / len(answer)
+
+    return VQAEval().evaluate(response, answer)
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py
@@ -2,6 +2,7 @@ import pandas as pd
 from ...utils import can_infer, track_progress_rich
 from ...smp import *
 import numpy as np
+import re

 MMB_abbrs = {
    'coarse_perception': 'CP',
@@ -170,6 +171,31 @@ def build_prompt(question, options, prediction):
    return tmpl.format(question, options, prediction)


+def build_prompt_wemath(question, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D, E, F, G (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
+        'Answer: <start>\na cute teddy bear\n<end>\nYour output: A\n'
+        'Example 2: \n'
+        'Question: <start>\nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n<end>\n'
+        'Answer: <start>\nSpider\n<end>\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: <start>\n{}\n<end>\nAnswer: <start>\n{}\n<end>\nYour output: '
+    )
+    question = question.replace(
+        ("Regarding the format, please answer following the template below, and be sure to include two <> symbols:\n"
+        "<Thought process>: <<your thought process>> <Answer>: <<your option>>"),
+        '',
+    )
+    return tmpl.format(question, prediction)
+
+
 def build_prompt_blink(question, options, prediction):
    tmpl = (
        'You are an AI assistant who will help me to match an answer with several options of a single-choice question. '
@@ -241,6 +267,8 @@ def extract_answer_from_item(model, item, dataset_name=None):

    if dataset_name == 'BLINK':
        prompt = build_prompt_blink(item['question'], option_str, item['prediction'])
+    elif dataset_name == 'WeMath':
+        prompt = build_prompt_wemath(item['question'], item['prediction'])
    elif cn_string(item['question']):
        prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
    else:
@@ -359,9 +387,7 @@ def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None):
        res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys)
        result = load(result_file)
        for k, v in zip(keys, res):
-            if k in result:
-                assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
-            else:
+            if k not in result:
                result[k] = v
    data['hit'] = [result[i]['hit'] for i in data['index']]
    data['log'] = [result[i]['log'] for i in data['index']]
@@ -425,9 +451,7 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
                keys=keys)
            result = load(result_file)
            for k, v in zip(keys, res):
-                if k in result:
-                    assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
-                else:
+                if k not in result:
                    result[k] = v

    tmp_pth = f'/tmp/{timestr()}.xlsx'
@@ -440,3 +464,95 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
        data_main.pop('GT')

    return data_main
+
+
+def extract_characters_regex(s, choices=['(A)', '(B)', '(C)', '(D)', '(E)']):
+    if type(s) is dict:
+        s = ''
+    s = s.strip()
+    answer_prefixes = [
+        'The best answer is',
+        'The correct answer is',
+        'The answer is',
+        'The answer',
+        'The best option is'
+        'The correct option is',
+        'Best answer:'
+        'Best option:',
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, '')
+
+    if len(s.split()) > 10 and not re.search('[ABCDE]', s):
+        return ''
+    matches = re.search(r'[ABCDE]', s)
+    if matches is None:
+        for choice in choices:
+            if s.lower() in choice.lower():
+                return choice[1]
+        return ''
+    return matches[0]
+
+
+def get_dimension_rating(data_path):
+    TASKS = [
+        'Reasoning',
+        'Perception',
+    ]
+
+    SUBTASKS = [
+        'Monitoring',
+        'Autonomous_Driving',
+        'OCR with Complex Context',
+        'Diagram and Table',
+        'Remote Sensing',
+    ]
+    data = load(data_path)
+    results = {}
+    results['Overall'] = {}
+    for task in TASKS:
+        results[f'{task}'] = {}
+        for subtask in SUBTASKS:
+            results[f'{task}'][f'{subtask}'] = {}
+
+    for i in range(len(data)):
+        question = data.iloc[i]
+        Task = question['category'].split('/')[0]
+        Subtask = question['category'].split('/')[1]
+        Category = question['l2-category'].lower()
+        if 'attribute' in Category.lower():
+            Category = Category.split('/')[0] + '/attribute'
+        if question['score'] >= 0:
+            cnt = question['score']
+            if Category not in results[Task][Subtask].keys():
+                results[Task][Subtask][f'{Category}'] = {'true': cnt, 'false': 1 - cnt}
+            else:
+                results[Task][Subtask][f'{Category}']['true'] += cnt
+                results[Task][Subtask][f'{Category}']['false'] += 1 - cnt
+
+    sum_all, succ_all = 0, 0
+    for task, tasks_values in results.items():
+        cnt_task, sum_task = 0, 0
+        for substask, subtask_value in tasks_values.items():
+            cnt_subtask, sum_subtask = 0, 0
+            for category, category_dict in subtask_value.items():
+                cnt_subtask += category_dict['true']
+                sum_subtask += category_dict['false'] + category_dict['true']
+                acc = category_dict['true'] / (category_dict['false'] + category_dict['true'])
+                results[task][substask][category] = acc
+            if sum_subtask == 0:
+                acc_subtasks = 0
+            else:
+                acc_subtasks = cnt_subtask / sum_subtask
+            cnt_task += cnt_subtask
+            sum_task += sum_subtask
+            results[task][substask]['Avg'] = acc_subtasks
+        if sum_task == 0:
+            acc_task = 0
+        else:
+            acc_task = cnt_task / sum_task
+        succ_all += cnt_task
+        sum_all += sum_task
+        results[task]['Avg'] = acc_task
+    results['Overall'] = succ_all / sum_all
+    return results
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py
@@ -1,4 +1,5 @@
 from ...smp import *
+from .multiple_choice import extract_answer_from_item
 from PIL import Image, ImageOps
 import torchvision
 import random
@@ -32,9 +33,9 @@ def get_dimension_rating(data_path):
 def check_ans(pred, gt):
    flag = False

-    pred_list = pred.lower().split(' ')
+    pred_list = pred.lower().strip().split(' ')
    pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
-    gt_list = gt.lower().split(' ')
+    gt_list = gt.lower().strip().split(' ')
    gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
    if gt_content[-1] == '.':
        gt_content = gt_content[:-1]
@@ -47,6 +48,64 @@ def check_ans(pred, gt):
    return flag


+def check_ans_with_model(pred, gt, model, item, dataset_name='MVBench'):
+    flag = False
+
+    pred_list = pred.lower().strip().split(' ')
+    pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
+    gt_list = gt.lower().strip().split(' ')
+    gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
+    if gt_content[-1] == '.':
+        gt_content = gt_content[:-1]
+
+    if pred_option.replace('.', '') in gt_option:
+        flag = True
+    elif gt_option in pred_option:
+        flag = True
+    elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
+        flag = True
+
+    return flag
+
+
+def check_ans_advanced(pred, gt):
+    number_table = {
+        0: 'zero',
+        1: 'one',
+        2: 'two',
+        3: 'three',
+        4: 'four',
+        5: 'five',
+        6: 'six',
+        7: 'seven',
+        8: 'eight',
+        9: 'nine',
+    }
+    flag = False
+
+    pred_list = pred.lower().strip().split(' ')
+    pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
+    gt_list = gt.lower().strip().split(' ')
+    gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
+    if gt_content[-1] == '.':
+        gt_content = gt_content[:-1]
+
+    try:
+        gt_content = number_table[int(gt_content.strip('. \n'))]
+        print(gt_content)
+    except:
+        pass
+
+    if pred_option.replace('.', '') in gt_option:
+        flag = True
+    elif gt_option in pred_option:
+        flag = True
+    elif gt_content.lower().strip('. \n') in pred.lower().strip('. \n'):
+        flag = True
+
+    return flag
+
+
 class GroupRandomCrop(object):
    def __init__(self, size):
        if isinstance(size, numbers.Number):
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py
@@ -0,0 +1,145 @@
+import re
+
+
+def extract_answer(output_string, task_type="yes_no"):
+    """
+    Extracts the answer from the output string based on the task type.
+
+    Parameters:
+    output_string (str): The output string.
+    task_type (str): The type of task. Must be either "yes_no" or "multiple_choice".
+
+    Returns:
+    int:
+        1 if "yes" or "A"
+        0 if "no" or "B"
+        -1 if no relevant answer is found.
+        Raises a ValueError if an unsupported task_type is provided.
+    """
+
+    def find_word_position(string, word):
+        pattern = r'\b' + re.escape(word) + r'\b'
+        match = re.search(pattern, string, re.IGNORECASE)
+        if match:
+            return match.start()
+        return -1
+
+    if task_type not in ["yes_no", "multiple_choice"]:
+        raise ValueError(f"Task type {task_type} not supported. Must be 'yes_no' or 'multiple_choice'.")
+
+    if task_type == "yes_no":
+        position_yes_and_a = find_word_position(output_string, "yes")
+        position_no_and_b = find_word_position(output_string, "no")
+    elif task_type == "multiple_choice":
+        position_yes_and_a = find_word_position(output_string, "A")
+        position_no_and_b = find_word_position(output_string, "B")
+
+    if position_yes_and_a == -1 and position_no_and_b == -1:
+        print(f"No answer found in the output string: {output_string}.")
+        return -1
+    elif position_yes_and_a != -1 and position_no_and_b != -1:
+        return 1 if position_yes_and_a < position_no_and_b else 0
+    else:
+        return 0 if position_yes_and_a == -1 else 1
+
+
+def get_scores(scores):
+    """
+    Calculate various scores based on the given results.
+
+    Args:
+        scores (dict or list): A dictionary or list containing results where each result can be:
+            - dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...}
+            - list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...]
+
+    The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images:
+        - "q0_i0" means question_0 on image_0
+        - "q0_i1" means question_0 on image_1
+        - "q1_i0" means question_1 on image_0
+        - "q1_i1" means question_1 on image_1
+
+    Returns:
+        dict: A dictionary containing the calculated scores:
+            - 'Q_Acc': Average question score
+            - 'I_Acc': Average image score
+            - 'Acc': Average binary VQA score
+            - 'G_Acc': Average group score
+    """
+    Q_Acc = 0.0
+    I_Acc = 0.0
+    Acc = 0.0
+    G_Acc = 0.0
+
+    num_samples = len(scores)
+
+    def calculate_image_score(result):
+        image_correct = 0
+        if isinstance(result, dict):
+            if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0:
+                image_correct += 1
+            if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0:
+                image_correct += 1
+        elif isinstance(result, list):
+            if result[0] == 1.0 and result[2] == 0.0:
+                image_correct += 1
+            if result[3] == 1.0 and result[1] == 0.0:
+                image_correct += 1
+        return image_correct
+
+    def calculate_question_score(result):
+        text_correct = 0
+        if isinstance(result, dict):
+            if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0:
+                text_correct += 1
+            if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0:
+                text_correct += 1
+        else:
+            if result[0] == 1.0 and result[1] == 0.0:
+                text_correct += 1
+            if result[3] == 1.0 and result[2] == 0.0:
+                text_correct += 1
+        return text_correct
+
+    def calculate_binary_score(result):
+        binary_score_correct = 0
+        if isinstance(result, dict):
+            binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0
+            binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0
+            binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0
+            binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0
+        else:
+            binary_score_correct += 1 if result[0] == 1.0 else 0
+            binary_score_correct += 1 if result[1] == 0.0 else 0
+            binary_score_correct += 1 if result[2] == 0.0 else 0
+            binary_score_correct += 1 if result[3] == 1.0 else 0
+
+        return binary_score_correct
+
+    def calculate_group(result):
+        group_correct = 0
+        if calculate_question_score(result) == 2 and calculate_image_score(result) == 2:
+            group_correct += 1
+
+        return group_correct
+
+    if isinstance(scores, dict):
+        for _, result in scores.items():
+            Q_Acc += calculate_question_score(result)
+            I_Acc += calculate_image_score(result)
+            Acc += calculate_binary_score(result)
+            G_Acc += calculate_group(result)
+    else:
+        for result in scores:
+            Q_Acc += calculate_question_score(result)
+            I_Acc += calculate_image_score(result)
+            Acc += calculate_binary_score(result)
+            G_Acc += calculate_group(result)
+
+    results = {
+        'Q_Acc': Q_Acc / float(num_samples * 2),
+        'I_Acc': I_Acc / float(num_samples * 2),
+        'Acc': Acc / float(num_samples * 4),
+        'G_Acc': G_Acc / num_samples
+    }
+
+    return results
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py
@@ -0,0 +1,532 @@
+import re
+import json
+from math import isclose
+import sympy as sp
+from sympy import simplify, Eq, sympify, evalf, Pow
+from sympy.parsing.latex import parse_latex
+import antlr4
+from decimal import Decimal, getcontext
+from fractions import Fraction
+import sys
+import math
+
+
+chinese_answer_type_dict = {
+    'Numerical': '数值',
+    'Expression': '表达式',
+    'Equation': '方程',
+    'Interval': '区间'
+}
+english_answer_type_dict = {
+    'Numerical': 'a numerical value',
+    'Expression': 'an expression',
+    'Equation': 'an equation',
+    'Interval': 'an interval'
+}
+
+
+def get_single_answer_type_text(answer_type, is_chinese):
+    if '-' in answer_type:  # No need now
+        answer_type = answer_type[:answer_type.find('-')]
+    for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
+        if t in answer_type:
+            if is_chinese:
+                return chinese_answer_type_dict[t]
+            else:
+                return english_answer_type_dict[t]
+    exit(f'Error parsing answer type {answer_type}!')
+
+
+def get_answer_type_text(answer_type, is_chinese, multiple_answer):
+    # 'Tuple' has various meanings in different context, such as position or values of a series of variable,
+    # so it may lead to confusion to directly use 'tuple' in the prompt.
+    if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
+        full_answer_text = ''
+    else:
+        if not multiple_answer:
+            answer_text = get_single_answer_type_text(answer_type, is_chinese)
+            if is_chinese:
+                full_answer_text = f'，答案类型为{answer_text}'
+            else:
+                full_answer_text = f"The answer of The problem should be {answer_text}. "
+        else:
+            if ',' not in answer_type:  # Same answer type for all answers
+                answer_text = get_single_answer_type_text(answer_type, is_chinese)
+                if is_chinese:
+                    full_answer_text = f'，题目有多个答案，答案类型均为{answer_text}'
+                else:
+                    full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
+            else:
+                answer_types = answer_type.split(',')
+                answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
+                if len(set(answer_types)) == 1:
+                    answer_text = answer_types[0]
+                    if is_chinese:
+                        full_answer_text = f'，题目有多个答案，答案类型均为{answer_text}'
+                    else:
+                        full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
+                else:
+                    if is_chinese:
+                        answer_text = '、'.join(answer_types)
+                        full_answer_text = f'，题目有多个答案，答案类型分别为{answer_text}'
+                    else:
+                        answer_text = ', '.join(answer_types)
+                        full_answer_text = (
+                            f'The problem has multiple answers, with the answers in order being {answer_text}. '
+                        )
+    return full_answer_text
+
+
+def make_input(prompt, question_content):
+    # diversified based on the vllm, which is not implemented temporarily
+    input = prompt + '\n' + question_content
+    return input
+
+
+sys.set_int_max_str_digits(1000000)
+# 设置decimal的精度
+getcontext().prec = 50
+
+
+class MathJudger:
+    def __init__(self):
+        self.special_signal_map = {
+            "\\left": "",
+            "\\right": "",
+            "∶": ":",
+            "，": ",",
+            "$": "",
+            "\\approx": "=",
+            "\\simeq": "=",
+            "\\sim": "=",
+            "^\\prime": "'",
+            "^{\\prime}": "'",
+            "^\\circ": "",
+            "%": "",
+        }
+        self.pi = parse_latex("\\pi")
+        self.precision = 1e-8
+
+    def split_by_comma(self, expr: str):
+        in_bracket_num = 0
+        splitted_expr = []
+        start_idx = 0
+        for i, char in enumerate(expr):
+            if char == "(" or char == "[":
+                in_bracket_num += 1
+            elif char == ")" or char == "]":
+                in_bracket_num -= 1
+            elif char == "," and in_bracket_num == 0:
+                splitted_expr.append(expr[start_idx:i].strip())
+                start_idx = i + 1
+
+        if start_idx < len(expr):
+            splitted_expr.append(expr[start_idx:].strip())
+
+        return splitted_expr
+
+    def trans_plus_minus_sign(self, expr_list: list):
+        new_expr_list = []
+        for expr in expr_list:
+            if "\\pm" in expr:
+                new_expr_list.append(expr.replace("\\pm", "+"))
+                new_expr_list.append(expr.replace("\\pm", "-"))
+            else:
+                new_expr_list.append(expr)
+
+        return new_expr_list
+
+    def judge(self, expression1, expression2, precision=1e-8):
+        # (默认 expression1 为 Ground_Truth)
+        precision = precision if isinstance(precision, list) else [precision]
+
+        try:
+            expression1, expression2 = self.preprocess(expression1, expression2)
+        except:
+            return False
+        if expression1 == expression2:
+            # print("原生相等")
+            return True
+
+        # 去除字符串中的中文字符，因为上面已经判断过了类似回答为"能"或"不能"的含有中文字符的回答情况
+        expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
+        expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
+
+        expression1 = self.split_by_comma(expression1)
+        expression2 = self.split_by_comma(expression2)
+
+        temp_list1 = self.trans_plus_minus_sign(expression1)
+        temp_list2 = self.trans_plus_minus_sign(expression2)
+
+        # 设计误差值列表
+        if len(precision) <= 1:
+            precision = precision * len(temp_list1)
+
+        if len(temp_list1) != len(temp_list2):
+            return False
+
+        # 判断两个列表中的元素是否可以两两配对，并且两两相等，由此支持多个回答的比较
+        idx = -1
+        while len(temp_list1) != 0:
+            idx = (idx + 1) % len(temp_list1)
+
+            item1 = temp_list1[idx]
+            self.precision = precision[idx]
+            # print(self.precision)
+
+            for item2 in temp_list2:
+                if self.is_equal(item1, item2):
+                    temp_list1.remove(item1)
+                    temp_list2.remove(item2)
+                    precision.remove(self.precision)
+                    break
+            else:
+                # If we didn't break from the inner loop, it means no match was found
+                return False
+
+        # If all elements are matched and removed, the lists can be paired
+        return True
+
+    def is_interval(self, epr):
+        return epr.startswith(("(", "[")) and epr.endswith((")", "]"))
+
+    # 在进行数值计算前，需要将sympy中的pi符号替换为pi的近似数值
+    # def sympy_sub_pi(self, expression_sympy):
+    #     return expression_sympy.subs(self.pi, math.pi)
+
+    # 默认第一个表达式是 ground_truth
+    def is_equal(self, expression1, expression2):
+        if expression1 == expression2 and expression1 != "" and expression2 != "":
+            # print("原生等价")
+            return True
+
+        # 先判断是否是两个区间，是的话进行判断相等，不相等则返回 False
+        if self.is_interval(expression1) and self.is_interval(expression2):
+            try:
+                if self.interval_equal(expression1, expression2):
+                    # print("区间等价")
+                    return True
+            except:
+                return False
+
+        # 再判断是否在数值上相等
+        try:
+            if self.numerical_equal(expression1, expression2):
+                # print("数值等价")
+                return True
+        except:
+            pass
+
+        # 再判断是否是表达式相等
+        try:
+            if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
+                # print("表达式等价")
+                return True
+        except:
+            pass
+
+        # 再判断是否是等式相等
+        try:
+            if self.equation_equal(expression1, expression2):
+                # print("等式等价")
+                return True
+        except:
+            pass
+
+        return False
+
+    # 判断两个数值在误差允许范围内是否相等
+    def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
+        """
+        (默认 expression1 为 Ground_Truth)
+        函数: 判读两个数值是否在误差允许范围内相等
+        步骤1: 将可能出现的百分号的情况包含进来
+        步骤2: 使用 math.isclose 函数判断是否相等
+        """
+        reference = float(expression1)
+        prediction = float(expression2)
+
+        if include_percentage:
+            gt_result = [reference / 100, reference, reference * 100]
+        else:
+            gt_result = [reference]
+
+        for item in gt_result:
+            # if isclose(item, prediction, abs_tol=self.precision, rel_tol=0):
+            if abs(item - prediction) <= self.precision * 1.01:
+                return True
+        return False
+
+    def expression_equal(self, exp1, exp2):
+        """
+        (默认 expression1 为 Ground_Truth)
+        函数: 判断两个表达式是否在数学意义上等价
+        步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1"
+        步骤2: 使用 sympy 库进行等价判断
+        """
+
+        # 只提取等号右边的表达式，一般左边是所求的量
+        def extract_expression(expression):
+            if "=" in expression:
+                expression = expression.split("=")[1]
+            return expression.strip()
+
+        exp1 = extract_expression(exp1)
+        exp2 = extract_expression(exp2)
+
+        exp_too_long = len(exp1) > 300 or len(exp2) > 300
+
+        # 将表达式转换为 sympy 中能够进行处理的格式
+        expr1_sym = sympify(parse_latex(exp1))
+        expr2_sym = sympify(parse_latex(exp2))
+
+        if expr1_sym == expr2_sym:
+            return True
+        else:
+            expr1_sym = self.sympy_sub_pi(expr1_sym)
+            expr2_sym = self.sympy_sub_pi(expr2_sym)
+            # 如果输入的表达式可以计算出具体数值的话，则将其进行数值计算的比较
+
+            if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (
+                    not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
+                return False
+            elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
+                try:
+                    if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
+                        print(
+                            "These two number can not be calculated by current computer for: "
+                            f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\""
+                        )
+                        return False
+                    if exp_too_long:
+                        print(f'Expression {exp1} or {exp2} is too long to compute. ')
+                        return False
+
+                    if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
+                        return True
+                    else:
+                        return False
+                except:
+                    return False
+            elif exp_too_long:
+                print(f'Expression {exp1} or {exp2} is too long to compute. ')
+                return False
+            else:
+                try:
+                    simplified_expr = simplify(expr1_sym - expr2_sym)
+
+                    num_value = simplified_expr.evalf()
+
+                    return abs(num_value) < 1e-3
+                except:
+                    return False
+
+    def equation_equal(self, expression1, expression2):
+        """
+        (默认 expression1 为 Ground_Truth)
+        函数: 判断两个方程是否在数学意义上等价
+        步骤1: 将一个方程/等式化简为标准方程, 即等式的右边严格等于0, 接下来只需要判断两个等式的左边是否"等价"
+        步骤2: 使用 sympy 库计算两个等式左边的商, 如果这个商或者这个商的倒数为整数, 那么数学意义上我们可以推导出这两个方程等价👌
+        """
+
+        # 将等式的右边都移到左边，并返回一个 sympy 格式的表达式
+        def simplify_equation(latex_eq):
+            # 分割等式的左边和右边
+            lhs, rhs = latex_eq.split('=')
+
+            # 使用 parse_latex 解析 LaTeX 表达式
+            lhs_expr = parse_latex(lhs)
+            rhs_expr = parse_latex(rhs)
+
+            # 创建等式对象
+            equation = Eq(lhs_expr, rhs_expr)
+
+            # 化简等式：将等式右边移到左边
+            simplified_eq = simplify(equation.lhs - equation.rhs)
+
+            return simplified_eq
+
+        expr1_sym = simplify_equation(expression1)
+        expr2_sym = simplify_equation(expression2)
+
+        division_result_1 = simplify(expr1_sym / expr2_sym)
+        division_result_2 = simplify(expr2_sym / expr1_sym)
+
+        # 如果两个方程转换后的式子相除为整数 且非零，则根据推导可知这两个方程等价
+        if (division_result_1.is_Integer and division_result_1 != 0) or (
+                division_result_2.is_Integer and division_result_2 != 0):
+            return True
+        else:
+            return False
+
+    def interval_equal(self, expression1, expression2):
+        # 函数: 判断两个区间是否在数学意义上等价
+        # 步骤1: 简化区间的表达式, 去除无关的符号比如"\left", "\right", 同时将可能出现的"x \in"删去
+        # 步骤2: 对比两个区间的左右符号、中间出现的数学表达式等是否一致
+
+        def compare_two_interval(inter1, inter2):
+
+            # 首先比较两边的括号是否一致，一致的话再进行下一步比较
+            if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
+                return False
+
+            inter1 = inter1.strip('[]()')
+            inter2 = inter2.strip('[]()')
+
+            # 分割区间的左右部分
+            items_1 = inter1.split(',')
+            items_2 = inter2.split(',')
+
+            for item_1, item_2 in zip(items_1, items_2):
+                if not self.expression_equal(item_1, item_2):
+                    return False
+            return True
+
+        interval1 = expression1
+        interval2 = expression2
+
+        if interval1 == interval2:
+            return True
+        else:
+            inter_list1 = interval1.split("\\cup")
+            inter_list2 = interval2.split("\\cup")
+
+            if len(inter_list1) != len(inter_list2):
+                return False
+            else:
+                for inter1, inter2 in zip(inter_list1, inter_list2):
+                    if not compare_two_interval(inter1, inter2):
+                        return False
+                return True
+
+    def preprocess(self, expression1, expression2):
+
+        # 尝试捕获box中的内容，如果有多个则以逗号相连返回，如果一个都没有，则报错
+        def extract_boxed_content(latex_str):
+            # 查找所有的 \boxed{...} 结构
+            boxed_matches = re.finditer(r'\\boxed{', latex_str)
+            results = ""
+
+            for match in boxed_matches:
+                start_index = match.end()
+                end_index = start_index
+                stack = 1
+
+                # 从 \boxed{ 之后开始搜索，直到找到对应的闭合括号
+                while stack > 0 and end_index < len(latex_str):
+                    if latex_str[end_index] == '{':
+                        stack += 1
+                    elif latex_str[end_index] == '}':
+                        stack -= 1
+                    end_index += 1
+
+                if stack == 0:
+                    # 提取 \boxed{} 内部的内容
+                    content = latex_str[start_index:end_index - 1]
+                    results += content + ","
+                else:
+                    # 如果括号没有正确闭合，则返回错误信息
+                    raise ValueError("Mismatched braces in LaTeX string.")
+
+            # 如果没有匹配到'\boxed{}'字符，则默认提取有内容的文字最后一行中的所有公式部分
+            if results == "":
+                last_line_ans = latex_str.strip().split("\n")[-1]
+                dollar_pattern = r"\$(.*?)\$"
+                answers = re.findall(dollar_pattern, last_line_ans)
+
+                if answers:
+                    for ans in answers:
+                        results += ans + ","
+                else:
+                    results = latex_str
+
+            return results
+
+        def sepcial_symbol_replace(expression):
+            if "\\in " in expression:
+                expression = expression.split("\\in ")[1]
+
+            # 进行特殊字符的替换，这些字符都不影响latex的解析，属于美观/修饰性字符
+            for signal in self.special_signal_map:
+                expression = expression.replace(signal, self.special_signal_map[signal])
+
+            expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~，。")
+
+            pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
+            expression = re.sub(pattern, r'\1', expression)
+
+            return expression
+
+        exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
+        exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
+
+        return exp1, exp2
+
+    def can_compute_power(self, expr):
+        """
+        Check if the power expression can be computed.
+
+        Parameters:
+        expr (sympy expression): The expression to check.
+
+        Returns:
+        bool: True if the expression can be computed, False otherwise.
+        """
+        # Check if the expression is a power expression
+        if isinstance(expr, Pow):
+            # Extract the base and the exponent
+            base, exp = expr.as_base_exp()
+
+            # Check if the base and the exponent are numbers
+            if base.is_number and exp.is_number:
+                # Set a threshold for the maximum size of the exponent
+                MAX_EXP = 1000  # This threshold can be adjusted based on the computing environment
+
+                # Check if the exponent is greater than the threshold
+                if abs(exp.evalf()) > MAX_EXP:
+                    return False
+                else:
+                    return True
+            else:
+                # If the base or the exponent is not a number, we cannot compute the power
+                return False
+        else:
+            # If the expression is not a power expression, return True as it is not the case we are checking for
+            return True
+
+
+def extract_answer(is_chinese, model_output, is_deepseek=False):
+    # deepseekmath has special answering format
+    if str(model_output) == 'nan':
+        model_output = 'nan'
+
+    if is_deepseek:
+        if is_chinese:
+            matches = re.findall('## 解题答案(.*)', model_output)
+        else:
+            matches = re.findall('The answer is: (.*)', model_output)
+
+        # 检测是否至少找到一个匹配，如果没有就直接整个送进去找\boxed{}
+        if matches:
+            # 如果找到多个匹配，取最后一个
+            model_answer = matches[-1].strip()
+            return model_answer
+        else:
+            return model_output
+
+    if is_chinese:
+        matches = re.findall('所以最终答案是(.*)', model_output)
+    else:
+        matches = re.findall('So the final answer is (.*)', model_output)
+
+    # 检测是否至少找到一个匹配，如果没有就直接整个送进去找\boxed{}
+    if matches:
+        # 如果找到多个匹配，取最后一个
+        model_answer = matches[-1].strip()
+        return model_answer
+    else:
+        return model_output
+
+
+def calculate_merged_accuracy(reference_dir, text_only):
+    pass
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py
@@ -0,0 +1,123 @@
+from ...smp import *
+from ...utils import can_infer
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def get_gpt4_ICE_for_qspatial():
+    example_1 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (1, m), (2.2, cm), (3.12, meter), at the end.\n
+Model response: **Object Identification**
+
+* The object in question is a chair.
+* The chair is not visible in the image.
+
+**Conclusion**
+
+The height of the chair cannot be determined from the provided image.\n
+Extracted answer: (0, cm)
+"""
+
+    example_2 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (1, inch), (1.2, cm), (3.0, feet), at the end.\n
+Model response: **Step 1: Identify the stapler and the recycle bin in the image.**
+
+The stapler is located on the wooden table, and the recycle bin is located on the floor.
+
+**Step 2: Determine the distance between the stapler and the recycle bin.**
+
+The stapler is 0.5 meters from the edge of the table, and the recycle bin is 1.5 meters from the edge of the table.
+Therefore, the minimum distance between the stapler and the recycle bin is 1.5 - 0.5 = 1 meter.
+
+**Answer:** 1 m\n
+Extracted answer: (1, m)
+"""
+    example_3 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (1, foot), (2, cm), (4.3, meter), at the end.\n
+Model response: The mirror in the image is approximately 5 feet 4 inches tall.\n
+Extracted answer: (64, inch)
+"""
+    example_4 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (0.1, cm), (2.9, cm), (0.3, meter), at the end.\n
+Model response: The minimum distance between the wooden chair and the chair near the camera in the image is 1.7 feet.\n
+Extracted answer: (1.7, feet)
+"""
+    example_5 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (5.1, cm), (0.9, cm), (55, mm), at the end.\n
+Model response: The height of the painting's bottom edge from the floor is approximately 4.5 feet.\n
+Extracted answer: (4.5, feet)
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if line['question_type'] == 'multi_choice':
+            ans = line['answer_option']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            if line['answer_type'] == 'integer':
+                res = int(response)
+                ans = int(line['answer'])
+            elif line['answer_type'] == 'float':
+                res = float(response)
+                ans = float(line['answer'])
+            else:
+                res = str(res)
+                ans = str(ans)
+    except ValueError:
+        pass
+
+    if res == ans:
+        return res if prefetch else True
+    else:
+        return False
+
+
+def build_qspatial_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE_for_qspatial()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += '\nExtracted answer:'
+    return prompt
+
+
+def QSpatial_auxeval(model, line):
+    prompt = build_qspatial_gpt4_prompt(line)
+
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py
@@ -0,0 +1,500 @@
+"""
+Copied from https://github.com/allenai/allennlp-semparse
+Modified from https://github.com/naver-ai/tablevqabench
+"""
+
+import re
+import unicodedata
+import time
+
+from abc import ABCMeta, abstractmethod
+from math import isinf, isnan
+
+
+# Vision Prompts
+VWTQ_PROMPT = (
+    'You are asked to answer questions asked on an image.\n'
+    'You should answer the question with a single word.\n'
+    'Example: \n'
+    'Question: what was the only year mr. wu competed in the olympic games?\n'
+    'Answer: 2004\n'
+    'Question: which township in pope county, arkansas has the least amount of water area?\n'
+    'Answer: Freeman\n'
+    'If you have multiple answers, please separate them with || marks. Example: Apple||Banana||Tomato\n\n'
+    'Question: {question}\n'
+    'Answer:'
+)
+
+VTABFACT_PROMPT = (
+    'You are asked to answer whether the statement is True or False based on given image\n'
+    'You should only answer True or False.\n'
+    'Example: \n'
+    'Statement: the milwaukee buck win 6 game in the 2010 - 11 season\n'
+    'Answer: True\n'
+    'Statement: only the top team score above the average of 8.8\n'
+    'Answer: False\n\n'
+    'Statement: {question}\n'
+    'Answer:'
+)
+
+FINTABNETQA_PROMPT = (
+    'You are asked to answer questions asked on a image.\n'
+    'You should answer the question within a single word or few words.\n'
+    'If units can be known, the answer should include units such as $, %, million and etc.\n'
+    'Example: \n'
+    'Question: What were the total financing originations for the fiscal year ended October 31, 2004?\n'
+    'Answer: $3,852 million\n'
+    'Question: What is the time period represented in the table?\n'
+    'Answer: October 31\n'
+    'Question: What was the percentage of net sales for selling, general and administrative expenses in 2006?\n'
+    'Answer: 34.2%\n'
+    'Question: {question}\n'
+    'Answer:'
+)
+
+
+def evaluate_tabfact(data, score_keys):
+    num_examples = 0
+    num_correct = 0
+    manual_check = 0
+    start_time = time.time()
+    for instance in data:
+        if instance['prediction'] is None:
+            instance['prediction'] = 'none'
+        pred = instance['prediction'].lower()
+        gt = instance['answer']
+        num_examples += 1
+        if 'true' in pred and 'false' in pred:
+            manual_check += 1
+            score = None
+        elif 'true' in pred and gt == '1':
+            num_correct += 1
+            score = 1
+        elif 'false' in pred and gt == '0':
+            num_correct += 1
+            score = 1
+        else:
+            score = 0
+        instance['scores'] = {score_keys[0]: score}
+    if manual_check > 0:
+        print(f'the number of not properly parsed samples: {manual_check}')
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
+    meta = {
+        'evaluators': 'correctness',
+        'score_info': [score_keys[0]],
+        'evaluated_time': elapsed_time,
+        'total_num_sample': len(data),
+        'average_scores': [Accuracy],
+    }
+    return meta
+
+
+def evaluate_wtq(data, score_keys):
+    num_examples = 0
+    num_correct = 0
+    start_time = time.time()
+
+    for instance in data:
+        pred = instance['prediction'].replace('||', '|')
+        gt = instance['answer']
+        original_strings = tsv_unescape_list(gt)
+        target_values = to_value_list(original_strings)
+
+        predicted_strings = tsv_unescape_list(pred)
+        predicted_values = to_value_list(predicted_strings)
+        correct = check_denotation(target_values, predicted_values)
+        num_examples += 1
+        score = 0
+        if correct:
+            num_correct += 1
+            score = 1
+        instance['scores'] = {score_keys[0]: score}
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+
+    Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
+    meta = {
+        'evaluators': 'correctness',
+        'score_info': [score_keys[0]],
+        'evaluated_time': elapsed_time,
+        'total_num_sample': len(data),
+        'average_scores': [Accuracy],
+    }
+    return meta
+
+
+def evaluate_fintabnet(data, score_keys):
+    num_examples = 0
+    num_correct, _num_correct = 0, 0
+    start_time = time.time()
+    for instance in data:
+        pred, preds = fintabnet_normalize(instance['prediction'])
+        gt, gts = fintabnet_normalize(instance['answer'])
+        correct = 1 if gt == pred else 0
+        _correct = any(_pred == _gt for _pred in preds for _gt in gts)
+        num_examples += 1
+        score, _score = 0, 0
+        if correct:
+            num_correct += 1
+            score = 1
+        if _correct:
+            _num_correct += 1
+            _score = 1
+        instance['scores'] = {score_keys[0]: _score, 'exact_score': score}
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
+    _Accuracy = round((_num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100
+    meta = {
+        'evaluators': 'correctness',
+        'score_info': ['relieved_accuracy', score_keys[0]],
+        'evaluated_time': elapsed_time,
+        'total_num_sample': len(data),
+        'average_scores': [_Accuracy, Accuracy],
+    }
+    return meta
+
+
+def fintabnet_normalize(s):
+    s = normalize(s)
+    remove_words = [
+        'dollar', 'gallons', 'square feet', 'shares', 'mbtu',
+        'mbpd', 'mbbls', 'mmbtu', 'unit', 'gwh', 'year', 'mmcf', 'mile', 'mboe'
+    ]
+
+    # Data specific filtering using regular expressions
+    # Remove special characters like $, (, and )
+    s = re.sub(r'[\$\(\),]', '', s)
+
+    # Replace "dollar" with empty string if it's not part of another word
+    pattern = r'\b(' + '|'.join(remove_words) + r')s?\b'
+    s = re.sub(pattern, '', s, flags=re.IGNORECASE)
+
+    # Unit conversion dictionary with regex patterns for flexibility
+    unit_conversion = {
+        r' \bthousand\b': 'e3',
+        r' \bmillion\b': 'e6',
+        r' \bbillion\b': 'e9',
+        r'\bthousand\b': 'e3',
+        r'\bmillion\b': 'e6',
+        r'\bbillion\b': 'e9',
+        r' ?%': 'e-2',
+    }
+
+    # Convert percentages to their decimal representation.
+    # Applying this after unit_conversion prevents "percent" from being processed
+    # in cases like "million %", which would be incorrect.
+    # s = re.sub(r' ?%', 'e-2', s)
+    # s_percent = re.sub(r' ?%', '', s_percent)
+
+    s_unit_free = s
+
+    # Iterate over unit_conversion and apply transformations
+    for pattern, value in unit_conversion.items():
+        s = re.sub(pattern, value, s)
+        s_unit_free = re.sub(pattern, '', s_unit_free)
+
+    # Attempt to convert to float
+    try:
+        return float(s), [float(s), float(s_unit_free)]
+    except ValueError:
+        # Return the original string and the error for debugging purposes
+        return s, [s, s_unit_free]
+
+
+def normalize(x):
+    if not isinstance(x, str):
+        x = x.decode('utf8', errors='ignore')
+    # Remove diacritics
+    x = ''.join(
+        c for c in unicodedata.normalize('NFKD', x) if unicodedata.category(c) != 'Mn'
+    )
+    # Normalize quotes and dashes
+    x = re.sub(r'[‘’´`]', "'", x)
+    x = re.sub(r'[“”]', '"', x)
+    x = re.sub(r'[‐‑‒–—−]', '-', x)
+    while True:
+        old_x = x
+        # Remove citations
+        x = re.sub(r'((?<!^)\[[^\]]*\]|\[\d+\]|[•♦†‡*#+])*$', '', x.strip())
+        # Remove details in parenthesis
+        x = re.sub(r'(?<!^)( \([^)]*\))*$', '', x.strip())
+        # Remove outermost quotation mark
+        x = re.sub(r'^"([^"]*)"$', r'\1', x.strip())
+        if x == old_x:
+            break
+    # Remove final '.'
+    if x and x[-1] == '.':
+        x = x[:-1]
+    # Collapse whitespaces and convert to lower case
+    x = re.sub(r'\s+', ' ', x, flags=re.U).lower().strip()
+    return x
+
+
+# Value Types
+class Value(object):
+    __metaclass__ = ABCMeta
+
+    # Should be populated with the normalized string
+    _normalized = None
+
+    @abstractmethod
+    def match(self, other):
+        """Return True if the value matches the other value.
+
+        Args:
+            other (Value)
+        Returns:
+            a boolean
+        """
+        pass
+
+    @property
+    def normalized(self):
+        return self._normalized
+
+
+class StringValue(Value):
+    def __init__(self, content):
+        assert isinstance(content, str)
+        self._normalized = normalize(content)
+        self._hash = hash(self._normalized)
+
+    def __eq__(self, other):
+        return isinstance(other, StringValue) and self.normalized == other.normalized
+
+    def __hash__(self):
+        return self._hash
+
+    def __str__(self):
+        return 'S' + str([self.normalized])
+
+    def __repr__(self):
+        return self.__str__()
+
+    def match(self, other):
+        assert isinstance(other, Value)
+        return self.normalized == other.normalized
+
+
+class NumberValue(Value):
+    def __init__(self, amount, original_string=None):
+        assert isinstance(amount, (int, float))
+        if abs(amount - round(amount)) < 1e-6:
+            self._amount = int(amount)
+        else:
+            self._amount = float(amount)
+        if not original_string:
+            self._normalized = str(self._amount)
+        else:
+            self._normalized = normalize(original_string)
+        self._hash = hash(self._amount)
+
+    @property
+    def amount(self):
+        return self._amount
+
+    def __eq__(self, other):
+        return isinstance(other, NumberValue) and self.amount == other.amount
+
+    def __hash__(self):
+        return self._hash
+
+    def __str__(self):
+        return 'N({})'.format(self.amount) + str([self.normalized])
+
+    def __repr__(self):
+        return self.__str__()
+
+    def match(self, other):
+        assert isinstance(other, Value)
+        if self.normalized == other.normalized:
+            return True
+        if isinstance(other, NumberValue):
+            return abs(self.amount - other.amount) < 1e-6
+        return False
+
+    @staticmethod
+    def parse(text):
+        """Try to parse into a number.
+
+        Return:
+            the number (int or float) if successful; otherwise None.
+        """
+        try:
+            return int(text)
+        except ValueError:
+            try:
+                amount = float(text)
+                assert not isnan(amount) and not isinf(amount)
+                return amount
+            except ValueError:
+                return None
+
+
+class DateValue(Value):
+    def __init__(self, year, month, day, original_string=None):
+        """Create a new DateValue. Placeholders are marked as -1."""
+        assert isinstance(year, int)
+        assert isinstance(month, int) and (month == -1 or 1 <= month <= 12)
+        assert isinstance(day, int) and (day == -1 or 1 <= day <= 31)
+        assert not (year == month == day == -1)
+        self._year = year
+        self._month = month
+        self._day = day
+        if not original_string:
+            self._normalized = '{}-{}-{}'.format(
+                year if year != -1 else 'xx',
+                month if month != -1 else 'xx',
+                day if day != '-1' else 'xx',
+            )
+        else:
+            self._normalized = normalize(original_string)
+        self._hash = hash((self._year, self._month, self._day))
+
+    @property
+    def ymd(self):
+        return (self._year, self._month, self._day)
+
+    def __eq__(self, other):
+        return isinstance(other, DateValue) and self.ymd == other.ymd
+
+    def __hash__(self):
+        return self._hash
+
+    def __str__(self):
+        return ('D(%d,%d,%d)' % (self._year, self._month, self._day)) + str(
+            [self._normalized]
+        )
+
+    __repr__ = __str__
+
+    def match(self, other):
+        assert isinstance(other, Value)
+        if self.normalized == other.normalized:
+            return True
+        if isinstance(other, DateValue):
+            return self.ymd == other.ymd
+        return False
+
+    @staticmethod
+    def parse(text):
+        """Try to parse into a date.
+
+        Return:
+            tuple (year, month, date) if successful; otherwise None.
+        """
+        try:
+            ymd = text.lower().split('-')
+            assert len(ymd) == 3
+            year = -1 if ymd[0] in ('xx', 'xxxx') else int(ymd[0])
+            month = -1 if ymd[1] == 'xx' else int(ymd[1])
+            day = -1 if ymd[2] == 'xx' else int(ymd[2])
+            assert not (year == month == day == -1)
+            assert month == -1 or 1 <= month <= 12
+            assert day == -1 or 1 <= day <= 31
+            return (year, month, day)
+        except:
+            return None
+
+
+# Value Instantiation
+def to_value(original_string, corenlp_value=None):
+    """Convert the string to Value object.
+
+    Args:
+        original_string (basestring): Original string
+        corenlp_value (basestring): Optional value returned from CoreNLP
+    Returns:
+        Value
+    """
+    if isinstance(original_string, Value):
+        # Already a Value
+        return original_string
+    if not corenlp_value:
+        corenlp_value = original_string
+    # Number?
+    amount = NumberValue.parse(corenlp_value)
+    if amount is not None:
+        return NumberValue(amount, original_string)
+    # Date?
+    ymd = DateValue.parse(corenlp_value)
+    if ymd is not None:
+        if ymd[1] == ymd[2] == -1:
+            return NumberValue(ymd[0], original_string)
+        else:
+            return DateValue(ymd[0], ymd[1], ymd[2], original_string)
+    # String.
+    return StringValue(original_string)
+
+
+def to_value_list(original_strings, corenlp_values=None):
+    """Convert a list of strings to a list of Values
+
+    Args:
+        original_strings (list[basestring])
+        corenlp_values (list[basestring or None])
+    Returns:
+        list[Value]
+    """
+    assert isinstance(original_strings, (list, tuple, set))
+    if corenlp_values is not None:
+        assert isinstance(corenlp_values, (list, tuple, set))
+        assert len(original_strings) == len(corenlp_values)
+        return list(
+            set(to_value(x, y) for (x, y) in zip(original_strings, corenlp_values))
+        )
+    else:
+        return list(set(to_value(x) for x in original_strings))
+
+
+# Check the Predicted Denotations
+def check_denotation(target_values, predicted_values):
+    """Return True if the predicted denotation is correct.
+
+    Args:
+        target_values (list[Value])
+        predicted_values (list[Value])
+    Returns:
+        bool
+    """
+    # Check size
+    if len(target_values) != len(predicted_values):
+        return False
+    # Check items
+    for target in target_values:
+        if not any(target.match(pred) for pred in predicted_values):
+            return False
+    return True
+
+
+# Batch Mode
+def tsv_unescape(x):
+    """Unescape strings in the TSV file.
+    Escaped characters include:
+        newline (0x10) -> backslash + n
+        vertical bar (0x7C) -> backslash + p
+        backslash (0x5C) -> backslash + backslash
+
+    Args:
+        x (str or unicode)
+    Returns:
+        a unicode
+    """
+    return x.replace(r'\n', '\n').replace(r'\p', '|').replace('\\\\', '\\')
+
+
+def tsv_unescape_list(x):
+    """Unescape a list in the TSV file.
+    List items are joined with vertical bars (0x5C)
+
+    Args:
+        x (str or unicode)
+    Returns:
+        a list of unicodes
+    """
+    return [tsv_unescape(y) for y in x.split('|')]
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py
@@ -0,0 +1,254 @@
+from ...smp import *
+from .multiple_choice import extract_answer_from_item
+from PIL import Image, ImageOps
+import numpy as np
+
+sys_prompt = "You are an AI assistant for question answering."
+
+system_prompt_multi_choice = (
+    "You will receive a multi-choice question, the ground-truth answer and the prediction from a question answering (QA) model. "  # noqa
+    "Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
+    "If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
+)
+
+system_prompt_caption_matching = (
+    "You will receive a caption matching question, the ground-truth answer and the prediction from a question answering (QA) model. "  # noqa
+    "Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. "
+    "If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"."
+)
+
+system_prompt_captioning = """
+You will receive a video description and a multi-choice question. Your task is to choose the correct answer and briefly explain the reason why you choose the answer. \
+If none of the choice candidates are correct or the video description lacks enough information to answer the question, just answer "None of the choices are correct". \
+Please organize your response in this format:
+```
+Reasoning: [Your reason to obtain the answer]
+Answer: [Your answer]
+```
+
+Here are some examples of video description, multi-choice question and the expected answer:
+```
+Video Description: A person is palying football.
+Multi-Choice Question:
+What is the person doing in the video?
+A. cooking
+B. palying football
+C. playing basketball
+D. reading book
+Reasoning: The video description mentions that the person is playing football.
+Answer: B. palying football
+
+Video Description: A bird is flying clockwise.
+Multi-Choice Question:
+In which direction is the bird flying?
+A. backwark
+B. counter-clockwise
+C. clockwise
+D. downward
+Reasoning: The video description mentions that the bird is flying clockwise
+Answer: C. clockwise
+
+Video Description: An air balloon is inflating.
+Multi-Choice Question:
+What is happening to the air balloon?
+A. exploding
+B. getting smaller
+C. flying
+Reasoning: The video description mentions that the air balloon is inflating, while none of the coices can be explained as inflating.
+Answer: None of the choices are correct
+```
+"""  # noqa
+
+system_prompt_YorN = """
+You will receive a Yes/No question, the ground-truth answer and the prediction from a question answering (QA) model. \
+Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. \
+If the prediction is correct, respond "Correct". If the prediction is incorrect, respond "Incorrect".
+"""  # noqa
+
+
+def eval_rule_caption_matching(line):
+    # Determine whether the video llm output is correct, based on word matching rules
+    video_llm_output = line['prediction']
+    answer = line['answer']
+    option_strs = eval(line['candidates'])  # complete option strings
+    option_sents = [opt.split(': ')[1] for opt in option_strs]    # option sentence
+    # option index, e.g., Sentence A, Caption A, Option 1
+    option_inds = [opt.split(': ')[0] for opt in option_strs] + [opt.split(': ')[0].replace('Sentence ', '').replace('Option ', '').replace('Caption ', '') for opt in option_strs]  # noqa
+    video_llm_pred = None
+    for option_str in option_strs:
+        if option_str == video_llm_output:
+            video_llm_pred = option_str
+    for option_sent in option_sents:
+        if option_sent == video_llm_output or (') ' in video_llm_output and option_sent == video_llm_output.split(') ')[1]):  # noqa
+            video_llm_pred = option_sent
+    for option_ind in option_inds:
+        if option_ind == video_llm_output or option_ind == video_llm_output.replace('.', ''):  # noqa
+            video_llm_pred = option_ind
+
+    if video_llm_pred is None:
+        return "fail"
+    else:
+        return 1 if video_llm_pred == answer or video_llm_pred == answer.split(":")[0] or video_llm_pred == answer.split(": ")[1] or video_llm_pred == answer.split(": ")[0].split()[1] else 0  # noqa
+
+
+def eval_rule_multi_choice(line):
+    if line['prediction'] == line['answer']:
+        return 1
+    elif line['prediction'] in ['A', 'B', 'C', 'D']:
+        return 1 if line['prediction'] == line['answer'][0] else 0
+    elif any(line['prediction'].startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']):
+        return 1 if line['prediction'].split('.')[0] == line['answer'][0] else 0
+    elif any(line['prediction'].startswith(prefix) for prefix in ['A)', 'B)', 'C)', 'D)']):
+        return 1 if line['prediction'].split(')')[0] == line['answer'][0] else 0
+    else:
+        return "fail"
+
+
+def eval_rule_YorN(video_llm_output):
+    # Extract the yes/no predction from the original video llm output
+    video_llm_output = video_llm_output.lower()
+    if video_llm_output.startswith("yes"):
+        return "yes"
+    elif video_llm_output.startswith("no"):
+        return "no"
+    else:
+        return False
+
+
+def llm_output_to_rating(llm_output):
+    if not ('Correct' in llm_output or 'Incorrect' in llm_output):
+        print(f"Warning: LLM output is not in the correct format: {llm_output}")
+        rating = 0
+        return rating
+    if llm_output.startswith('Correct'):
+        rating = 1
+    elif llm_output.startswith('Incorrect'):
+        rating = 0
+    elif ('Correct' in llm_output) and ('Incorrect' not in llm_output):
+        rating = 1
+    elif 'Incorrect' in llm_output:
+        rating = 0
+    return rating
+
+
+def parse_llm_output(llm_output, gt_answer):
+    if llm_output == "invalid_request_error" or not llm_output:
+        eval_result = {"rating": -1, "chatgpt-answer": None, "chatgpt-reasoning": None}
+        return eval_result
+
+    eval_result = {}
+    lines = llm_output.split("\n")
+
+    for line in lines:
+        line = line.strip()
+        if "Reasoning" in line:
+            eval_result['chatgpt-reasoning'] = line.replace("Reasoning:", "").strip()
+        if "Answer" in line:
+            eval_result['chatgpt-answer'] = line.replace("Answer:", "").strip()
+
+    if "chatgpt-answer" not in eval_result:
+        eval_result['chatgpt-answer'] = llm_output
+    if "chatgpt-reasoning" not in eval_result:
+        eval_result['chatgpt-reasoning'] = None
+
+    # Check if the chatgpt answer is the ground-truth answer
+    # calculate the number of 'A.', 'B.', 'C.', 'D.' in chatgpt-answer
+    answer_counts = sum(eval_result['chatgpt-answer'].count(prefix) for prefix in ['A.', 'B.', 'C.', 'D.'])  # noqa
+    if eval_result['chatgpt-answer'].split(". ")[0] == gt_answer.split(". ")[0] and answer_counts == 1:
+        eval_result['rating'] = 1
+    else:
+        eval_result['rating'] = 0
+    return eval_result
+
+
+def evaluate_tempcompass_mcq(model, line):
+    eval_rules_dict = {
+        'caption_matching': eval_rule_caption_matching,
+        'multi-choice': eval_rule_multi_choice
+    }
+    gpt_eval_prompt = {
+        'multi-choice': '{}\nMulti-Choice Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}',
+        'caption_matching': '{}\nCaption Matching Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}'
+    }
+    base_prompt = {
+        'multi-choice': system_prompt_multi_choice,
+        'caption_matching': system_prompt_caption_matching
+    }
+    eval_result = {
+        "question": line['question'],
+        "answer": line['answer'],
+        "prediction": line['prediction'],
+        "task_type": line['task_type'],
+        "candidates": line['candidates'],
+        "match_success": True
+    }
+    result = eval_rules_dict[line['task_type']](line)
+    if result == "fail":
+        eval_result['match_success'] = False
+        if model is None:
+            eval_result['rating'] = 0
+        else:
+            prompt_template = gpt_eval_prompt[line['task_type']]
+            prompt = prompt_template.format(base_prompt[line['task_type']], line['question'], line['answer'], line['prediction'])  # noqa
+            llm_output = model.generate(prompt)
+            result = llm_output_to_rating(llm_output)
+            eval_result['chatgpt-response'] = llm_output
+            eval_result['rating'] = result
+    else:
+        eval_result['rating'] = result
+
+    return eval_result
+
+
+def evaluate_tempcompass_captioning(model, line):
+    prompt = (
+        f"{system_prompt_captioning}\n"
+        f"Video Description:{line['prediction']}\n"
+        f"Multi-Choice Question:\n{line['mc_question']}\n"
+    )
+    if model is not None:
+        llm_output = model.generate(prompt)
+        eval_result = parse_llm_output(llm_output, gt_answer=line['mc_answer'])
+        return eval_result
+    else:
+        raise ValueError("Model is None, TempCompass Captioning task not supported exact matching")  # noqa
+
+
+def evaluate_tempcompass_YorN(model, line):
+    prompt = (
+        f"{system_prompt_YorN}\n"
+        f"Yes/No Question:\n{line['question']}\n"
+        f"Ground-Truth Answer: {line['answer']}\n"
+        f"Model Prediction: {line['prediction']}"
+    )
+    result = eval_rule_YorN(line['prediction'])
+    eval_result = {
+        "question": line['question'],
+        "answer": line['answer'],
+        "prediction": line['prediction'],
+        "match_success": True
+    }
+    if result:
+        eval_result['rating'] = 1 if result == line['answer'] else 0
+    elif model is None:
+        eval_result['match_success'] = False
+        eval_result['rating'] = 0
+    else:
+        eval_result['match_success'] = False
+        llm_output = model.generate(prompt)
+        result = llm_output_to_rating(llm_output)
+        eval_result['chatgpt-response'] = llm_output
+        eval_result['rating'] = result
+    return eval_result
+
+
+def get_dimension_rating(score_file):
+    data = load(score_file)
+    result_dict = {}
+    for idx, item in data.iterrows():
+        dict_key = item['dim'] + '. ' + item['task_type']
+        if dict_key not in result_dict:
+            result_dict[dict_key] = [0,0]
+        result_dict[dict_key][0] += int(item['score'])
+        result_dict[dict_key][1] += 1
+    return result_dict
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py
@@ -1,4 +1,5 @@
 from ...smp import *
+from .multiple_choice import extract_answer_from_item
 import numpy as np
 import re

@@ -97,24 +98,33 @@ def get_dimension_rating(data_path):

    for duration in DURATIONS + ['overall']:

-        overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.2f}'
+        overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.3f}'
        duration_rating[duration]['overall'] = overall_res_dur

        for domain in DOMAINS:
-            domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.2f}'
+            domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.3f}'
            duration_rating[duration]['domain'][domain] = domain_res_dur

        for sub_ctg in SUB_CATEGORIES:
-            sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.2f}'
+            sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.3f}'
            duration_rating[duration]['sub_category'][sub_ctg] = sub_res_dur

        for task_ctg in TASK_CATEGORIES:
-            task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.2f}'
+            task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.3f}'
            duration_rating[duration]['task_type'][task_ctg] = task_res_dur

    return duration_rating


+def extract_option(model, input_item, dataset_name):
+    options = input_item['question'].split('\n')[1:]
+    for id, option in enumerate(options):
+        option_id = chr(ord('A') + id) + '.'
+        if option.find(option_id) >= 0:
+            input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
+    return extract_answer_from_item(model, input_item, dataset_name)['opt']
+
+
 def extract_characters_regex(s):
    s = s.strip()
    answer_prefixes = [
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py
@@ -0,0 +1,896 @@
+# pylint: skip-file
+
+import pandas as pd
+import json
+import numpy as np
+import os
+import argparse
+
+# four_dimensional_metrics.py
+
+
+# Function to evaluate steps
+def evaluate_evaluate_steps(json, steps):  # noqa
+    jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)]
+    for i in range(steps):
+        jokers[i].rename(
+            columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'},
+            inplace=True,
+        )
+    concatenated_steps = pd.concat(jokers, axis=0)
+    return concatenated_steps
+
+
+# Function to load and process JSON data
+def load_and_process_data(filepath):
+    df = pd.read_excel(filepath)
+    if 'hit' not in df.columns:
+        df['processed_answer'] = (
+            df['prediction']
+            .str.split('Answer')
+            .str[-1]
+            .str.strip()
+            .str.replace(r'[>><<:.]', '', regex=True)
+            .str.strip()
+        )
+        df['processed_answer'] = df['processed_answer'].apply(lambda x: x[0] if x and x[0] in 'ABCDEFGH' else None)
+        df['joker'] = df['processed_answer'] == df['answer']
+    else:
+        df['joker'] = df['hit'].astype(bool)
+    return df
+
+
+# Function to process steps data and merge results
+def evaluate_process_steps_data(df, steps):
+    steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)}
+    steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi']
+    for key, data in steps_data.items():
+        data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns]
+    merged_data = steps_data[f'{steps}steps_1']
+    for i in range(2, steps + 1):
+        merged_data = pd.merge(
+            merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left' # noqa
+        )
+    merged_data = pd.merge(
+        merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left'  # noqa
+    )
+    return merged_data
+
+
+# Function to calculate evaluation metrics
+def evaluate_calculate_metrics(merged_2steps, merged_3steps):
+    metrics = {}
+    metrics['steps2_filtered_rows_1_loose'] = merged_2steps[
+        ((merged_2steps['joker_1'] == False) & (merged_2steps['joker_2'] == False))  # noqa
+        & (merged_2steps['joker_multi'] == True)  # noqa
+    ]
+    metrics['steps2_filtered_rows_1_strict'] = merged_2steps[
+        ((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False))  # noqa
+        & (merged_2steps['joker_multi'] == True)  # noqa
+    ]
+    metrics['steps2_filtered_rows_2'] = merged_2steps[
+        ((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True))  # noqa
+        & (merged_2steps['joker_multi'] == False)  # noqa
+    ]
+    metrics['steps2_filtered_rows_3'] = merged_2steps[
+        ((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False))  # noqa
+        & (merged_2steps['joker_multi'] == False)  # noqa
+    ]
+    metrics['steps2_filtered_rows_4_loose'] = merged_2steps[
+        ((merged_2steps['joker_1'] == True) | (merged_2steps['joker_2'] == True))
+        & (merged_2steps['joker_multi'] == True)
+    ]
+    metrics['steps2_filtered_rows_4_strict'] = merged_2steps[
+        ((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True))
+        & (merged_2steps['joker_multi'] == True)
+    ]
+    metrics['steps3_filtered_rows_1_loose'] = merged_3steps[
+        (
+            (merged_3steps['joker_1'] == False)
+            & (merged_3steps['joker_2'] == False)
+            & (merged_3steps['joker_3'] == False)
+        )
+        & (merged_3steps['joker_multi'] == True)
+    ]
+    metrics['steps3_filtered_rows_1_strict'] = merged_3steps[
+        (
+            (merged_3steps['joker_1'] == False)
+            | (merged_3steps['joker_2'] == False)
+            | (merged_3steps['joker_3'] == False)
+        )
+        & (merged_3steps['joker_multi'] == True)
+    ]
+    metrics['steps3_filtered_rows_2'] = merged_3steps[
+        ((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True))
+        & (merged_3steps['joker_multi'] == False)
+    ]
+    metrics['steps3_filtered_rows_3'] = merged_3steps[
+        (
+            (merged_3steps['joker_1'] == False)
+            | (merged_3steps['joker_2'] == False)
+            | (merged_3steps['joker_3'] == False)
+        )
+        & (merged_3steps['joker_multi'] == False)
+    ]
+    metrics['steps3_filtered_rows_4_loose'] = merged_3steps[
+        ((merged_3steps['joker_1'] == True) | (merged_3steps['joker_2'] == True) | (merged_3steps['joker_3'] == True))
+        & (merged_3steps['joker_multi'] == True)
+    ]
+    metrics['steps3_filtered_rows_4_strict'] = merged_3steps[
+        ((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True))
+        & (merged_3steps['joker_multi'] == True)
+    ]
+    # metrics.to_csv("/Users/mac/Desktop/测试结果/error_anal/csv/gpt4o-0626.csv", index = False)
+    return metrics
+
+
+# Function to compute evaluation rates and final scores
+def evaluate_compute_final_scores(metrics, total_count):
+    total_counts = {
+        'InadequateGeneralization': len(metrics['steps2_filtered_rows_2']) + len(metrics['steps3_filtered_rows_2']),
+        'InsufficientKnowledge': len(metrics['steps2_filtered_rows_3']) + len(metrics['steps3_filtered_rows_3']),
+        'CompleteMastery_loose': len(metrics['steps2_filtered_rows_4_loose'])
+        + len(metrics['steps3_filtered_rows_4_loose']),
+        'CompleteMastery_strict': len(metrics['steps2_filtered_rows_4_strict'])
+        + len(metrics['steps3_filtered_rows_4_strict']),
+        'RoteMemorization_loose': len(metrics['steps2_filtered_rows_1_loose'])
+        + len(metrics['steps3_filtered_rows_1_loose']),
+        'RoteMemorization_strict': len(metrics['steps2_filtered_rows_1_strict'])
+        + len(metrics['steps3_filtered_rows_1_strict']),
+    }
+    rates = {
+        'InadequateGeneralization_rate': "{:.2%}".format(total_counts['InadequateGeneralization'] / total_count),
+        'InsufficientKnowledge_rate': "{:.2%}".format(total_counts['InsufficientKnowledge'] / total_count),
+        'CompleteMastery_loose_rate': "{:.2%}".format(total_counts['CompleteMastery_loose'] / total_count),
+        'CompleteMastery_strict_rate': "{:.2%}".format(total_counts['CompleteMastery_strict'] / total_count),
+        'RoteMemorization_loose_rate': "{:.2%}".format(
+            total_counts['RoteMemorization_loose']
+            / (total_counts['CompleteMastery_loose'] + total_counts['RoteMemorization_loose'])
+        ),
+        'RoteMemorization_strict_rate': "{:.2%}".format(
+            total_counts['RoteMemorization_strict']
+            / (total_counts['CompleteMastery_strict'] + total_counts['RoteMemorization_strict'])
+        ),
+    }
+    return total_counts, rates
+
+
+# Function to update main results DataFrame
+def evaluate_update_main_results_df(main_results_df, total_counts, rates):
+
+    final_score_loose = "{:.2%}".format(
+        (
+            525
+            - 0.5 * total_counts['InadequateGeneralization']
+            - total_counts['RoteMemorization_loose']
+            - total_counts['InsufficientKnowledge']
+        )
+        / 525
+    )
+    final_score_strict = "{:.2%}".format(
+        (
+            525
+            - 0.5 * total_counts['InadequateGeneralization']
+            - total_counts['RoteMemorization_strict']
+            - total_counts['InsufficientKnowledge']
+        )
+        / 525
+    )
+
+    new_row = {
+        # 'Model': model,
+        'Score (Strict)': final_score_strict,
+        'InsufficientKnowledge (Strict)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})",
+        'InadequateGeneralization (Strict)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})",
+        'CompleteMastery (Strict)': f"{rates['CompleteMastery_strict_rate']} ({total_counts['CompleteMastery_strict']})",
+        'RoteMemorization (Strict)': f"{rates['RoteMemorization_strict_rate']} ({total_counts['RoteMemorization_strict']})",
+        'Score (Loose)': final_score_loose,
+        'InsufficientKnowledge (Loose)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})",
+        'InadequateGeneralization (Loose)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})",
+        'CompleteMastery (Loose)': f"{rates['CompleteMastery_loose_rate']} ({total_counts['CompleteMastery_loose']})",
+        'RoteMemorization (Loose)': f"{rates['RoteMemorization_loose_rate']} ({total_counts['RoteMemorization_loose']})",
+    }
+    main_results_df = main_results_df._append(new_row, ignore_index=True)
+    return main_results_df
+
+
+# Main function to evaluate models
+def wemath_evaluate_models(output_json, main_results_csv_path=None):
+
+    main_results_df = pd.DataFrame(
+        columns=[
+            'Model',
+            'Score (Strict)',
+            'InsufficientKnowledge (Strict)',
+            'InadequateGeneralization (Strict)',
+            'CompleteMastery (Strict)',
+            'RoteMemorization (Strict)',
+            'Score (Loose)',
+            'InsufficientKnowledge (Loose)',
+            'InadequateGeneralization (Loose)',
+            'CompleteMastery (Loose)',
+            'RoteMemorization (Loose)',
+        ]
+    )
+
+    # print(f"Evaluating model: {model_name}, JSON path: {output_json}")
+    data = load_and_process_data(output_json)
+    data_2steps = data[data['key'].str.contains('2steps')]
+    data_3steps = data[data['key'].str.contains('3steps')]
+    merged_2steps = evaluate_process_steps_data(data_2steps, 2)
+    merged_3steps = evaluate_process_steps_data(data_3steps, 3)
+
+    metrics = evaluate_calculate_metrics(merged_2steps, merged_3steps)
+    total_counts, rates = evaluate_compute_final_scores(metrics, total_count=525)
+
+    main_results_df = evaluate_update_main_results_df(main_results_df, total_counts, rates)
+
+    print(main_results_df.to_string(index=False))
+    if main_results_csv_path is not None:
+        main_results_df.to_csv(main_results_csv_path, index=False)
+        print("Evaluation completed and results saved to CSV.")
+    return main_results_df.to_dict()
+
+
+### Accuracy.py
+# Function to load knowledge structure nodes
+def load_knowledge_structure_nodes(filepath):
+    # with open(filepath, "r") as file:
+    #     nodes = json.load(file)
+    nodes = knowledge_structure_nodes
+    nodes = pd.DataFrame(nodes)
+    nodes['final_key'] = nodes['full node'].str.split('_').str[-1]
+    nodes['root_2'] = nodes['full node'].str.split('_').str[1]
+    return nodes
+
+
+# Function to evaluate steps
+def accuracy_evaluate_steps(json, steps, nodes):
+    jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)]
+    for i in range(steps):
+        jokers[i] = pd.merge(
+            jokers[i],
+            nodes[['final_key', 'full node', 'root_2']],
+            left_on=f'knowledge concept_{i + 1}',
+            right_on='final_key',
+            how='left',
+        )
+        jokers[i].rename(
+            columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'},
+            inplace=True,
+        )
+    concatenated_steps = pd.concat(jokers, axis=0)
+    return concatenated_steps
+
+
+# Function to process steps data and merge results
+def accuracy_process_steps_data(df, steps):
+    steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)}
+    steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi']
+    for key, data in steps_data.items():
+        data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns]
+    merged_data = steps_data[f'{steps}steps_1']
+    for i in range(2, steps + 1):
+        merged_data = pd.merge(
+            merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left'
+        )
+    merged_data = pd.merge(
+        merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left'
+    )
+    return merged_data
+
+
+# Function to update main results DataFrame
+def accuracy_update_main_results_df(nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps):
+    One_step_acc = "{:.2%}".format(concatenated_data['joker'].mean())
+    Two_step_acc = "{:.2%}".format(merged_2steps['joker_multi'].mean())
+    Three_step_acc = "{:.2%}".format(merged_3steps['joker_multi'].mean())
+
+    new_row = {
+        # 'Model': model_name,
+        'One-step(S1)': One_step_acc,
+        'Two-step(S2)': Two_step_acc,
+        'Three-step(S3)': Three_step_acc,
+    }
+    # Calculate rates according to Nodes
+    nodes['final_rode'] = nodes['full node'].str.split('_').str[-1]
+    csv_final_score = concatenated_data.groupby('final_key')['joker'].mean()
+    csv_final_score = pd.merge(nodes, csv_final_score, left_on='final_rode', right_on='final_key', how='left')
+
+    new_row.update(csv_final_score.groupby('root2')['joker'].mean().apply(lambda x: "{:.2%}".format(x)).to_dict())
+    main_results_df = main_results_df._append(new_row, ignore_index=True)
+
+    return main_results_df
+
+
+# Main function to evaluate models
+def wemath_accuracy(output_json, main_results_csv_path=None):
+
+    # nodes = load_knowledge_structure_nodes(knowledge_structure_nodes_path)
+    nodes = knowledge_structure_nodes
+    nodes = pd.DataFrame(nodes)
+    nodes['final_key'] = nodes['full node'].str.split('_').str[-1]
+    nodes['root_2'] = nodes['full node'].str.split('_').str[1]
+
+    main_results_df = pd.DataFrame(
+        columns=[
+            'Model',
+            'One-step(S1)',
+            'Two-step(S2)',
+            'Three-step(S3)',
+            'Understanding and Conversion of Units',
+            'Angles and Length',
+            'Calculation of Plane Figures',
+            'Understanding of Plane Figures',
+            'Calculation of Solid Figures',
+            'Understanding of Solid Figures',
+            'Basic Transformations of Figures',
+            'Cutting and Combining of Figures',
+            'Direction',
+            'Position',
+            'Route Map',
+            'Correspondence of Coordinates and Positions',
+        ]
+    )
+
+    # print(f"Evaluating model: {model_name}, JSON path: {output_json}")
+    data = load_and_process_data(output_json)
+    data_2steps = data[data['key'].str.contains('2steps')]
+    data_3steps = data[data['key'].str.contains('3steps')]
+    merged_2steps = accuracy_process_steps_data(data_2steps, 2)
+    merged_3steps = accuracy_process_steps_data(data_3steps, 3)
+
+    concatenated_data = pd.concat(
+        [accuracy_evaluate_steps(merged_2steps, 2, nodes), accuracy_evaluate_steps(merged_3steps, 3, nodes)],
+        axis=0,
+    )
+    main_results_df = accuracy_update_main_results_df(
+        nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps
+    )
+
+    print(main_results_df.to_string(index=False))
+    if main_results_csv_path is not None:
+        main_results_df.to_csv(main_results_csv_path, index=False)
+        print("Evaluation completed and results saved to CSV.")
+
+    return main_results_df.to_dict()
+
+
+knowledge_structure_nodes = [
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Measurement",
+        "root2": "Understanding and Conversion of Units",
+        "root3": "Conversion Rates and Calculations Between Area Units",
+        "root4": None,
+        "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Area Units",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Measurement",
+        "root2": "Understanding and Conversion of Units",
+        "root3": "Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)",
+        "root4": None,
+        "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Measurement",
+        "root2": "Understanding and Conversion of Units",
+        "root3": "Conversion Rates and Calculations Between Length Units",
+        "root4": None,
+        "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Length Units",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Measurement",
+        "root2": "Angles and Length",
+        "root3": "Understanding Angles (Using a Protractor)",
+        "root4": None,
+        "full node": "Measurement_Angles and Length_Understanding Angles (Using a Protractor)",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Measurement",
+        "root2": "Angles and Length",
+        "root3": "Understanding Length (Using a Ruler)",
+        "root4": None,
+        "full node": "Measurement_Angles and Length_Understanding Length (Using a Ruler)",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Surface Area of Solid Figures",
+        "root4": "Surface Area of Cylinders",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cylinders",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Surface Area of Solid Figures",
+        "root4": "Surface Area of Rectangular Cuboids",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Rectangular Cuboids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Surface Area of Solid Figures",
+        "root4": "Surface Area of Cubes",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cubes",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Volume of Solid Figures",
+        "root4": "Volume and Capacity of Cylinders",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cylinders",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Volume of Solid Figures",
+        "root4": "Volume and Capacity of Cones",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cones",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Volume of Solid Figures",
+        "root4": "Volume and Capacity of Rectangular Cuboids",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Rectangular Cuboids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Calculation of Solid Figures",
+        "root3": "Calculation of Volume of Solid Figures",
+        "root4": "Volume and Capacity of Cubes",
+        "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cubes",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Expanded View of Solids",
+        "root4": "Expanded View of Cylinders",
+        "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cylinders",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Expanded View of Solids",
+        "root4": "Expanded View of Rectangular Cuboids",
+        "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Rectangular Cuboids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Expanded View of Solids",
+        "root4": "Expanded View of Cubes",
+        "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cubes",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Cylinders and Cones",
+        "root4": "Properties of Cylinders",
+        "full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cylinders",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Cylinders and Cones",
+        "root4": "Properties of Cones",
+        "full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cones",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Rectangular Cuboids and Cubes",
+        "root4": "Properties and Understanding of Rectangular Cuboids",
+        "full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Rectangular Cuboids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Rectangular Cuboids and Cubes",
+        "root4": "Properties and Understanding of Cubes",
+        "full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Cubes",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Solid Figures",
+        "root2": "Understanding of Solid Figures",
+        "root3": "Observing Objects",
+        "root4": None,
+        "full node": "Solid Figures_Understanding of Solid Figures_Observing Objects",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Sum of Interior Angles of Polygons",
+        "root4": "Sum of Interior Angles of Other Polygons",
+        "full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Other Polygons",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Sum of Interior Angles of Polygons",
+        "root4": "Sum of Interior Angles of Triangles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Triangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation and Comparison of Angles",
+        "root4": None,
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation and Comparison of Angles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Parallelograms",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Parallelograms",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Triangles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Triangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Sectors",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Sectors",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Trapezoids",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Trapezoids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Circles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Circles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Rectangles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Rectangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Areas",
+        "root4": "Area of Squares",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Squares",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Perimeters",
+        "root4": "Perimeter of Parallelograms",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Parallelograms",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Perimeters",
+        "root4": "Perimeter of Triangles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Triangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Perimeters",
+        "root4": "Perimeter of Trapezoids",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Trapezoids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Perimeters",
+        "root4": "Circumference of Circles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Circumference of Circles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Perimeters",
+        "root4": "Perimeter of Rectangles",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Rectangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Calculation of Plane Figures",
+        "root3": "Calculation of Perimeters",
+        "root4": "Perimeter of Squares",
+        "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Squares",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Polygons",
+        "root4": "Properties and Understanding of Parallelograms",
+        "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Parallelograms",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Polygons",
+        "root4": "Properties and Understanding of Triangles",
+        "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Triangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Polygons",
+        "root4": "Properties and Understanding of Trapezoids",
+        "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Trapezoids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Polygons",
+        "root4": "Properties and Understanding of Rectangles",
+        "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Rectangles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Polygons",
+        "root4": "Properties and Understanding of Squares",
+        "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Squares",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Classification and Understanding of Angles",
+        "root4": "Understanding Triangular Rulers",
+        "full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding Triangular Rulers",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Classification and Understanding of Angles",
+        "root4": "Understanding and Representing Angles",
+        "full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding and Representing Angles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Properties and Understanding of Line Segments",
+        "root4": "Distance Between Two Points",
+        "full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Distance Between Two Points",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Properties and Understanding of Line Segments",
+        "root4": "Understanding Line Segments, Lines, and Rays",
+        "full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Understanding Line Segments, Lines, and Rays",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Positional Relationships Between Line Segments",
+        "root4": "perpendicularity",
+        "full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_perpendicularity",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Positional Relationships Between Line Segments",
+        "root4": "Parallel",
+        "full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_Parallel",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Circles and Sectors",
+        "root4": "Understanding Sectors",
+        "full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Sectors",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Circles and Sectors",
+        "root4": "Understanding Circles",
+        "full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Circles",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Plane Figures",
+        "root2": "Understanding of Plane Figures",
+        "root3": "Observing Figures",
+        "root4": None,
+        "full node": "Plane Figures_Understanding of Plane Figures_Observing Figures",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Basic Transformations of Figures",
+        "root3": "Axial Symmetry",
+        "root4": None,
+        "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Axial Symmetry",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Basic Transformations of Figures",
+        "root3": "Translation",
+        "root4": None,
+        "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Translation",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Basic Transformations of Figures",
+        "root3": "Rotation",
+        "root4": None,
+        "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Rotation",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Cutting and Combining of Figures",
+        "root3": "Combining and Dividing Solids",
+        "root4": None,
+        "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining and Dividing Solids",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Cutting and Combining of Figures",
+        "root3": "Combining Plane Figures",
+        "root4": "Division of Plane Figures",
+        "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Division of Plane Figures",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Cutting and Combining of Figures",
+        "root3": "Combining Plane Figures",
+        "root4": "Combining Plane Figures",
+        "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Combining Plane Figures",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Cutting and Combining of Figures",
+        "root3": "Combining Plane Figures",
+        "root4": "Tessellation of Figures",
+        "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Tessellation of Figures",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Transformation and Motion of Figures",
+        "root2": "Cutting and Combining of Figures",
+        "root3": "Combining Plane Figures",
+        "root4": "Folding Problems of Figures",
+        "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Folding Problems of Figures",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Direction",
+        "root3": "Southeast, Southwest, Northeast, Northwest Directions",
+        "root4": None,
+        "full node": "Position and Direction_Direction_Southeast, Southwest, Northeast, Northwest Directions",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Direction",
+        "root3": "Cardinal Directions (East, South, West, North)",
+        "root4": None,
+        "full node": "Position and Direction_Direction_Cardinal Directions (East, South, West, North)",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Route Map",
+        "root3": "Determining the Positions of Objects Based on Direction, Angle, and Distance",
+        "root4": None,
+        "full node": "Position and Direction_Route Map_Determining the Positions of Objects Based on Direction, Angle, and Distance",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Route Map",
+        "root3": "Describing Simple Routes Based on Direction and Distance",
+        "root4": None,
+        "full node": "Position and Direction_Route Map_Describing Simple Routes Based on Direction and Distance",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Correspondence of Coordinates and Positions",
+        "root3": "Representing Positions Using Ordered Pairs",
+        "root4": None,
+        "full node": "Position and Direction_Correspondence of Coordinates and Positions_Representing Positions Using Ordered Pairs",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Correspondence of Coordinates and Positions",
+        "root3": "Finding Positions Based on Ordered Pairs",
+        "root4": None,
+        "full node": "Position and Direction_Correspondence of Coordinates and Positions_Finding Positions Based on Ordered Pairs",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Position",
+        "root3": "Front-Back Position",
+        "root4": None,
+        "full node": "Position and Direction_Position_Front-Back Position",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Position",
+        "root3": "Up-Down Position",
+        "root4": None,
+        "full node": "Position and Direction_Position_Up-Down Position",
+    },
+    {
+        "root0": "Geometry and Figures",
+        "root1": "Position and Direction",
+        "root2": "Position",
+        "root3": "Left-Right Position",
+        "root4": None,
+        "full node": "Position and Direction_Position_Left-Right Position",
+    },
+]
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py
@@ -1,6 +1,47 @@
 from ...smp import *


+def AMBER_rating(data_file):
+    data = load(data_file)
+    stats = defaultdict(dict)
+    lt = len(data)
+    category_mapping = {
+        'discriminative-attribute-state': 'Attribute',
+        'discriminative-attribute-number': 'Attribute',
+        'discriminative-attribute-action': 'Attribute',
+        'discriminative-hallucination': 'Existence',
+        'discriminative-relation': 'Relation',
+        'relation': 'Relation'
+    }
+
+    for i in range(lt):
+        item = data.iloc[i]
+        category = item['category']
+        image_path = item['image_path']
+        score = item['score']
+
+        new_category = category_mapping.get(category, category)
+
+        if image_path not in stats[new_category]:
+            stats[new_category][image_path] = []
+        stats[new_category][image_path].append(score)
+
+    def acc(key):
+        res = stats[key]
+        values = []
+        for val in res.values():
+            values.extend(val)
+        return np.mean(values) * 100
+
+    scores = {}
+    for k in stats:
+        scores[k] = acc(k)
+
+    scores['Avg ACC'] = np.mean(list(scores.values()))
+    ret = d2df(scores)
+    return ret
+
+
 def MME_rating(data_file):
    data = load(data_file)
    stats = defaultdict(dict)