Modify eval_mm for MiniCPM-V 2.6

2026-02-05 18:29:18 +08:00 · 2024-08-30 18:18:22 +00:00
parent ab1141ee45
commit 59224808a1
69 changed files with 8231 additions and 1818 deletions
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/init.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/init.py
@@ -0,0 +1,9 @@
+from .judge_util import build_judge, DEBUG_MESSAGE
+from .multiple_choice import extract_answer_from_item, prefetch_answer
+from .vqa_eval import levenshtein_distance
+
+
+__all__ = [
+    'build_judge', 'extract_answer_from_item', 'prefetch_answer',
+    'levenshtein_distance', 'DEBUG_MESSAGE'
+]
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py
@@ -0,0 +1,41 @@
+import os
+from ...api import OpenAIWrapper
+from ...smp import load_env
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+
+def build_judge(**kwargs):
+    model = kwargs.pop('model', None)
+    kwargs.pop('nproc', None)
+    load_env()
+    LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
+    if LOCAL_LLM is None:
+        model_map = {
+            'gpt-4-turbo': 'gpt-4-1106-preview',
+            'gpt-4-0613': 'gpt-4-0613',
+            'gpt-4-0125': 'gpt-4-0125-preview',
+            'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
+            'chatgpt-1106': 'gpt-3.5-turbo-1106',
+            'chatgpt-0125': 'gpt-3.5-turbo-0125',
+            'gpt-4o': 'gpt-4o-2024-05-13',
+            'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
+        }
+        model_version = model_map[model]
+    else:
+        model_version = LOCAL_LLM
+    model = OpenAIWrapper(model_version, **kwargs)
+    return model
+
+
+DEBUG_MESSAGE = """
+To debug the OpenAI API, you can try the following scripts in python:
+```python
+from vlmeval.api import OpenAIWrapper
+model = OpenAIWrapper('gpt-4-1106-preview', verbose=True)
+msgs = [dict(type='text', value='Hello!')]
+code, answer, resp = model.generate_inner(msgs)
+print(code, answer, resp)
+```
+You cam see the specific error if the API call fails.
+"""
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/llavabench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/llavabench.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+from ...smp import *
+
+rule_dict = {
+    'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}  # noqa: E501
+}
+
+
+def get_eval(judge, content):
+    return judge.generate(content)
+
+
+def parse_score(review):
+    logger = get_logger('Evaluation')
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            logger.error('error', review)
+            return [-1, -1]
+    except Exception as e:
+        logger.error(e, 'error', review)
+        return [-1, -1]
+
+
+def build_prompt(line):
+    cap_str = line['caption']
+    question = line['question']
+    ans1 = line['gpt4_ans']
+    ans2 = line['prediction']
+    category = 'llava_bench_' + line['category']
+    rule = rule_dict[category]
+    role, prompt = rule['role'], rule['prompt']
+
+    content = (f'[Context]\n{cap_str}\n\n'
+               f'[Question]\n{question}\n\n'
+               f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
+               f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
+               f'[System]\n{prompt}\n\n')
+    return content
+
+
+def LLaVABench_atomeval(model, prompt):
+    review = get_eval(model, prompt)
+    scores = parse_score(review)
+    return scores
+
+
+def LLaVABench_score(data):
+    cates = ['overall'] + list(set(data['category']))
+    ret = defaultdict(list)
+
+    for c in cates:
+        ret['split'].append(c)
+        sub = data[data['category'] == c] if c != 'overall' else data
+        ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
+        ret['VLM Score'].append(np.mean(sub['score']) * 10)
+        ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
+    return pd.DataFrame(ret)
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py
@@ -0,0 +1,170 @@
+from ...smp import *
+from ...utils import can_infer
+try:
+    from latex2sympy2 import latex2sympy
+except ImportError:
+    print('Please install latex2sympy2 by running "pip install latex2sympy2"')
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def is_equal(asw: str, gt_asw: str) -> bool:
+    if not isinstance(asw, str) != str or not isinstance(gt_asw, str):
+        print('Warning: input is not string')
+        print(asw, gt_asw)
+    asw = str(asw).lower().strip()
+    gt_asw = str(gt_asw).lower().strip()
+    if gt_asw == asw:
+        return True
+    try:
+        a = eval(gt_asw)
+        b = eval(asw)
+        if abs(a - b) < 1e-6:
+            return True
+    except:
+        pass
+    try:
+        a = latex2sympy(gt_asw)
+        b = latex2sympy(asw)
+        if abs(eval(str(a)) - eval(str(b))) < 1e-6:
+            return True
+        if abs(a - b) < 1e-6:
+            return True
+    except:
+        pass
+    return False
+
+
+def get_gpt4_ICE():
+    example_1 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+
+    example_2 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+
+    example_3 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+
+    example_4 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def build_mathv_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += 'Extracted answer:'
+    return prompt
+
+
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if len(eval(line['choices'])) > 0:
+            ans = line['answer']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            res = str(response)
+            ans = str(ans)
+    except ValueError:
+        pass
+
+    if is_equal(res, ans):
+        return res if prefetch else True
+    else:
+        return False
+
+
+def MATH_V_auxeval(model, line):
+    prompt = build_mathv_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        return dict(log='Prefetch succeed', res=res)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
+
+
+def MATH_V_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        tot['Overall'] += 1
+        tot[cate] += 1
+        if item['log'] == 'Prefetch succeed':
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Subject'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+    res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
+    return res
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathvista.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathvista.py
@@ -0,0 +1,164 @@
+from ...smp import *
+from ...utils import can_infer
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def get_gpt4_ICE():
+    example_1 = """
+Hint: Please answer the question requiring an integer answer and provide the final value,
+e.g., 1, 2, 3, at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+
+    example_2 = """
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
+e.g., 1.2, 1.3, 1.4, at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+
+    example_3 = """
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
+e.g., 1.23, 1.34, 1.45, at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+
+    example_4 = """
+Hint: Please answer the question requiring a Python list as an answer and provide the final list,
+e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def build_mathvista_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += 'Extracted answer:'
+    return prompt
+
+
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if line['question_type'] == 'multi_choice':
+            ans = line['answer_option']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            if line['answer_type'] == 'integer':
+                res = int(response)
+                ans = int(line['answer'])
+            elif line['answer_type'] == 'float':
+                res = float(response)
+                ans = float(line['answer'])
+            else:
+                res = str(res)
+                ans = str(ans)
+    except ValueError:
+        pass
+
+    if res == ans:
+        return res if prefetch else True
+    else:
+        return False
+
+
+def MathVista_auxeval(model, line):
+    prompt = build_mathvista_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        return dict(log='Prefetch succeed', res=res)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
+
+
+def MathVista_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    skill_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['task']
+        tot['Overall'] += 1
+        try:
+            skills = eval(item['skills'])
+        except SyntaxError:
+            skills = [item['skills']]
+        for skill in skills:
+            if skill not in skill_list:
+                skill_list.append(skill)
+            tot[skill] += 1
+        tot[cate] += 1
+        if item['log'] == 'Prefetch succeed':
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+            for skill in skills:
+                fetch[skill] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+            for skill in skills:
+                hit[skill] += 1
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Task&Skill'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+    res = pd.DataFrame(res)
+    return res
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmbench_video.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmbench_video.py
@@ -0,0 +1,70 @@
+from ...smp import *
+import numpy as np
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+system_prompt = """
+As an AI assistant, your task is to evaluate a candidate answer in comparison to a given correct answer.
+The question itself, the correct 'groundtruth' answer, and the candidate answer will be provided to you.
+Your assessment should range from 0 to 3, \
+based solely on the semantic similarity between the groundtruth and the candidate answer, \
+disregarding any grammatical differences.
+A rating of 0 suggests no similarity, implying the candidate answer is entirely incorrect.
+A rating of 1 suggests low similarity, meaning the candidate answer is largely incorrect.
+A rating of 2 suggests high similarity, meaning the candidate answer is largely correct.
+Lastly, a rating of 3 indicates complete similarity, which means the candidate answer is entirely correct.
+Your response should be a single integer from 0, 1, 2, or 3.
+"""
+
+MMV_DIMENSIONS = {
+    'CP': ['Video Topic', 'Video Emotion', 'Video Scene', 'Video Style'],
+    'FP-S': ['OCR', 'Object Recognition', 'Attribute Recognition', 'Event Recognition', 'Human Motion', 'Counting'],
+    'FP-C': ['Spatial Relationship', 'Human-object Interaction', 'Human Interaction'],
+    'HL': ['Hallucination'],
+    'LR': ['Structuralized Image-Text Understanding', 'Mathematical Calculation'],
+    'AR': ['Physical Property', 'Function Reasoning', 'Identity Reasoning'],
+    'RR': ['Natural Relation', 'Physical Relation', 'Social Relation'],
+    'CSR': ['Common Sense Reasoning'],
+    'TR': ['Counterfactual Reasoning', 'Causal Reasoning', 'Future Prediction'],
+}
+L3_DIMS = []
+for k, v in MMV_DIMENSIONS.items():
+    L3_DIMS.extend(v)
+
+MMV_DIMENSIONS['Perception'] = []
+MMV_DIMENSIONS['Reasoning'] = []
+MMV_DIMENSIONS['Overall'] = []
+for k in ['CP', 'FP-C', 'FP-S', 'HL']:
+    MMV_DIMENSIONS['Perception'].extend(MMV_DIMENSIONS[k])
+    MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
+for k in ['LR', 'AR', 'RR', 'CSR', 'TR']:
+    MMV_DIMENSIONS['Reasoning'].extend(MMV_DIMENSIONS[k])
+    MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
+
+
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    coarse_rating = {k: [] for k in MMV_DIMENSIONS}
+    fine_rating = {k: [] for k in L3_DIMS}
+
+    for i in range(len(data)):
+        cate = data.iloc[i]['dimensions']
+        cates = eval(cate)
+
+        for c in cates:
+            fine_rating[c].append(data.iloc[i]['score'])
+
+        for d in MMV_DIMENSIONS:
+            if np.any([x in MMV_DIMENSIONS[d] for x in cates]):
+                coarse_rating[d].append(data.iloc[i]['score'])
+
+    coarse_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in coarse_rating.items()}
+    coarse_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in coarse_rating.items()}
+    fine_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in fine_rating.items()}
+    fine_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in fine_rating.items()}
+    return dict(coarse_all=coarse_all, coarse_valid=coarse_valid, fine_all=fine_all, fine_valid=fine_valid)
+
+
+def build_prompt(item):
+    tmpl = 'Question: {}\nGroundtruth answer: {}\nCandidate answer: {}\nYour response: '
+    return tmpl.format(item['question'], item['answer'], item['prediction'])
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py
@@ -0,0 +1,126 @@
+from ...smp import *
+
+meta_prompt = """
+You are an assistant skilled at evaluating the quality of creative text.
+Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to \
+the user question displayed below. You'll need to assess the response on the following dimensions: \
+Creativity, Richness, Visual Perception, Logical Coherence, Answer Accuracy and Image Relationship Understanding. \
+We will provide you with a creative question and the AI model's response and a reference answer for your evaluation. \
+As you begin your assessment, follow this process:
+1. Evaluate the AI model's answers on different dimensions, pointing out its strengths or weaknesses \
+in each dimension and assigning a score of 1 to 10 for each.
+2. Finally, based on the assessments across dimensions, \
+provide an overall score of 1 to 10 for the AI model's response.
+3. Your scoring should be as stringent as possible and follow the scoring rules below:
+In general, the higher the quality of the model's response and its strict adherence to user needs, \
+the higher the score. Responses that do not meet user needs will receive lower scores.
+Scoring rules:
+Creativity:
+Scores 1-2 when there is no innovation or uniqueness in the content.
+Scores 3-4 when providing partially original content but with low creative quality.
+Scores 5-6 when mostly creative but lacks significant novelty, with moderate quality.
+Scores 7-8 when having novelty and high-quality content.
+Scores 9-10 when highly novel and of exceptional quality compared to the reference answer.
+Richness:
+Scores 1-2 when lacking depth and breadth, with very limited information.
+Scores 3-4 when limited in depth and breadth, with fewer explanations and examples, showing low diversity.
+Scores 5-6 when limited in depth and breadth but provides basic necessary information.
+Scores 7-8 when providing depth and useful additional information.
+Scores 9-10 when providing exceptional depth, breadth, and high diversity compared to the reference answer.
+Visual Perception:
+Scores 1-2 when the description of the visual information in the image contains errors or \
+is significantly inconsistent with the content of the image.
+Scores 3-4 When the description of the visual information in the image reflects only a small amount \
+of the image's information and contains some errors.
+Scores 5-6 when the description of the visual information in the image includes the basic information \
+of the image but contains minimal information.
+Scores 7-8 when the description of the visual information in the image matches the image well and is rich in content, \
+providing a substantial amount of information about the image.
+Scores 9-10 when the description of the visual information in the image not only matches the image \
+but also is more detailed and informative compared to the reference answer, providing more information about the image.
+Logical Coherence:
+Scores 1-2 when entirely incoherent, lacking any logic, and not matching the question or known information.
+Scores 3-4 when somewhat coherent but with many logical errors or inconsistencies.
+Scores 5-6 when mostly coherent, with few errors, but may struggle to maintain complete coherence in complex situations.
+Scores 7-8 when excellent logical handling, very few errors.
+Scores 9-10 when flawless logic, impeccable in handling complexity, \
+and significantly higher logical coherence compared to the reference answer.
+Answer Accuracy:
+Scores 1-2 when the answer is significantly inconsistent with the question or contains obvious errors.
+Scores 3-4 when the answer is partially correct but contains some errors or is incomplete.
+Scores 5-6 when the answer is basically correct but lacks details or is not sufficiently detailed.
+Scores 7-8 when the answer is accurate and detailed, fully corresponding to the question.
+Scores 9-10 when the answer is not only accurate and detailed but also provides additional useful information, \
+exceeding expectations.
+Image Relationship Understanding:
+Scores 1-2 when there are significant errors or confusion in distinguishing and describing different images, \
+unable to correctly identify and relate the content of the images.
+Scores 3-4 when the description of different images reflects only minimal distinguishing information, \
+contains some errors and confusion, and fails to clearly differentiate and relate the images.
+Scores 5-6 when the description of different images includes basic distinguishing information, \
+is able to correctly identify and relate the images in a basic manner, \
+but the information provided is minimal and lacks detail.
+Scores 7-8 when the description of different images is accurate and detailed, \
+clearly distinguishing and relating the images, \
+with rich content that points out the main commonalities and differences between the images.
+Scores 9-10 when the description of different images is not only accurate and detailed but also \
+provides richer information and analysis, clearly distinguishing and relating the images, \
+more comprehensively pointing out the commonalities and differences \
+between the images compared to the reference answer.
+Overall Score:
+Scores 1-2 when irrelevant to the question, factually incorrect, or generates harmful content.
+Scores 3-4 when no serious errors, mostly harmless, but of low quality and does not meet requirements.
+Scores 5-6 when basically meeting requirements but performing poorly in some dimensions, with moderate quality.
+Scores 7-8 when performing well in all dimensions.
+Scores 9-10 when fully addressing user questions and all requirements, significantly surpassing the reference answer.
+Please remember, you must evaluate and explain before scoring. After your explanation for each dimension, \
+add the score for that dimension. Finally, at the end of your response, \
+in the format of the dictionary (including brackets), return all your scoring results, \
+ensuring your scores are integers:
+{'Dimension One': Score, 'Dimension Two': Score, ..., 'Overall Score': Score}, \
+for example: {'Creativity': 9, 'Richness': 6, ..., 'Overall Score': 7}.\n
+"""
+question_begin_prompt = '[Question]'
+reference_begin_prompt = '[The Start of Reference Answer]'
+reference_end_prompt = '[The End of Reference Answer]'
+answers_begin_prompt = '[The Start of Assistant’s Answer]'
+answers_end_prompt = '[The End of Assistant’s Answer]'
+
+
+def mmdu_score(model, line):
+    question = eval(line['question'])
+    gt = eval(line['answer'])
+    prediction = eval(line['prediction'])
+
+    DIMS = [
+        'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
+        'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
+    ]
+
+    all_result_dict = []
+    logs = []
+    for j in range(len(question)):
+        try:
+            prompt = meta_prompt + question_begin_prompt + '\n' + question[j] + '\n\n' + \
+                reference_begin_prompt + '\n' + gt[j] + '\n' + reference_end_prompt + '\n\n' + \
+                answers_begin_prompt + '\n' + prediction[j] + '\n' + answers_end_prompt
+            response = model.generate(prompt)
+            start_index = response.find('{')
+            end_index = response.rfind('}') + 1
+            dictionary_str = response[start_index: end_index]
+            result_dict = eval(dictionary_str)
+            all_result_dict.append(result_dict)
+            if all([x in result_dict for x in DIMS]):
+                logs.append('Succeed')
+            else:
+                logs.append(
+                    f'Following Dims are not in results of turn {j}: '
+                    f'{",".join([x for x in DIMS if x not in result_dict])}'
+                )
+        except Exception as e:
+            print({e})
+            all_result_dict.append({d: None for d in DIMS})
+            logs.append(str(e))
+
+    df = pd.DataFrame(all_result_dict)
+    return dict(res=df, log='\n'.join(logs))
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmvet.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmvet.py
@@ -0,0 +1,106 @@
+from ...smp import *
+
+
+def build_mmvet_gpt4_prompt(line):
+    question = line['question']
+    gt = str(line['answer'])
+    prediction = str(line['prediction'])
+    prompt = """
+Compare the ground truth and prediction from AI models, to give a correctness score for the prediction.
+<AND> in the ground truth means it is totally right
+only when all elements in the ground truth are present in the prediction,
+and <OR> means it is totally right when any one element in the ground truth is present in the prediction.
+The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
+Just complete the last space of the correctness score.
+
+Question | Ground truth | Prediction | Correctness
+--- | --- | --- | ---
+What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
+What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
+Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
+Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
+while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
+because the names of these countries do not accurately represent their landscapes. |
+The meme talks about Iceland and Greenland. It's pointing out that despite their names,
+Iceland is not very icy and Greenland isn't very green. | 0.4
+Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
+Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
+while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
+because the names of these countries do not accurately represent their landscapes. |
+The meme is using humor to point out the misleading nature of Iceland's and Greenland's names.
+Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow.
+The text 'This is why I have trust issues' is a playful way to suggest
+that these contradictions can lead to distrust or confusion.
+The humor in this meme is derived from the unexpected contrast between the names of the countries
+and their actual physical characteristics. | 1.0
+"""
+    gpt4_prompt = prompt + '\n' + ' | '.join(
+        [question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
+    return gpt4_prompt
+
+
+def MMVet_auxeval(model, line):
+    def float_cvt(s):
+        try:
+            return float(s)
+        except ValueError:
+            return None
+
+    prompt = build_mmvet_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=i * 0.5)
+        score = float_cvt(output)
+        if score is None:
+            log += f'Try {i}: output is {output}, failed to parse.\n'
+        elif score < 0 or score > 1:
+            log += f'Try {i}: output is {output}, invalid score: {score}.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, score=score)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, score=0.0)
+
+
+def MMVet_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    score = defaultdict(lambda: 0)
+    lt = len(data)
+    cate2_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        cate2 = cate.replace(',', '_')
+        if cate2 not in cate2_list:
+            cate2_list.append(cate2)
+        grade = float(item['score'])
+        cate_list = ['rec', 'ocr', 'know', 'gen', 'spat', 'math']
+        for capa in cate_list:
+            if capa in cate:
+                tot[capa] += 1
+                score[capa] += grade
+        tot['Overall'] += 1
+        tot[cate2] += 1
+        score['Overall'] += grade
+        score[cate2] += grade
+
+    res = defaultdict(list)
+    res2 = defaultdict(list)
+    cate_list.append('Overall')
+    cate2_list.append('Overall')
+    for k in cate_list:
+        res['Category'].append(k)
+        res['tot'].append(tot[k])
+        res['acc'].append(score[k] / tot[k] * 100)
+    for v in cate2_list:
+        res2['Category'].append(v)
+        res2['tot'].append(tot[v])
+        res2['acc'].append(score[v] / tot[v] * 100)
+    res = pd.DataFrame(res)
+    res2 = pd.DataFrame(res2)
+    return res, res2
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py
@@ -0,0 +1,442 @@
+import pandas as pd
+from ...utils import can_infer, track_progress_rich
+from ...smp import *
+import numpy as np
+
+MMB_abbrs = {
+    'coarse_perception': 'CP',
+    'finegrained_perception (instance-level)': 'FP-S',
+    'finegrained_perception (cross-instance)': 'FP-C',
+    'logic_reasoning': 'LR',
+    'relation_reasoning': 'RR',
+    'attribute_reasoning': 'AR'
+}
+
+MMT_abbrs = {
+    'visual_recognition': 'VR',
+    'localization': 'Loc',
+    'ocr': 'OCR',
+    'counting': 'Count',
+    'hallucination': 'HLN',
+    'image_retrieval': 'IR',
+    'threed': '3D',
+    'visual_captioning': 'VC',
+    'visual_grounding': 'VG',
+    'doc_understanding': 'DU',
+    'action_recognition': 'AR',
+    'pixel_level_perception': 'PLP',
+    'image-to-image_translation': 'I2IT',
+    'relation_reasoning': 'RR',
+    'intelligence_quotient_test': 'IQT',
+    'emotion': 'Emo',
+    'visual_illusion': 'VI',
+    'meme_understanding': 'MemU',
+    'visual_prompt_understanding': 'VPU',
+    'anomaly_detection': 'AND',
+    'keypoint_detection': 'KD',
+    'visual_commonsense_reasoning': 'VCR',
+    'image_evaluation_judgement': 'IEJ',
+    'multiple_image_analysis': 'MIA',
+    'cross_image_matching': 'CIM',
+    'temporal_understanding': 'TU',
+    'visual_code': 'VP',
+    'medical_understanding': 'MedU',
+    'autonomous_driving': 'AUD',
+    'discipline_knowledge_reasoning': 'DKR',
+    'embodied_ai': 'EA',
+    'gui_navigation': 'GN'
+}
+
+
+def MMMU_preproc(data):
+    logger = get_logger('Evaluation')
+    cnt = 0
+    As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
+    lt = len(data)
+    for i in range(lt):
+        if pd.isna(As[i]):
+            As[i] = Ans[i]
+            Bs[i] = 'Other Answers'
+            cnt += 1
+    logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ')
+    data['A'] = As
+    data['B'] = Bs
+    return data
+
+
+def report_acc(df):
+    # assert group in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+
+    if 'split' in df:
+        splits = list(set(df['split']))
+        res['split'] = splits
+    else:
+        df['split'] = ['none'] * len(df)
+        res['split'] = ['none']
+
+    for group in [None, 'l2-category', 'category']:
+        if group is None:
+            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+        elif group not in df:
+            continue
+        else:
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                ab_name = MMB_abbrs[ab] if ab in MMB_abbrs else ab
+                sub_df = df[df[group] == ab]
+                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+    return pd.DataFrame(res)
+
+
+def report_acc_MMT(df):
+    # assert group in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+    res['split'] = list()
+    res['Overall'] = list()
+    for _, name in MMT_abbrs.items():
+        res[name] = list()
+
+    if 'split' in df:
+        splits = list(set(df['split']))
+        res['split'] = splits
+
+    else:
+        df['split'] = ['none'] * len(df)
+        res['split'] = ['none']
+
+    for group in [None, 'category', 'l2-category']:
+        if group is None:
+            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+            res['Overall'].extend([np.mean(df['hit'])])
+        elif group not in df:
+            continue
+        elif group == 'category':
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                ab_name = ab
+                sub_df = df[df[group] == ab]
+                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+                res[ab_name].extend([np.mean(sub_df['hit'])])
+        else:
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
+                sub_task_acc = []
+                for sub_task_name in sub_task_name_list:
+                    sub_df = df[df['category'] == sub_task_name]
+                    sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
+
+                new_acc = []
+                for i in range(len(sub_task_acc[0])):
+                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
+                ab_name = MMT_abbrs[ab] if ab in MMT_abbrs else ab
+                res[ab_name] = new_acc
+
+                sub_task_acc = []
+                for sub_task_name in sub_task_name_list:
+                    sub_df = df[df['category'] == sub_task_name]
+                    sub_task_acc.append([np.mean(sub_df['hit'])])
+                new_acc = []
+                for i in range(len(sub_task_acc[0])):
+                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
+
+                res[ab_name].extend(new_acc)
+
+    res['split'].append('ALL')
+    return pd.DataFrame(res)
+
+
+def build_prompt(question, options, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: a cute teddy bear\nYour output: A\n'
+        'Example 2: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: Spider\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_prompt_blink(question, options, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        "If the answer says things like refuse to answer, I'm sorry cannot help, etc., output Z."
+        'If the meaning of all options are significantly different from the answer, '
+        'or the answer does not select any option, output Z. '
+        'Your should output one of the choices, A, B, C, D (if they are valid options), or Z.\n'
+        'Example 1: \n'
+        'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
+        'Options: A. Point A\nB. Point B\n(Z) Failed\n'
+        'Answer: Point B, where the child is sitting, is closer to the camera.\nYour output: (B)\n'
+        'Example 2: \n'
+        'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
+        'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
+        "Answer: I'm sorry, but I can't assist with that request.\nYour output: (Z)\n"
+        'Example 3: \n'
+        'Question: Which point is corresponding to the reference point?\nSelect from the following choices.\n'
+        'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
+        'Answer:The reference point (REF) on the first image is at the tip of the pot, '
+        'which is the part used to Poke if the pots were used for that action. Looking at the second image, '
+        'we need to find the part of the object that would correspond to poking.\n'
+        "(A) Point A is at the tip of the spoon's handle, which is not used for poking.\n"
+        '(B) Point B is at the bottom of the spoon, which is not used for poking.\n'
+        '(C) Point C is on the side of the pspoonot, which is not used for poking.\n'
+        '(D) Point D is at the tip of the spoon, which is not used for poking.\n'
+        '\nTherefore, there is no correct answer in the choices\nYour output: (Z)\n'
+        'Example 4: \n'
+        'Question: {}?\nOptions: {}\n(Z) Failed\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_prompt_cn(question, options, prediction):
+    tmpl = (
+        '你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。'
+        '你会被提供：一个问题，多个选项，一个答案。你的任务是找到与答案意义最相近的选项。'
+        '如果所有选项的意义都与答案显著不同，则输出 Z。'
+        '你应该输出一个单个的大写字母，例如 A, B, C, D（如果它们是有效选项），或 Z。'
+        '例 1:'
+        '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n'
+        '例 2: \n'
+        '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n'
+        '例 3: \n'
+        '问题: {}?\n选项: {}\n答案: {}\n输出: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_choices(item):
+    ret = {}
+    for ch in string.ascii_uppercase:
+        if ch in item and (not pd.isna(item[ch])):
+            ret[ch] = item[ch]
+    return ret
+
+
+def prefetch_answer(item):
+    choices = build_choices(item)
+    return can_infer(item['prediction'], choices)
+
+
+def extract_answer_from_item(model, item, dataset_name=None):
+    logger = get_logger('Evaluation')
+    # It will return: (pred, raw, llm_time)
+    choices = build_choices(item)
+    option_str = build_option_str(choices)
+
+    if dataset_name == 'BLINK':
+        prompt = build_prompt_blink(item['question'], option_str, item['prediction'])
+    elif cn_string(item['question']):
+        prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
+    else:
+        prompt = build_prompt(item['question'], option_str, item['prediction'])
+    retry = 3
+
+    ret = can_infer(item['prediction'], choices)
+    if ret:
+        return dict(opt=ret, log=item['prediction'])
+    if model is None:
+        return dict(opt='Z', log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
+
+    while retry:
+        ans = model.generate(prompt)
+        if 'Failed to obtain answer via API' in ans:
+            logger.warning('GPT API failed to answer. ')
+        else:
+            ret = can_infer(ans, choices)
+            if ret:
+                return dict(opt=ret, log=ans)
+            else:
+                logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
+        retry -= 1
+
+        if retry == 0:
+            options = list(choices) + ['Z'] if 'Z' not in choices else []
+            return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
+
+
+# For Circular Evaluation
+def prefetch_circular_group(sub_data, verbose=False):
+    lt = len(sub_data)
+    GT, PRED = [], []
+    for i in range(lt):
+        item = sub_data.iloc[i]
+        GT.append(item['GT'])
+        PRED.append(prefetch_answer(item))
+        if PRED[-1] and (GT[-1] != PRED[-1]):
+            log = (
+                f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, '
+                f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. "
+            )
+            return dict(hit=0, log=log)
+    flag = True
+    for g, p in zip(GT, PRED):
+        if g != p:
+            flag = False
+    ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, )
+    ret = ret + (GT, PRED) if verbose else ret
+    return ret if len(ret) > 1 else ret[0]
+
+
+def eval_vanilla(model, item, dataset_name=None):
+    res = extract_answer_from_item(model, item, dataset_name=dataset_name)
+    opt, match_log = res['opt'], res['log']
+    if opt == item['GT']:
+        return dict(hit=1, log=f'Match Log: {match_log}. ')
+    else:
+        return dict(hit=0, log=f'Match Log: {match_log}. ')
+
+
+# For Circular Evaluation
+def eval_circular_group(model, sub_data, dataset_name=None):
+    res, GT, PRED = prefetch_circular_group(sub_data, verbose=True)
+    if res is not None:
+        return res
+
+    lt = len(sub_data)
+    log = ''
+    for i in range(lt):
+        if PRED[i]:
+            log += f'Rolling {i} Matched.\n'
+        else:
+            res = extract_answer_from_item(model, sub_data.iloc[i], dataset_name=dataset_name)
+            opt, match_log = res['opt'], res['log']
+            PRED[i] = opt
+            if PRED[i] != GT[i]:
+                log += (
+                    f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
+                    f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
+                )
+                return dict(hit=0, log=log)
+            else:
+                log += (
+                    f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
+                    f'Pre-fetched is {PRED[i]}.\n'
+                )
+
+    return dict(hit=1, log=log)
+
+
+# data, meta are pd.DataFrame, result_file is a path
+def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None):
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+    answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
+
+    if 'MMMU' in dataset_name:
+        data = MMMU_preproc(data)
+        answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
+
+    data = data[data['index'].isin(answer_map)]
+    data['GT'] = [answer_map[idx] for idx in data['index']]
+    items = []
+
+    for i in range(len(data)):
+        # Dealing with the normal part
+        item = data.iloc[i]
+        if item['index'] not in result:
+            items.append(item)
+
+    tups = [dict(model=model, item=x, dataset_name=dataset_name) for x in items]
+    keys = [x['index'] for x in items]
+    if len(tups):
+        res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys)
+        result = load(result_file)
+        for k, v in zip(keys, res):
+            if k in result:
+                assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
+            else:
+                result[k] = v
+    data['hit'] = [result[i]['hit'] for i in data['index']]
+    data['log'] = [result[i]['log'] for i in data['index']]
+    if 'GT' in data:
+        data.pop('GT')
+    return data
+
+
+# data, meta are pd.DataFrame, result_file is a path
+def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+    # Build Answer Map
+    answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
+
+    for idx in list(meta['index']) + list(data['index']):
+        assert istype(idx, int)
+
+    # Only keep those lines in the meta data
+    data = data[data['index'].isin(answer_map)]
+    data['GT'] = [answer_map[idx] for idx in data['index']]
+    data_main = data[data['index'] < int(1e6)]
+
+    data_groups = []
+    for i in range(len(data_main)):
+        # Dealing with the normal part
+        idx = data_main.iloc[i]['index']
+        if idx not in result:
+            sub_data = data[data['index'] % int(1e6) == idx]
+            data_groups.append(sub_data)
+
+    if len(data_groups):
+        prefetched = [prefetch_circular_group(g, verbose=False) for g in data_groups]
+        remain = []
+        for dg, pf in zip(data_groups, prefetched):
+            if pf is not None:
+                result[dg.iloc[0]['index'] % 1e6] = pf
+            else:
+                remain.append(dg)
+        dump(result, result_file)
+
+        tups = [dict(model=model, sub_data=x, dataset_name=dataset_name) for x in remain]
+        keys = [x.iloc[0]['index'] % 1e6 for x in remain]
+
+        if len(tups) == 0:
+            pass
+        elif model is None:
+            logger = get_logger('Evaluation')
+            logger.warning('Exact Matching mode, will not do GPT-based answer matching. ')
+            for k in keys:
+                result[k] = dict(
+                    hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
+        else:
+            res = track_progress_rich(
+                eval_circular_group,
+                tups,
+                nproc=nproc,
+                chunksize=nproc,
+                save=result_file,
+                keys=keys)
+            result = load(result_file)
+            for k, v in zip(keys, res):
+                if k in result:
+                    assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
+                else:
+                    result[k] = v
+
+    tmp_pth = f'/tmp/{timestr()}.xlsx'
+    dump(data_main, tmp_pth)
+    data_main = load(tmp_pth)
+    indices = data_main['index']
+    data_main['hit'] = [result[i]['hit'] for i in indices]
+    data_main['log'] = [result[i]['log'] for i in indices]
+    if 'GT' in data_main:
+        data_main.pop('GT')
+
+    return data_main
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py
@@ -0,0 +1,450 @@
+from ...smp import *
+from PIL import Image, ImageOps
+import torchvision
+import random
+import numbers
+import math
+import torch
+
+
+def get_dimension_rating(data_path):
+    data = load(data_path)
+    result_board = {}
+    for idx, item in data.iterrows():
+        if item['task_type'] not in result_board:
+            result_board[item['task_type']] = [0, 0]
+        result_board[item['task_type']][1] += 1
+        if item['score']:
+            result_board[item['task_type']][0] += 1
+
+    correct = 0
+    total = 0
+    for key, value in result_board.items():
+        correct += value[0]
+        total += value[1]
+        result_board[key].append(f'{value[0] / value[1] * 100 :.2f}%')
+
+    result_board['overall'] = [correct, total, f'{correct / total * 100 :.2f}%']
+
+    return result_board
+
+
+def check_ans(pred, gt):
+    flag = False
+
+    pred_list = pred.lower().split(' ')
+    pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
+    gt_list = gt.lower().split(' ')
+    gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
+    if gt_content[-1] == '.':
+        gt_content = gt_content[:-1]
+
+    if pred_option.replace('.', '') in gt_option:
+        flag = True
+    elif gt_option in pred_option:
+        flag = True
+
+    return flag
+
+
+class GroupRandomCrop(object):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, img_group):
+
+        w, h = img_group[0].size
+        th, tw = self.size
+
+        out_images = list()
+
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+
+        for img in img_group:
+            assert (img.size[0] == w and img.size[1] == h)
+            if w == tw and h == th:
+                out_images.append(img)
+            else:
+                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+        return out_images
+
+
+class MultiGroupRandomCrop(object):
+    def __init__(self, size, groups=1):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.groups = groups
+
+    def __call__(self, img_group):
+
+        w, h = img_group[0].size
+        th, tw = self.size
+
+        out_images = list()
+
+        for i in range(self.groups):
+            x1 = random.randint(0, w - tw)
+            y1 = random.randint(0, h - th)
+
+            for img in img_group:
+                assert (img.size[0] == w and img.size[1] == h)
+                if w == tw and h == th:
+                    out_images.append(img)
+                else:
+                    out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+        return out_images
+
+
+class GroupCenterCrop(object):
+    def __init__(self, size):
+        self.worker = torchvision.transforms.CenterCrop(size)
+
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+
+
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    # invert flow pixel values when flipping
+                    ret[i] = ImageOps.invert(ret[i])
+            return ret
+        else:
+            return img_group
+
+
+class GroupNormalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):
+        rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
+        rep_std = self.std * (tensor.size()[0] // len(self.std))
+
+        # TODO: make efficient
+        for t, m, s in zip(tensor, rep_mean, rep_std):
+            t.sub_(m).div_(s)
+
+        return tensor
+
+
+class GroupScale(object):
+    """ Rescales the input PIL.Image to the given 'size'.
+    'size' will be the size of the smaller edge.
+    For example, if height > width, then image will be
+    rescaled to (size * height / width, size)
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.worker = torchvision.transforms.Resize(size, interpolation)
+
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+
+
+class GroupOverSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(
+            crop_size, int) else (crop_size, crop_size)
+
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+
+    def __call__(self, img_group):
+
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+
+        offsets = GroupMultiScaleCrop.fill_fix_offset(
+            False, image_w, image_h, crop_w, crop_h)
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+
+                if img.mode == 'L' and i % 2 == 0:
+                    flip_group.append(ImageOps.invert(flip_crop))
+                else:
+                    flip_group.append(flip_crop)
+
+            oversample_group.extend(normal_group)
+            if self.flip:
+                oversample_group.extend(flip_group)
+        return oversample_group
+
+
+class GroupFullResSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(
+            crop_size, int) else (crop_size, crop_size)
+
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+
+    def __call__(self, img_group):
+
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        offsets = list()
+        offsets.append((0 * w_step, 2 * h_step))  # left
+        offsets.append((4 * w_step, 2 * h_step))  # right
+        offsets.append((2 * w_step, 2 * h_step))  # center
+
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                if self.flip:
+                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+
+                    if img.mode == 'L' and i % 2 == 0:
+                        flip_group.append(ImageOps.invert(flip_crop))
+                    else:
+                        flip_group.append(flip_crop)
+
+            oversample_group.extend(normal_group)
+            oversample_group.extend(flip_group)
+        return oversample_group
+
+
+class GroupMultiScaleCrop(object):
+
+    def __init__(self, input_size, scales=None, max_distort=1,
+                 fix_crop=True, more_fix_crop=True):
+        self.scales = scales if scales is not None else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.more_fix_crop = more_fix_crop
+        self.input_size = input_size if not isinstance(input_size, int) else [
+            input_size, input_size]
+        self.interpolation = Image.BILINEAR
+
+    def __call__(self, img_group):
+
+        im_size = img_group[0].size
+
+        crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
+        crop_img_group = [
+            img.crop(
+                (offset_w,
+                 offset_h,
+                 offset_w + crop_w,
+                 offset_h + crop_h)) for img in img_group]
+        ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
+                         for img in crop_img_group]
+        return ret_img_group
+
+    def _sample_crop_size(self, im_size):
+        image_w, image_h = im_size[0], im_size[1]
+
+        # find a crop size
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in self.scales]
+        crop_h = [
+            self.input_size[1] if abs(
+                x - self.input_size[1]) < 3 else x for x in crop_sizes]
+        crop_w = [
+            self.input_size[0] if abs(
+                x - self.input_size[0]) < 3 else x for x in crop_sizes]
+
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= self.max_distort:
+                    pairs.append((w, h))
+
+        crop_pair = random.choice(pairs)
+        if not self.fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_offset, h_offset = self._sample_fix_offset(
+                image_w, image_h, crop_pair[0], crop_pair[1])
+
+        return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+    def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
+        offsets = self.fill_fix_offset(
+            self.more_fix_crop, image_w, image_h, crop_w, crop_h)
+        return random.choice(offsets)
+
+    @staticmethod
+    def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        ret = list()
+        ret.append((0, 0))  # upper left
+        ret.append((4 * w_step, 0))  # upper right
+        ret.append((0, 4 * h_step))  # lower left
+        ret.append((4 * w_step, 4 * h_step))  # lower right
+        ret.append((2 * w_step, 2 * h_step))  # center
+
+        if more_fix_crop:
+            ret.append((0, 2 * h_step))  # center left
+            ret.append((4 * w_step, 2 * h_step))  # center right
+            ret.append((2 * w_step, 4 * h_step))  # lower center
+            ret.append((2 * w_step, 0 * h_step))  # upper center
+
+            ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+            ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+            ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+            ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+        return ret
+
+
+class GroupRandomSizedCrop(object):
+    """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
+    and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
+    This is popularly used to train the Inception networks
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.size = size
+        self.interpolation = interpolation
+
+    def __call__(self, img_group):
+        for attempt in range(10):
+            area = img_group[0].size[0] * img_group[0].size[1]
+            target_area = random.uniform(0.08, 1.0) * area
+            aspect_ratio = random.uniform(3. / 4, 4. / 3)
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if random.random() < 0.5:
+                w, h = h, w
+
+            if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
+                x1 = random.randint(0, img_group[0].size[0] - w)
+                y1 = random.randint(0, img_group[0].size[1] - h)
+                found = True
+                break
+        else:
+            found = False
+            x1 = 0
+            y1 = 0
+
+        if found:
+            out_group = list()
+            for img in img_group:
+                img = img.crop((x1, y1, x1 + w, y1 + h))
+                assert (img.size == (w, h))
+                out_group.append(
+                    img.resize(
+                        (self.size, self.size), self.interpolation))
+            return out_group
+        else:
+            # Fallback
+            scale = GroupScale(self.size, interpolation=self.interpolation)
+            crop = GroupRandomCrop(self.size)
+            return crop(scale(img_group))
+
+
+class ConvertDataFormat(object):
+    def __init__(self, model_type):
+        self.model_type = model_type
+
+    def __call__(self, images):
+        if self.model_type == '2D':
+            return images
+        tc, h, w = images.size()
+        t = tc // 3
+        images = images.view(t, 3, h, w)
+        images = images.permute(1, 0, 2, 3)
+        return images
+
+
+class Stack(object):
+
+    def __init__(self, roll=False):
+        self.roll = roll
+
+    def __call__(self, img_group):
+        if img_group[0].mode == 'L':
+            return np.concatenate([np.expand_dims(x, 2)
+                                   for x in img_group], axis=2)
+        elif img_group[0].mode == 'RGB':
+            if self.roll:
+                return np.concatenate([np.array(x)[:, :, ::-1]
+                                       for x in img_group], axis=2)
+            else:
+                # print(np.concatenate(img_group, axis=2).shape)
+                # print(img_group[0].shape)
+                return np.concatenate(img_group, axis=2)
+
+
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+
+    def __init__(self, div=True):
+        self.div = div
+
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(
+                    pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        return img.float().div(255) if self.div else img.float()
+
+
+class IdentityTransform(object):
+
+    def __call__(self, data):
+        return data
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ocrbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ocrbench.py
@@ -0,0 +1,65 @@
+from ...smp import *
+
+
+def OCRBench_eval(eval_file):
+    OCRBench_score = {
+        'Regular Text Recognition': 0,
+        'Irregular Text Recognition': 0,
+        'Artistic Text Recognition': 0,
+        'Handwriting Recognition': 0,
+        'Digit String Recognition': 0,
+        'Non-Semantic Text Recognition': 0,
+        'Scene Text-centric VQA': 0,
+        'Doc-oriented VQA': 0,
+        'Key Information Extraction': 0,
+        'Handwritten Mathematical Expression Recognition': 0
+    }
+
+    logger = get_logger('Evaluation')
+
+    data = load(eval_file)
+    lt = len(data)
+    lines = [data.iloc[i] for i in range(lt)]
+    for i in tqdm(range(len(lines))):
+        line = lines[i]
+        predict = str(line['prediction'])
+        answers = eval(line['answer'])
+        category = line['category']
+        if category == 'Handwritten Mathematical Expression Recognition':
+            for j in range(len(answers)):
+                answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+                predict = predict.strip().replace('\n', ' ').replace(' ', '')
+                if answer in predict:
+                    OCRBench_score[category] += 1
+                    break
+        else:
+            for j in range(len(answers)):
+                answer = answers[j].lower().strip().replace('\n', ' ')
+                predict = predict.lower().strip().replace('\n', ' ')
+                if answer in predict:
+                    OCRBench_score[category] += 1
+                    break
+
+    final_score_dict = {}
+    final_score_dict['Text Recognition'] = (
+        OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+        + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+        + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
+    )
+    final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
+    final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
+    final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
+    final_score_dict['Handwritten Mathematical Expression Recognition'] = \
+        OCRBench_score['Handwritten Mathematical Expression Recognition']
+    final_score_dict['Final Score'] = (
+        final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+        + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+        + final_score_dict['Handwritten Mathematical Expression Recognition']
+    )
+    final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
+    score_pth = eval_file.replace('.xlsx', '_score.json')
+    dump(final_score_dict, score_pth)
+    logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    for key, value in final_score_dict.items():
+        logger.info('{}:{}'.format(key, value))
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py
@@ -0,0 +1,140 @@
+from ...smp import *
+import numpy as np
+import re
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+DURATIONS = [
+    'short',
+    'medium',
+    'long',
+]
+
+DOMAINS = [
+    'Knowledge',
+    'Film & Television',
+    'Sports Competition',
+    'Artistic Performance',
+    'Life Record',
+    'Multilingual'
+]
+
+SUB_CATEGORIES = [
+    'Humanity & History',
+    'Literature & Art',
+    'Biology & Medicine',
+    'Finance & Commerce',
+    'Astronomy',
+    'Geography',
+    'Law',
+    'Life Tip',
+    'Technology',
+    'Animation',
+    'Movie & TV Show',
+    'Documentary',
+    'News Report',
+    'Esports',
+    'Basketball',
+    'Football',
+    'Athletics',
+    'Other Sports',
+    'Stage Play',
+    'Magic Show',
+    'Variety Show',
+    'Acrobatics',
+    'Handicraft',
+    'Food',
+    'Fashion',
+    'Daily Life',
+    'Travel',
+    'Pet & Animal',
+    'Exercise',
+    'Multilingual'
+]
+
+TASK_CATEGORIES = [
+    'Temporal Perception',
+    'Spatial Perception',
+    'Attribute Perception',
+    'Action Recognition',
+    'Object Recognition',
+    'OCR Problems',
+    'Counting Problem',
+    'Temporal Reasoning',
+    'Spatial Reasoning',
+    'Action Reasoning',
+    'Object Reasoning',
+    'Information Synopsis',
+]
+
+
+def get_dimension_rating(data_path):
+    data = load(data_path)
+
+    duration_rating = {k: {} for k in DURATIONS}
+    for duration in DURATIONS + ['overall']:
+        duration_rating[duration] = {
+            'overall': '',
+            'domain': {k: [] for k in DOMAINS},
+            'sub_category': {k: [] for k in SUB_CATEGORIES},
+            'task_type': {k: [] for k in TASK_CATEGORIES}
+        }
+
+    for i in range(len(data)):
+
+        domain = data.iloc[i]['domain']
+        sub_ctg = data.iloc[i]['sub_category']
+        task_ctg = data.iloc[i]['task_type']
+
+        duration = data.iloc[i]['duration']
+        duration_rating[duration]['domain'][domain].append(data.iloc[i]['score'])
+        duration_rating[duration]['sub_category'][sub_ctg].append(data.iloc[i]['score'])
+        duration_rating[duration]['task_type'][task_ctg].append(data.iloc[i]['score'])
+
+        duration_rating['overall']['domain'][domain].append(data.iloc[i]['score'])
+        duration_rating['overall']['sub_category'][sub_ctg].append(data.iloc[i]['score'])
+        duration_rating['overall']['task_type'][task_ctg].append(data.iloc[i]['score'])
+
+    for duration in DURATIONS + ['overall']:
+
+        overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.2f}'
+        duration_rating[duration]['overall'] = overall_res_dur
+
+        for domain in DOMAINS:
+            domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.2f}'
+            duration_rating[duration]['domain'][domain] = domain_res_dur
+
+        for sub_ctg in SUB_CATEGORIES:
+            sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.2f}'
+            duration_rating[duration]['sub_category'][sub_ctg] = sub_res_dur
+
+        for task_ctg in TASK_CATEGORIES:
+            task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.2f}'
+            duration_rating[duration]['task_type'][task_ctg] = task_res_dur
+
+    return duration_rating
+
+
+def extract_characters_regex(s):
+    s = s.strip()
+    answer_prefixes = [
+        'The best answer is',
+        'The correct answer is',
+        'The answer is',
+        'The answer',
+        'The best option is'
+        'The correct option is',
+        'Best answer:'
+        'Best option:',
+        'Answer:',
+        'Option:',
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, '')
+
+    if len(s.split()) > 10 and not re.search('[ABCD]', s):
+        return ''
+    matches = re.search(r'[ABCD]', s)
+    if matches is None:
+        return ''
+    return matches[0]
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/vqa_eval.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/vqa_eval.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Partly adopted from https://github.com/GT-Vision-Lab/VQA
+# Copyright (c) 2014, Aishwarya Agrawal
+
+from ...smp import *
+from typing import Optional
+
+
+def _process_digit_article(inText):
+    outText = []
+    tempText = inText.lower().split()
+    articles = ['a', 'an', 'the']
+    manualMap = {
+        'none': '0',
+        'zero': '0',
+        'one': '1',
+        'two': '2',
+        'three': '3',
+        'four': '4',
+        'five': '5',
+        'six': '6',
+        'seven': '7',
+        'eight': '8',
+        'nine': '9',
+        'ten': '10',
+    }
+    contractions = {
+        'aint': "ain't",
+        'arent': "aren't",
+        'cant': "can't",
+        'couldve': "could've",
+        'couldnt': "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        'didnt': "didn't",
+        'doesnt': "doesn't",
+        'dont': "don't",
+        'hadnt': "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        'hasnt': "hasn't",
+        'havent': "haven't",
+        'hed': "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        'hes': "he's",
+        'howd': "how'd",
+        'howll': "how'll",
+        'hows': "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        'Im': "I'm",
+        'Ive': "I've",
+        'isnt': "isn't",
+        'itd': "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        'itll': "it'll",
+        "let's": "let's",
+        'maam': "ma'am",
+        'mightnt': "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        'mightve': "might've",
+        'mustnt': "mustn't",
+        'mustve': "must've",
+        'neednt': "needn't",
+        'notve': "not've",
+        'oclock': "o'clock",
+        'oughtnt': "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        'shant': "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        'shouldve': "should've",
+        'shouldnt': "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": 'somebodyd',
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        'somebodyll': "somebody'll",
+        'somebodys': "somebody's",
+        'someoned': "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        'someonell': "someone'll",
+        'someones': "someone's",
+        'somethingd': "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        'somethingll': "something'll",
+        'thats': "that's",
+        'thered': "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        'therere': "there're",
+        'theres': "there's",
+        'theyd': "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        'theyll': "they'll",
+        'theyre': "they're",
+        'theyve': "they've",
+        'twas': "'twas",
+        'wasnt': "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        'weve': "we've",
+        'werent': "weren't",
+        'whatll': "what'll",
+        'whatre': "what're",
+        'whats': "what's",
+        'whatve': "what've",
+        'whens': "when's",
+        'whered': "where'd",
+        'wheres': "where's",
+        'whereve': "where've",
+        'whod': "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        'wholl': "who'll",
+        'whos': "who's",
+        'whove': "who've",
+        'whyll': "why'll",
+        'whyre': "why're",
+        'whys': "why's",
+        'wont': "won't",
+        'wouldve': "would've",
+        'wouldnt': "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        'yall': "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        'youd': "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        'youll': "you'll",
+        'youre': "you're",
+        'youve': "you've",
+    }
+    for word in tempText:
+        word = manualMap.setdefault(word, word)
+        if word not in articles:
+            outText.append(word)
+    for wordId, word in enumerate(outText):
+        if word in contractions:
+            outText[wordId] = contractions[word]
+    outText = ' '.join(outText)
+    return outText
+
+
+def hit_calculate(result, dataset_name, anls_threshold=0.5):
+    if listinstr(['TextVQA'], dataset_name):
+        return [np.mean(x['match']) for x in result]
+    elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
+        return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result]
+    elif listinstr(['ChartQA', 'OCRVQA'], dataset_name):
+        return [np.max(x['match']) for x in result]
+    else:  # default using vqa_score to calculate score
+        return [np.mean(x['match']) for x in result]
+
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+    prediction = str(prediction)
+    target = str(target)
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float - target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+
+
+def anls_compute(groundtruth, prediction):
+    gt_answer = ' '.join(groundtruth.strip().lower().split())
+    det_answer = ' '.join(prediction.strip().lower().split())
+    dist = levenshtein_distance(gt_answer, det_answer)
+    length = max(len(groundtruth.upper()), len(prediction.upper()))
+    values = 0.0 if length == 0 else float(dist) / float(length)
+    return values
+
+
+def process_answer(answer):
+    answer = answer.replace('\n', ' ')
+    answer = answer.replace('\t', ' ')
+    answer = answer.strip()
+    answer = process_punctuation(answer)
+    answer = _process_digit_article(answer)
+    return answer
+
+
+def process_line(line, method='vqa_score'):
+    ret = {}
+    if istype(line['answer'], list):
+        answers = eval(line['answer'])
+    else:
+        answers = [line['answer']]
+    if method == 'vqa_score':
+        ret['gt'] = [process_answer(x) for x in answers]
+        ret['pred'] = process_answer(line['prediction'])
+        ret['match'] = []
+        for current_idx, gtAnsDatum in enumerate(ret['gt']):
+            otherGTAns = [
+                item for ret_gt_idx, item in enumerate(ret['gt'])
+                if ret_gt_idx != current_idx
+            ]
+            matchingAns = [
+                item for item in otherGTAns if item == ret['pred']
+            ]
+            acc = min(1, float(len(matchingAns)) / 3)
+            ret['match'].append(acc)
+    elif method == 'anls':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction']
+        ret['match'] = [anls_compute(x, ret['pred']) for x in ret['gt']]
+    elif method == 'relaxed_accuracy':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction'].strip()
+        ret['match'] = [relaxed_correctness(ret['pred'], x) for x in ret['gt']]
+    elif method == 'accuracy':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction'].strip()
+        ret['match'] = [(1.0 if (x.strip().lower() == ret['pred'].strip().lower()) else 0.0) for x in ret['gt']]
+    else:  # default using vqa_score to calculate score
+        ret['gt'] = [process_answer(x) for x in answers]
+        ret['pred'] = process_answer(line['prediction'])
+        ret['match'] = [x == ret['pred'] for x in ret['gt']]
+
+    return ret
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py
@@ -0,0 +1,203 @@
+from ...smp import *
+
+
+def MME_rating(data_file):
+    data = load(data_file)
+    stats = defaultdict(dict)
+    lt = len(data)
+    for i in range(lt):
+        item = data.iloc[i]
+        category = item['category']
+        image_path = item['image_path']
+        score = item['score']
+        if image_path not in stats[category]:
+            stats[category][image_path] = []
+        stats[category][image_path].append(score)
+
+    def acc(key, mode='normal'):
+        res = stats[key]
+        values = []
+        for val in res.values():
+            if mode == 'normal':
+                values.extend(val)
+            elif mode == 'plus':
+                values.append(val[0] * val[1])
+        return np.mean(values) * 100
+
+    scores = {}
+    for k in stats:
+        scores[k] = acc(k) + acc(k, 'plus')
+
+    super_cates = dict(
+        perception=[
+            'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence',
+            'landmark', 'position', 'posters', 'scene'
+        ],
+        reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation']
+    )
+
+    ret = {}
+    for sc, cate_list in super_cates.items():
+        base = 0
+        for c in cate_list:
+            base += scores[c]
+        ret[sc] = base
+    ret.update(scores)
+    ret = d2df(ret)
+    return ret
+
+
+def Hallusion_rating(data_file):
+    def calc_fAcc(data):
+        res = defaultdict(list)
+        lt = len(data)
+        for i in range(lt):
+            line = data.iloc[i]
+            res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score'])
+        return np.mean([np.all(x) for x in res.values()]) * 100
+
+    def calc_qAcc(data):
+        res = defaultdict(list)
+        lt = len(data)
+        for i in range(lt):
+            line = data.iloc[i]
+            res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score'])
+        return np.mean([np.all(x) for x in res.values()]) * 100
+
+    def calc_aAcc(data):
+        return np.mean(data['score']) * 100
+
+    data = load(data_file)
+    data['set_id'] = [x.split('_')[3] for x in data['index']]
+    data['figure_id'] = [x.split('_')[4] for x in data['index']]
+    data['question_id'] = [x.split('_')[5] for x in data['index']]
+
+    res = dict(split=[], aAcc=[], fAcc=[], qAcc=[])
+    res['split'].append('Overall')
+    res['aAcc'].append(calc_aAcc(data))
+    res['fAcc'].append(calc_fAcc(data))
+    res['qAcc'].append(calc_qAcc(data))
+
+    if 'category' in data:
+        cates = list(set(data['category']))
+        for c in cates:
+            sub = data[data['category'] == c]
+            res['split'].append(c)
+            res['aAcc'].append(calc_aAcc(sub))
+            res['fAcc'].append(calc_fAcc(sub))
+            res['qAcc'].append(calc_qAcc(sub))
+
+    if 'l2-category' in data:
+        cates = list(set(data['l2-category']))
+        for c in cates:
+            sub = data[data['l2-category'] == c]
+            res['split'].append(c)
+            res['aAcc'].append(calc_aAcc(sub))
+            res['fAcc'].append(calc_fAcc(sub))
+            res['qAcc'].append(calc_qAcc(sub))
+    ret = pd.DataFrame(res)
+    return ret
+
+
+def POPE_rating(data_file):
+    def cal_f1_score(y_true, y_pred):
+        tp = sum((y_true == 1) & (y_pred == 1))
+        fp = sum((y_true == 0) & (y_pred == 1))
+        fn = sum((y_true == 1) & (y_pred == 0))
+
+        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
+        recall = tp / (tp + fn) if (tp + fn) != 0 else 0
+        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
+        return f1_score, precision, recall
+
+    data = load(data_file)
+    data = data.assign(category=data['category'].str.split(',')).explode('category')
+    data['index'] = range(len(data))
+    res = dict(split=[], Overall=[], acc=[], precision=[], recall=[])
+    y_true = np.array([1 if i == 'Yes' else 0 for i in data['answer']])
+    y_pred = np.array([1 if i == 'Yes' else 0 for i in data['extracted']])
+    f1_score, precision, recall = cal_f1_score(y_true, y_pred)
+    res['split'].append('Overall')
+    res['Overall'].append(f1_score * 100)
+    res['acc'].append(np.mean(data['score']) * 100)
+    res['precision'].append(precision * 100)
+    res['recall'].append(recall * 100)
+
+    if 'category' in data:
+        cates = list(set(data['category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        for c in cates:
+            sub = data[data['category'] == c]
+            y_true = np.array([1 if i == 'Yes' else 0 for i in sub['answer']])
+            y_pred = np.array([1 if i == 'Yes' else 0 for i in sub['extracted']])
+            f1_score, precision, recall = cal_f1_score(y_true, y_pred)
+            res['split'].append(c)
+            res['Overall'].append(f1_score * 100)
+            res['acc'].append(np.mean(sub['score']) * 100)
+            res['precision'].append(precision * 100)
+            res['recall'].append(recall * 100)
+
+    ret = pd.DataFrame(res)
+    return ret
+
+
+def default_rating(data_file):
+    data = load(data_file)
+    res = {}
+    res['Overall'] = np.mean(data['score']) * 100
+    if 'category' in data:
+        cates = list(set(data['category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        cates.sort()
+        for c in cates:
+            sub = data[data['category'] == c]
+            res[c] = np.mean(sub['score']) * 100
+    if 'l2-category' in data:
+        cates = list(set(data['l2-category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        cates.sort()
+        for c in cates:
+            sub = data[data['l2-category'] == c]
+            res[c] = np.mean(sub['score']) * 100
+    ret = d2df(res)
+    return ret
+
+
+def YOrN_match_prompt(line):
+    tmpl = (
+        'You are an AI assistant who will help me to match an answer with two options of a question. '
+        'The options are only Yes / No. '
+        'You are provided with a question and an answer, '
+        'and you need to find which option (Yes / No) is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Unknown. '
+        'Your should output a single word among the following 3 choices: Yes, No, Unknown.\n'
+        'Example 1: \n'
+        "Question: Is the word in this image 'Hello'?\nAnswer: The word in this image is 'Hello'.\nYour output: Yes\n"
+        'Example 2: \n'
+        "Question: Is the word in this image 'Hello'?\n"
+        "Answer: The word in this image is not 'Hello'.\nYour output: No\n"
+        'Example 3: \n'
+        'Question: {}?\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(line['question'], line['prediction'])
+
+
+def YOrN_Extraction(output):
+    s = output.lower()
+    words = process_punctuation(s).split()
+    if 'yes' in words and 'no' not in words:
+        return 'Yes'
+    if 'yes' not in words and 'no' in words:
+        return 'No'
+    return 'Unknown'
+
+
+def YOrN_auxeval(model, line):
+    prompt = YOrN_match_prompt(line)
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=0.5 * i)
+        ans = YOrN_Extraction(output)
+        if ans != 'Unknown':
+            return ans
+    return 'Unknown'