Add eval_mm dir

2026-02-05 18:29:18 +08:00 · 2024-05-28 01:21:34 +08:00
parent 7e12387362
commit 65f5567a3a
49 changed files with 5610 additions and 0 deletions
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/OCRBench.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/OCRBench.py
@@ -0,0 +1,65 @@
+from vlmeval.smp import *
+
+
+def OCRBench_eval(eval_file):
+    OCRBench_score = {
+        'Regular Text Recognition': 0,
+        'Irregular Text Recognition': 0,
+        'Artistic Text Recognition': 0,
+        'Handwriting Recognition': 0,
+        'Digit String Recognition': 0,
+        'Non-Semantic Text Recognition': 0,
+        'Scene Text-centric VQA': 0,
+        'Doc-oriented VQA': 0,
+        'Key Information Extraction': 0,
+        'Handwritten Mathematical Expression Recognition': 0
+    }
+
+    logger = get_logger('Evaluation')
+
+    data = load(eval_file)
+    lt = len(data)
+    lines = [data.iloc[i] for i in range(lt)]
+    for i in tqdm(range(len(lines))):
+        line = lines[i]
+        predict = str(line['prediction'])
+        answers = eval(line['answer'])
+        category = line['category']
+        if category == 'Handwritten Mathematical Expression Recognition':
+            for j in range(len(answers)):
+                answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+                predict = predict.strip().replace('\n', ' ').replace(' ', '')
+                if answer in predict:
+                    OCRBench_score[category] += 1
+                    break
+        else:
+            for j in range(len(answers)):
+                answer = answers[j].lower().strip().replace('\n', ' ')
+                predict = predict.lower().strip().replace('\n', ' ')
+                if answer in predict:
+                    OCRBench_score[category] += 1
+                    break
+
+    final_score_dict = {}
+    final_score_dict['Text Recognition'] = (
+        OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+        + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+        + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
+    )
+    final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
+    final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
+    final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
+    final_score_dict['Handwritten Mathematical Expression Recognition'] = \
+        OCRBench_score['Handwritten Mathematical Expression Recognition']
+    final_score_dict['Final Score'] = (
+        final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+        + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+        + final_score_dict['Handwritten Mathematical Expression Recognition']
+    )
+    final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
+    score_pth = eval_file.replace('.xlsx', '_score.json')
+    dump(final_score_dict, score_pth)
+    logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    for key, value in final_score_dict.items():
+        logger.info('{}:{}'.format(key, value))
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/init.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/init.py
@@ -0,0 +1,9 @@
+from .yes_or_no import default_rating, MME_rating, YOrN_eval
+from .mmvet_eval import MMVet_eval
+from .multiple_choice import multiple_choice_eval
+from .coco_eval import COCO_eval
+from .vqa_eval import VQAEval
+from .mathvista_eval import MathVista_eval
+from .llavabench import LLaVABench_eval
+from .misc import build_judge
+from .OCRBench import OCRBench_eval
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/coco_eval.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/coco_eval.py
@@ -0,0 +1,74 @@
+from vlmeval.smp import *
+from pycocoevalcap.bleu.bleu import Bleu
+from pycocoevalcap.rouge.rouge import Rouge
+from pycocoevalcap.cider.cider import Cider
+
+
+class COCO_Caption_Scorer():
+    def __init__(self, ref, gt):
+        self.ref = ref
+        self.gt = gt
+        print('setting up scorers...')
+        self.scorers = [
+            (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
+            # (Meteor(), "METEOR"), # need java version 11.0.16+
+            (Rouge(), 'ROUGE_L'),
+            (Cider(), 'CIDEr'),
+            # (Spice(), "SPICE"), # need java version 11.0.16+
+        ]
+
+    def compute_scores(self):
+        total_scores = {}
+        for scorer, method in self.scorers:
+            print('computing %s score...' % (scorer.method()))
+            score, scores = scorer.compute_score(self.gt, self.ref)
+            if type(method) == list:
+                for sc, scs, m in zip(score, scores, method):
+                    print('%s: %0.3f' % (m, sc * 100))
+                total_scores['Bleu'] = [x * 100 for x in score]
+            else:
+                print('%s: %0.3f' % (method, score * 100))
+                total_scores[method] = score * 100
+
+        print('*****DONE*****')
+        for key, value in total_scores.items():
+            print('{}:{}'.format(key, value))
+        return total_scores
+
+
+def COCO_eval(eval_file, nproc=4, verbose=False):
+    logger = get_logger('Evaluation')
+
+    data = load(eval_file)
+
+    lt = len(data)
+    lines = [data.iloc[i] for i in range(lt)]
+    ref = {}
+    gt = {}
+    for i, line in enumerate(lines):
+        ref[str(i)] = [str(line['prediction'])]
+        gt[str(i)] = eval(line['answer'])
+
+    scorer = COCO_Caption_Scorer(ref, gt)
+    coco_caption_score_dict = scorer.compute_scores()
+
+    score_pth = eval_file.replace('.xlsx', '_score.json')
+    dump(coco_caption_score_dict, score_pth)
+    logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    for key, value in coco_caption_score_dict.items():
+        logger.info('{}:{}'.format(key, value))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose)
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/llavabench.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/llavabench.py
@@ -0,0 +1,120 @@
+import argparse
+import numpy as np
+import pandas as pd
+import os.path as osp
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+
+rule_dict = {
+    'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'},  # noqa: E501
+    'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}  # noqa: E501
+}
+
+
+def get_eval(judge, content):
+    return judge.generate(content)
+
+
+def parse_score(review):
+    logger = get_logger('Evaluation')
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            logger.error('error', review)
+            return [-1, -1]
+    except Exception as e:
+        logger.error(e, 'error', review)
+        return [-1, -1]
+
+
+def build_prompt(line):
+    cap_str = line['caption']
+    question = line['question']
+    ans1 = line['gpt4_ans']
+    ans2 = line['prediction']
+    category = 'llava_bench_' + line['category']
+    rule = rule_dict[category]
+    role, prompt = rule['role'], rule['prompt']
+
+    content = (f'[Context]\n{cap_str}\n\n'
+               f'[Question]\n{question}\n\n'
+               f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
+               f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
+               f'[System]\n{prompt}\n\n')
+    return content
+
+
+def LLaVABench_atomeval(model, prompt):
+    review = get_eval(model, prompt)
+    scores = parse_score(review)
+    return scores
+
+
+def LLaVABench_score(data):
+    cates = ['overall'] + list(set(data['category']))
+    ret = defaultdict(list)
+
+    for c in cates:
+        ret['split'].append(c)
+        sub = data[data['category'] == c] if c != 'overall' else data
+        ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
+        ret['VLM Score'].append(np.mean(sub['score']) * 10)
+        ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
+    return pd.DataFrame(ret)
+
+
+def LLaVABench_eval(eval_file, **judge_kwargs):
+    suffix = '.' + eval_file.split('.')[-1]
+    record_file = eval_file.replace(suffix, '_openai_result' + suffix)
+    score_file = eval_file.replace(suffix, '_score.csv')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if not osp.exists(record_file):
+        data = load(eval_file)
+        lines = [data.iloc[i] for i in range(len(data))]
+        model = build_judge(
+            temperature=0.2,
+            system_prompt='You are a helpful and precise assistant for checking the quality of the answer.',
+            **judge_kwargs)
+        prompts = [build_prompt(line) for line in lines]
+        tups = [(model, prompt) for prompt in prompts]
+        scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
+        data['gpt4_score'] = [x[0] for x in scores]
+        data['score'] = [x[1] for x in scores]
+        dump(data, record_file)
+
+    data = load(record_file)
+    ret = LLaVABench_score(data).round(1)
+    print(ret)
+    dump(ret, score_file)
+    return ret
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='LLaVABench Evaluation. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model', type=str, help='The LLM (GPT) used for inference. ', default='gpt-4-turbo',
+        choices=['gpt-4-0613', 'gpt-4-turbo', 'chatgpt-1106', 'chatgpt-0613', 'gpt-4-0314'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+
+    LLaVABench_eval(eval_file=args.data, **judge_kwargs)
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/mathvista_eval.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/mathvista_eval.py
@@ -0,0 +1,240 @@
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+from vlmeval.utils.matching_util import can_infer
+
+
+def get_gpt4_ICE():
+    example_1 = """
+Hint: Please answer the question requiring an integer answer and provide the final value,
+e.g., 1, 2, 3, at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+
+    example_2 = """
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
+e.g., 1.2, 1.3, 1.4, at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+
+    example_3 = """
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
+e.g., 1.23, 1.34, 1.45, at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+
+    example_4 = """
+Hint: Please answer the question requiring a Python list as an answer and provide the final list,
+e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def build_mathvista_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += 'Extracted answer:'
+    return prompt
+
+
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if line['question_type'] == 'multi_choice':
+            ans = line['answer_option']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            if line['answer_type'] == 'integer':
+                res = int(response)
+                ans = int(line['answer'])
+            elif line['answer_type'] == 'float':
+                res = float(response)
+                ans = float(line['answer'])
+            else:
+                res = str(res)
+                ans = str(ans)
+    except ValueError:
+        pass
+
+    if res == ans:
+        return res if prefetch else True
+    else:
+        return False
+
+
+def MathVista_auxeval(model, line):
+    prompt = build_mathvista_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        return dict(log='Prefetch succeed', res=res)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+        if res is None:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')
+
+
+def MathVista_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    skill_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['task']
+        tot['Overall'] += 1
+        try:
+            skills = eval(item['skills'])
+        except SyntaxError:
+            skills = [item['skills']]
+        for skill in skills:
+            if skill not in skill_list:
+                skill_list.append(skill)
+            tot[skill] += 1
+        tot[cate] += 1
+        if item['log'] == 'Prefetch succeed':
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+            for skill in skills:
+                fetch[skill] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+            for skill in skills:
+                hit[skill] += 1
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Task&Skill'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+    res = pd.DataFrame(res)
+    return res
+
+
+def MathVista_eval(eval_file, **judge_kwargs):
+    logger = get_logger('Evaluation')
+    model = judge_kwargs['model']
+
+    suffix = eval_file.split('.')[-1]
+    storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+    tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if osp.exists(storage):
+        logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MathVista_eval. ')
+    else:
+        data = load(eval_file)
+        model = build_judge(max_tokens=128, **judge_kwargs)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+
+        if len(indices):
+            new_results = track_progress_rich(
+                MathVista_auxeval, tups, nproc=nproc, chunksize=nproc,
+                keys=indices, save=tmp_file)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+                assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+        log_map, res_map = {}, {}
+        all_inds = [line['index'] for line in lines]
+        for k in all_inds:
+            log_map[k] = ans[k]['log']
+            res_map[k] = ans[k]['res']
+        data['res'] = [res_map[idx] for idx in data['index']]
+        data['log'] = [log_map[idx] for idx in data['index']]
+        dump(data, storage)
+
+    score = MathVista_acc(storage)
+    score_pth = storage.replace('.xlsx', '_score.csv')
+
+    dump(score, score_pth)
+    logger.info(f'MathVista_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    logger.info(score)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='gpt-4-turbo',
+        choices=['gpt-4-0613', 'gpt-4-turbo', 'chatgpt-1106', 'chatgpt-0613'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    MathVista_eval(eval_file=args.data, **judge_kwargs)
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/misc.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/misc.py
@@ -0,0 +1,29 @@
+import os
+from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal
+from vlmeval.smp import load_env
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+
+def build_judge(**kwargs):
+    model = kwargs.pop('model', None)
+    load_env()
+    LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
+    if LOCAL_LLM is None:
+        model_map = {
+            'gpt-4-turbo': 'gpt-4-1106-preview',
+            'gpt-4-0613': 'gpt-4-0613',
+            'gpt-4-0314': 'gpt-4-0314',
+            'gpt-4-0125': 'gpt-4-0125-preview',
+            'chatgpt-1106': 'gpt-3.5-turbo-1106',
+            'chatgpt-0613': 'gpt-3.5-turbo-0613',
+            'chatgpt-0125': 'gpt-3.5-turbo-0125'
+        }
+        model_version = model_map[model]
+    else:
+        model_version = LOCAL_LLM
+    if INTERNAL:
+        model = OpenAIWrapperInternal(model_version, **kwargs)
+    else:
+        model = OpenAIWrapper(model_version, **kwargs)
+    return model
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/mmvet_eval.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/mmvet_eval.py
@@ -0,0 +1,191 @@
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+
+
+def build_mmvet_gpt4_prompt(line):
+    question = line['question']
+    gt = str(line['answer'])
+    prediction = str(line['prediction'])
+    prompt = """
+Compare the ground truth and prediction from AI models, to give a correctness score for the prediction.
+<AND> in the ground truth means it is totally right
+only when all elements in the ground truth are present in the prediction,
+and <OR> means it is totally right when any one element in the ground truth is present in the prediction.
+The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
+Just complete the last space of the correctness score.
+
+Question | Ground truth | Prediction | Correctness
+--- | --- | --- | ---
+What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
+What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
+What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
+Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
+Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
+while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
+because the names of these countries do not accurately represent their landscapes. |
+The meme talks about Iceland and Greenland. It's pointing out that despite their names,
+Iceland is not very icy and Greenland isn't very green. | 0.4
+Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
+Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
+while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
+because the names of these countries do not accurately represent their landscapes. |
+The meme is using humor to point out the misleading nature of Iceland's and Greenland's names.
+Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow.
+The text 'This is why I have trust issues' is a playful way to suggest
+that these contradictions can lead to distrust or confusion.
+The humor in this meme is derived from the unexpected contrast between the names of the countries
+and their actual physical characteristics. | 1.0
+"""
+    gpt4_prompt = prompt + '\n' + ' | '.join(
+        [question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
+    return gpt4_prompt
+
+
+def MMVet_auxeval(model, line):
+    def float_cvt(s):
+        try:
+            return float(s)
+        except ValueError:
+            return None
+
+    prompt = build_mmvet_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=i * 0.5)
+        score = float_cvt(output)
+        if score is None:
+            log += f'Try {i}: output is {output}, failed to parse.\n'
+        elif score < 0 or score > 1:
+            log += f'Try {i}: output is {output}, invalid score: {score}.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, score=score)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, score=0.0)
+
+
+def MMVet_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    score = defaultdict(lambda: 0)
+    lt = len(data)
+    cate2_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        cate2 = cate.replace(',', '_')
+        if cate2 not in cate2_list:
+            cate2_list.append(cate2)
+        grade = float(item['score'])
+        cate_list = ['rec', 'ocr', 'know', 'gen', 'spat', 'math']
+        for capa in cate_list:
+            if capa in cate:
+                tot[capa] += 1
+                score[capa] += grade
+        tot['Overall'] += 1
+        tot[cate2] += 1
+        score['Overall'] += grade
+        score[cate2] += grade
+
+    res = defaultdict(list)
+    res2 = defaultdict(list)
+    cate_list.append('Overall')
+    cate2_list.append('Overall')
+    for k in cate_list:
+        res['Category'].append(k)
+        res['tot'].append(tot[k])
+        res['acc'].append(score[k] / tot[k] * 100)
+    for v in cate2_list:
+        res2['Category'].append(v)
+        res2['tot'].append(tot[v])
+        res2['acc'].append(score[v] / tot[v] * 100)
+    res = pd.DataFrame(res)
+    res2 = pd.DataFrame(res2)
+    return res, res2
+
+
+def MMVet_eval(eval_file, **judge_kwargs):
+    logger = get_logger('Evaluation')
+
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+    tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+    nproc = judge_kwargs.pop('nproc', 4)
+    if osp.exists(storage):
+        logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMVet_eval. ')
+    else:
+        data = load(eval_file)
+        model = build_judge(max_tokens=3, **judge_kwargs)
+
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+
+        if len(indices):
+            new_results = track_progress_rich(
+                MMVet_auxeval, tups, nproc=nproc, chunksize=nproc,
+                keys=indices, save=tmp_file)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+                assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+
+        log_map, score_map = {}, {}
+        all_inds = [line['index'] for line in lines]
+        for k in all_inds:
+            log_map[k] = ans[k]['log']
+            score_map[k] = ans[k]['score']
+        data['score'] = [score_map[idx] for idx in data['index']]
+        data['log'] = [log_map[idx] for idx in data['index']]
+        dump(data, storage)
+
+    score, score_fine = MMVet_acc(storage)
+    score_pth = storage.replace('.xlsx', '_score.csv')
+    score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+
+    dump(score, score_pth)
+    dump(score_fine, score_fine_pth)
+    logger.info(
+        f'MMVet_eval successfully finished evaluating {eval_file}, '
+        f'results saved in {score_pth} and {score_fine_pth}'
+    )
+    logger.info('Score: ')
+    logger.info(score)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='gpt-4-turbo',
+        choices=['gpt-4-0613', 'gpt-4-turbo', 'chatgpt-1106', 'chatgpt-0613'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    MMVet_eval(eval_file=args.data, **judge_kwargs)
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/multiple_choice.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/multiple_choice.py
@@ -0,0 +1,399 @@
+import os.path as osp
+import pandas as pd
+from tqdm import tqdm
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.utils import can_infer, track_progress_rich, TSVDataset
+from vlmeval.smp import *
+import numpy as np
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+abbrs = {
+    'coarse_perception': 'CP',
+    'finegrained_perception (instance-level)': 'FP-S',
+    'finegrained_perception (cross-instance)': 'FP-C',
+    'logic_reasoning': 'LR',
+    'relation_reasoning': 'RR',
+    'attribute_reasoning': 'AR'
+}
+
+
+def MMMU_preproc(data):
+    logger = get_logger('Evaluation')
+    cnt = 0
+    As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
+    lt = len(data)
+    for i in range(lt):
+        if pd.isna(As[i]):
+            As[i] = Ans[i]
+            Bs[i] = 'Other Answers'
+            cnt += 1
+    logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ')
+    data['A'] = As
+    data['B'] = Bs
+    return data
+
+
+def report_acc(df):
+    # assert group in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+
+    if 'split' in df:
+        splits = list(set(df['split']))
+        res['split'] = splits
+    else:
+        df['split'] = ['none'] * len(df)
+        res['split'] = ['none']
+
+    for group in [None, 'l2-category', 'category']:
+        if group is None:
+            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+        elif group not in df:
+            continue
+        else:
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                ab_name = abbrs[ab] if ab in abbrs else ab
+                sub_df = df[df[group] == ab]
+                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+    return pd.DataFrame(res)
+
+
+def build_prompt(question, options, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: a cute teddy bear\nYour output: A\n'
+        'Example 2: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: Spider\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_prompt_cn(question, options, prediction):
+    tmpl = (
+        '你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。'
+        '你会被提供：一个问题，多个选项，一个答案。你的任务是找到与答案意义最相近的选项。'
+        '如果所有选项的意义都与答案显著不同，则输出 Z。'
+        '你应该输出一个单个的大写字母，例如 A, B, C, D（如果它们是有效选项），或 Z。'
+        '例 1:'
+        '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n'
+        '例 2: \n'
+        '问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n'
+        '例 3: \n'
+        '问题: {}?\n选项: {}\n答案: {}\n输出: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def build_choices(item):
+    ret = {}
+    for ch in string.ascii_uppercase:
+        if ch in item and (not pd.isna(item[ch])):
+            ret[ch] = item[ch]
+    return ret
+
+
+def prefetch_answer(item):
+    choices = build_choices(item)
+    return can_infer(item['prediction'], choices)
+
+
+def extract_answer_from_item(model, item):
+    logger = get_logger('Evaluation')
+    # It will return: (pred, raw, llm_time)
+    choices = build_choices(item)
+    option_str = build_option_str(choices)
+
+    if cn_string(item['question']):
+        prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
+    else:
+        prompt = build_prompt(item['question'], option_str, item['prediction'])
+    retry = 3
+
+    ret = can_infer(item['prediction'], choices)
+    if ret:
+        return dict(opt=ret, log=item['prediction'])
+
+    while retry:
+        ans = model.generate(prompt)
+        if 'Failed to obtain answer via API' in ans:
+            logger.warning('GPT API failed to answer. ')
+        else:
+            ret = can_infer(ans, choices)
+            if ret:
+                return dict(opt=ret, log=ans)
+            else:
+                logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
+        retry -= 1
+
+        if retry == 0:
+            options = list(choices) + ['Z'] if 'Z' not in choices else []
+            return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
+
+
+def prefetch_sub_data(sub_data, answer_map, verbose=False):
+    lt = len(sub_data)
+    GT, PRED = [], []
+    for i in range(lt):
+        item = sub_data.iloc[i]
+        idx = item['index']
+        GT.append(answer_map[idx])
+        PRED.append(prefetch_answer(item))
+        if PRED[-1] and (GT[-1] != PRED[-1]):
+            log = (
+                f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, '
+                f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. "
+            )
+            return dict(hit=0, log=log)
+    flag = True
+    for g, p in zip(GT, PRED):
+        if g != p:
+            flag = False
+    ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, )
+    ret = ret + (GT, PRED) if verbose else ret
+    return ret if len(ret) > 1 else ret[0]
+
+
+def eval_sub_data(model, sub_data, answer_map):
+    res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True)
+    if res is not None:
+        return res
+
+    lt = len(sub_data)
+    log = ''
+    for i in range(lt):
+        if PRED[i]:
+            log += f'Rolling {i} Matched.\n'
+        else:
+            res = extract_answer_from_item(model, sub_data.iloc[i])
+            opt, match_log = res['opt'], res['log']
+            PRED[i] = opt
+            if PRED[i] != GT[i]:
+                log += (
+                    f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
+                    f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
+                )
+                return dict(hit=0, log=log)
+            else:
+                log += (
+                    f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
+                    f'Pre-fetched is {PRED[i]}.\n'
+                )
+
+    return dict(hit=1, log=log)
+
+
+def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16):
+    prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups]
+    remain = []
+    for dg, pf in zip(data_groups, prefetched):
+        if pf:
+            result[dg.iloc[0]['index'] % 1e6] = pf
+        else:
+            remain.append(dg)
+    dump(result, result_file)
+    tups = [(model, x, answer_map) for x in remain]
+    keys = [x.iloc[0]['index'] % 1e6 for x in remain]
+    if len(tups) == 0:
+        return
+
+    if model is None:
+        logger = get_logger('Evaluation')
+        logger.warning('Exact Matching mode, will not do GPT-based answer matching. ')
+        for k in keys:
+            result[k] = dict(
+                hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
+        dump(result, result_file)
+        return
+
+    res = track_progress_rich(
+        eval_sub_data,
+        tups,
+        nproc=nproc,
+        chunksize=nproc,
+        save=result_file,
+        keys=keys)
+    result = load(result_file)
+    for k, v in zip(keys, res):
+        if k in result:
+            assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
+        else:
+            result[k] = v
+    dump(result, result_file)
+
+
+def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
+    logger = get_logger('Evaluation')
+
+    # assert dataset is not None
+    dataset_map = {
+        'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+        'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+    }
+    if dataset in dataset_map:
+        dataset = dataset_map[dataset]
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if listinstr(['mmbench', 'ccbench'], dataset.lower()):
+        data = load(eval_file)
+        data['index'] = [int(x) for x in data['index']]
+        dump(data, eval_file)
+
+    rd.seed(2680)
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125']
+    name_str_map = {
+        'chatgpt-0613': 'openai',
+        'gpt-4-0125': 'gpt4'
+    }
+    name_str = name_str_map[model] if model in name_str_map else model
+
+    if model == 'exact_matching':
+        model = None
+    else:
+        if INTERNAL or gpt_key_set():
+            model = build_judge(**judge_kwargs)
+        else:
+            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+    logger.info(f'Evaluating {eval_file}')
+    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+
+    data = load(eval_file)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+    if dataset != 'default':
+        meta = TSVDataset(dataset).data
+    else:
+        logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ')
+        meta = load(eval_file)
+        assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.'
+
+    answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
+    cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None
+    l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None
+    split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None
+
+    if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]):
+        cate_map = None
+    if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]):
+        l2_cate_map = None
+    if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]):
+        split_map = None
+
+    if listinstr(['MMMU'], dataset):
+        data = MMMU_preproc(data)
+        answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
+
+    data = data[data['index'].isin(answer_map)]
+    data_main = data[data['index'] < int(1e6)]
+    meta_idx_set = set(meta['index'])
+    data_main = data_main[data_main['index'].isin(meta_idx_set)]
+
+    lt = len(data_main)
+    hit, tot = 0, 0
+
+    data_groups = []
+    for i in tqdm(range(lt)):
+        # Dealing with the normal part
+        item_main = data_main.iloc[i]
+        idx = item_main['index']
+
+        if idx in result:
+            correct = result[idx]['hit']
+            assert correct in [0, 1]
+            hit += correct
+            tot += 1
+            continue
+
+        sub_data = data[data['index'] % int(1e6) == idx]
+        data_groups.append(sub_data)
+
+    if len(data_groups):
+        eval_data_groups(
+            model=model,
+            data_groups=data_groups,
+            answer_map=answer_map,
+            nproc=nproc,
+            result=result,
+            result_file=result_file)
+
+    tmp_pth = f'/tmp/{timestr()}.xlsx'
+    dump(data_main, tmp_pth)
+    data_main = load(tmp_pth)
+
+    res = load(result_file)
+    indices = data_main['index']
+
+    data_main['hit'] = [res[i]['hit'] for i in indices]
+    data_main['log'] = [res[i]['log'] for i in indices]
+
+    main_idx = data_main['index']
+    if cate_map is not None:
+        data_main['category'] = [cate_map[i] for i in main_idx]
+    if l2_cate_map is not None:
+        data_main['l2-category'] = [l2_cate_map[i] for i in main_idx]
+    if split_map is not None:
+        data_main['split'] = [split_map[i] for i in indices]
+
+    # load split
+    dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+    data_main = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+
+    acc = report_acc(data_main)
+    score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+    dump(acc, score_file)
+    logger.info(f'multiple_choice_eval successfully finished evaluating {eval_file}, results saved in {score_file}')
+    logger.info('Score: ')
+    logger.info(acc)
+    return acc
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='chatgpt-0613',
+        choices=['chatgpt-0613', 'exact_matching', 'gpt-4-0125'])
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='default',
+        help='The dataset to evaluate')
+    parser.add_argument('--nproc', type=int, default=6)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    acc = multiple_choice_eval(eval_file=args.data, dataset=args.dataset, **judge_kwargs)
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/vqa_eval.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/vqa_eval.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Partly adopted from https://github.com/GT-Vision-Lab/VQA
+# Copyright (c) 2014, Aishwarya Agrawal
+
+import re
+from vlmeval.smp import *
+from typing import Optional
+from functools import partial
+
+
+def _process_digit_article(inText):
+    outText = []
+    tempText = inText.lower().split()
+    articles = ['a', 'an', 'the']
+    manualMap = {
+        'none': '0',
+        'zero': '0',
+        'one': '1',
+        'two': '2',
+        'three': '3',
+        'four': '4',
+        'five': '5',
+        'six': '6',
+        'seven': '7',
+        'eight': '8',
+        'nine': '9',
+        'ten': '10',
+    }
+    contractions = {
+        'aint': "ain't",
+        'arent': "aren't",
+        'cant': "can't",
+        'couldve': "could've",
+        'couldnt': "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        'didnt': "didn't",
+        'doesnt': "doesn't",
+        'dont': "don't",
+        'hadnt': "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        'hasnt': "hasn't",
+        'havent': "haven't",
+        'hed': "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        'hes': "he's",
+        'howd': "how'd",
+        'howll': "how'll",
+        'hows': "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        'Im': "I'm",
+        'Ive': "I've",
+        'isnt': "isn't",
+        'itd': "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        'itll': "it'll",
+        "let's": "let's",
+        'maam': "ma'am",
+        'mightnt': "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        'mightve': "might've",
+        'mustnt': "mustn't",
+        'mustve': "must've",
+        'neednt': "needn't",
+        'notve': "not've",
+        'oclock': "o'clock",
+        'oughtnt': "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        'shant': "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        'shouldve': "should've",
+        'shouldnt': "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": 'somebodyd',
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        'somebodyll': "somebody'll",
+        'somebodys': "somebody's",
+        'someoned': "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        'someonell': "someone'll",
+        'someones': "someone's",
+        'somethingd': "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        'somethingll': "something'll",
+        'thats': "that's",
+        'thered': "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        'therere': "there're",
+        'theres': "there's",
+        'theyd': "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        'theyll': "they'll",
+        'theyre': "they're",
+        'theyve': "they've",
+        'twas': "'twas",
+        'wasnt': "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        'weve': "we've",
+        'werent': "weren't",
+        'whatll': "what'll",
+        'whatre': "what're",
+        'whats': "what's",
+        'whatve': "what've",
+        'whens': "when's",
+        'whered': "where'd",
+        'wheres': "where's",
+        'whereve': "where've",
+        'whod': "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        'wholl': "who'll",
+        'whos': "who's",
+        'whove': "who've",
+        'whyll': "why'll",
+        'whyre': "why're",
+        'whys': "why's",
+        'wont': "won't",
+        'wouldve': "would've",
+        'wouldnt': "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        'yall': "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        'youd': "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        'youll': "you'll",
+        'youre': "you're",
+        'youve': "you've",
+    }
+    for word in tempText:
+        word = manualMap.setdefault(word, word)
+        if word not in articles:
+            outText.append(word)
+    for wordId, word in enumerate(outText):
+        if word in contractions:
+            outText[wordId] = contractions[word]
+    outText = ' '.join(outText)
+    return outText
+
+
+def hit_calculate(result, dataset_name, anls_threshold=0.5):
+    if listinstr(['TextVQA'], dataset_name):
+        return [np.mean(x['match']) for x in result]
+    elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
+        # return [1 - np.min(x['match']) >= anls_threshold for x in result]
+        return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result]
+    elif listinstr(['ChartQA', 'OCRVQA'], dataset_name):
+        return [np.max(x['match']) for x in result]
+    else:  # default using vqa_score to calculate score
+        return [np.mean(x['match']) for x in result]
+
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+    prediction = str(prediction)
+    target = str(target)
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float - target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+
+
+def anls_compute(groundtruth, prediction):
+    gt_answer = ' '.join(groundtruth.strip().lower().split())
+    det_answer = ' '.join(prediction.strip().lower().split())
+    dist = levenshtein_distance(gt_answer, det_answer)
+    length = max(len(groundtruth.upper()), len(prediction.upper()))
+    values = 0.0 if length == 0 else float(dist) / float(length)
+    return values
+
+
+def process_answer(answer):
+    answer = answer.replace('\n', ' ')
+    answer = answer.replace('\t', ' ')
+    answer = answer.strip()
+    answer = process_punctuation(answer)
+    answer = _process_digit_article(answer)
+    return answer
+
+
+def process_line(line, method='vqa_score'):
+    ret = {}
+    if istype(line['answer'], list):
+        answers = eval(line['answer'])
+    else:
+        answers = [line['answer']]
+    if method == 'vqa_score':
+        ret['gt'] = [process_answer(x) for x in answers]
+        ret['pred'] = process_answer(line['prediction'])
+        ret['match'] = []
+        for current_idx, gtAnsDatum in enumerate(ret['gt']):
+            otherGTAns = [
+                item for ret_gt_idx, item in enumerate(ret['gt'])
+                if ret_gt_idx != current_idx
+            ]
+            matchingAns = [
+                item for item in otherGTAns if item == ret['pred']
+            ]
+            acc = min(1, float(len(matchingAns)) / 3)
+            ret['match'].append(acc)
+    elif method == 'anls':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction']
+        ret['match'] = [anls_compute(x, ret['pred']) for x in ret['gt']]
+    elif method == 'relaxed_accuracy':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction'].strip()
+        ret['match'] = [relaxed_correctness(ret['pred'], x) for x in ret['gt']]
+    elif method == 'accuracy':
+        ret['gt'] = answers
+        ret['pred'] = line['prediction'].strip()
+        ret['match'] = [(1.0 if (x.strip().lower() == ret['pred'].strip().lower()) else 0.0) for x in ret['gt']]
+    else:  # default using vqa_score to calculate score
+        ret['gt'] = [process_answer(x) for x in answers]
+        ret['pred'] = process_answer(line['prediction'])
+        ret['match'] = [x == ret['pred'] for x in ret['gt']]
+
+    return ret
+
+
+def VQAEval(eval_file, dataset_name, **kwargs):
+    logger = get_logger('Evaluation')
+    data = load(eval_file)
+    assert 'answer' in data and 'prediction' in data
+    data['prediction'] = [str(x) for x in data['prediction']]
+    data['answer'] = [str(x) for x in data['answer']]
+    lt = len(data)
+    pool = mp.Pool(16)
+    lines = [data.iloc[i] for i in range(lt)]
+    if listinstr(['TextVQA'], dataset_name):
+        res = pool.map(partial(process_line, method='vqa_score'), lines)
+    elif listinstr(['ChartQA'], dataset_name):
+        res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
+    elif listinstr(['OCRVQA'], dataset_name):
+        res = pool.map(partial(process_line, method='accuracy'), lines)
+    elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
+        res = pool.map(partial(process_line, method='anls'), lines)
+    else:  # default using vqa_score to calculate score
+        res = pool.map(process_line, lines)
+    # [np.mean(x['match']) >= full_score_weight for x in res]
+    hit = hit_calculate(res, dataset_name)
+    ret = dict()
+    if 'split' in data:
+        splits = set(data['split'])
+        for sp in splits:
+            sub = [r for l, r in zip(lines, res) if l['split'] == sp]
+            # [np.mean(x['match']) >= full_score_weight for x in sub]
+            hit = hit_calculate(sub, dataset_name)
+            ret[sp] = np.mean(hit) * 100
+        sub = [r for l, r in zip(lines, res)]
+        hit = hit_calculate(sub, dataset_name)
+        ret['Overall'] = np.mean(hit) * 100
+    else:
+        ret['Overall'] = np.mean(hit) * 100
+        if 'category' in data:
+            cates = list(set(data['category']))
+            cates.sort()
+            for c in cates:
+                sub = [r for l, r in zip(lines, res) if l['category'] == c]
+                # [np.mean(x['match']) >= full_score_weight for x in sub]
+                hit = hit_calculate(sub, dataset_name)
+                ret[c] = np.mean(hit) * 100
+    ret = d2df(ret)
+    ret.round(2)
+
+    suffix = eval_file.split('.')[-1]
+    result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+    logger.info(f'VQA Eval Finished. Saved to {result_file}. ')
+    logger.info(ret)
+    dump(ret, result_file)
--- a/eval_mm/vlmevalkit/vlmeval/evaluate/yes_or_no.py
+++ b/eval_mm/vlmevalkit/vlmeval/evaluate/yes_or_no.py
@@ -0,0 +1,297 @@
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.smp import *
+from vlmeval.utils import track_progress_rich
+
+INTERNAL = os.environ.get('INTERNAL', 0)
+
+
+def MME_rating(data_file):
+    data = load(data_file)
+    stats = defaultdict(dict)
+    lt = len(data)
+    for i in range(lt):
+        item = data.iloc[i]
+        category = item['category']
+        image_path = item['image_path']
+        score = item['score']
+        if image_path not in stats[category]:
+            stats[category][image_path] = []
+        stats[category][image_path].append(score)
+
+    def acc(key, mode='normal'):
+        res = stats[key]
+        values = []
+        for val in res.values():
+            if mode == 'normal':
+                values.extend(val)
+            elif mode == 'plus':
+                values.append(val[0] * val[1])
+        return np.mean(values) * 100
+
+    scores = {}
+    for k in stats:
+        scores[k] = acc(k) + acc(k, 'plus')
+
+    super_cates = dict(
+        perception=[
+            'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence',
+            'landmark', 'position', 'posters', 'scene'
+        ],
+        reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation']
+    )
+
+    ret = {}
+    for sc, cate_list in super_cates.items():
+        base = 0
+        for c in cate_list:
+            base += scores[c]
+        ret[sc] = base
+    ret.update(scores)
+    ret = d2df(ret)
+    return ret
+
+
+def Hallusion_rating(data_file):
+    def calc_fAcc(data):
+        res = defaultdict(list)
+        lt = len(data)
+        for i in range(lt):
+            line = data.iloc[i]
+            res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score'])
+        return np.mean([np.all(x) for x in res.values()]) * 100
+
+    def calc_qAcc(data):
+        res = defaultdict(list)
+        lt = len(data)
+        for i in range(lt):
+            line = data.iloc[i]
+            res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score'])
+        return np.mean([np.all(x) for x in res.values()]) * 100
+
+    def calc_aAcc(data):
+        return np.mean(data['score']) * 100
+
+    data = load(data_file)
+    data['set_id'] = [x.split('_')[3] for x in data['index']]
+    data['figure_id'] = [x.split('_')[4] for x in data['index']]
+    data['question_id'] = [x.split('_')[5] for x in data['index']]
+
+    res = dict(split=[], aAcc=[], fAcc=[], qAcc=[])
+    res['split'].append('Overall')
+    res['aAcc'].append(calc_aAcc(data))
+    res['fAcc'].append(calc_fAcc(data))
+    res['qAcc'].append(calc_qAcc(data))
+
+    if 'category' in data:
+        cates = list(set(data['category']))
+        for c in cates:
+            sub = data[data['category'] == c]
+            res['split'].append(c)
+            res['aAcc'].append(calc_aAcc(sub))
+            res['fAcc'].append(calc_fAcc(sub))
+            res['qAcc'].append(calc_qAcc(sub))
+
+    if 'l2-category' in data:
+        cates = list(set(data['l2-category']))
+        for c in cates:
+            sub = data[data['l2-category'] == c]
+            res['split'].append(c)
+            res['aAcc'].append(calc_aAcc(sub))
+            res['fAcc'].append(calc_fAcc(sub))
+            res['qAcc'].append(calc_qAcc(sub))
+    ret = pd.DataFrame(res)
+    return ret
+
+
+def POPE_rating(data_file):
+    def cal_f1_score(y_true, y_pred):
+        tp = sum((y_true == 1) & (y_pred == 1))
+        fp = sum((y_true == 0) & (y_pred == 1))
+        fn = sum((y_true == 1) & (y_pred == 0))
+
+        precision = tp / (tp + fp) if (tp + fp) != 0 else 0
+        recall = tp / (tp + fn) if (tp + fn) != 0 else 0
+        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
+        return f1_score, precision, recall
+
+    data = load(data_file)
+    data = data.assign(category=data['category'].str.split(',')).explode('category')
+    data['index'] = range(len(data))
+    res = dict(split=[], Overall=[], acc=[], precision=[], recall=[])
+    y_true = np.array([1 if i == 'Yes' else 0 for i in data['answer']])
+    y_pred = np.array([1 if i == 'Yes' else 0 for i in data['extracted']])
+    f1_score, precision, recall = cal_f1_score(y_true, y_pred)
+    res['split'].append('Overall')
+    res['Overall'].append(f1_score * 100)
+    res['acc'].append(np.mean(data['score']) * 100)
+    res['precision'].append(precision * 100)
+    res['recall'].append(recall * 100)
+
+    if 'category' in data:
+        cates = list(set(data['category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        for c in cates:
+            sub = data[data['category'] == c]
+            y_true = np.array([1 if i == 'Yes' else 0 for i in sub['answer']])
+            y_pred = np.array([1 if i == 'Yes' else 0 for i in sub['extracted']])
+            f1_score, precision, recall = cal_f1_score(y_true, y_pred)
+            res['split'].append(c)
+            res['Overall'].append(f1_score * 100)
+            res['acc'].append(np.mean(sub['score']) * 100)
+            res['precision'].append(precision * 100)
+            res['recall'].append(recall * 100)
+
+    ret = pd.DataFrame(res)
+    return ret
+
+
+def default_rating(data_file):
+    data = load(data_file)
+    res = {}
+    res['Overall'] = np.mean(data['score']) * 100
+    if 'category' in data:
+        cates = list(set(data['category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        cates.sort()
+        for c in cates:
+            sub = data[data['category'] == c]
+            res[c] = np.mean(sub['score']) * 100
+    if 'l2-category' in data:
+        cates = list(set(data['l2-category']))
+        cates = [c for c in cates if not pd.isna(c)]
+        cates.sort()
+        for c in cates:
+            sub = data[data['l2-category'] == c]
+            res[c] = np.mean(sub['score']) * 100
+    ret = d2df(res)
+    return ret
+
+
+def YOrN_match_prompt(line):
+    tmpl = (
+        'You are an AI assistant who will help me to match an answer with two options of a question. '
+        'The options are only Yes / No. '
+        'You are provided with a question and an answer, '
+        'and you need to find which option (Yes / No) is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Unknown. '
+        'Your should output a single word among the following 3 choices: Yes, No, Unknown.\n'
+        'Example 1: \n'
+        "Question: Is the word in this image 'Hello'?\nAnswer: The word in this image is 'Hello'.\nYour output: Yes\n"
+        'Example 2: \n'
+        "Question: Is the word in this image 'Hello'?\n"
+        "Answer: The word in this image is not 'Hello'.\nYour output: No\n"
+        'Example 3: \n'
+        'Question: {}?\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(line['question'], line['prediction'])
+
+
+def YOrN_Extraction(output):
+    s = output.lower()
+    words = process_punctuation(s).split()
+    if 'yes' in words and 'no' not in words:
+        return 'Yes'
+    if 'yes' not in words and 'no' in words:
+        return 'No'
+    return 'Unknown'
+
+
+def YOrN_auxeval(model, line):
+    prompt = YOrN_match_prompt(line)
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=0.5 * i)
+        ans = YOrN_Extraction(output)
+        if ans != 'Unknown':
+            return ans
+    return 'Unknown'
+
+
+def YOrN_eval(eval_file, dataset=None, **judge_kwargs):
+    logger = get_logger('Evaluation')
+    data = load(eval_file)
+    data['prediction'] = [str(x) for x in data['prediction']]
+    storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
+    tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    if not osp.exists(storage):
+        ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
+        if osp.exists(tmp_file):
+            tmp = load(tmp_file)
+            for k in tmp:
+                if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
+                    ans_map[k] = tmp[k]
+
+        data['extracted'] = [ans_map[x] for x in data['index']]
+        unknown = data[data['extracted'] == 'Unknown']
+
+        if INTERNAL or gpt_key_set():
+            model = build_judge(**judge_kwargs)
+        else:
+            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+        if model is not None:
+            lt = len(unknown)
+            lines = [unknown.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = list(unknown['index'])
+            if len(tups):
+                res = track_progress_rich(
+                    YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
+                for k, v in zip(indices, res):
+                    ans_map[k] = v
+
+        data['extracted'] = [ans_map[x] for x in data['index']]
+        dump(data, storage)
+    else:
+        logger.warning(f'GPT matching file {storage} already exists, will reuse it in YOrN_eval. ')
+
+    data = load(storage)
+    data['score'] = (data['answer'] == data['extracted'])
+    dump(data, storage)
+
+    if dataset is not None and listinstr(['MME'], dataset):
+        score = MME_rating(storage)
+    elif dataset is not None and listinstr(['Hallusion'], dataset):
+        score = Hallusion_rating(storage)
+    elif dataset is not None and listinstr(['POPE'], dataset):
+        score = POPE_rating(storage)
+    else:
+        score = default_rating(storage)
+
+    score_tgt = eval_file.replace('.xlsx', '_score.csv')
+    dump(score, score_tgt)
+
+    logger.info(f'YOrN_eval successfully finished evaluating {eval_file}, results saved in {score_tgt}')
+    logger.info('Score: ')
+    logger.info(score)
+    return score
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
+    parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='The LLM (GPT) used for inference. ',
+        default='chatgpt-0613',
+        choices=['chatgpt-0613'])
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--dataset', type=str, default=None)
+    parser.add_argument('--verbose', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    load_env()
+    args = parse_args()
+    judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose)
+    if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']:
+        judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+    if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']:
+        judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+    acc = YOrN_eval(eval_file=args.data, dataset=args.dataset, **judge_kwargs)