Modify eval_mm for MiniCPM-V 2.6

This commit is contained in:
Haoyu Li
2024-08-30 18:18:22 +00:00
parent ab1141ee45
commit 59224808a1
69 changed files with 8231 additions and 1818 deletions

View File

@@ -0,0 +1,186 @@
import warnings
from .image_base import img_root_map, ImageBaseDataset
from .image_caption import ImageCaptionDataset
from .image_yorn import ImageYORNDataset
from .image_mcq import ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset
from .image_mt import MMDUDataset
from .image_vqa import (
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, CustomVQADataset
)
from .vcr import VCRDataset
from .mmlongbench import MMLongBench
from .dude import DUDE
from .slidevqa import SlideVQA
from .mmbench_video import MMBenchVideo
from .text_mcq import CustomTextMCQDataset, TextMCQDataset
from .videomme import VideoMME
from .mvbench import MVBench, MVBench_MP4
from .utils import *
from ..smp import *
class ConcatDataset(ImageBaseDataset):
# This dataset takes multiple dataset names as input and aggregate them into a single dataset.
# Each single dataset should not have a field named `SUB_DATASET`
DATASET_SETS = {
'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
'MTL_MMBench_DEV': [
'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
]
}
def __init__(self, dataset):
datasets = self.DATASET_SETS[dataset]
self.dataset_map = {}
# The name of the compliation
self.dataset_name = dataset
self.datasets = datasets
for dname in datasets:
dataset = build_dataset(dname)
assert dataset is not None, dataset
self.dataset_map[dname] = dataset
TYPES = [x.TYPE for x in self.dataset_map.values()]
MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
self.TYPE = TYPES[0]
self.MODALITY = MODALITIES[0]
data_all = []
for dname in datasets:
data = self.dataset_map[dname].data
data['SUB_DATASET'] = [dname] * len(data)
data_new = localize_df(data, dname, nproc=16)
data_all.append(data_new)
data = pd.concat(data_all)
data['original_index'] = data.pop('index')
data['index'] = np.arange(len(data))
self.data = data
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
idx = line['original_index']
dname = line['SUB_DATASET']
org_data = self.dataset_map[dname].data
org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
return self.dataset_map[dname].build_prompt(org_line)
def dump_image(self, line):
# Assert all images are pre-dumped
assert 'image' not in line
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_SETS)
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
# First, split the eval_file by dataset
data_all = load(eval_file)
for dname in self.datasets:
tgt = eval_file.replace(self.dataset_name, dname)
data_sub = data_all[data_all['SUB_DATASET'] == dname]
data_sub.pop('index')
data_sub['index'] = data_sub.pop('original_index')
data_sub.pop('SUB_DATASET')
dump(data_sub, tgt)
# Then, evaluate each dataset separately
results_all = []
for dname in self.datasets:
tgt = eval_file.replace(self.dataset_name, dname)
res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
assert isinstance(res, pd.DataFrame)
res['DATASET'] = [dname] * len(res)
results_all.append(res)
result = pd.concat(results_all)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(result, score_file)
return result
# Add new supported dataset class here
IMAGE_DATASET = [
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset,
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, GMAIMMBenchDataset
]
VIDEO_DATASET = [
MMBenchVideo, VideoMME, MVBench, MVBench_MP4
]
TEXT_DATASET = [
TextMCQDataset
]
CUSTOM_DATASET = [
CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
]
DATASET_COLLECTION = [ConcatDataset]
DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
SUPPORTED_DATASETS = []
for DATASET_CLS in DATASET_CLASSES:
SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
def DATASET_TYPE(dataset):
for cls in DATASET_CLASSES:
if dataset in cls.supported_datasets():
if hasattr(cls, 'TYPE'):
return cls.TYPE
# Have to add specific routine to handle ConcatDataset
if dataset in ConcatDataset.DATASET_SETS:
dataset_list = ConcatDataset.DATASET_SETS[dataset]
TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
return TYPES[0]
if 'openended' in dataset.lower():
return 'VQA'
warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as MCQ. ')
return 'MCQ'
def build_dataset(dataset_name, **kwargs):
for cls in DATASET_CLASSES:
if dataset_name in cls.supported_datasets():
return cls(dataset=dataset_name, **kwargs)
warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
if not osp.exists(data_file):
warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
return None
data = load(data_file)
if 'question' not in [x.lower() for x in data.columns]:
warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
return None
if 'A' in data and 'B' in data:
if 'image' in data or 'image_path' in data:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
return CustomMCQDataset(dataset=dataset_name, **kwargs)
else:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
else:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
return CustomVQADataset(dataset=dataset_name, **kwargs)
__all__ = [
'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
] + [cls.__name__ for cls in DATASET_CLASSES]

View File

@@ -0,0 +1,210 @@
import math
from typing import List
from .utils.judge_util import build_judge
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
from ..smp import *
FAIL_MSG = 'Failed to obtain answer via API.'
def DUDE_acc(result_file):
data = load(result_file)
overall_score = 0.0
score_list = list()
for i in range(len(data)):
item = data.iloc[i]
if isinstance(item['answer'], float) and math.isnan(item['answer']):
item['answer'] = 'Not answerable'
item['answer'] = item['answer'].lower()
item['pred'] = item['pred'].lower()
score = anls_compute(item['answer'], item['pred'])
score_list.append(score)
overall_score += score
data['score'] = score_list
dump(data, result_file)
res = dict()
res['category'], res['num'], res['avg_score'] = ['anls'], [len(data)], [overall_score / len(data)]
res = pd.DataFrame(res)
return res
class DUDE(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv',
'DUDE_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE_MINI.tsv',
}
DATASET_MD5 = {
'DUDE': '130d860d08206e1e407cd77150c10d88',
'DUDE_MINI': 'e0c0d998114f0cca7516d12039d2b538',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'XComposer2d5': (1, -1),
'XComposer2_4KHD': (1, -1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on DUDE.".format(model_name))
super(DUDE, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
return load(data_path)
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
try:
import fitz
except:
warnings.warn('Please use `pip install pymupdf` to parse PDF files.')
line = origin_line.copy()
if not isinstance(line['image_path'], List):
line['image_path'] = [line['image_path']]
line['image_path'] = line['image_path'][:self.max_pages]
skip_pdf_parse = True
for im_name in line['image_path']:
path = osp.join(self.img_root, im_name)
if not read_ok(path):
skip_pdf_parse = False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if skip_pdf_parse:
line['image'] = line['image_path']
else:
pdf_data = base64.b64decode(line['image'])
pdf_file = io.BytesIO(pdf_data)
encoded_images = []
with fitz.open(stream=pdf_file, filetype='pdf') as doc:
doc = doc[:self.max_pages]
for page in doc:
image = page.get_pixmap(dpi=144)
image_file = io.BytesIO(image.tobytes(output='png'))
image = Image.open(image_file)
encoded_image = encode_image_to_base64(image)
encoded_images.append(encoded_image)
line['image'] = encoded_images
print('process {}'.format(line['doc_id']))
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = DUDE_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)

View File

@@ -0,0 +1,165 @@
import pandas as pd
from abc import abstractmethod
from ..smp import *
def img_root_map(dataset):
if 'OCRVQA' in dataset:
return 'OCRVQA'
if 'COCO_VAL' == dataset:
return 'COCO'
if 'MMMU' in dataset:
return 'MMMU'
mmbench_root_map = {
'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
}
if dataset in mmbench_root_map:
return mmbench_root_map[dataset]
return dataset
class ImageBaseDataset:
MODALITY = 'IMAGE'
DATASET_URL = {}
DATASET_MD5 = {}
def __init__(self, dataset='MMBench', skip_noimg=True):
ROOT = LMUDataRoot()
# You can override this variable to save image files to a different directory
self.dataset_name = dataset
self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
data = self.load_data(dataset)
self.skip_noimg = skip_noimg
if skip_noimg and 'image' in data:
data = data[~pd.isna(data['image'])]
data['index'] = [str(x) for x in data['index']]
self.meta_only = True
# The image field can store the base64 encoded image or another question index (for saving space)
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
for k in image_map:
if len(image_map[k]) <= 64:
idx = image_map[k]
assert idx in image_map and len(image_map[idx]) > 64
image_map[k] = image_map[idx]
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
self.meta_only = False
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
if np.all([istype(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
self.data = data
self.post_build(dataset)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return dict(self.data.iloc[idx])
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
update_flag = False
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
update_flag = True
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def dump_image(self, line):
os.makedirs(self.img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def display(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
assert isinstance(line, pd.Series) or isinstance(line, dict)
mmqa_display(line)
# Return a list of dataset names that are supported by this class, can override
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_URL)
# Given the dataset name, return the dataset as a pandas dataframe, can override
def load_data(self, dataset):
url = self.DATASET_URL[dataset]
file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
return self.prepare_tsv(url, file_md5)
# Post built hook, will be called after the dataset is built, can override
def post_build(self, dataset):
pass
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
return msgs
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
pass

View File

@@ -0,0 +1,75 @@
from .image_base import ImageBaseDataset
from ..smp import *
class COCO_Caption_Scorer():
def __init__(self, ref, gt):
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
self.ref = ref
self.gt = gt
print('setting up scorers...')
self.scorers = [
(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
(Rouge(), 'ROUGE_L'),
(Cider(), 'CIDEr'),
]
def compute_scores(self):
total_scores = {}
for scorer, method in self.scorers:
print('computing %s score...' % (scorer.method()))
score, scores = scorer.compute_score(self.gt, self.ref)
if isinstance(method, list):
for sc, scs, m in zip(score, scores, method):
print('%s: %0.3f' % (m, sc * 100))
total_scores['Bleu'] = [x * 100 for x in score]
else:
print('%s: %0.3f' % (method, score * 100))
total_scores[method] = score * 100
print('*****DONE*****')
for key, value in total_scores.items():
print('{}:{}'.format(key, value))
return total_scores
class ImageCaptionDataset(ImageBaseDataset):
TYPE = 'Caption'
DATASET_URL = {
'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
}
DATASET_MD5 = {
'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
}
def load_data(self, dataset):
data = super().load_data(dataset)
if 'question' not in data:
data['question'] = [(
'Please describe this image in general. Directly provide the description, '
'do not include prefix like "This image depicts". '
)] * len(data)
return data
# It returns a dictionary of scores
@classmethod
def evaluate(self, eval_file, **kwargs):
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
ref, gt = {}, {}
for i, line in enumerate(lines):
ref[str(i)] = [str(line['prediction'])]
gt[str(i)] = eval(line['answer'])
scorer = COCO_Caption_Scorer(ref, gt)
coco_caption_score_dict = scorer.compute_scores()
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(coco_caption_score_dict, score_pth)
return coco_caption_score_dict

View File

@@ -0,0 +1,484 @@
import warnings
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
MMMB_URLS = {
'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
}
MTL_MMBench_URLS = {
'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
}
MMMB_MD5 = {
'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
}
MTL_MMBench_MD5 = {
'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
}
class ImageMCQDataset(ImageBaseDataset):
TYPE = 'MCQ'
DATASET_URL = {
# MMBench v1.0
'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN.tsv',
'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN.tsv',
'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv',
'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv',
'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv', # Internal Only
'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only
# MMBench v1.1
'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv',
'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv',
'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv',
'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv',
'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv', # Internal Only
'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only
# SEEDBench Series
'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv',
'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench2_Plus.tsv',
# ScienceQA Series
'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_VAL.tsv',
'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv',
# MMT-Bench
'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL_MI.tsv',
'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL.tsv',
'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL_MI.tsv',
'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL.tsv',
# AesBench
'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
# Q-Bench1
'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
# A-Bench
'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
# Other Benchmarks
'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
'TaskMeAnything_v1_imageqa_random': (
'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
),
'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv'
}
DATASET_MD5 = {
# MMBench v1.0
'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only
'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only
# MMBench v1.1
'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only
'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only
# SEEDBench
'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
# ScienceQA
'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
# MMT-Bench
'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
# AesBench
'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
# Q-Bench1
'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
# A-Bench
'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
# Other Benchmarks
'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
'RealWorldQA': '92321028d2bc29040284b6674721e48f',
'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
'BLINK': '3b6649b6a662184ea046908e5506260e',
'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889'
}
DATASET_URL.update(MMMB_URLS)
DATASET_URL.update(MTL_MMBench_URLS)
DATASET_MD5.update(MMMB_MD5)
DATASET_MD5.update(MTL_MMBench_MD5)
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
# assert dataset is not None
dataset_map = {
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
}
dataset = self.dataset_name
if dataset in dataset_map:
dataset = dataset_map[dataset]
nproc = judge_kwargs.pop('nproc', 4)
circular = False
if listinstr(['mmbench', 'ccbench'], dataset.lower()):
data = load(eval_file)
data['index'] = [int(x) for x in data['index']]
dump(data, eval_file)
circular = True
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
if circular:
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
else:
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
# May have different report acc functions for different datasets
if 'MMT' in dataset:
acc = report_acc_MMT(data)
else:
acc = report_acc(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
if dataset == 'AesBench_VAL':
warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
return acc
class MMMUDataset(ImageMCQDataset):
DATASET_URL = {
'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
}
DATASET_MD5 = {
'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
}
@staticmethod
def split_MMMU(msgs):
text, images = None, []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
assert text is None
text = s['value']
text_segs = text.split('<image ')
if len(text_segs) == 1:
return msgs
segs = [dict(type='text', value=text_segs[0])]
for i, seg in enumerate(text_segs):
if i == 0:
continue
assert istype(seg[0], int) and seg[1] == '>'
image_idx = int(seg[0]) - 1
segs.append(dict(type='image', value=images[image_idx]))
segs.append(dict(type='text', value=seg[2:]))
return segs
def build_prompt(self, line):
msgs = super().build_prompt(line)
msgs = self.split_MMMU(msgs)
return msgs
class MUIRDataset(ImageMCQDataset):
DATASET_URL = {
'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
}
DATASET_MD5 = {
'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
}
@staticmethod
def split_MUIR(msgs):
text, images = None, []
# Separate images and text from msgs
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
assert text is None # Ensure only one text entry is expected
text = s['value']
# Split text by <image> tags
text_segs = text.split('<image>')
# Initialize the segments list
segs = []
# Iterate through the text segments and images
for i, seg in enumerate(text_segs):
# Append the image if this is not the first segment and there are still images left
if i > 0 and i - 1 < len(images):
segs.append(dict(type='image', value=images[i - 1]))
# Append the text segment (if it's non-empty)
if len(seg) > 0:
segs.append(dict(type='text', value=seg))
return segs
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
# options_prompt = ''
options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
# for key, item in options.items():
# options_prompt += f'{key}. {item}\n'
prompt = ''
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
prompt += "\nAnswer with the option's letter from the given choices directly."
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
msgs = self.split_MUIR(msgs)
return msgs
class GMAIMMBenchDataset(ImageMCQDataset):
DATASET_URL = {
'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv'
}
DATASET_MD5 = {
'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324'
}
def report_acc_by_groups(self, df, group_column):
res = defaultdict(list)
# Check for the 'split' column
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
if group_column not in df:
raise ValueError(f"Column '{group_column}' not found in dataframe.")
abilities = list(set(df[group_column]))
abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
abilities.sort()
for ab in abilities:
ab_name = ab
sub_df = df[df[group_column] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
return pd.DataFrame(res)
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, mcq_vanilla_eval
nproc = judge_kwargs.pop('nproc', 4)
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
acc = report_acc(data)
for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
acc_grouped = self.report_acc_by_groups(data, group_col)
score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
dump(acc_grouped, score_file_grouped)
return acc
class CustomMCQDataset(ImageMCQDataset):
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)

View File

@@ -0,0 +1,128 @@
from .image_base import ImageBaseDataset
from .utils.judge_util import build_judge
from ..smp import *
from ..utils import track_progress_rich
class ImageMTDataset(ImageBaseDataset):
TYPE = 'MT'
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
questions = toliststr(line['question'])
if 'answer' in line:
answers = toliststr(line['answer'])
else:
answers = [''] * len(questions)
assert len(questions) == len(answers)
dlgs, pics_number = [], 0
for i in range(len(questions)):
q, a = questions[i], answers[i]
if '<ImageHere>' in q:
content = []
tag_number = q.count('<ImageHere>')
images = tgt_path[pics_number: pics_number + tag_number]
pics_number += tag_number
q_split = q.split('<ImageHere>')
for i in range(tag_number):
qsp, im = q_split[i], images[i]
if qsp != '':
content.append(dict(type='text', value=qsp))
content.append(dict(type='image', value=im))
if q_split[-1] != '':
content.append(dict(type='text', value=q_split[-1]))
else:
content = [dict(type='text', value=q)]
dlgs.append(dict(role='user', content=content))
assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
content = [dict(type='text', value=a)]
dlgs.append(dict(role='assistant', content=content))
return dlgs
class MMDUDataset(ImageMTDataset):
DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
DIMS = [
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
]
def calculat_metric(self, ans):
all = defaultdict(lambda: 0)
tot = defaultdict(lambda: 0)
valid = defaultdict(lambda: 0)
for k in ans:
res = ans[k]['res']
assert isinstance(res, pd.DataFrame)
lt = len(res)
for i in range(lt):
line = res.iloc[i]
for k in self.DIMS:
tot[k] += 1
if k in line and line[k] is not None:
try:
score = int(line[k])
score = np.clip(score, 0, 10)
all[k] += score
valid[k] += 1
except Exception as e:
print(f'Failed to parse the score: {str(e)}')
sp1 = {'set': 'all'}
sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
sp2 = {'set': 'valid'}
sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
return pd.DataFrame([sp1, sp2])
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
nproc = judge_kwargs.pop('nproc', 4)
data = load(eval_file)
model = judge_kwargs.pop('model', 'gpt-4o')
judge_model = build_judge(model=model, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(judge_model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
from .utils.mmdu import mmdu_score
if len(indices):
new_results = track_progress_rich(
mmdu_score,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
metric = self.calculat_metric(ans)
dump(metric, score_file)
return metric

View File

@@ -0,0 +1,433 @@
from functools import partial
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
from ..utils import track_progress_rich
class ImageVQADataset(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
}
DATASET_MD5 = {
'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
}
def build_prompt(self, line):
msgs = super().build_prompt(line)
assert msgs[-1]['type'] == 'text'
msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
return msgs
# It returns a DataFrame
def evaluate(self, eval_file, **judge_kwargs):
from .utils.vqa_eval import hit_calculate, process_line
data = load(eval_file)
dataset = self.dataset_name
assert 'answer' in data and 'prediction' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
lt = len(data)
pool = mp.Pool(16)
lines = [data.iloc[i] for i in range(lt)]
if listinstr(['TextVQA'], dataset):
res = pool.map(partial(process_line, method='vqa_score'), lines)
elif listinstr(['ChartQA'], dataset):
res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
elif listinstr(['OCRVQA'], dataset):
res = pool.map(partial(process_line, method='accuracy'), lines)
elif listinstr(['DocVQA', 'InfoVQA'], dataset):
res = pool.map(partial(process_line, method='anls'), lines)
else: # default using vqa_score to calculate score
res = pool.map(process_line, lines)
hit = hit_calculate(res, dataset)
ret = dict()
if 'split' in data:
splits = set(data['split'])
for sp in splits:
sub = [r for l, r in zip(lines, res) if l['split'] == sp]
# [np.mean(x['match']) >= full_score_weight for x in sub]
hit = hit_calculate(sub, dataset)
ret[sp] = np.mean(hit) * 100
sub = [r for l, r in zip(lines, res)]
hit = hit_calculate(sub, dataset)
ret['Overall'] = np.mean(hit) * 100
else:
ret['Overall'] = np.mean(hit) * 100
if 'category' in data:
cates = list(set(data['category']))
cates.sort()
for c in cates:
sub = [r for l, r in zip(lines, res) if l['category'] == c]
# [np.mean(x['match']) >= full_score_weight for x in sub]
hit = hit_calculate(sub, dataset)
ret[c] = np.mean(hit) * 100
ret = d2df(ret)
ret.round(2)
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(ret, result_file)
return ret
class OCRBench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
}
DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
OCRBench_score = {
'Regular Text Recognition': 0,
'Irregular Text Recognition': 0,
'Artistic Text Recognition': 0,
'Handwriting Recognition': 0,
'Digit String Recognition': 0,
'Non-Semantic Text Recognition': 0,
'Scene Text-centric VQA': 0,
'Doc-oriented VQA': 0,
'Key Information Extraction': 0,
'Handwritten Mathematical Expression Recognition': 0,
}
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
for i in tqdm(range(len(lines))):
line = lines[i]
predict = str(line['prediction'])
answers = eval(line['answer'])
category = line['category']
if category == 'Handwritten Mathematical Expression Recognition':
for j in range(len(answers)):
answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
predict = predict.strip().replace('\n', ' ').replace(' ', '')
if answer in predict:
OCRBench_score[category] += 1
break
else:
for j in range(len(answers)):
answer = answers[j].lower().strip().replace('\n', ' ')
predict = predict.lower().strip().replace('\n', ' ')
if answer in predict:
OCRBench_score[category] += 1
break
final_score_dict = {}
final_score_dict['Text Recognition'] = \
(OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+ OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+ OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
final_score_dict['Handwritten Mathematical Expression Recognition'] = \
(OCRBench_score['Handwritten Mathematical Expression Recognition'])
final_score_dict['Final Score'] = \
(final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+ final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+ final_score_dict['Handwritten Mathematical Expression Recognition'])
final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(final_score_dict, score_pth)
return final_score_dict
class MathVista(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
}
DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathvista import MathVista_auxeval, MathVista_acc
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MathVista_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
data['res'] = [ans[idx]['res'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
score = MathVista_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
return score
class MathVision(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
}
DATASET_MD5 = {
'MathVision': '93f6de14f7916e598aa1b7165589831e',
'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathv import MATH_V_auxeval, MATH_V_acc
if 'model' in judge_kwargs:
model = judge_kwargs['model']
else:
model = os.path.basename(os.environ.get('LOCAL_LLM'))
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MATH_V_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
data['res'] = [ans[idx]['res'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
score = MATH_V_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
return score
class LLaVABench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.llavabench import (
build_prompt,
LLaVABench_atomeval,
LLaVABench_score,
)
suffix = '.' + eval_file.split('.')[-1]
record_file = eval_file.replace(suffix, '_openai_result' + suffix)
score_file = eval_file.replace(suffix, '_score.csv')
nproc = judge_kwargs.pop('nproc', 4)
system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
if not osp.exists(record_file):
data = load(eval_file)
lines = [data.iloc[i] for i in range(len(data))]
model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
prompts = [build_prompt(line) for line in lines]
tups = [(model, prompt) for prompt in prompts]
scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
data['gpt4_score'] = [x[0] for x in scores]
data['score'] = [x[1] for x in scores]
dump(data, record_file)
data = load(record_file)
ret = LLaVABench_score(data).round(1)
dump(ret, score_file)
return ret
class MMVet(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
}
DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmvet import MMVet_auxeval, MMVet_acc
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
model = build_judge(max_tokens=3, **judge_kwargs)
assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = load(tmp_file) if osp.exists(tmp_file) else {}
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MMVet_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
data['score'] = [ans[idx]['score'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
score, score_fine = MMVet_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
dump(score, score_pth)
dump(score_fine, score_fine_pth)
return score
class MTVQADataset(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
data = load(eval_file)
assert 'answer' in data and 'prediction' in data and 'category' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
if 'split' in data:
assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
lt = len(data)
category_scores = defaultdict(list)
for i in range(lt):
line = data.iloc[i]
ans = line['answer'].strip().lower().replace('.', '')
pred = line['prediction'].strip().lower().replace('.', '')
cate = line['category']
score = 1.0 if ans in pred else 0.0
category_scores[cate].append(score)
category_scores['Average'].append(score)
# Calculate the average score for each category, the score is normalized to [0, 100]
category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.json')
dump(category_averages, result_file)
return category_averages
# MT-VQA adopts a custom prompt
def build_prompt(self, line):
msgs = super().build_prompt(line)
assert sum([x['type'] == 'text' for x in msgs]) == 1
for item in msgs:
if item['type'] == 'text':
item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
return msgs
class CustomVQADataset(ImageBaseDataset):
TYPE = 'VQA'
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def evaluate(self, eval_file, **judge_kwargs):
raise NotImplementedError

View File

@@ -0,0 +1,88 @@
from ..smp import *
from ..utils import *
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
class ImageYORNDataset(ImageBaseDataset):
TYPE = 'Y/N'
DATASET_URL = {
'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
}
DATASET_MD5 = {
'MME': 'b36b43c3f09801f5d368627fb92187c3',
'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
}
# It returns a dataframe
def evaluate(self, eval_file, **judge_kwargs):
from .utils.yorn import YOrN_Extraction, YOrN_auxeval
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating
dataset = self.dataset_name
data = load(eval_file)
data['prediction'] = [str(x) for x in data['prediction']]
storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
if osp.exists(tmp_file):
tmp = load(tmp_file)
for k in tmp:
if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
ans_map[k] = tmp[k]
data['extracted'] = [ans_map[x] for x in data['index']]
unknown = data[data['extracted'] == 'Unknown']
model = judge_kwargs.get('model', 'exact_matching')
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
model = None
warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
if model is not None:
lt = len(unknown)
lines = [unknown.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = list(unknown['index'])
if len(tups):
res = track_progress_rich(
YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
for k, v in zip(indices, res):
ans_map[k] = v
data['extracted'] = [ans_map[x] for x in data['index']]
dump(data, storage)
data = load(storage)
data['score'] = (data['answer'] == data['extracted'])
dump(data, storage)
if dataset is not None and listinstr(['MME'], dataset):
score = MME_rating(storage)
elif dataset is not None and listinstr(['Hallusion'], dataset):
score = Hallusion_rating(storage)
elif dataset is not None and listinstr(['POPE'], dataset):
score = POPE_rating(storage)
else:
score = default_rating(storage)
score_tgt = eval_file.replace('.xlsx', '_score.csv')
dump(score, score_tgt)
return score

View File

@@ -0,0 +1,252 @@
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
FAIL_MSG = 'Failed to obtain answer via API.'
def unwrap_hf_pkl(pth, suffix='.mp4'):
base_dir = os.path.join(pth, 'video_pkl/')
target_dir = os.path.join(pth, 'video/')
pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
pickle_files.sort()
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
for pickle_file in pickle_files:
with open(pickle_file, 'rb') as file:
video_data = pickle.load(file)
# For each video file in the pickle file, write its contents to a new mp4 file
for video_name, video_content in video_data.items():
output_path = os.path.join(target_dir, f'{video_name}{suffix}')
with open(output_path, 'wb') as output_file:
output_file.write(video_content)
print('The video file has been restored and stored from the pickle file.')
else:
print('The video file already exists.')
class MMBenchVideo(VideoBaseDataset):
MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
SYS = 'You are an AI assistant responsible for answering questions about videos.'
FRAMES_TMPL_PACK = """
You will be provided with {} separate frames uniformly sampled from a video, \
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer / answers to the \
following question / questions about the video content.
If multiple questions are provided (with indices I1, I2, I3, ...), \
you should organize your answers in the following json format:
{{
'I1': 'Answer to Question I1',
'I2': 'Answer to Question I2',
...
}}
Otherwise, please directly reply with your response to the only question.
Even if the information in these separate frames is not enough to give an answer,
PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
"""
FRAMES_TMPL_NOPACK = """
You will be provided with {} separate frames uniformly sampled from a video, \
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer to the question about the video content.
Please directly reply with your response to the only question.
"""
TYPE = 'VQA'
def __init__(self, dataset='MMBench-Video', pack=False):
super().__init__(dataset=dataset, pack=pack)
@classmethod
def supported_datasets(cls):
return ['MMBench-Video']
def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='nebulae09/MMBench-Video'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data['video_path']:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
unwrap_hf_pkl(dataset_path)
self.video_path = osp.join(dataset_path, 'video/')
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
def build_prompt_pack(self, line, num_frames):
if isinstance(line, int):
assert line < len(self)
video = self.videos[line]
elif isinstance(line, pd.Series):
video = line['video']
elif isinstance(line, str):
video = line
frames = self.save_video_frames(video, num_frames)
sub = self.data[self.data['video'] == video]
sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(num_frames)
message = [dict(type='text', value=sys_prompt)]
for im in frames:
message.append(dict(type='image', value=im))
nq = len(sub)
prompt = 'Questions: \n{}\nAnswers: \n'
qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
prompt = prompt.format(json.dumps(qs))
message.append(dict(type='text', value=prompt))
return message
def build_prompt_nopack(self, line, num_frames, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
if video_llm:
question = line['question']
prefix, video_idx_path = os.path.split(line['video_path'])
message = [dict(type='text', value=question)]
message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
return message
else:
frames = self.save_video_frames(line['video'], num_frames)
sys_prompt = self.FRAMES_TMPL_NOPACK.format(num_frames)
message = [dict(type='text', value=sys_prompt)]
for im in frames:
message.append(dict(type='image', value=im))
prompt = 'Question: {}\nAnswer: '.format(line['question'])
message.append(dict(type='text', value=prompt))
return message
def build_prompt(self, line, num_frames, video_llm):
if self.pack and not video_llm:
return self.build_prompt_pack(line, num_frames)
else:
return self.build_prompt_nopack(line, num_frames, video_llm)
@staticmethod
def remove_side_quote(s, syms=[',', '"', "'"]):
if np.all([x in syms for x in s]):
return ''
while s[0] in syms:
s = s[1:]
while s[-1] in syms:
s = s[:-1]
return s
@staticmethod
def robust_json_load(s):
try:
jsons = list(extract_json_objects(s))
assert len(jsons) == 1
return jsons[0]
except:
if '{' in s and s.find('{') == s.rfind('{'):
sub_str = s[s.find('{') + 1:].strip()
lines = sub_str.split('\n')
res = {}
for l in lines:
l = l.strip()
if ': ' in l:
key = l.split(': ')[0].strip()
val = l.split(': ')[1].strip()
key = MMBenchVideo.remove_side_quote(key)
val = MMBenchVideo.remove_side_quote(val)
if len(key) and len(val):
res[key] = val
return res
return None
def load_pack_answers(self, data_raw):
vstats = defaultdict(lambda: 0)
data = defaultdict(lambda: {})
for k in data_raw:
ans = data_raw[k].strip()
if FAIL_MSG in ans:
vstats['GEN_FAIL'] += 1
continue
res = self.robust_json_load(ans)
if res is not None:
data[k] = res
vstats['PARSE_OK'] += 1
else:
vstats['PARSE_FAIL'] += 1
# return data
meta = cp.deepcopy(self.data)
lt = len(meta)
prediction = []
for i in range(lt):
line = meta.iloc[i]
vid = line['video']
idx = str(line['index'])
prediction.append(data[vid][idx] if idx in data[vid] else None)
meta['prediction'] = prediction
vstats['VALIDQ'] = len([x for x in prediction if x is not None])
vstats['INVALIDQ'] = len([x for x in prediction if x is None])
return meta, vstats
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
model = build_judge(system_prompt=system_prompt, **judge_kwargs)
assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if model.fail_msg not in v}
data = load(eval_file)
data_un = data[~data['index'].isin(res)]
data_un = data_un[~pd.isna(data_un['prediction'])]
lt = len(data_un)
prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
indices = [data_un.iloc[i]['index'] for i in range(lt)]
if len(prompts):
_ = track_progress_rich(
model.generate,
prompts,
keys=indices,
save=tmp_file,
nproc=nproc,
chunksize=nproc
)
score_map = load(tmp_file)
data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
rejected = [x for x in score_map.values() if FAIL_MSG in x]
data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating

View File

@@ -0,0 +1,582 @@
import re
import math
from urllib.request import urlopen
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms as transforms
from vlmeval.dataset.utils import build_judge, levenshtein_distance
from vlmeval.smp import *
from .image_base import ImageBaseDataset
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_ICE():
example_1 = """
---
Question: List the primary questions asked about the services in this report.
Analysis: The primary questions asked about the services in the report for The Limes Residential Home are:\n\n
1. Is the service safe?\n
2. Is the service effective?\n
3. Is the service caring?\n
4. Is the service responsive?\n
5. Is the service well-led?
Extracted answer: [
'Is the servife safe?',
'Is the service effective',
'Is the serve caring?',
'Is the service responsive?',
'Is the service well-led?'
]
Answer format: List\n
"""
example_2 = """
---
Question: How many regulations of the HSCA 2008 are breached in all according to this report?
Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
Regulations in total. Here are the specifics:\n\n1. Regulation 13: Safeguarding service users from abuse and
improper treatment\n2. Regulation 12: Safe care and treatment\n3. Regulation 18: Staffing\n4. Regulation 11:
Need for consent\n5. Regulation 10: Dignity and respect\n6. Regulation 9: Person-centred care\n7. Regulation 17:
Good governance\n8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents\n9.
Regulation 18: Failure to maintain an accurate and up-to-date care plan\n10. Regulation 11: Failure to implement
the Mental Capacity Act 2005 code of practice effectively\n\nThese breaches involve issues concerning staffing,
safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
notify the CQC of incidents.
Extracted answer: 10
Answer format: Integer\n
"""
example_3 = """
---
Question: According to the survey that is the percentage of Chinese who are paying more or
about the same attention to politics after Trump's election?
Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
more or about the same attention to politics after Trump's election. The report focuses primarily on American
demographics and does not include specific details about the Chinese population in relation to this question. If
you need information about a different demographic or a summary of the findings from the American demographic,
I can certainly help with that!
Extracted answer: Not answerable
Answer format: String\n
"""
example_4 = """
---
Question: How many quotations from male respondent over 50 years old are included in this report?
Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
able to help you with your question.
Extracted answer: Fail to answer
Answer format: String\n
"""
return [example_1, example_2, example_3, example_4]
def build_mmlongbench_gpt4_prompt(line):
task_description = """
Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
- Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
If you find the analysis the question can not be answered from the given documents, type "Not answerable".
Exception: If the analysis only tells you that it can not read/understand the images or documents,
type "Fail to answer".
- Please make your response as concise as possible. Also note that your response should be formatted as below:
```
Extracted answer: [answer]
Answer format: [answer format]
```
Please read the following example, then extract the answer from the model response
and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example
prompt += '---\nQuestion:' + question + '\n'
prompt += 'Analysis: ' + prediction
return prompt
def anls_compute(groundtruth, prediction, threshold=0.5):
dist = levenshtein_distance(groundtruth, prediction)
length = max(len(groundtruth.upper()), len(prediction.upper()))
value = 0.0 if length == 0 else float(dist) / float(length)
anls = 1.0 - value
if anls <= threshold:
anls = 0.0
return anls
def is_float_equal(reference, prediction, include_percentage: bool = False, is_close: float = False) -> bool:
def get_precision(gt_ans: float) -> int:
precision = 3
if '.' in str(gt_ans):
precision = len(str(gt_ans).split('.')[-1])
return precision
reference = float(str(reference).strip().rstrip('%').strip())
try:
prediction = float(str(prediction).strip().rstrip('%').strip())
except:
return False
if include_percentage:
gt_result = [reference / 100, reference, reference * 100]
else:
gt_result = [reference]
for item in gt_result:
try:
if is_close:
if math.isclose(item, prediction, rel_tol=0.01):
return True
precision = max(min(get_precision(prediction), get_precision(item)), 2)
if round(prediction, precision) == round(item, precision):
return True
except Exception:
continue
return False
def get_clean_string(s):
s = str(s).lower().strip()
if s.endswith('mile'):
s.rstrip('mile').strip()
if s.endswith('miles'):
s.rstrip('miles').strip()
if s.endswith('million'):
s.rstrip('million').strip()
# remove parenthesis
s = re.sub(r'\s*\([^)]*\)', '', s).strip()
# remove quotes
s = re.sub(r"^['\"]|['\"]$", '', s).strip()
s = s.strip().lstrip('$').strip()
s = s.strip().rstrip('%').strip()
return s
def is_exact_match(s):
flag = False
# Website
if 'https://' in s:
flag = True
# code file
if s.endswith('.py') or s.endswith('ipynb'):
flag = True
if s.startswith('page'):
flag = True
# telephone number
if re.fullmatch(r'\b\d+(-\d+|\s\d+)?\b', s):
flag = True
# time
if 'a.m.' in s or 'p.m.' in s:
flag = True
# YYYY-MM-DD
if re.fullmatch(r'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b', s):
flag = True
# YYYY-MM
if re.fullmatch(r'\b\d{4}[-\s]\d{2}\b', s):
flag = True
# Email address
if re.fullmatch(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', s):
flag = True
return flag
def isfloat(num):
try:
float(num)
return True
except ValueError:
return False
def get_font():
try:
truetype_url = 'http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf'
ff = urlopen(truetype_url)
font = ImageFont.truetype(ff, size=40)
except:
print('Fail to download the font. Use the default one.')
font = ImageFont.load_default(size=40)
return font
def frame2img(img_path_list, font, save_path=None, idx_start=0):
imgs = [Image.open(img_path) for img_path in img_path_list]
new_imgs = []
for img in imgs:
w, h = img.size
scale = w / h
if w > h:
new_w = 560 * 2
new_h = int(560 * 2 / scale)
else:
new_w = int(560 * 2 * scale)
new_h = 560 * 2
img = transforms.functional.resize(img, [new_h, new_w],)
new_imgs.append(img)
imgs = new_imgs
new_w = 0
new_h = 0
pad = 40
if w > h:
for im in imgs:
w, h = im.size
new_w = max(new_w, w)
new_h += h + 10 + pad
new_img = Image.new('RGB', (new_w, new_h), 'white')
draw = ImageDraw.Draw(new_img)
curr_h = 0
for idx, im in enumerate(imgs):
w, h = im.size
new_img.paste(im, (0, pad + curr_h))
draw.text((0, curr_h), f'<IMAGE {idx+idx_start}>', font=font, fill='black')
if idx + 1 < len(imgs):
draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
curr_h += h + 10 + pad
else:
for im in imgs:
w, h = im.size
new_w += w + 10
new_h = max(new_h, h)
new_h += pad
new_img = Image.new('RGB', (new_w, new_h), 'white')
draw = ImageDraw.Draw(new_img)
curr_w = 0
for idx, im in enumerate(imgs):
w, h = im.size
new_img.paste(im, (curr_w, pad))
draw.text((curr_w, 0), f'<IMAGE {idx+idx_start}>', font=font, fill='black')
if idx + 1 < len(imgs):
draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
curr_w += w + 10
if save_path is not None:
new_img.save(save_path)
return new_img
def concat_images(image_list, max_concat=1, column_num=1):
concatenated_images = []
if column_num == -1:
MAX_COLUMN_NUM = 20
max_concat = 1
while len(image_list) / max_concat > MAX_COLUMN_NUM:
max_concat += 1
interval = max(math.ceil(len(image_list) / max_concat), 1)
for i in range(0, len(image_list), interval):
batch_images = image_list[i:i + interval]
concatenated_image = frame2img(batch_images, font=get_font(), idx_start=i)
concatenated_images.append(concatenated_image)
else:
interval = max(math.ceil(len(image_list) / max_concat), 1)
for i in range(0, len(image_list), interval):
batch_images = [Image.open(filename) for filename in image_list[i:i + interval]]
if column_num == 1:
total_height = batch_images[0].height * len(batch_images)
else:
total_height = batch_images[0].height * ((len(batch_images) - 1) // column_num + 1)
concatenated_image = Image.new('RGB', (batch_images[0].width * column_num, total_height), 'white')
x_offset, y_offset = 0, 0
for count, image in enumerate(batch_images):
concatenated_image.paste(image, (x_offset, y_offset))
x_offset += image.width
if (count + 1) % column_num == 0:
y_offset += image.height
x_offset = 0
concatenated_images.append(concatenated_image)
return concatenated_images
def eval_score(gt, pred, answer_type):
if answer_type == 'Int':
try:
gt, pred = int(gt), int(float(pred))
except:
pred = ''
score = (gt == pred)
elif answer_type == 'Float':
try:
gt = float(get_clean_string(str(gt)))
pred = float(get_clean_string(str(pred)))
except:
pred = ''
score = is_float_equal(gt, pred, include_percentage=True, is_close=True)
elif answer_type == 'Str':
gt = get_clean_string(gt)
pred = get_clean_string(pred)
if is_exact_match(gt):
score = (gt == pred)
else:
score = anls_compute(gt, pred)
else:
if isinstance(gt, str) and gt.startswith('['):
gt = eval(gt)
if not isinstance(gt, list):
gt = [gt]
if isinstance(pred, str) and pred.startswith('['):
pred = eval(pred)
if not isinstance(pred, list):
pred = [pred]
print(len(gt), len(pred))
if len(gt) != len(pred):
score = 0.0
else:
gt = sorted([get_clean_string(a) for a in gt])
pred = sorted([get_clean_string(a) for a in pred])
print(gt, pred)
if isfloat(gt[0]) or is_exact_match(gt[0]):
score = ('-'.join(gt) == '-'.join(pred))
else:
score = min([anls_compute(gt_v, pred_v) for gt_v, pred_v in zip(gt, pred)])
return float(score)
def MMLongBench_auxeval(model, line):
prompt = build_mmlongbench_gpt4_prompt(line)
log = ''
retry = 5
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
try:
pred = res.split('Answer format:')[0].split('Extracted answer:')[1].strip()
except:
pred = ''
return dict(log=log, res=res, pred=pred)
log += 'All 5 retries failed.\n'
return dict(log=log, res='', pred='')
def get_f1(data):
gt_pos_data = data[data.apply(lambda k: k['answer'] != 'Not answerable', axis=1)]
pred_pos_data = data[data.apply(lambda k: k['pred'] != 'Not answerable', axis=1)]
recall = sum(gt_pos_data['score'].tolist()) / len(gt_pos_data)
precision = sum(pred_pos_data['score'].tolist()) / len(pred_pos_data)
return 2 * recall * precision / (recall + precision)
def MMLongBench_acc(result_file):
data = load(result_file)
overall_score = 0.0
score_list = list()
for i in range(len(data)):
item = data.iloc[i]
try:
score = eval_score(item['answer'], item['pred'], item['answer_format'])
except:
score = 0.0
score_list.append(score)
overall_score += score
data['score'] = score_list
dump(data, result_file)
data_chart = data[data.apply(lambda k: 'Chart' in eval(k['evidence_sources']), axis=1)]
data_table = data[data.apply(lambda k: 'Table' in eval(k['evidence_sources']), axis=1)]
data_image = data[data.apply(lambda k: 'Figure' in eval(k['evidence_sources']), axis=1)]
data_text = data[data.apply(lambda k: 'Pure-text (Plain-text)' in eval(k['evidence_sources']), axis=1)]
data_layout = data[data.apply(lambda k: 'Generalized-text (Layout)' in eval(k['evidence_sources']), axis=1)]
data_single = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 1, axis=1)]
data_multi = data[data.apply(lambda k: len(eval(k['evidence_pages'])) > 1, axis=1)]
data_unans = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 0, axis=1)]
res = dict()
res['category'] = [
'overall_f1', 'overall_acc', 'text', 'layout', 'table', 'chart',
'image', 'single-page', 'multi-page', 'unanswerable'
]
res['num'] = [
len(data), len(data), len(data_text), len(data_layout), len(data_table),
len(data_chart), len(data_image), len(data_single), len(data_multi), len(data_unans)
]
res['avg_score'] = [
get_f1(data),
overall_score / len(data),
sum(data_text['score'].tolist()) / len(data_text) if len(data_text) > 0 else 0.0,
sum(data_layout['score'].tolist()) / len(data_layout) if len(data_layout) > 0 else 0.0,
sum(data_table['score'].tolist()) / len(data_table) if len(data_table) > 0 else 0.0,
sum(data_chart['score'].tolist()) / len(data_chart) if len(data_chart) > 0 else 0.0,
sum(data_image['score'].tolist()) / len(data_image) if len(data_image) > 0 else 0.0,
sum(data_single['score'].tolist()) / len(data_single) if len(data_single) > 0 else 0.0,
sum(data_multi['score'].tolist()) / len(data_multi) if len(data_multi) > 0 else 0.0,
sum(data_unans['score'].tolist()) / len(data_unans) if len(data_unans) > 0 else 0.0,
]
res = pd.DataFrame(res)
return res
class MMLongBench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
}
DATASET_MD5 = {
'MMLongBench_DOC': '9b393e1f4c52718380d50586197eac9b',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
'XComposer2_4KHD': (1, 5),
'XComposer2d5': (1, -1),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on MMLongBench_DOC.".format(model_name))
super(MMLongBench, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
try:
import fitz
except:
warnings.warn('Please use `pip install pymupdf` to parse PDF files.')
line = origin_line.copy()
line['image_path'] = line['image_path'][:self.max_pages]
skip_pdf_parse = True
for im_name in line['image_path']:
path = osp.join(self.img_root, im_name)
if not read_ok(path):
skip_pdf_parse = False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if skip_pdf_parse:
line['image'] = line['image_path']
else:
pdf_data = base64.b64decode(line['image'])
pdf_file = io.BytesIO(pdf_data)
encoded_images = []
with fitz.open(stream=pdf_file, filetype='pdf') as doc:
doc = doc[:self.max_pages]
for page in doc:
image = page.get_pixmap(dpi=144)
image_file = io.BytesIO(image.tobytes(output='png'))
image = Image.open(image_file)
encoded_image = encode_image_to_base64(image)
encoded_images.append(encoded_image)
line['image'] = encoded_images
print('process {}'.format(line['doc_id']))
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all_{}.jpg'.format(i)
for i in range(len(concatenated_images))
]
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = MMLongBench_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)

View File

@@ -0,0 +1,577 @@
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
import torchvision.transforms as T
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from decord import VideoReader, cpu
import imageio
import cv2
import zipfile
import os
import glob
from moviepy.editor import VideoFileClip, ImageSequenceClip
import moviepy.config_defaults
from .utils.mvbench import *
FAIL_MSG = 'Failed to obtain answer via API.'
moviepy.config_defaults.LOGGER_LEVEL = logging.CRITICAL + 1
class MVBench(VideoBaseDataset):
MD5 = 'ae2a2607e2f8618155709220c6e927a6'
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
the detail and movement of objects, and the action and pose of persons. \
Based on your observations, select the best option that accurately addresses the question.
"""
TYPE = 'MCQ'
def __init__(self, dataset='MVBench', pack=False):
self.type_data_list = {
'Action Sequence': ('action_sequence.json',
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
'Action Prediction': ('action_prediction.json',
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
'Action Antonym': ('action_antonym.json',
'your_data_path/ssv2_video/', 'video', False),
'Fine-grained Action': ('fine_grained_action.json',
'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
'Unexpected Action': ('unexpected_action.json',
'your_data_path/FunQA_test/test/', 'video', False),
'Object Existence': ('object_existence.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'Object Interaction': ('object_interaction.json',
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
'Object Shuffle': ('object_shuffle.json',
'your_data_path/perception/videos/', 'video', False),
'Moving Direction': ('moving_direction.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'Action Localization': ('action_localization.json',
'your_data_path/sta/sta_video/', 'video', True), # has start & end
'Scene Transition': ('scene_transition.json',
'your_data_path/scene_qa/video/', 'video', False),
'Action Count': ('action_count.json',
'your_data_path/perception/videos/', 'video', False),
'Moving Count': ('moving_count.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'Moving Attribute': ('moving_attribute.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'State Change': ('state_change.json',
'your_data_path/perception/videos/', 'video', False),
'Fine-grained Pose': ('fine_grained_pose.json',
'your_data_path/nturgbd/', 'video', False),
'Character Order': ('character_order.json',
'your_data_path/perception/videos/', 'video', False),
'Egocentric Navigation': ('egocentric_navigation.json',
'your_data_path/vlnqa/', 'video', False),
'Episodic Reasoning': ('episodic_reasoning.json',
'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame
'Counterfactual Inference': ('counterfactual_inference.json',
'your_data_path/clevrer/video_validation/', 'video', False),
}
super().__init__(dataset=dataset, pack=pack)
@classmethod
def supported_datasets(cls):
return ['MVBench']
def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
return False
return True
cache_path = get_cache_path(repo_id, branch='main')
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def unzip_hf_zip(pth):
pth = os.path.join(pth, 'video/')
for filename in os.listdir(pth):
if filename.endswith('.zip'):
# 构建完整的文件路径
zip_path = os.path.join(pth, filename)
# 解压 ZIP 文件
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(pth)
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MD5:
return
json_data_dir = os.path.join(dataset_path, 'json')
self.data_list = []
for k, v in self.type_data_list.items():
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
self.data_list.append({
'task_type': k,
'prefix': v[1].replace('your_data_path', os.path.join(dataset_path, 'video')),
'data_type': v[2],
'bound': v[3],
'start': data['start'] if 'start' in data.keys() else None,
'end': data['end'] if 'end' in data.keys() else None,
'video': data['video'],
'question': data['question'],
'answer': data['answer'],
'candidates': data['candidates']
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
def move_files(pth):
# special for mvbench
src_folder = os.path.join(pth, 'video/data0613')
for subdir in os.listdir(src_folder):
subdir_path = os.path.join(src_folder, subdir)
if os.path.isdir(subdir_path):
for subsubdir in os.listdir(subdir_path):
subsubdir_path = os.path.join(subdir_path, subsubdir)
if os.path.isdir(subsubdir_path):
for item in os.listdir(subsubdir_path):
item_path = os.path.join(subsubdir_path, item)
target_folder = os.path.join(pth, 'video', subdir, subsubdir, item)
if not os.path.exists(target_folder):
shutil.move(item_path, os.path.join(target_folder, item))
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
move_files(dataset_path)
unzip_hf_zip(dataset_path)
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
self.decord_method = {
'video': self.read_video,
'gif': self.read_gif,
'frame': self.read_frame,
}
self.nframe = 8
self.resolution = 224
self.frame_fps = 3
# transform
crop_size = self.resolution
scale_size = self.resolution
input_mean = [0.48145466, 0.4578275, 0.40821073]
input_std = [0.26862954, 0.26130258, 0.27577711]
self.transform = T.Compose([
GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
GroupCenterCrop(crop_size),
Stack(),
ToTorchFormatTensor(),
GroupNormalize(input_mean, input_std)
])
return dict(root=dataset_path, data_file=data_file)
def get_index(self, bound, fps, max_frame, first_idx=0):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / self.num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(self.num_segments)
])
return frame_indices
def read_video(self, video_path, bound=None):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def read_gif(self, video_path, bound=None, fps=25):
gif = imageio.get_reader(video_path)
max_frame = len(gif) - 1
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
for index, frame in enumerate(gif):
if index in frame_indices:
img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
img = Image.fromarray(img)
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def read_frame(self, video_path, bound=None, fps=3):
max_frame = len(os.listdir(video_path))
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
for frame_index in frame_indices:
img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def save_video_frames(self, imgs, video_name, frames):
frame_paths = self.frame_paths(video_name, frames)
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
block_size = imgs.size(0) // frames
split_tensors = torch.split(imgs, block_size)
to_pil = transforms.ToPILImage()
images = [to_pil(arr) for arr in split_tensors]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def qa_template(self, data):
question = f"Question: {data['question']}\n"
question += 'Options:\n'
answer = data['answer']
answer_idx = -1
for idx, c in enumerate(eval(data['candidates'])):
question += f"({chr(ord('A') + idx)}) {c}\n"
if c == answer:
answer_idx = idx
question = question.rstrip()
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
return question, answer
def load_into_video_and_process(self, line):
video_path = os.path.join(line['prefix'], line['video'])
if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
if not os.path.exists(processed_video_path):
# using MoviePy to transform GIF, webm into mp4 format
gif_clip = VideoFileClip(video_path)
gif_clip.write_videofile(processed_video_path, codec='libx264')
gif_clip.close()
elif line['data_type'] in ['frame']:
input_images = os.path.join(video_path, '*.jpg')
processed_video_path = f'{video_path}.mp4'
if not os.path.exists(processed_video_path):
# using MoviePy to transform images into mp4
image_files = sorted(glob.glob(input_images))
image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
image_clip.write_videofile(processed_video_path, codec='libx264')
image_clip.close()
else:
processed_video_path = video_path
if line['bound']:
base_name, suffix = os.path.splitext(processed_video_path)
output_video_path = f'{base_name}_processed{suffix}'
if not os.path.exists(output_video_path):
video_clip = VideoFileClip(processed_video_path)
clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
clip.write_videofile(output_video_path)
clip.close()
else:
output_video_path = processed_video_path
return output_video_path
def build_prompt(self, line, num_frames, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = [dict(type='text', value=self.SYS)]
message.append(dict(type='text', value=question))
if video_llm:
new_video_path = self.load_into_video_and_process(line)
message.append(dict(type='video', value=new_video_path))
else:
bound = None
if line['bound']:
bound = (
line['start'],
line['end'],
)
video_path = os.path.join(line['prefix'], line['video'])
decord_method = self.decord_method[line['data_type']]
self.num_segments = num_frames if num_frames > 0 else self.nframe
torch_imgs = decord_method(video_path, bound)
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nOnly give the best option.'))
message.append(dict(type='text', value='Best option:('))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
answer_idx = -1
for id, c in enumerate(options):
if c == ans:
answer_idx = id
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
if FAIL_MSG in pred:
data.loc[idx, 'score'] = -1
else:
data.loc[idx, 'score'] = int(check_ans(pred, ans))
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
class MVBench_MP4(VideoBaseDataset):
MP4_MD5 = '7b4608045347904c28c153015a7a2b6b'
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
the detail and movement of objects, and the action and pose of persons. \
Based on your observations, select the best option that accurately addresses the question.
"""
TYPE = 'MCQ'
def __init__(self, dataset='MVBench_MP4', pack=False):
super().__init__(dataset=dataset, pack=pack)
@classmethod
def supported_datasets(cls):
return ['MVBench_MP4']
def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MP4_MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
return False
return True
cache_path = get_cache_path(repo_id, branch='video')
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MD5:
return
json_data_path = os.path.join(dataset_path, 'test.json')
json_data = load(json_data_path)
root_data_dict = json_data['root']
self.data_list = []
for k, v in json_data['meta'].items():
for item in v:
self.data_list.append({
'task_type': k,
'prefix': root_data_dict[k],
'video': item['video'],
'question': item['question'],
'answer': item['answer'],
'candidates': item['candidates']
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
self.nframe = 8
self.resolution = 224
# transform
crop_size = self.resolution
scale_size = self.resolution
input_mean = [0.48145466, 0.4578275, 0.40821073]
input_std = [0.26862954, 0.26130258, 0.27577711]
self.transform = T.Compose([
GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
GroupCenterCrop(crop_size),
Stack(),
ToTorchFormatTensor(),
GroupNormalize(input_mean, input_std)
])
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = f"Question: {data['question']}\n"
question += 'Options:\n'
answer = data['answer']
answer_idx = -1
for idx, c in enumerate(eval(data['candidates'])):
question += f"({chr(ord('A') + idx)}) {c}\n"
if c == answer:
answer_idx = idx
question = question.rstrip()
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
return question, answer
def get_index(self, max_frame):
seg_size = float(max_frame) / self.num_segments
frame_indices = np.array([
int((seg_size / 2) + np.round(seg_size * idx))
for idx in range(self.num_segments)
])
return frame_indices
def read_video(self, video_path, bound=None):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
images_group = list()
frame_indices = self.get_index(max_frame)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def save_video_frames(self, imgs, video_name, frames):
frame_paths = self.frame_paths(video_name, frames)
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
block_size = imgs.size(0) // frames
split_tensors = torch.split(imgs, block_size)
to_pil = transforms.ToPILImage()
images = [to_pil(arr) for arr in split_tensors]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def build_prompt(self, line, num_frames, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = [dict(type='text', value=self.SYS)]
message.append(dict(type='text', value=question))
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
self.num_segments = num_frames if num_frames > 0 else self.nframe
torch_imgs = self.read_video(video_path)
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nOnly give the best option.'))
message.append(dict(type='text', value='Best option:('))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
answer_idx = -1
for id, c in enumerate(options):
if c == ans:
answer_idx = id
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
if FAIL_MSG in pred:
data.loc[idx, 'score'] = -1
else:
data.loc[idx, 'score'] = int(check_ans(pred, ans))
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating

View File

@@ -0,0 +1,189 @@
import re
import math
from typing import List
from vlmeval.dataset.utils.judge_util import build_judge
from vlmeval.smp import *
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
FAIL_MSG = 'Failed to obtain answer via API.'
def get_f1(gt, pred):
gt_bow, pred_bow = gt.strip().split(), pred.strip().split()
if not gt_bow or not pred_bow:
return 0.0
recall = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(gt_bow)
precision = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(pred_bow)
f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 1e-4 else 0.0
return f1
def SlideVQA_acc(result_file):
data = load(result_file)
anls_list, em_list, f1_list = list(), list(), list()
for i in range(len(data)):
item = data.iloc[i]
if isinstance(item['answer'], float) and math.isnan(item['answer']):
item['answer'] = 'Not answerable'
item['answer'] = re.sub('\n', '', item['answer']).lower()
item['pred'] = str(item['pred']).lower()
anls_score = anls_compute(item['answer'], item['pred'])
em_score = (item['answer'].strip() == item['pred'].strip())
f1_score = get_f1(item['answer'], item['pred'])
anls_list.append(anls_score)
em_list.append(em_score)
f1_list.append(f1_score)
print('---------------------')
print(item['answer'], item['pred'], anls_score, em_score, f1_score)
data['anls'] = anls_list
data['em'] = em_list
data['f1'] = f1_list
dump(data, result_file)
res = dict()
res['category'], res['num'] = ['anls', 'EM', 'F1'], [len(data), len(data), len(data)]
res['avg'] = [sum(anls_list) / len(data), sum(em_list) / len(data), sum(f1_list) / len(data)]
res = pd.DataFrame(res)
return res
class SlideVQA(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'SLIDEVQA_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA_MINI.tsv',
'SLIDEVQA': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA.tsv',
}
DATASET_MD5 = {
'SLIDEVQA_MINI': '6d9a8d8814fa5b7669deb2af3a3208eb',
'SLIDEVQA': '5e822c2f800e94c1e23badfd478326b6',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'XComposer2d5': (1, -1),
'XComposer2_4KHD': (1, -1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on SlideVQA.".format(model_name))
super(SlideVQA, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
line = origin_line.copy()
if not isinstance(line['image_path'], List):
line['image_path'] = [line['image_path']]
line['image_path'] = line['image_path'][:self.max_pages]
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = SlideVQA_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)

View File

@@ -0,0 +1,88 @@
from abc import abstractmethod
from ..smp import *
class TextBaseDataset:
MODALITY = 'TEXT'
DATASET_URL = {}
DATASET_MD5 = {}
def __init__(self, dataset='MMBench', **kwargs):
self.dataset_name = dataset
data = self.load_data(dataset)
data['index'] = [str(x) for x in data['index']]
if np.all([istype(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
self.data = data
self.post_build(dataset)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return dict(self.data.iloc[idx])
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
update_flag = False
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
update_flag = True
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def dump_image(self, line):
return []
def display(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
assert isinstance(line, pd.Series) or isinstance(line, dict)
mmqa_display(line)
# Return a list of dataset names that are supported by this class, can override
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_URL)
# Given the dataset name, return the dataset as a pandas dataframe, can override
def load_data(self, dataset):
url = self.DATASET_URL[dataset]
file_md5 = self.DATASET_MD5[dataset]
return self.prepare_tsv(url, file_md5)
# Post built hook, will be called after the dataset is built, can override
def post_build(self, dataset):
pass
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
question = line['question']
msgs = []
msgs.append(dict(type='text', value=question))
return msgs
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
pass

View File

@@ -0,0 +1,123 @@
from .text_base import TextBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
class TextMCQDataset(TextBaseDataset):
TYPE = 'MCQ'
DATASET_URL = {}
DATASET_MD5 = {}
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
msgs = []
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
# assert dataset is not None
dataset_map = {
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
}
dataset = self.dataset_name
if dataset in dataset_map:
dataset = dataset_map[dataset]
nproc = judge_kwargs.pop('nproc', 4)
circular = False
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
if circular:
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
else:
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
# May have different report acc functions for different datasets
if 'MMT' in dataset:
acc = report_acc_MMT(data)
else:
acc = report_acc(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
return acc
class CustomTextMCQDataset(TextMCQDataset):
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)

View File

@@ -0,0 +1,9 @@
from .judge_util import build_judge, DEBUG_MESSAGE
from .multiple_choice import extract_answer_from_item, prefetch_answer
from .vqa_eval import levenshtein_distance
__all__ = [
'build_judge', 'extract_answer_from_item', 'prefetch_answer',
'levenshtein_distance', 'DEBUG_MESSAGE'
]

View File

@@ -0,0 +1,41 @@
import os
from ...api import OpenAIWrapper
from ...smp import load_env
INTERNAL = os.environ.get('INTERNAL', 0)
def build_judge(**kwargs):
model = kwargs.pop('model', None)
kwargs.pop('nproc', None)
load_env()
LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
if LOCAL_LLM is None:
model_map = {
'gpt-4-turbo': 'gpt-4-1106-preview',
'gpt-4-0613': 'gpt-4-0613',
'gpt-4-0125': 'gpt-4-0125-preview',
'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
'chatgpt-1106': 'gpt-3.5-turbo-1106',
'chatgpt-0125': 'gpt-3.5-turbo-0125',
'gpt-4o': 'gpt-4o-2024-05-13',
'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
}
model_version = model_map[model]
else:
model_version = LOCAL_LLM
model = OpenAIWrapper(model_version, **kwargs)
return model
DEBUG_MESSAGE = """
To debug the OpenAI API, you can try the following scripts in python:
```python
from vlmeval.api import OpenAIWrapper
model = OpenAIWrapper('gpt-4-1106-preview', verbose=True)
msgs = [dict(type='text', value='Hello!')]
code, answer, resp = model.generate_inner(msgs)
print(code, answer, resp)
```
You cam see the specific error if the API call fails.
"""

View File

@@ -0,0 +1,65 @@
import numpy as np
import pandas as pd
from ...smp import *
rule_dict = {
'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}, # noqa: E501
'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}, # noqa: E501
'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'} # noqa: E501
}
def get_eval(judge, content):
return judge.generate(content)
def parse_score(review):
logger = get_logger('Evaluation')
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
logger.error('error', review)
return [-1, -1]
except Exception as e:
logger.error(e, 'error', review)
return [-1, -1]
def build_prompt(line):
cap_str = line['caption']
question = line['question']
ans1 = line['gpt4_ans']
ans2 = line['prediction']
category = 'llava_bench_' + line['category']
rule = rule_dict[category]
role, prompt = rule['role'], rule['prompt']
content = (f'[Context]\n{cap_str}\n\n'
f'[Question]\n{question}\n\n'
f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
return content
def LLaVABench_atomeval(model, prompt):
review = get_eval(model, prompt)
scores = parse_score(review)
return scores
def LLaVABench_score(data):
cates = ['overall'] + list(set(data['category']))
ret = defaultdict(list)
for c in cates:
ret['split'].append(c)
sub = data[data['category'] == c] if c != 'overall' else data
ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
ret['VLM Score'].append(np.mean(sub['score']) * 10)
ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
return pd.DataFrame(ret)

View File

@@ -0,0 +1,170 @@
from ...smp import *
from ...utils import can_infer
try:
from latex2sympy2 import latex2sympy
except ImportError:
print('Please install latex2sympy2 by running "pip install latex2sympy2"')
FAIL_MSG = 'Failed to obtain answer via API.'
def is_equal(asw: str, gt_asw: str) -> bool:
if not isinstance(asw, str) != str or not isinstance(gt_asw, str):
print('Warning: input is not string')
print(asw, gt_asw)
asw = str(asw).lower().strip()
gt_asw = str(gt_asw).lower().strip()
if gt_asw == asw:
return True
try:
a = eval(gt_asw)
b = eval(asw)
if abs(a - b) < 1e-6:
return True
except:
pass
try:
a = latex2sympy(gt_asw)
b = latex2sympy(asw)
if abs(eval(str(a)) - eval(str(b))) < 1e-6:
return True
if abs(a - b) < 1e-6:
return True
except:
pass
return False
def get_gpt4_ICE():
example_1 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Which number is missing?\n
Model response: The number missing in the sequence is 14.\n
Extracted answer: 14
"""
example_2 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: What is the fraction of females facing the camera?\n
Model response: The fraction of females facing the camera is 0.6,
which means that six out of ten females in the group are facing the camera.\n
Extracted answer: 0.6
"""
example_3 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
Extracted answer: 1.45
"""
example_4 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Between which two years does the line graph saw its maximum peak?\n
Model response: The line graph saw its maximum peak between 2007 and 2008.\n
Extracted answer: [2007, 2008]
"""
example_5 = """
Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
Question: What fraction of the shape is blue?\n
Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
Model response: The correct answer is (B) 8/11.\n
Extracted answer: B
"""
return [example_1, example_2, example_3, example_4, example_5]
def build_mathv_gpt4_prompt(line):
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example + '\n'
prompt += question + '\n'
prompt += 'Model respone: ' + prediction
prompt += 'Extracted answer:'
return prompt
def list_to_dict(lst):
return {chr(65 + i): val for i, val in enumerate(lst)}
def post_check(line, prefetch=False):
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if len(eval(line['choices'])) > 0:
ans = line['answer']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
res = str(response)
ans = str(ans)
except ValueError:
pass
if is_equal(res, ans):
return res if prefetch else True
else:
return False
def MATH_V_auxeval(model, line):
prompt = build_mathv_gpt4_prompt(line)
log = ''
retry = 5
if post_check(line, prefetch=True):
res = post_check(line, prefetch=True)
return dict(log='Prefetch succeed', res=res)
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log=log, res=res)
log += 'All 5 retries failed.\n'
return dict(log=log, res='')
def MATH_V_acc(result_file):
data = load(result_file)
tot = defaultdict(lambda: 0)
fetch = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
lt = len(data)
for i in range(lt):
item = data.iloc[i]
cate = item['category']
tot['Overall'] += 1
tot[cate] += 1
if item['log'] == 'Prefetch succeed':
fetch['Overall'] += 1
fetch[cate] += 1
if post_check(item, prefetch=False):
hit['Overall'] += 1
hit[cate] += 1
res = defaultdict(list)
for k in tot.keys():
res['Subject'].append(k)
res['tot'].append(tot[k])
res['prefetch'].append(fetch[k])
res['hit'].append(hit[k])
res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
res['acc'].append(hit[k] / tot[k] * 100)
res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
return res

View File

@@ -0,0 +1,164 @@
from ...smp import *
from ...utils import can_infer
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_ICE():
example_1 = """
Hint: Please answer the question requiring an integer answer and provide the final value,
e.g., 1, 2, 3, at the end.\n
Question: Which number is missing?\n
Model response: The number missing in the sequence is 14.\n
Extracted answer: 14
"""
example_2 = """
Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
e.g., 1.2, 1.3, 1.4, at the end.\n
Question: What is the fraction of females facing the camera?\n
Model response: The fraction of females facing the camera is 0.6,
which means that six out of ten females in the group are facing the camera.\n
Extracted answer: 0.6
"""
example_3 = """
Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
e.g., 1.23, 1.34, 1.45, at the end.\n
Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
Extracted answer: 1.45
"""
example_4 = """
Hint: Please answer the question requiring a Python list as an answer and provide the final list,
e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
Question: Between which two years does the line graph saw its maximum peak?\n
Model response: The line graph saw its maximum peak between 2007 and 2008.\n
Extracted answer: [2007, 2008]
"""
example_5 = """
Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
Question: What fraction of the shape is blue?\n
Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
Model response: The correct answer is (B) 8/11.\n
Extracted answer: B
"""
return [example_1, example_2, example_3, example_4, example_5]
def build_mathvista_gpt4_prompt(line):
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example + '\n'
prompt += question + '\n'
prompt += 'Model respone: ' + prediction
prompt += 'Extracted answer:'
return prompt
def list_to_dict(lst):
return {chr(65 + i): val for i, val in enumerate(lst)}
def post_check(line, prefetch=False):
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if line['question_type'] == 'multi_choice':
ans = line['answer_option']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
if line['answer_type'] == 'integer':
res = int(response)
ans = int(line['answer'])
elif line['answer_type'] == 'float':
res = float(response)
ans = float(line['answer'])
else:
res = str(res)
ans = str(ans)
except ValueError:
pass
if res == ans:
return res if prefetch else True
else:
return False
def MathVista_auxeval(model, line):
prompt = build_mathvista_gpt4_prompt(line)
log = ''
retry = 5
if post_check(line, prefetch=True):
res = post_check(line, prefetch=True)
return dict(log='Prefetch succeed', res=res)
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log=log, res=res)
log += 'All 5 retries failed.\n'
return dict(log=log, res='')
def MathVista_acc(result_file):
data = load(result_file)
tot = defaultdict(lambda: 0)
fetch = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
lt = len(data)
skill_list = []
for i in range(lt):
item = data.iloc[i]
cate = item['task']
tot['Overall'] += 1
try:
skills = eval(item['skills'])
except SyntaxError:
skills = [item['skills']]
for skill in skills:
if skill not in skill_list:
skill_list.append(skill)
tot[skill] += 1
tot[cate] += 1
if item['log'] == 'Prefetch succeed':
fetch['Overall'] += 1
fetch[cate] += 1
for skill in skills:
fetch[skill] += 1
if post_check(item, prefetch=False):
hit['Overall'] += 1
hit[cate] += 1
for skill in skills:
hit[skill] += 1
res = defaultdict(list)
for k in tot.keys():
res['Task&Skill'].append(k)
res['tot'].append(tot[k])
res['prefetch'].append(fetch[k])
res['hit'].append(hit[k])
res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
res['acc'].append(hit[k] / tot[k] * 100)
res = pd.DataFrame(res)
return res

View File

@@ -0,0 +1,70 @@
from ...smp import *
import numpy as np
FAIL_MSG = 'Failed to obtain answer via API.'
system_prompt = """
As an AI assistant, your task is to evaluate a candidate answer in comparison to a given correct answer.
The question itself, the correct 'groundtruth' answer, and the candidate answer will be provided to you.
Your assessment should range from 0 to 3, \
based solely on the semantic similarity between the groundtruth and the candidate answer, \
disregarding any grammatical differences.
A rating of 0 suggests no similarity, implying the candidate answer is entirely incorrect.
A rating of 1 suggests low similarity, meaning the candidate answer is largely incorrect.
A rating of 2 suggests high similarity, meaning the candidate answer is largely correct.
Lastly, a rating of 3 indicates complete similarity, which means the candidate answer is entirely correct.
Your response should be a single integer from 0, 1, 2, or 3.
"""
MMV_DIMENSIONS = {
'CP': ['Video Topic', 'Video Emotion', 'Video Scene', 'Video Style'],
'FP-S': ['OCR', 'Object Recognition', 'Attribute Recognition', 'Event Recognition', 'Human Motion', 'Counting'],
'FP-C': ['Spatial Relationship', 'Human-object Interaction', 'Human Interaction'],
'HL': ['Hallucination'],
'LR': ['Structuralized Image-Text Understanding', 'Mathematical Calculation'],
'AR': ['Physical Property', 'Function Reasoning', 'Identity Reasoning'],
'RR': ['Natural Relation', 'Physical Relation', 'Social Relation'],
'CSR': ['Common Sense Reasoning'],
'TR': ['Counterfactual Reasoning', 'Causal Reasoning', 'Future Prediction'],
}
L3_DIMS = []
for k, v in MMV_DIMENSIONS.items():
L3_DIMS.extend(v)
MMV_DIMENSIONS['Perception'] = []
MMV_DIMENSIONS['Reasoning'] = []
MMV_DIMENSIONS['Overall'] = []
for k in ['CP', 'FP-C', 'FP-S', 'HL']:
MMV_DIMENSIONS['Perception'].extend(MMV_DIMENSIONS[k])
MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
for k in ['LR', 'AR', 'RR', 'CSR', 'TR']:
MMV_DIMENSIONS['Reasoning'].extend(MMV_DIMENSIONS[k])
MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
def get_dimension_rating(data_path):
data = load(data_path)
coarse_rating = {k: [] for k in MMV_DIMENSIONS}
fine_rating = {k: [] for k in L3_DIMS}
for i in range(len(data)):
cate = data.iloc[i]['dimensions']
cates = eval(cate)
for c in cates:
fine_rating[c].append(data.iloc[i]['score'])
for d in MMV_DIMENSIONS:
if np.any([x in MMV_DIMENSIONS[d] for x in cates]):
coarse_rating[d].append(data.iloc[i]['score'])
coarse_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in coarse_rating.items()}
coarse_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in coarse_rating.items()}
fine_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in fine_rating.items()}
fine_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in fine_rating.items()}
return dict(coarse_all=coarse_all, coarse_valid=coarse_valid, fine_all=fine_all, fine_valid=fine_valid)
def build_prompt(item):
tmpl = 'Question: {}\nGroundtruth answer: {}\nCandidate answer: {}\nYour response: '
return tmpl.format(item['question'], item['answer'], item['prediction'])

View File

@@ -0,0 +1,126 @@
from ...smp import *
meta_prompt = """
You are an assistant skilled at evaluating the quality of creative text.
Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to \
the user question displayed below. You'll need to assess the response on the following dimensions: \
Creativity, Richness, Visual Perception, Logical Coherence, Answer Accuracy and Image Relationship Understanding. \
We will provide you with a creative question and the AI model's response and a reference answer for your evaluation. \
As you begin your assessment, follow this process:
1. Evaluate the AI model's answers on different dimensions, pointing out its strengths or weaknesses \
in each dimension and assigning a score of 1 to 10 for each.
2. Finally, based on the assessments across dimensions, \
provide an overall score of 1 to 10 for the AI model's response.
3. Your scoring should be as stringent as possible and follow the scoring rules below:
In general, the higher the quality of the model's response and its strict adherence to user needs, \
the higher the score. Responses that do not meet user needs will receive lower scores.
Scoring rules:
Creativity:
Scores 1-2 when there is no innovation or uniqueness in the content.
Scores 3-4 when providing partially original content but with low creative quality.
Scores 5-6 when mostly creative but lacks significant novelty, with moderate quality.
Scores 7-8 when having novelty and high-quality content.
Scores 9-10 when highly novel and of exceptional quality compared to the reference answer.
Richness:
Scores 1-2 when lacking depth and breadth, with very limited information.
Scores 3-4 when limited in depth and breadth, with fewer explanations and examples, showing low diversity.
Scores 5-6 when limited in depth and breadth but provides basic necessary information.
Scores 7-8 when providing depth and useful additional information.
Scores 9-10 when providing exceptional depth, breadth, and high diversity compared to the reference answer.
Visual Perception:
Scores 1-2 when the description of the visual information in the image contains errors or \
is significantly inconsistent with the content of the image.
Scores 3-4 When the description of the visual information in the image reflects only a small amount \
of the image's information and contains some errors.
Scores 5-6 when the description of the visual information in the image includes the basic information \
of the image but contains minimal information.
Scores 7-8 when the description of the visual information in the image matches the image well and is rich in content, \
providing a substantial amount of information about the image.
Scores 9-10 when the description of the visual information in the image not only matches the image \
but also is more detailed and informative compared to the reference answer, providing more information about the image.
Logical Coherence:
Scores 1-2 when entirely incoherent, lacking any logic, and not matching the question or known information.
Scores 3-4 when somewhat coherent but with many logical errors or inconsistencies.
Scores 5-6 when mostly coherent, with few errors, but may struggle to maintain complete coherence in complex situations.
Scores 7-8 when excellent logical handling, very few errors.
Scores 9-10 when flawless logic, impeccable in handling complexity, \
and significantly higher logical coherence compared to the reference answer.
Answer Accuracy:
Scores 1-2 when the answer is significantly inconsistent with the question or contains obvious errors.
Scores 3-4 when the answer is partially correct but contains some errors or is incomplete.
Scores 5-6 when the answer is basically correct but lacks details or is not sufficiently detailed.
Scores 7-8 when the answer is accurate and detailed, fully corresponding to the question.
Scores 9-10 when the answer is not only accurate and detailed but also provides additional useful information, \
exceeding expectations.
Image Relationship Understanding:
Scores 1-2 when there are significant errors or confusion in distinguishing and describing different images, \
unable to correctly identify and relate the content of the images.
Scores 3-4 when the description of different images reflects only minimal distinguishing information, \
contains some errors and confusion, and fails to clearly differentiate and relate the images.
Scores 5-6 when the description of different images includes basic distinguishing information, \
is able to correctly identify and relate the images in a basic manner, \
but the information provided is minimal and lacks detail.
Scores 7-8 when the description of different images is accurate and detailed, \
clearly distinguishing and relating the images, \
with rich content that points out the main commonalities and differences between the images.
Scores 9-10 when the description of different images is not only accurate and detailed but also \
provides richer information and analysis, clearly distinguishing and relating the images, \
more comprehensively pointing out the commonalities and differences \
between the images compared to the reference answer.
Overall Score:
Scores 1-2 when irrelevant to the question, factually incorrect, or generates harmful content.
Scores 3-4 when no serious errors, mostly harmless, but of low quality and does not meet requirements.
Scores 5-6 when basically meeting requirements but performing poorly in some dimensions, with moderate quality.
Scores 7-8 when performing well in all dimensions.
Scores 9-10 when fully addressing user questions and all requirements, significantly surpassing the reference answer.
Please remember, you must evaluate and explain before scoring. After your explanation for each dimension, \
add the score for that dimension. Finally, at the end of your response, \
in the format of the dictionary (including brackets), return all your scoring results, \
ensuring your scores are integers:
{'Dimension One': Score, 'Dimension Two': Score, ..., 'Overall Score': Score}, \
for example: {'Creativity': 9, 'Richness': 6, ..., 'Overall Score': 7}.\n
"""
question_begin_prompt = '[Question]'
reference_begin_prompt = '[The Start of Reference Answer]'
reference_end_prompt = '[The End of Reference Answer]'
answers_begin_prompt = '[The Start of Assistants Answer]'
answers_end_prompt = '[The End of Assistants Answer]'
def mmdu_score(model, line):
question = eval(line['question'])
gt = eval(line['answer'])
prediction = eval(line['prediction'])
DIMS = [
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
]
all_result_dict = []
logs = []
for j in range(len(question)):
try:
prompt = meta_prompt + question_begin_prompt + '\n' + question[j] + '\n\n' + \
reference_begin_prompt + '\n' + gt[j] + '\n' + reference_end_prompt + '\n\n' + \
answers_begin_prompt + '\n' + prediction[j] + '\n' + answers_end_prompt
response = model.generate(prompt)
start_index = response.find('{')
end_index = response.rfind('}') + 1
dictionary_str = response[start_index: end_index]
result_dict = eval(dictionary_str)
all_result_dict.append(result_dict)
if all([x in result_dict for x in DIMS]):
logs.append('Succeed')
else:
logs.append(
f'Following Dims are not in results of turn {j}: '
f'{",".join([x for x in DIMS if x not in result_dict])}'
)
except Exception as e:
print({e})
all_result_dict.append({d: None for d in DIMS})
logs.append(str(e))
df = pd.DataFrame(all_result_dict)
return dict(res=df, log='\n'.join(logs))

View File

@@ -0,0 +1,106 @@
from ...smp import *
def build_mmvet_gpt4_prompt(line):
question = line['question']
gt = str(line['answer'])
prediction = str(line['prediction'])
prompt = """
Compare the ground truth and prediction from AI models, to give a correctness score for the prediction.
<AND> in the ground truth means it is totally right
only when all elements in the ground truth are present in the prediction,
and <OR> means it is totally right when any one element in the ground truth is present in the prediction.
The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
Just complete the last space of the correctness score.
Question | Ground truth | Prediction | Correctness
--- | --- | --- | ---
What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
because the names of these countries do not accurately represent their landscapes. |
The meme talks about Iceland and Greenland. It's pointing out that despite their names,
Iceland is not very icy and Greenland isn't very green. | 0.4
Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
because the names of these countries do not accurately represent their landscapes. |
The meme is using humor to point out the misleading nature of Iceland's and Greenland's names.
Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow.
The text 'This is why I have trust issues' is a playful way to suggest
that these contradictions can lead to distrust or confusion.
The humor in this meme is derived from the unexpected contrast between the names of the countries
and their actual physical characteristics. | 1.0
"""
gpt4_prompt = prompt + '\n' + ' | '.join(
[question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
return gpt4_prompt
def MMVet_auxeval(model, line):
def float_cvt(s):
try:
return float(s)
except ValueError:
return None
prompt = build_mmvet_gpt4_prompt(line)
log = ''
retry = 5
for i in range(retry):
output = model.generate(prompt, temperature=i * 0.5)
score = float_cvt(output)
if score is None:
log += f'Try {i}: output is {output}, failed to parse.\n'
elif score < 0 or score > 1:
log += f'Try {i}: output is {output}, invalid score: {score}.\n'
else:
log += 'Succeed'
return dict(log=log, score=score)
log += 'All 5 retries failed.\n'
return dict(log=log, score=0.0)
def MMVet_acc(result_file):
data = load(result_file)
tot = defaultdict(lambda: 0)
score = defaultdict(lambda: 0)
lt = len(data)
cate2_list = []
for i in range(lt):
item = data.iloc[i]
cate = item['category']
cate2 = cate.replace(',', '_')
if cate2 not in cate2_list:
cate2_list.append(cate2)
grade = float(item['score'])
cate_list = ['rec', 'ocr', 'know', 'gen', 'spat', 'math']
for capa in cate_list:
if capa in cate:
tot[capa] += 1
score[capa] += grade
tot['Overall'] += 1
tot[cate2] += 1
score['Overall'] += grade
score[cate2] += grade
res = defaultdict(list)
res2 = defaultdict(list)
cate_list.append('Overall')
cate2_list.append('Overall')
for k in cate_list:
res['Category'].append(k)
res['tot'].append(tot[k])
res['acc'].append(score[k] / tot[k] * 100)
for v in cate2_list:
res2['Category'].append(v)
res2['tot'].append(tot[v])
res2['acc'].append(score[v] / tot[v] * 100)
res = pd.DataFrame(res)
res2 = pd.DataFrame(res2)
return res, res2

View File

@@ -0,0 +1,442 @@
import pandas as pd
from ...utils import can_infer, track_progress_rich
from ...smp import *
import numpy as np
MMB_abbrs = {
'coarse_perception': 'CP',
'finegrained_perception (instance-level)': 'FP-S',
'finegrained_perception (cross-instance)': 'FP-C',
'logic_reasoning': 'LR',
'relation_reasoning': 'RR',
'attribute_reasoning': 'AR'
}
MMT_abbrs = {
'visual_recognition': 'VR',
'localization': 'Loc',
'ocr': 'OCR',
'counting': 'Count',
'hallucination': 'HLN',
'image_retrieval': 'IR',
'threed': '3D',
'visual_captioning': 'VC',
'visual_grounding': 'VG',
'doc_understanding': 'DU',
'action_recognition': 'AR',
'pixel_level_perception': 'PLP',
'image-to-image_translation': 'I2IT',
'relation_reasoning': 'RR',
'intelligence_quotient_test': 'IQT',
'emotion': 'Emo',
'visual_illusion': 'VI',
'meme_understanding': 'MemU',
'visual_prompt_understanding': 'VPU',
'anomaly_detection': 'AND',
'keypoint_detection': 'KD',
'visual_commonsense_reasoning': 'VCR',
'image_evaluation_judgement': 'IEJ',
'multiple_image_analysis': 'MIA',
'cross_image_matching': 'CIM',
'temporal_understanding': 'TU',
'visual_code': 'VP',
'medical_understanding': 'MedU',
'autonomous_driving': 'AUD',
'discipline_knowledge_reasoning': 'DKR',
'embodied_ai': 'EA',
'gui_navigation': 'GN'
}
def MMMU_preproc(data):
logger = get_logger('Evaluation')
cnt = 0
As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
lt = len(data)
for i in range(lt):
if pd.isna(As[i]):
As[i] = Ans[i]
Bs[i] = 'Other Answers'
cnt += 1
logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ')
data['A'] = As
data['B'] = Bs
return data
def report_acc(df):
# assert group in [None, 'category', 'l2-category']
res = defaultdict(list)
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
for group in [None, 'l2-category', 'category']:
if group is None:
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
elif group not in df:
continue
else:
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
ab_name = MMB_abbrs[ab] if ab in MMB_abbrs else ab
sub_df = df[df[group] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
return pd.DataFrame(res)
def report_acc_MMT(df):
# assert group in [None, 'category', 'l2-category']
res = defaultdict(list)
res['split'] = list()
res['Overall'] = list()
for _, name in MMT_abbrs.items():
res[name] = list()
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
for group in [None, 'category', 'l2-category']:
if group is None:
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
res['Overall'].extend([np.mean(df['hit'])])
elif group not in df:
continue
elif group == 'category':
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
ab_name = ab
sub_df = df[df[group] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
res[ab_name].extend([np.mean(sub_df['hit'])])
else:
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
sub_task_acc = []
for sub_task_name in sub_task_name_list:
sub_df = df[df['category'] == sub_task_name]
sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
new_acc = []
for i in range(len(sub_task_acc[0])):
new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
ab_name = MMT_abbrs[ab] if ab in MMT_abbrs else ab
res[ab_name] = new_acc
sub_task_acc = []
for sub_task_name in sub_task_name_list:
sub_df = df[df['category'] == sub_task_name]
sub_task_acc.append([np.mean(sub_df['hit'])])
new_acc = []
for i in range(len(sub_task_acc[0])):
new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
res[ab_name].extend(new_acc)
res['split'].append('ALL')
return pd.DataFrame(res)
def build_prompt(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match '
'an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
'If the meaning of all options are significantly different from the answer, output Z. '
'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
'Example 1: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: a cute teddy bear\nYour output: A\n'
'Example 2: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: Spider\nYour output: Z\n'
'Example 3: \n'
'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
)
return tmpl.format(question, options, prediction)
def build_prompt_blink(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
"If the answer says things like refuse to answer, I'm sorry cannot help, etc., output Z."
'If the meaning of all options are significantly different from the answer, '
'or the answer does not select any option, output Z. '
'Your should output one of the choices, A, B, C, D (if they are valid options), or Z.\n'
'Example 1: \n'
'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
'Options: A. Point A\nB. Point B\n(Z) Failed\n'
'Answer: Point B, where the child is sitting, is closer to the camera.\nYour output: (B)\n'
'Example 2: \n'
'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
"Answer: I'm sorry, but I can't assist with that request.\nYour output: (Z)\n"
'Example 3: \n'
'Question: Which point is corresponding to the reference point?\nSelect from the following choices.\n'
'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
'Answer:The reference point (REF) on the first image is at the tip of the pot, '
'which is the part used to Poke if the pots were used for that action. Looking at the second image, '
'we need to find the part of the object that would correspond to poking.\n'
"(A) Point A is at the tip of the spoon's handle, which is not used for poking.\n"
'(B) Point B is at the bottom of the spoon, which is not used for poking.\n'
'(C) Point C is on the side of the pspoonot, which is not used for poking.\n'
'(D) Point D is at the tip of the spoon, which is not used for poking.\n'
'\nTherefore, there is no correct answer in the choices\nYour output: (Z)\n'
'Example 4: \n'
'Question: {}?\nOptions: {}\n(Z) Failed\nAnswer: {}\nYour output: '
)
return tmpl.format(question, options, prediction)
def build_prompt_cn(question, options, prediction):
tmpl = (
'你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。'
'你会被提供:一个问题,多个选项,一个答案。你的任务是找到与答案意义最相近的选项。'
'如果所有选项的意义都与答案显著不同,则输出 Z。'
'你应该输出一个单个的大写字母,例如 A, B, C, D如果它们是有效选项或 Z。'
'例 1:'
'问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n'
'例 2: \n'
'问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n'
'例 3: \n'
'问题: {}?\n选项: {}\n答案: {}\n输出: '
)
return tmpl.format(question, options, prediction)
def build_choices(item):
ret = {}
for ch in string.ascii_uppercase:
if ch in item and (not pd.isna(item[ch])):
ret[ch] = item[ch]
return ret
def prefetch_answer(item):
choices = build_choices(item)
return can_infer(item['prediction'], choices)
def extract_answer_from_item(model, item, dataset_name=None):
logger = get_logger('Evaluation')
# It will return: (pred, raw, llm_time)
choices = build_choices(item)
option_str = build_option_str(choices)
if dataset_name == 'BLINK':
prompt = build_prompt_blink(item['question'], option_str, item['prediction'])
elif cn_string(item['question']):
prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
else:
prompt = build_prompt(item['question'], option_str, item['prediction'])
retry = 3
ret = can_infer(item['prediction'], choices)
if ret:
return dict(opt=ret, log=item['prediction'])
if model is None:
return dict(opt='Z', log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
while retry:
ans = model.generate(prompt)
if 'Failed to obtain answer via API' in ans:
logger.warning('GPT API failed to answer. ')
else:
ret = can_infer(ans, choices)
if ret:
return dict(opt=ret, log=ans)
else:
logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
retry -= 1
if retry == 0:
options = list(choices) + ['Z'] if 'Z' not in choices else []
return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
# For Circular Evaluation
def prefetch_circular_group(sub_data, verbose=False):
lt = len(sub_data)
GT, PRED = [], []
for i in range(lt):
item = sub_data.iloc[i]
GT.append(item['GT'])
PRED.append(prefetch_answer(item))
if PRED[-1] and (GT[-1] != PRED[-1]):
log = (
f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, '
f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. "
)
return dict(hit=0, log=log)
flag = True
for g, p in zip(GT, PRED):
if g != p:
flag = False
ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, )
ret = ret + (GT, PRED) if verbose else ret
return ret if len(ret) > 1 else ret[0]
def eval_vanilla(model, item, dataset_name=None):
res = extract_answer_from_item(model, item, dataset_name=dataset_name)
opt, match_log = res['opt'], res['log']
if opt == item['GT']:
return dict(hit=1, log=f'Match Log: {match_log}. ')
else:
return dict(hit=0, log=f'Match Log: {match_log}. ')
# For Circular Evaluation
def eval_circular_group(model, sub_data, dataset_name=None):
res, GT, PRED = prefetch_circular_group(sub_data, verbose=True)
if res is not None:
return res
lt = len(sub_data)
log = ''
for i in range(lt):
if PRED[i]:
log += f'Rolling {i} Matched.\n'
else:
res = extract_answer_from_item(model, sub_data.iloc[i], dataset_name=dataset_name)
opt, match_log = res['opt'], res['log']
PRED[i] = opt
if PRED[i] != GT[i]:
log += (
f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
)
return dict(hit=0, log=log)
else:
log += (
f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
f'Pre-fetched is {PRED[i]}.\n'
)
return dict(hit=1, log=log)
# data, meta are pd.DataFrame, result_file is a path
def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None):
result = {}
if osp.exists(result_file):
result = load(result_file)
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
if 'MMMU' in dataset_name:
data = MMMU_preproc(data)
answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
data = data[data['index'].isin(answer_map)]
data['GT'] = [answer_map[idx] for idx in data['index']]
items = []
for i in range(len(data)):
# Dealing with the normal part
item = data.iloc[i]
if item['index'] not in result:
items.append(item)
tups = [dict(model=model, item=x, dataset_name=dataset_name) for x in items]
keys = [x['index'] for x in items]
if len(tups):
res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys)
result = load(result_file)
for k, v in zip(keys, res):
if k in result:
assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
else:
result[k] = v
data['hit'] = [result[i]['hit'] for i in data['index']]
data['log'] = [result[i]['log'] for i in data['index']]
if 'GT' in data:
data.pop('GT')
return data
# data, meta are pd.DataFrame, result_file is a path
def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
result = {}
if osp.exists(result_file):
result = load(result_file)
# Build Answer Map
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
for idx in list(meta['index']) + list(data['index']):
assert istype(idx, int)
# Only keep those lines in the meta data
data = data[data['index'].isin(answer_map)]
data['GT'] = [answer_map[idx] for idx in data['index']]
data_main = data[data['index'] < int(1e6)]
data_groups = []
for i in range(len(data_main)):
# Dealing with the normal part
idx = data_main.iloc[i]['index']
if idx not in result:
sub_data = data[data['index'] % int(1e6) == idx]
data_groups.append(sub_data)
if len(data_groups):
prefetched = [prefetch_circular_group(g, verbose=False) for g in data_groups]
remain = []
for dg, pf in zip(data_groups, prefetched):
if pf is not None:
result[dg.iloc[0]['index'] % 1e6] = pf
else:
remain.append(dg)
dump(result, result_file)
tups = [dict(model=model, sub_data=x, dataset_name=dataset_name) for x in remain]
keys = [x.iloc[0]['index'] % 1e6 for x in remain]
if len(tups) == 0:
pass
elif model is None:
logger = get_logger('Evaluation')
logger.warning('Exact Matching mode, will not do GPT-based answer matching. ')
for k in keys:
result[k] = dict(
hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
else:
res = track_progress_rich(
eval_circular_group,
tups,
nproc=nproc,
chunksize=nproc,
save=result_file,
keys=keys)
result = load(result_file)
for k, v in zip(keys, res):
if k in result:
assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
else:
result[k] = v
tmp_pth = f'/tmp/{timestr()}.xlsx'
dump(data_main, tmp_pth)
data_main = load(tmp_pth)
indices = data_main['index']
data_main['hit'] = [result[i]['hit'] for i in indices]
data_main['log'] = [result[i]['log'] for i in indices]
if 'GT' in data_main:
data_main.pop('GT')
return data_main

View File

@@ -0,0 +1,450 @@
from ...smp import *
from PIL import Image, ImageOps
import torchvision
import random
import numbers
import math
import torch
def get_dimension_rating(data_path):
data = load(data_path)
result_board = {}
for idx, item in data.iterrows():
if item['task_type'] not in result_board:
result_board[item['task_type']] = [0, 0]
result_board[item['task_type']][1] += 1
if item['score']:
result_board[item['task_type']][0] += 1
correct = 0
total = 0
for key, value in result_board.items():
correct += value[0]
total += value[1]
result_board[key].append(f'{value[0] / value[1] * 100 :.2f}%')
result_board['overall'] = [correct, total, f'{correct / total * 100 :.2f}%']
return result_board
def check_ans(pred, gt):
flag = False
pred_list = pred.lower().split(' ')
pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
gt_list = gt.lower().split(' ')
gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
if gt_content[-1] == '.':
gt_content = gt_content[:-1]
if pred_option.replace('.', '') in gt_option:
flag = True
elif gt_option in pred_option:
flag = True
return flag
class GroupRandomCrop(object):
def __init__(self, size):
if isinstance(size, numbers.Number):
self.size = (int(size), int(size))
else:
self.size = size
def __call__(self, img_group):
w, h = img_group[0].size
th, tw = self.size
out_images = list()
x1 = random.randint(0, w - tw)
y1 = random.randint(0, h - th)
for img in img_group:
assert (img.size[0] == w and img.size[1] == h)
if w == tw and h == th:
out_images.append(img)
else:
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
return out_images
class MultiGroupRandomCrop(object):
def __init__(self, size, groups=1):
if isinstance(size, numbers.Number):
self.size = (int(size), int(size))
else:
self.size = size
self.groups = groups
def __call__(self, img_group):
w, h = img_group[0].size
th, tw = self.size
out_images = list()
for i in range(self.groups):
x1 = random.randint(0, w - tw)
y1 = random.randint(0, h - th)
for img in img_group:
assert (img.size[0] == w and img.size[1] == h)
if w == tw and h == th:
out_images.append(img)
else:
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
return out_images
class GroupCenterCrop(object):
def __init__(self, size):
self.worker = torchvision.transforms.CenterCrop(size)
def __call__(self, img_group):
return [self.worker(img) for img in img_group]
class GroupRandomHorizontalFlip(object):
"""Randomly horizontally flips the given PIL.Image with a probability of 0.5
"""
def __init__(self, is_flow=False):
self.is_flow = is_flow
def __call__(self, img_group, is_flow=False):
v = random.random()
if v < 0.5:
ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
if self.is_flow:
for i in range(0, len(ret), 2):
# invert flow pixel values when flipping
ret[i] = ImageOps.invert(ret[i])
return ret
else:
return img_group
class GroupNormalize(object):
def __init__(self, mean, std):
self.mean = mean
self.std = std
def __call__(self, tensor):
rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
rep_std = self.std * (tensor.size()[0] // len(self.std))
# TODO: make efficient
for t, m, s in zip(tensor, rep_mean, rep_std):
t.sub_(m).div_(s)
return tensor
class GroupScale(object):
""" Rescales the input PIL.Image to the given 'size'.
'size' will be the size of the smaller edge.
For example, if height > width, then image will be
rescaled to (size * height / width, size)
size: size of the smaller edge
interpolation: Default: PIL.Image.BILINEAR
"""
def __init__(self, size, interpolation=Image.BILINEAR):
self.worker = torchvision.transforms.Resize(size, interpolation)
def __call__(self, img_group):
return [self.worker(img) for img in img_group]
class GroupOverSample(object):
def __init__(self, crop_size, scale_size=None, flip=True):
self.crop_size = crop_size if not isinstance(
crop_size, int) else (crop_size, crop_size)
if scale_size is not None:
self.scale_worker = GroupScale(scale_size)
else:
self.scale_worker = None
self.flip = flip
def __call__(self, img_group):
if self.scale_worker is not None:
img_group = self.scale_worker(img_group)
image_w, image_h = img_group[0].size
crop_w, crop_h = self.crop_size
offsets = GroupMultiScaleCrop.fill_fix_offset(
False, image_w, image_h, crop_w, crop_h)
oversample_group = list()
for o_w, o_h in offsets:
normal_group = list()
flip_group = list()
for i, img in enumerate(img_group):
crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
normal_group.append(crop)
flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
if img.mode == 'L' and i % 2 == 0:
flip_group.append(ImageOps.invert(flip_crop))
else:
flip_group.append(flip_crop)
oversample_group.extend(normal_group)
if self.flip:
oversample_group.extend(flip_group)
return oversample_group
class GroupFullResSample(object):
def __init__(self, crop_size, scale_size=None, flip=True):
self.crop_size = crop_size if not isinstance(
crop_size, int) else (crop_size, crop_size)
if scale_size is not None:
self.scale_worker = GroupScale(scale_size)
else:
self.scale_worker = None
self.flip = flip
def __call__(self, img_group):
if self.scale_worker is not None:
img_group = self.scale_worker(img_group)
image_w, image_h = img_group[0].size
crop_w, crop_h = self.crop_size
w_step = (image_w - crop_w) // 4
h_step = (image_h - crop_h) // 4
offsets = list()
offsets.append((0 * w_step, 2 * h_step)) # left
offsets.append((4 * w_step, 2 * h_step)) # right
offsets.append((2 * w_step, 2 * h_step)) # center
oversample_group = list()
for o_w, o_h in offsets:
normal_group = list()
flip_group = list()
for i, img in enumerate(img_group):
crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
normal_group.append(crop)
if self.flip:
flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
if img.mode == 'L' and i % 2 == 0:
flip_group.append(ImageOps.invert(flip_crop))
else:
flip_group.append(flip_crop)
oversample_group.extend(normal_group)
oversample_group.extend(flip_group)
return oversample_group
class GroupMultiScaleCrop(object):
def __init__(self, input_size, scales=None, max_distort=1,
fix_crop=True, more_fix_crop=True):
self.scales = scales if scales is not None else [1, .875, .75, .66]
self.max_distort = max_distort
self.fix_crop = fix_crop
self.more_fix_crop = more_fix_crop
self.input_size = input_size if not isinstance(input_size, int) else [
input_size, input_size]
self.interpolation = Image.BILINEAR
def __call__(self, img_group):
im_size = img_group[0].size
crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
crop_img_group = [
img.crop(
(offset_w,
offset_h,
offset_w + crop_w,
offset_h + crop_h)) for img in img_group]
ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
for img in crop_img_group]
return ret_img_group
def _sample_crop_size(self, im_size):
image_w, image_h = im_size[0], im_size[1]
# find a crop size
base_size = min(image_w, image_h)
crop_sizes = [int(base_size * x) for x in self.scales]
crop_h = [
self.input_size[1] if abs(
x - self.input_size[1]) < 3 else x for x in crop_sizes]
crop_w = [
self.input_size[0] if abs(
x - self.input_size[0]) < 3 else x for x in crop_sizes]
pairs = []
for i, h in enumerate(crop_h):
for j, w in enumerate(crop_w):
if abs(i - j) <= self.max_distort:
pairs.append((w, h))
crop_pair = random.choice(pairs)
if not self.fix_crop:
w_offset = random.randint(0, image_w - crop_pair[0])
h_offset = random.randint(0, image_h - crop_pair[1])
else:
w_offset, h_offset = self._sample_fix_offset(
image_w, image_h, crop_pair[0], crop_pair[1])
return crop_pair[0], crop_pair[1], w_offset, h_offset
def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
offsets = self.fill_fix_offset(
self.more_fix_crop, image_w, image_h, crop_w, crop_h)
return random.choice(offsets)
@staticmethod
def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
w_step = (image_w - crop_w) // 4
h_step = (image_h - crop_h) // 4
ret = list()
ret.append((0, 0)) # upper left
ret.append((4 * w_step, 0)) # upper right
ret.append((0, 4 * h_step)) # lower left
ret.append((4 * w_step, 4 * h_step)) # lower right
ret.append((2 * w_step, 2 * h_step)) # center
if more_fix_crop:
ret.append((0, 2 * h_step)) # center left
ret.append((4 * w_step, 2 * h_step)) # center right
ret.append((2 * w_step, 4 * h_step)) # lower center
ret.append((2 * w_step, 0 * h_step)) # upper center
ret.append((1 * w_step, 1 * h_step)) # upper left quarter
ret.append((3 * w_step, 1 * h_step)) # upper right quarter
ret.append((1 * w_step, 3 * h_step)) # lower left quarter
ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
return ret
class GroupRandomSizedCrop(object):
"""Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
This is popularly used to train the Inception networks
size: size of the smaller edge
interpolation: Default: PIL.Image.BILINEAR
"""
def __init__(self, size, interpolation=Image.BILINEAR):
self.size = size
self.interpolation = interpolation
def __call__(self, img_group):
for attempt in range(10):
area = img_group[0].size[0] * img_group[0].size[1]
target_area = random.uniform(0.08, 1.0) * area
aspect_ratio = random.uniform(3. / 4, 4. / 3)
w = int(round(math.sqrt(target_area * aspect_ratio)))
h = int(round(math.sqrt(target_area / aspect_ratio)))
if random.random() < 0.5:
w, h = h, w
if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
x1 = random.randint(0, img_group[0].size[0] - w)
y1 = random.randint(0, img_group[0].size[1] - h)
found = True
break
else:
found = False
x1 = 0
y1 = 0
if found:
out_group = list()
for img in img_group:
img = img.crop((x1, y1, x1 + w, y1 + h))
assert (img.size == (w, h))
out_group.append(
img.resize(
(self.size, self.size), self.interpolation))
return out_group
else:
# Fallback
scale = GroupScale(self.size, interpolation=self.interpolation)
crop = GroupRandomCrop(self.size)
return crop(scale(img_group))
class ConvertDataFormat(object):
def __init__(self, model_type):
self.model_type = model_type
def __call__(self, images):
if self.model_type == '2D':
return images
tc, h, w = images.size()
t = tc // 3
images = images.view(t, 3, h, w)
images = images.permute(1, 0, 2, 3)
return images
class Stack(object):
def __init__(self, roll=False):
self.roll = roll
def __call__(self, img_group):
if img_group[0].mode == 'L':
return np.concatenate([np.expand_dims(x, 2)
for x in img_group], axis=2)
elif img_group[0].mode == 'RGB':
if self.roll:
return np.concatenate([np.array(x)[:, :, ::-1]
for x in img_group], axis=2)
else:
# print(np.concatenate(img_group, axis=2).shape)
# print(img_group[0].shape)
return np.concatenate(img_group, axis=2)
class ToTorchFormatTensor(object):
""" Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
def __init__(self, div=True):
self.div = div
def __call__(self, pic):
if isinstance(pic, np.ndarray):
# handle numpy array
img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
else:
# handle PIL Image
img = torch.ByteTensor(
torch.ByteStorage.from_buffer(
pic.tobytes()))
img = img.view(pic.size[1], pic.size[0], len(pic.mode))
# put it from HWC to CHW format
# yikes, this transpose takes 80% of the loading time/CPU
img = img.transpose(0, 1).transpose(0, 2).contiguous()
return img.float().div(255) if self.div else img.float()
class IdentityTransform(object):
def __call__(self, data):
return data

View File

@@ -0,0 +1,65 @@
from ...smp import *
def OCRBench_eval(eval_file):
OCRBench_score = {
'Regular Text Recognition': 0,
'Irregular Text Recognition': 0,
'Artistic Text Recognition': 0,
'Handwriting Recognition': 0,
'Digit String Recognition': 0,
'Non-Semantic Text Recognition': 0,
'Scene Text-centric VQA': 0,
'Doc-oriented VQA': 0,
'Key Information Extraction': 0,
'Handwritten Mathematical Expression Recognition': 0
}
logger = get_logger('Evaluation')
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
for i in tqdm(range(len(lines))):
line = lines[i]
predict = str(line['prediction'])
answers = eval(line['answer'])
category = line['category']
if category == 'Handwritten Mathematical Expression Recognition':
for j in range(len(answers)):
answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
predict = predict.strip().replace('\n', ' ').replace(' ', '')
if answer in predict:
OCRBench_score[category] += 1
break
else:
for j in range(len(answers)):
answer = answers[j].lower().strip().replace('\n', ' ')
predict = predict.lower().strip().replace('\n', ' ')
if answer in predict:
OCRBench_score[category] += 1
break
final_score_dict = {}
final_score_dict['Text Recognition'] = (
OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+ OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+ OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
)
final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
final_score_dict['Handwritten Mathematical Expression Recognition'] = \
OCRBench_score['Handwritten Mathematical Expression Recognition']
final_score_dict['Final Score'] = (
final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+ final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+ final_score_dict['Handwritten Mathematical Expression Recognition']
)
final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(final_score_dict, score_pth)
logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
for key, value in final_score_dict.items():
logger.info('{}:{}'.format(key, value))

View File

@@ -0,0 +1,140 @@
from ...smp import *
import numpy as np
import re
FAIL_MSG = 'Failed to obtain answer via API.'
DURATIONS = [
'short',
'medium',
'long',
]
DOMAINS = [
'Knowledge',
'Film & Television',
'Sports Competition',
'Artistic Performance',
'Life Record',
'Multilingual'
]
SUB_CATEGORIES = [
'Humanity & History',
'Literature & Art',
'Biology & Medicine',
'Finance & Commerce',
'Astronomy',
'Geography',
'Law',
'Life Tip',
'Technology',
'Animation',
'Movie & TV Show',
'Documentary',
'News Report',
'Esports',
'Basketball',
'Football',
'Athletics',
'Other Sports',
'Stage Play',
'Magic Show',
'Variety Show',
'Acrobatics',
'Handicraft',
'Food',
'Fashion',
'Daily Life',
'Travel',
'Pet & Animal',
'Exercise',
'Multilingual'
]
TASK_CATEGORIES = [
'Temporal Perception',
'Spatial Perception',
'Attribute Perception',
'Action Recognition',
'Object Recognition',
'OCR Problems',
'Counting Problem',
'Temporal Reasoning',
'Spatial Reasoning',
'Action Reasoning',
'Object Reasoning',
'Information Synopsis',
]
def get_dimension_rating(data_path):
data = load(data_path)
duration_rating = {k: {} for k in DURATIONS}
for duration in DURATIONS + ['overall']:
duration_rating[duration] = {
'overall': '',
'domain': {k: [] for k in DOMAINS},
'sub_category': {k: [] for k in SUB_CATEGORIES},
'task_type': {k: [] for k in TASK_CATEGORIES}
}
for i in range(len(data)):
domain = data.iloc[i]['domain']
sub_ctg = data.iloc[i]['sub_category']
task_ctg = data.iloc[i]['task_type']
duration = data.iloc[i]['duration']
duration_rating[duration]['domain'][domain].append(data.iloc[i]['score'])
duration_rating[duration]['sub_category'][sub_ctg].append(data.iloc[i]['score'])
duration_rating[duration]['task_type'][task_ctg].append(data.iloc[i]['score'])
duration_rating['overall']['domain'][domain].append(data.iloc[i]['score'])
duration_rating['overall']['sub_category'][sub_ctg].append(data.iloc[i]['score'])
duration_rating['overall']['task_type'][task_ctg].append(data.iloc[i]['score'])
for duration in DURATIONS + ['overall']:
overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.2f}'
duration_rating[duration]['overall'] = overall_res_dur
for domain in DOMAINS:
domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.2f}'
duration_rating[duration]['domain'][domain] = domain_res_dur
for sub_ctg in SUB_CATEGORIES:
sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.2f}'
duration_rating[duration]['sub_category'][sub_ctg] = sub_res_dur
for task_ctg in TASK_CATEGORIES:
task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.2f}'
duration_rating[duration]['task_type'][task_ctg] = task_res_dur
return duration_rating
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
'The best answer is',
'The correct answer is',
'The answer is',
'The answer',
'The best option is'
'The correct option is',
'Best answer:'
'Best option:',
'Answer:',
'Option:',
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, '')
if len(s.split()) > 10 and not re.search('[ABCD]', s):
return ''
matches = re.search(r'[ABCD]', s)
if matches is None:
return ''
return matches[0]

View File

@@ -0,0 +1,285 @@
# Copyright (c) OpenMMLab. All rights reserved.
# Partly adopted from https://github.com/GT-Vision-Lab/VQA
# Copyright (c) 2014, Aishwarya Agrawal
from ...smp import *
from typing import Optional
def _process_digit_article(inText):
outText = []
tempText = inText.lower().split()
articles = ['a', 'an', 'the']
manualMap = {
'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10',
}
contractions = {
'aint': "ain't",
'arent': "aren't",
'cant': "can't",
'couldve': "could've",
'couldnt': "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
'didnt': "didn't",
'doesnt': "doesn't",
'dont': "don't",
'hadnt': "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
'hasnt': "hasn't",
'havent': "haven't",
'hed': "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
'hes': "he's",
'howd': "how'd",
'howll': "how'll",
'hows': "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
'Im': "I'm",
'Ive': "I've",
'isnt': "isn't",
'itd': "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
'itll': "it'll",
"let's": "let's",
'maam': "ma'am",
'mightnt': "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
'mightve': "might've",
'mustnt': "mustn't",
'mustve': "must've",
'neednt': "needn't",
'notve': "not've",
'oclock': "o'clock",
'oughtnt': "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
'shant': "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
'shouldve': "should've",
'shouldnt': "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": 'somebodyd',
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
'somebodyll': "somebody'll",
'somebodys': "somebody's",
'someoned': "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
'someonell': "someone'll",
'someones': "someone's",
'somethingd': "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
'somethingll': "something'll",
'thats': "that's",
'thered': "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
'therere': "there're",
'theres': "there's",
'theyd': "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
'theyll': "they'll",
'theyre': "they're",
'theyve': "they've",
'twas': "'twas",
'wasnt': "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
'weve': "we've",
'werent': "weren't",
'whatll': "what'll",
'whatre': "what're",
'whats': "what's",
'whatve': "what've",
'whens': "when's",
'whered': "where'd",
'wheres': "where's",
'whereve': "where've",
'whod': "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
'wholl': "who'll",
'whos': "who's",
'whove': "who've",
'whyll': "why'll",
'whyre': "why're",
'whys': "why's",
'wont': "won't",
'wouldve': "would've",
'wouldnt': "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
'yall': "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
'youd': "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
'youll': "you'll",
'youre': "you're",
'youve': "you've",
}
for word in tempText:
word = manualMap.setdefault(word, word)
if word not in articles:
outText.append(word)
for wordId, word in enumerate(outText):
if word in contractions:
outText[wordId] = contractions[word]
outText = ' '.join(outText)
return outText
def hit_calculate(result, dataset_name, anls_threshold=0.5):
if listinstr(['TextVQA'], dataset_name):
return [np.mean(x['match']) for x in result]
elif listinstr(['DocVQA', 'InfoVQA'], dataset_name):
return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result]
elif listinstr(['ChartQA', 'OCRVQA'], dataset_name):
return [np.max(x['match']) for x in result]
else: # default using vqa_score to calculate score
return [np.mean(x['match']) for x in result]
# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
def relaxed_correctness(target: str,
prediction: str,
max_relative_change: float = 0.05) -> bool:
"""Calculates relaxed correctness.
The correctness tolerates certain error ratio defined by max_relative_change.
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
numeric answers to allow a minor inaccuracy that may result from the automatic
data extraction process. We consider an answer to be correct if it is within
5% of the gold answer. For non-numeric answers, we still need an exact match
to consider an answer to be correct.”
Args:
target: Target string.
prediction: Predicted string.
max_relative_change: Maximum relative change.
Returns:
Whether the prediction was correct given the specified tolerance.
"""
def _to_float(text: str) -> Optional[float]:
try:
if text.endswith('%'):
# Convert percentages to floats.
return float(text.rstrip('%')) / 100.0
else:
return float(text)
except ValueError:
return None
prediction = str(prediction)
target = str(target)
prediction_float = _to_float(prediction)
target_float = _to_float(target)
if prediction_float is not None and target_float:
relative_change = abs(prediction_float - target_float) / abs(target_float)
return relative_change <= max_relative_change
else:
return prediction.lower() == target.lower()
def levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2 + 1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
def anls_compute(groundtruth, prediction):
gt_answer = ' '.join(groundtruth.strip().lower().split())
det_answer = ' '.join(prediction.strip().lower().split())
dist = levenshtein_distance(gt_answer, det_answer)
length = max(len(groundtruth.upper()), len(prediction.upper()))
values = 0.0 if length == 0 else float(dist) / float(length)
return values
def process_answer(answer):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = process_punctuation(answer)
answer = _process_digit_article(answer)
return answer
def process_line(line, method='vqa_score'):
ret = {}
if istype(line['answer'], list):
answers = eval(line['answer'])
else:
answers = [line['answer']]
if method == 'vqa_score':
ret['gt'] = [process_answer(x) for x in answers]
ret['pred'] = process_answer(line['prediction'])
ret['match'] = []
for current_idx, gtAnsDatum in enumerate(ret['gt']):
otherGTAns = [
item for ret_gt_idx, item in enumerate(ret['gt'])
if ret_gt_idx != current_idx
]
matchingAns = [
item for item in otherGTAns if item == ret['pred']
]
acc = min(1, float(len(matchingAns)) / 3)
ret['match'].append(acc)
elif method == 'anls':
ret['gt'] = answers
ret['pred'] = line['prediction']
ret['match'] = [anls_compute(x, ret['pred']) for x in ret['gt']]
elif method == 'relaxed_accuracy':
ret['gt'] = answers
ret['pred'] = line['prediction'].strip()
ret['match'] = [relaxed_correctness(ret['pred'], x) for x in ret['gt']]
elif method == 'accuracy':
ret['gt'] = answers
ret['pred'] = line['prediction'].strip()
ret['match'] = [(1.0 if (x.strip().lower() == ret['pred'].strip().lower()) else 0.0) for x in ret['gt']]
else: # default using vqa_score to calculate score
ret['gt'] = [process_answer(x) for x in answers]
ret['pred'] = process_answer(line['prediction'])
ret['match'] = [x == ret['pred'] for x in ret['gt']]
return ret

View File

@@ -0,0 +1,203 @@
from ...smp import *
def MME_rating(data_file):
data = load(data_file)
stats = defaultdict(dict)
lt = len(data)
for i in range(lt):
item = data.iloc[i]
category = item['category']
image_path = item['image_path']
score = item['score']
if image_path not in stats[category]:
stats[category][image_path] = []
stats[category][image_path].append(score)
def acc(key, mode='normal'):
res = stats[key]
values = []
for val in res.values():
if mode == 'normal':
values.extend(val)
elif mode == 'plus':
values.append(val[0] * val[1])
return np.mean(values) * 100
scores = {}
for k in stats:
scores[k] = acc(k) + acc(k, 'plus')
super_cates = dict(
perception=[
'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence',
'landmark', 'position', 'posters', 'scene'
],
reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation']
)
ret = {}
for sc, cate_list in super_cates.items():
base = 0
for c in cate_list:
base += scores[c]
ret[sc] = base
ret.update(scores)
ret = d2df(ret)
return ret
def Hallusion_rating(data_file):
def calc_fAcc(data):
res = defaultdict(list)
lt = len(data)
for i in range(lt):
line = data.iloc[i]
res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score'])
return np.mean([np.all(x) for x in res.values()]) * 100
def calc_qAcc(data):
res = defaultdict(list)
lt = len(data)
for i in range(lt):
line = data.iloc[i]
res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score'])
return np.mean([np.all(x) for x in res.values()]) * 100
def calc_aAcc(data):
return np.mean(data['score']) * 100
data = load(data_file)
data['set_id'] = [x.split('_')[3] for x in data['index']]
data['figure_id'] = [x.split('_')[4] for x in data['index']]
data['question_id'] = [x.split('_')[5] for x in data['index']]
res = dict(split=[], aAcc=[], fAcc=[], qAcc=[])
res['split'].append('Overall')
res['aAcc'].append(calc_aAcc(data))
res['fAcc'].append(calc_fAcc(data))
res['qAcc'].append(calc_qAcc(data))
if 'category' in data:
cates = list(set(data['category']))
for c in cates:
sub = data[data['category'] == c]
res['split'].append(c)
res['aAcc'].append(calc_aAcc(sub))
res['fAcc'].append(calc_fAcc(sub))
res['qAcc'].append(calc_qAcc(sub))
if 'l2-category' in data:
cates = list(set(data['l2-category']))
for c in cates:
sub = data[data['l2-category'] == c]
res['split'].append(c)
res['aAcc'].append(calc_aAcc(sub))
res['fAcc'].append(calc_fAcc(sub))
res['qAcc'].append(calc_qAcc(sub))
ret = pd.DataFrame(res)
return ret
def POPE_rating(data_file):
def cal_f1_score(y_true, y_pred):
tp = sum((y_true == 1) & (y_pred == 1))
fp = sum((y_true == 0) & (y_pred == 1))
fn = sum((y_true == 1) & (y_pred == 0))
precision = tp / (tp + fp) if (tp + fp) != 0 else 0
recall = tp / (tp + fn) if (tp + fn) != 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
return f1_score, precision, recall
data = load(data_file)
data = data.assign(category=data['category'].str.split(',')).explode('category')
data['index'] = range(len(data))
res = dict(split=[], Overall=[], acc=[], precision=[], recall=[])
y_true = np.array([1 if i == 'Yes' else 0 for i in data['answer']])
y_pred = np.array([1 if i == 'Yes' else 0 for i in data['extracted']])
f1_score, precision, recall = cal_f1_score(y_true, y_pred)
res['split'].append('Overall')
res['Overall'].append(f1_score * 100)
res['acc'].append(np.mean(data['score']) * 100)
res['precision'].append(precision * 100)
res['recall'].append(recall * 100)
if 'category' in data:
cates = list(set(data['category']))
cates = [c for c in cates if not pd.isna(c)]
for c in cates:
sub = data[data['category'] == c]
y_true = np.array([1 if i == 'Yes' else 0 for i in sub['answer']])
y_pred = np.array([1 if i == 'Yes' else 0 for i in sub['extracted']])
f1_score, precision, recall = cal_f1_score(y_true, y_pred)
res['split'].append(c)
res['Overall'].append(f1_score * 100)
res['acc'].append(np.mean(sub['score']) * 100)
res['precision'].append(precision * 100)
res['recall'].append(recall * 100)
ret = pd.DataFrame(res)
return ret
def default_rating(data_file):
data = load(data_file)
res = {}
res['Overall'] = np.mean(data['score']) * 100
if 'category' in data:
cates = list(set(data['category']))
cates = [c for c in cates if not pd.isna(c)]
cates.sort()
for c in cates:
sub = data[data['category'] == c]
res[c] = np.mean(sub['score']) * 100
if 'l2-category' in data:
cates = list(set(data['l2-category']))
cates = [c for c in cates if not pd.isna(c)]
cates.sort()
for c in cates:
sub = data[data['l2-category'] == c]
res[c] = np.mean(sub['score']) * 100
ret = d2df(res)
return ret
def YOrN_match_prompt(line):
tmpl = (
'You are an AI assistant who will help me to match an answer with two options of a question. '
'The options are only Yes / No. '
'You are provided with a question and an answer, '
'and you need to find which option (Yes / No) is most similar to the answer. '
'If the meaning of all options are significantly different from the answer, output Unknown. '
'Your should output a single word among the following 3 choices: Yes, No, Unknown.\n'
'Example 1: \n'
"Question: Is the word in this image 'Hello'?\nAnswer: The word in this image is 'Hello'.\nYour output: Yes\n"
'Example 2: \n'
"Question: Is the word in this image 'Hello'?\n"
"Answer: The word in this image is not 'Hello'.\nYour output: No\n"
'Example 3: \n'
'Question: {}?\nAnswer: {}\nYour output: '
)
return tmpl.format(line['question'], line['prediction'])
def YOrN_Extraction(output):
s = output.lower()
words = process_punctuation(s).split()
if 'yes' in words and 'no' not in words:
return 'Yes'
if 'yes' not in words and 'no' in words:
return 'No'
return 'Unknown'
def YOrN_auxeval(model, line):
prompt = YOrN_match_prompt(line)
retry = 5
for i in range(retry):
output = model.generate(prompt, temperature=0.5 * i)
ans = YOrN_Extraction(output)
if ans != 'Unknown':
return ans
return 'Unknown'

View File

@@ -0,0 +1,332 @@
import uuid
from functools import partial
from .image_base import ImageBaseDataset
from ..smp import *
rouge = None
nlp_en = None
nlp_zh = None
nlp = None
def initialize():
import evaluate
import spacy
global rouge, nlp_en, nlp_zh, nlp
try:
rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4()))
except:
warnings.warn('Please first `pip install rouge_score`.')
try:
nlp_en = spacy.load('en_core_web_sm')
except:
warnings.warn('Will automatically download en_core_web_sm via spacy.')
spacy.cli.download('en_core_web_sm')
nlp_en = spacy.load('en_core_web_sm')
try:
nlp_zh = spacy.load('zh_core_web_sm')
except:
warnings.warn('Will automatically download zh_core_web_sm via spacy.')
spacy.cli.download('zh_core_web_sm')
nlp_zh = spacy.load('zh_core_web_sm')
nlp = {'en': nlp_en, 'zh': nlp_zh}
def rough_filter(answer_text):
if "I can't" in answer_text:
return False
elif 'I cannot' in answer_text:
return False
elif 'sorry' in answer_text.lower():
return False
if '无法' in answer_text:
return False
elif '抱歉' in answer_text:
return False
else:
return True
def zero_template(crossed_text):
return {
'crossed_text': crossed_text,
'max_sim_val': 0,
'max_sim_string': '',
'precision': 0,
'recall': 0,
'f1': 0,
'jaccard': 0,
'rouge1': 0,
'exact_match': 0,
}
def tokenize(text, language):
"""
Tokenize the text and return the tokens.
Parameters:
text (str): The text to tokenize.
language (str): The language of the text.
Returns:
list: The list of tokens.
"""
assert language in ['en', 'zh']
nlp_language = nlp[language]
processed_text = nlp_language(text)
return [token.text for token in processed_text]
def find_best_match(needle, hay, language, rouge):
"""
Finds the best matching n-gram in the haystack for the given needle.
Parameters:
needle (str): The string to find.
hay (str): The text to search within.
Returns:
tuple: The highest similarity value and the best matching string.
"""
assert language in ['en', 'zh']
from nltk.util import ngrams
from difflib import SequenceMatcher as SM
tokens_hay = tokenize(hay, language)
tokens_needle = tokenize(needle, language)
splitter = '' if language == 'zh' else ' '
ngrams_ = ngrams(tokens_hay, len(tokens_needle))
max_sim_val = 0
max_sim_string = ''
max_sim_ngram = []
tokens_needle_set = set(tokens_needle)
ngrams_hasjoint = [
ngram
for ngram in ngrams_
if not set(ngram).isdisjoint(tokens_needle_set)
]
for ngram in ngrams_hasjoint:
hay_ngram = splitter.join(ngram)
similarity = SM(None, hay_ngram, needle).ratio()
if similarity > max_sim_val:
max_sim_val = similarity
max_sim_string = hay_ngram
max_sim_ngram = ngram
# Evaluate
if len(max_sim_ngram) == 0:
return {
'crossed_text': needle,
'max_sim_val': 0,
'max_sim_string': '',
'precision': 0,
'recall': 0,
'f1': 0,
'jaccard': 0,
'rouge1': 0,
'exact_match': 0,
}
pred_set = set(max_sim_ngram)
ref_set = set(tokens_needle)
correct_tokens = pred_set.intersection(ref_set)
len_correct_tokens = len(correct_tokens)
precision = len_correct_tokens / len(pred_set)
recall = len_correct_tokens / len(ref_set)
if (precision + recall) == 0:
f1 = 0
else:
f1 = 2 * precision * recall / (precision + recall)
union = pred_set.union(ref_set)
jaccard = len_correct_tokens / len(union) if len(union) > 0 else 0
rouge_1 = rouge.compute(
predictions=[max_sim_string],
references=[needle],
tokenizer=partial(tokenize, language=language),
rouge_types=['rouge1'],
)['rouge1']
exact_match = float(list(max_sim_ngram) == list(tokens_needle))
out = {
'crossed_text': needle,
'max_sim_string': max_sim_string,
'max_sim_val': max_sim_val,
'precision': precision,
'recall': recall,
'f1': f1,
'jaccard': jaccard,
'rouge1': rouge_1,
'exact_match': exact_match,
}
return out
def process_match_single_new(
image_id, prediction, answer, language, progress
):
"""
process the inference results for a single image and calculate the metrics
Parameters:
image_id (int): The image id (question id).
prediction (str): The prediction text.
answer (Union[str, List[str]]): The answer text, or a list of answer texts. The masked n-grams in the image.
language (str): The language of the text. Can be "en" or "zh".
rouge (rouge): The rouge metric object.
progress (multiprocessing.Queue): The progress queue.
Returns:
tuple: The image id (question_id, int) and the result per id (dict of dict of dict).
"""
result_per_id = {image_id: {}}
if isinstance(answer, str):
answer = eval(answer)
assert isinstance(answer, list)
result = prediction.split('Assistant: ')[-1]
for i, crossed_text in enumerate(answer):
if rough_filter(result):
find_best_match_result = find_best_match(
crossed_text, result, language, rouge
)
if i == 0:
result_per_id[image_id] = {str(i): find_best_match_result}
else:
result_per_id[image_id][str(i)] = find_best_match_result
else:
if i == 0:
result_per_id[image_id] = {str(i): zero_template(crossed_text)}
else:
result_per_id[image_id][str(i)] = zero_template(crossed_text)
progress.put(1)
return image_id, result_per_id
class VCRDataset(ImageBaseDataset):
TYPE = 'VQA'
URL_PREFIX = 'https://huggingface.co/datasets/vcr-org'
DATASET_URL = {
'VCR_EN_EASY_500': f'{URL_PREFIX}/VCR-wiki-en-easy-test-500/resolve/main/VCR-wiki-en-easy-test-500.tsv',
'VCR_EN_EASY_100': f'{URL_PREFIX}/VCR-wiki-en-easy-test-100/resolve/main/VCR-wiki-en-easy-test-100.tsv',
'VCR_EN_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-en-easy-test/resolve/main/VCR-wiki-en-easy-test.tsv',
'VCR_EN_HARD_500': f'{URL_PREFIX}/VCR-wiki-en-hard-test-500/resolve/main/VCR-wiki-en-hard-test-500.tsv',
'VCR_EN_HARD_100': f'{URL_PREFIX}/VCR-wiki-en-hard-test-100/resolve/main/VCR-wiki-en-hard-test-100.tsv',
'VCR_EN_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-en-hard-test/resolve/main/VCR-wiki-en-hard-test.tsv',
'VCR_ZH_EASY_500': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-500/resolve/main/VCR-wiki-zh-easy-test-500.tsv',
'VCR_ZH_EASY_100': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-100/resolve/main/VCR-wiki-zh-easy-test-100.tsv',
'VCR_ZH_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-zh-easy-test/resolve/main/VCR-wiki-zh-easy-test.tsv',
'VCR_ZH_HARD_500': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-500/resolve/main/VCR-wiki-zh-hard-test-500.tsv',
'VCR_ZH_HARD_100': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-100/resolve/main/VCR-wiki-zh-hard-test-100.tsv',
'VCR_ZH_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-zh-hard-test/resolve/main/VCR-wiki-zh-hard-test.tsv',
}
DATASET_MD5 = {
'VCR_EN_EASY_500': 'fd9258db52f8685dc710619a0ea0a261',
'VCR_EN_EASY_100': '9df5d7266683458621ecbe122beb72f0',
'VCR_EN_EASY_ALL': '8a9b96885f251d1c85f42f84073327f1',
'VCR_EN_HARD_500': '0a22a85080b6a1f52b1f95e302d43df4',
'VCR_EN_HARD_100': '1b20f5cbcbeae0b0bec77f7a36143958',
'VCR_EN_HARD_ALL': '2d8b8b1ee0eba0e0b618fd3aa7d9710e',
'VCR_ZH_EASY_500': 'beca5fd54176adf44cf94bd9b50cf048',
'VCR_ZH_EASY_100': '4a86a5678a79844d6d22ab0629c51cd5',
'VCR_ZH_EASY_ALL': '5050fe7f0027ad2068fd4c7f220edaea',
'VCR_ZH_HARD_500': '617e3360f75c54455625cb0a8da5c1e7',
'VCR_ZH_HARD_100': 'b0e38c85f5d5e63894a3b881c372a62b',
'VCR_ZH_HARD_ALL': '54bbfef448206518b03127ef8b61404c',
}
def __init__(self, dataset='VCR_EN_EASY_500', skip_noimg=True):
super().__init__(dataset, skip_noimg)
initialize()
self.language = 'en' if 'EN' in dataset else 'zh'
self.difficulty = 'easy' if 'EASY' in dataset else 'hard'
# def build_prompt(self, line):
# msgs = super().build_prompt(line)
# assert msgs[-1]['type'] == 'text'
# if self.language == 'zh':
# msgs[-1]['value'] += '图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。'
# else:
# msgs[-1]['value'] += ('What is the covered texts in the image? '
# 'Please restore the covered texts without outputting the explanations.')
# return msgs
def evaluate(self, eval_file, **judge_kwargs):
import multiprocessing
vcr_score_list = {'Exact_Match': [], 'Jaccard': []}
vcr_score = {'Exact_Match': 0, 'Jaccard': 0}
logger = get_logger('Evaluation')
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
pool = multiprocessing.Pool()
manager = multiprocessing.Manager()
progress_queue = manager.Queue()
results = []
overall_results = {str(image_id): {} for image_id in range(len(lines))}
for instance_id, instance in enumerate(lines):
results.append(
pool.apply_async(
process_match_single_new,
args=(
str(instance_id),
instance['prediction'],
instance['answer'],
self.language,
progress_queue,
),
)
)
pool.close()
# Display progress bar
for _ in tqdm(range(len(results))):
progress_queue.get()
pool.join()
# Merging results into overall_result
for result in results:
image_id, result_per_id = result.get()
overall_results[str(image_id)].update(result_per_id[image_id])
for blank_id_str in result_per_id[image_id].keys():
vcr_score_list['Exact_Match'].append(
result_per_id[image_id][blank_id_str]['exact_match']
)
vcr_score_list['Jaccard'].append(
result_per_id[image_id][blank_id_str]['jaccard']
)
vcr_score['Exact_Match'] = np.mean(vcr_score_list['Exact_Match'])
vcr_score['Jaccard'] = np.mean(vcr_score_list['Jaccard'])
results_out = {
k: v for i in range(len(results)) for k, v in results[i].get()[1].items()
}
results_with_metrics = {
'Exact_Match': vcr_score['Exact_Match'],
'Jaccard': vcr_score['Jaccard'],
'Predictions': results_out,
}
score_pth = eval_file.replace(
'.xlsx', f'{self.language}_{self.difficulty}_score.json'
)
dump(results_with_metrics, score_pth)
logger.info(
f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
)
logger.info('Score: ')
for key, value in vcr_score.items():
logger.info('{}:{}'.format(key, value))

View File

@@ -0,0 +1,87 @@
from abc import abstractmethod
from ..smp import *
class VideoBaseDataset:
MODALITY = 'VIDEO'
def __init__(self,
dataset='MMBench-Video',
pack=False):
try:
import decord
except:
warnings.warn('Please install decord via `pip install decord`.')
self.dataset_name = dataset
ret = self.prepare_dataset(dataset)
assert ret is not None
lmu_root = LMUDataRoot()
self.frame_root = osp.join(lmu_root, 'images', dataset)
os.makedirs(self.frame_root, exist_ok=True)
self.frame_tmpl = 'frame-{}-of-{}.jpg'
self.data_root = ret['root']
self.data_file = ret['data_file']
self.data = load(self.data_file)
assert 'question' in self.data and 'video' in self.data
videos = list(set(self.data['video']))
videos.sort()
self.videos = videos
self.pack = pack
def __len__(self):
return len(self.videos) if self.pack else len(self.data)
def __getitem__(self, idx):
if self.pack:
assert idx < len(self.videos)
sub_data = self.data[self.data['video'] == self.videos[idx]]
return sub_data
else:
assert idx < len(self.data)
return dict(self.data.iloc[idx])
def frame_paths(self, video, num_frames=8):
frame_root = osp.join(self.frame_root, video)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
def save_video_frames(self, video, num_frames=8):
frame_paths = self.frame_paths(video, num_frames)
flag = np.all([osp.exists(p) for p in frame_paths])
if flag:
return frame_paths
vid_path = osp.join(self.data_root, video + '.mp4')
vid = decord.VideoReader(vid_path)
step_size = len(vid) / (num_frames + 1)
indices = [int(i * step_size) for i in range(1, num_frames + 1)]
images = [vid[i].numpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
# Return a list of dataset names that are supported by this class, can override
@classmethod
def supported_datasets(cls):
return ['MMBench-Video', 'Video-MME', 'MVBench']
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
pass
@abstractmethod
def build_prompt(self, idx, num_frames=8):
pass
@abstractmethod
def prepare_dataset(self, dataset):
# The prepare_dataset function should return a dictionary containing:
# `root` (directory that containing video files)
# `data_file` (the TSV dataset file)
pass

View File

@@ -0,0 +1,250 @@
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
FAIL_MSG = 'Failed to obtain answer via API.'
def unwrap_hf_pkl(pth, suffix='.mp4'):
base_dir = os.path.join(pth, 'video_pkl/')
target_dir = os.path.join(pth, 'video/')
pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
pickle_files.sort()
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
for pickle_file in pickle_files:
with open(pickle_file, 'rb') as file:
video_data = pickle.load(file)
# For each video file in the pickle file, write its contents to a new mp4 file
for video_name, video_content in video_data.items():
output_path = os.path.join(target_dir, f'{video_name}{suffix}')
with open(output_path, 'wb') as output_file:
output_file.write(video_content)
print('The video file has been restored and stored from the pickle file.')
else:
print('The video file already exists.')
class VideoMME(VideoBaseDataset):
MD5 = '2f16cd40b1c125b67e661e59da2f6cd0'
SYS = ''
FRAMES_TMPL_NOSUB = """
These are the frames of a video. \
Select the best answer to the following multiple-choice question based on the video. \
Respond with only the letter (A, B, C, or D) of the correct option.
"""
FRAMES_TMPL_SUB = """
These are the frames of a video. \
This video's subtitles are listed below:
{}
Select the best answer to the following multiple-choice question based on the video. \
Respond with only the letter (A, B, C, or D) of the correct option.
"""
TYPE = 'MCQ'
def __init__(self, dataset='Video-MME', use_subtitle=False):
super().__init__(dataset=dataset)
self.use_subtitle = use_subtitle
@classmethod
def supported_datasets(cls):
return ['Video-MME']
def prepare_dataset(self, dataset_name='Video-MME', repo_id='lmms-lab/Video-MME'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data['video_path']:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def unzip_hf_zip(pth):
import zipfile
base_dir = pth
target_dir = os.path.join(pth, 'video/')
zip_files = [
os.path.join(base_dir, file) for file in os.listdir(base_dir)
if file.endswith('.zip') and file.startswith('video')
]
zip_files.sort()
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
for zip_file in zip_files:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for member in zip_ref.namelist():
# Check if the member is a file (not a directory)
if not member.endswith('/'):
# Extract the file to the specified directory
source = zip_ref.open(member)
target = open(os.path.join(target_dir, os.path.basename(member)), 'wb')
with source, target:
target.write(source.read())
print('The video file has been restored and stored from the zip file.')
else:
print('The video file already exists.')
subtitle_zip_file = os.path.join(base_dir, 'subtitle.zip')
subtitle_target_dir = os.path.join(base_dir, 'subtitle')
if not os.path.exists(subtitle_target_dir):
os.makedirs(subtitle_target_dir, exist_ok=True)
with zipfile.ZipFile(subtitle_zip_file, 'r') as zip_ref:
for member in zip_ref.namelist():
# Check if the member is a file (not a directory)
if not member.endswith('/'):
# Extract the file to the specified directory
source = zip_ref.open(member)
target = open(os.path.join(subtitle_target_dir, os.path.basename(member)), 'wb')
with source, target:
target.write(source.read())
print('The subtitle file has been restored and stored from the zip file.')
else:
print('The subtitle file already exists.')
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MD5:
return
data_file = pd.read_parquet(os.path.join(pth, 'videomme/test-00000-of-00001.parquet'))
data_file = data_file.assign(index=range(len(data_file)))
data_file['video'] = data_file['videoID']
data_file['video_path'] = data_file['videoID'].apply(lambda x: f'./video/{x}.mp4')
data_file['subtitle_path'] = data_file['videoID'].apply(lambda x: f'./subtitle/{x}.srt')
data_file['question'] += '\n' + data_file['options'].apply(lambda x: '\n'.join(x))
data_file = data_file[['index', 'video', 'video_path', 'duration', 'domain',
'sub_category', 'task_type', 'subtitle_path', 'question', 'answer']]
data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
unzip_hf_zip(dataset_path)
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(data_file=data_file, root=dataset_path)
def save_video_frames(self, video, num_frames=8):
vid_path = osp.join(self.data_root, 'video', video + '.mp4')
vid = decord.VideoReader(vid_path)
step_size = len(vid) / (num_frames + 1)
indices = [int(i * step_size) for i in range(1, num_frames + 1)]
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
frame_paths = self.frame_paths(video, num_frames)
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].numpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths, indices, video_info
def build_prompt(self, line, num_frames, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
frames, indices, video_info = self.save_video_frames(line['video'], num_frames)
if self.use_subtitle and os.path.exists(osp.join(self.data_root, line['subtitle_path'])):
import pysubs2
subs = pysubs2.load(osp.join(self.data_root, line['subtitle_path']), encoding='utf-8')
subtitles = []
for seleced_frame_id in indices:
sub_text = ''
cur_time = pysubs2.make_time(fps=video_info['fps'], frames=seleced_frame_id)
for sub in subs:
if sub.start < cur_time and sub.end > cur_time:
sub_text = sub.text.replace('\\N', ' ')
break
if sub_text.strip():
subtitles.append(sub_text)
subtitles = '\n'.join(subtitles)
else:
subtitles = ''
message = [dict(type='text', value=self.SYS)]
if video_llm:
message.append(dict(type='video', value=osp.join(self.data_root, 'video', line['video'] + '.mp4')))
else:
for im in frames:
message.append(dict(type='image', value=im))
text_prompt = self.FRAMES_TMPL_NOSUB if not self.use_subtitle else self.FRAMES_TMPL_SUB.format(subtitles)
message.append(dict(type='text', value=text_prompt))
prompt = 'Question: {}\nAnswer: '.format(line['question'])
message.append(dict(type='text', value=prompt))
return message
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.videomme import get_dimension_rating, extract_characters_regex
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
if extract_characters_regex(pred) == '':
data.loc[idx, 'score'] = -1
else:
data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating