mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-04 09:49:20 +08:00
96 lines
3.9 KiB
Python
96 lines
3.9 KiB
Python
from ..smp import *
|
|
from ..utils import *
|
|
from .image_base import ImageBaseDataset
|
|
from .utils import build_judge, DEBUG_MESSAGE
|
|
|
|
|
|
class ImageYORNDataset(ImageBaseDataset):
|
|
|
|
TYPE = 'Y/N'
|
|
|
|
DATASET_URL = {
|
|
'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
|
|
'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
|
|
'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
|
|
'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
|
|
}
|
|
|
|
DATASET_MD5 = {
|
|
'MME': 'b36b43c3f09801f5d368627fb92187c3',
|
|
'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
|
|
'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
|
|
'AMBER': '970d94c0410916166e0a76ba75da7934',
|
|
}
|
|
|
|
# It returns a dataframe
|
|
def evaluate(self, eval_file, **judge_kwargs):
|
|
from .utils.yorn import YOrN_Extraction, YOrN_auxeval
|
|
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
|
|
|
|
dataset = self.dataset_name
|
|
data = load(eval_file)
|
|
data['prediction'] = [str(x) for x in data['prediction']]
|
|
storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
|
|
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
|
nproc = judge_kwargs.pop('nproc', 4)
|
|
|
|
if not osp.exists(storage):
|
|
ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
|
|
if osp.exists(tmp_file):
|
|
tmp = load(tmp_file)
|
|
for k in tmp:
|
|
if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
|
|
ans_map[k] = tmp[k]
|
|
|
|
data['extracted'] = [ans_map[x] for x in data['index']]
|
|
unknown = data[data['extracted'] == 'Unknown']
|
|
|
|
model = judge_kwargs.get('model', 'exact_matching')
|
|
if model == 'exact_matching':
|
|
model = None
|
|
elif gpt_key_set():
|
|
model = build_judge(**judge_kwargs)
|
|
if not model.working():
|
|
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
|
warnings.warn(DEBUG_MESSAGE)
|
|
model = None
|
|
else:
|
|
model = None
|
|
warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
|
|
|
|
if model is not None:
|
|
lt = len(unknown)
|
|
lines = [unknown.iloc[i] for i in range(lt)]
|
|
tups = [(model, line) for line in lines]
|
|
indices = list(unknown['index'])
|
|
if len(tups):
|
|
res = track_progress_rich(
|
|
YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
|
|
for k, v in zip(indices, res):
|
|
ans_map[k] = v
|
|
|
|
data['extracted'] = [ans_map[x] for x in data['index']]
|
|
dump(data, storage)
|
|
|
|
data = load(storage)
|
|
if listinstr(['AMBER'], dataset):
|
|
data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
|
|
else:
|
|
data['score'] = (data['answer'] == data['extracted'])
|
|
dump(data, storage)
|
|
|
|
if dataset is not None and listinstr(['MME'], dataset):
|
|
score = MME_rating(storage)
|
|
elif dataset is not None and listinstr(['Hallusion'], dataset):
|
|
score = Hallusion_rating(storage)
|
|
elif dataset is not None and listinstr(['POPE'], dataset):
|
|
score = POPE_rating(storage)
|
|
elif dataset is not None and listinstr(['AMBER'], dataset):
|
|
score = AMBER_rating(storage)
|
|
else:
|
|
score = default_rating(storage)
|
|
|
|
score_tgt = eval_file.replace('.xlsx', '_score.csv')
|
|
dump(score, score_tgt)
|
|
return score
|