mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
Modify eval_mm for MiniCPM-o 2.6
This commit is contained in:
@@ -12,18 +12,20 @@ class ImageYORNDataset(ImageBaseDataset):
|
||||
'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
|
||||
'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
|
||||
'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
|
||||
'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
|
||||
}
|
||||
|
||||
DATASET_MD5 = {
|
||||
'MME': 'b36b43c3f09801f5d368627fb92187c3',
|
||||
'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
|
||||
'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
|
||||
'AMBER': '970d94c0410916166e0a76ba75da7934',
|
||||
}
|
||||
|
||||
# It returns a dataframe
|
||||
def evaluate(self, eval_file, **judge_kwargs):
|
||||
from .utils.yorn import YOrN_Extraction, YOrN_auxeval
|
||||
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating
|
||||
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
|
||||
|
||||
dataset = self.dataset_name
|
||||
data = load(eval_file)
|
||||
@@ -71,7 +73,10 @@ class ImageYORNDataset(ImageBaseDataset):
|
||||
dump(data, storage)
|
||||
|
||||
data = load(storage)
|
||||
data['score'] = (data['answer'] == data['extracted'])
|
||||
if listinstr(['AMBER'], dataset):
|
||||
data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
|
||||
else:
|
||||
data['score'] = (data['answer'] == data['extracted'])
|
||||
dump(data, storage)
|
||||
|
||||
if dataset is not None and listinstr(['MME'], dataset):
|
||||
@@ -80,6 +85,8 @@ class ImageYORNDataset(ImageBaseDataset):
|
||||
score = Hallusion_rating(storage)
|
||||
elif dataset is not None and listinstr(['POPE'], dataset):
|
||||
score = POPE_rating(storage)
|
||||
elif dataset is not None and listinstr(['AMBER'], dataset):
|
||||
score = AMBER_rating(storage)
|
||||
else:
|
||||
score = default_rating(storage)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user