mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-04 09:49:20 +08:00
245 lines
8.1 KiB
Python
245 lines
8.1 KiB
Python
from ...smp import *
|
|
|
|
|
|
def AMBER_rating(data_file):
|
|
data = load(data_file)
|
|
stats = defaultdict(dict)
|
|
lt = len(data)
|
|
category_mapping = {
|
|
'discriminative-attribute-state': 'Attribute',
|
|
'discriminative-attribute-number': 'Attribute',
|
|
'discriminative-attribute-action': 'Attribute',
|
|
'discriminative-hallucination': 'Existence',
|
|
'discriminative-relation': 'Relation',
|
|
'relation': 'Relation'
|
|
}
|
|
|
|
for i in range(lt):
|
|
item = data.iloc[i]
|
|
category = item['category']
|
|
image_path = item['image_path']
|
|
score = item['score']
|
|
|
|
new_category = category_mapping.get(category, category)
|
|
|
|
if image_path not in stats[new_category]:
|
|
stats[new_category][image_path] = []
|
|
stats[new_category][image_path].append(score)
|
|
|
|
def acc(key):
|
|
res = stats[key]
|
|
values = []
|
|
for val in res.values():
|
|
values.extend(val)
|
|
return np.mean(values) * 100
|
|
|
|
scores = {}
|
|
for k in stats:
|
|
scores[k] = acc(k)
|
|
|
|
scores['Avg ACC'] = np.mean(list(scores.values()))
|
|
ret = d2df(scores)
|
|
return ret
|
|
|
|
|
|
def MME_rating(data_file):
|
|
data = load(data_file)
|
|
stats = defaultdict(dict)
|
|
lt = len(data)
|
|
for i in range(lt):
|
|
item = data.iloc[i]
|
|
category = item['category']
|
|
image_path = item['image_path']
|
|
score = item['score']
|
|
if image_path not in stats[category]:
|
|
stats[category][image_path] = []
|
|
stats[category][image_path].append(score)
|
|
|
|
def acc(key, mode='normal'):
|
|
res = stats[key]
|
|
values = []
|
|
for val in res.values():
|
|
if mode == 'normal':
|
|
values.extend(val)
|
|
elif mode == 'plus':
|
|
values.append(val[0] * val[1])
|
|
return np.mean(values) * 100
|
|
|
|
scores = {}
|
|
for k in stats:
|
|
scores[k] = acc(k) + acc(k, 'plus')
|
|
|
|
super_cates = dict(
|
|
perception=[
|
|
'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence',
|
|
'landmark', 'position', 'posters', 'scene'
|
|
],
|
|
reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation']
|
|
)
|
|
|
|
ret = {}
|
|
for sc, cate_list in super_cates.items():
|
|
base = 0
|
|
for c in cate_list:
|
|
base += scores[c]
|
|
ret[sc] = base
|
|
ret.update(scores)
|
|
ret = d2df(ret)
|
|
return ret
|
|
|
|
|
|
def Hallusion_rating(data_file):
|
|
def calc_fAcc(data):
|
|
res = defaultdict(list)
|
|
lt = len(data)
|
|
for i in range(lt):
|
|
line = data.iloc[i]
|
|
res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score'])
|
|
return np.mean([np.all(x) for x in res.values()]) * 100
|
|
|
|
def calc_qAcc(data):
|
|
res = defaultdict(list)
|
|
lt = len(data)
|
|
for i in range(lt):
|
|
line = data.iloc[i]
|
|
res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score'])
|
|
return np.mean([np.all(x) for x in res.values()]) * 100
|
|
|
|
def calc_aAcc(data):
|
|
return np.mean(data['score']) * 100
|
|
|
|
data = load(data_file)
|
|
data['set_id'] = [x.split('_')[3] for x in data['index']]
|
|
data['figure_id'] = [x.split('_')[4] for x in data['index']]
|
|
data['question_id'] = [x.split('_')[5] for x in data['index']]
|
|
|
|
res = dict(split=[], aAcc=[], fAcc=[], qAcc=[])
|
|
res['split'].append('Overall')
|
|
res['aAcc'].append(calc_aAcc(data))
|
|
res['fAcc'].append(calc_fAcc(data))
|
|
res['qAcc'].append(calc_qAcc(data))
|
|
|
|
if 'category' in data:
|
|
cates = list(set(data['category']))
|
|
for c in cates:
|
|
sub = data[data['category'] == c]
|
|
res['split'].append(c)
|
|
res['aAcc'].append(calc_aAcc(sub))
|
|
res['fAcc'].append(calc_fAcc(sub))
|
|
res['qAcc'].append(calc_qAcc(sub))
|
|
|
|
if 'l2-category' in data:
|
|
cates = list(set(data['l2-category']))
|
|
for c in cates:
|
|
sub = data[data['l2-category'] == c]
|
|
res['split'].append(c)
|
|
res['aAcc'].append(calc_aAcc(sub))
|
|
res['fAcc'].append(calc_fAcc(sub))
|
|
res['qAcc'].append(calc_qAcc(sub))
|
|
ret = pd.DataFrame(res)
|
|
return ret
|
|
|
|
|
|
def POPE_rating(data_file):
|
|
def cal_f1_score(y_true, y_pred):
|
|
tp = sum((y_true == 1) & (y_pred == 1))
|
|
fp = sum((y_true == 0) & (y_pred == 1))
|
|
fn = sum((y_true == 1) & (y_pred == 0))
|
|
|
|
precision = tp / (tp + fp) if (tp + fp) != 0 else 0
|
|
recall = tp / (tp + fn) if (tp + fn) != 0 else 0
|
|
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
|
|
return f1_score, precision, recall
|
|
|
|
data = load(data_file)
|
|
data = data.assign(category=data['category'].str.split(',')).explode('category')
|
|
data['index'] = range(len(data))
|
|
res = dict(split=[], Overall=[], acc=[], precision=[], recall=[])
|
|
y_true = np.array([1 if i == 'Yes' else 0 for i in data['answer']])
|
|
y_pred = np.array([1 if i == 'Yes' else 0 for i in data['extracted']])
|
|
f1_score, precision, recall = cal_f1_score(y_true, y_pred)
|
|
res['split'].append('Overall')
|
|
res['Overall'].append(f1_score * 100)
|
|
res['acc'].append(np.mean(data['score']) * 100)
|
|
res['precision'].append(precision * 100)
|
|
res['recall'].append(recall * 100)
|
|
|
|
if 'category' in data:
|
|
cates = list(set(data['category']))
|
|
cates = [c for c in cates if not pd.isna(c)]
|
|
for c in cates:
|
|
sub = data[data['category'] == c]
|
|
y_true = np.array([1 if i == 'Yes' else 0 for i in sub['answer']])
|
|
y_pred = np.array([1 if i == 'Yes' else 0 for i in sub['extracted']])
|
|
f1_score, precision, recall = cal_f1_score(y_true, y_pred)
|
|
res['split'].append(c)
|
|
res['Overall'].append(f1_score * 100)
|
|
res['acc'].append(np.mean(sub['score']) * 100)
|
|
res['precision'].append(precision * 100)
|
|
res['recall'].append(recall * 100)
|
|
|
|
ret = pd.DataFrame(res)
|
|
return ret
|
|
|
|
|
|
def default_rating(data_file):
|
|
data = load(data_file)
|
|
res = {}
|
|
res['Overall'] = np.mean(data['score']) * 100
|
|
if 'category' in data:
|
|
cates = list(set(data['category']))
|
|
cates = [c for c in cates if not pd.isna(c)]
|
|
cates.sort()
|
|
for c in cates:
|
|
sub = data[data['category'] == c]
|
|
res[c] = np.mean(sub['score']) * 100
|
|
if 'l2-category' in data:
|
|
cates = list(set(data['l2-category']))
|
|
cates = [c for c in cates if not pd.isna(c)]
|
|
cates.sort()
|
|
for c in cates:
|
|
sub = data[data['l2-category'] == c]
|
|
res[c] = np.mean(sub['score']) * 100
|
|
ret = d2df(res)
|
|
return ret
|
|
|
|
|
|
def YOrN_match_prompt(line):
|
|
tmpl = (
|
|
'You are an AI assistant who will help me to match an answer with two options of a question. '
|
|
'The options are only Yes / No. '
|
|
'You are provided with a question and an answer, '
|
|
'and you need to find which option (Yes / No) is most similar to the answer. '
|
|
'If the meaning of all options are significantly different from the answer, output Unknown. '
|
|
'Your should output a single word among the following 3 choices: Yes, No, Unknown.\n'
|
|
'Example 1: \n'
|
|
"Question: Is the word in this image 'Hello'?\nAnswer: The word in this image is 'Hello'.\nYour output: Yes\n"
|
|
'Example 2: \n'
|
|
"Question: Is the word in this image 'Hello'?\n"
|
|
"Answer: The word in this image is not 'Hello'.\nYour output: No\n"
|
|
'Example 3: \n'
|
|
'Question: {}?\nAnswer: {}\nYour output: '
|
|
)
|
|
return tmpl.format(line['question'], line['prediction'])
|
|
|
|
|
|
def YOrN_Extraction(output):
|
|
s = output.lower()
|
|
words = process_punctuation(s).split()
|
|
if 'yes' in words and 'no' not in words:
|
|
return 'Yes'
|
|
if 'yes' not in words and 'no' in words:
|
|
return 'No'
|
|
return 'Unknown'
|
|
|
|
|
|
def YOrN_auxeval(model, line):
|
|
prompt = YOrN_match_prompt(line)
|
|
retry = 5
|
|
for i in range(retry):
|
|
output = model.generate(prompt, temperature=0.5 * i)
|
|
ans = YOrN_Extraction(output)
|
|
if ans != 'Unknown':
|
|
return ans
|
|
return 'Unknown'
|