from ...smp import * import numpy as np FAIL_MSG = 'Failed to obtain answer via API.' system_prompt = """ As an AI assistant, your task is to evaluate a candidate answer in comparison to a given correct answer. The question itself, the correct 'groundtruth' answer, and the candidate answer will be provided to you. Your assessment should range from 0 to 3, \ based solely on the semantic similarity between the groundtruth and the candidate answer, \ disregarding any grammatical differences. A rating of 0 suggests no similarity, implying the candidate answer is entirely incorrect. A rating of 1 suggests low similarity, meaning the candidate answer is largely incorrect. A rating of 2 suggests high similarity, meaning the candidate answer is largely correct. Lastly, a rating of 3 indicates complete similarity, which means the candidate answer is entirely correct. Your response should be a single integer from 0, 1, 2, or 3. """ MMV_DIMENSIONS = { 'CP': ['Video Topic', 'Video Emotion', 'Video Scene', 'Video Style'], 'FP-S': ['OCR', 'Object Recognition', 'Attribute Recognition', 'Event Recognition', 'Human Motion', 'Counting'], 'FP-C': ['Spatial Relationship', 'Human-object Interaction', 'Human Interaction'], 'HL': ['Hallucination'], 'LR': ['Structuralized Image-Text Understanding', 'Mathematical Calculation'], 'AR': ['Physical Property', 'Function Reasoning', 'Identity Reasoning'], 'RR': ['Natural Relation', 'Physical Relation', 'Social Relation'], 'CSR': ['Common Sense Reasoning'], 'TR': ['Counterfactual Reasoning', 'Causal Reasoning', 'Future Prediction'], } L3_DIMS = [] for k, v in MMV_DIMENSIONS.items(): L3_DIMS.extend(v) MMV_DIMENSIONS['Perception'] = [] MMV_DIMENSIONS['Reasoning'] = [] MMV_DIMENSIONS['Overall'] = [] for k in ['CP', 'FP-C', 'FP-S', 'HL']: MMV_DIMENSIONS['Perception'].extend(MMV_DIMENSIONS[k]) MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k]) for k in ['LR', 'AR', 'RR', 'CSR', 'TR']: MMV_DIMENSIONS['Reasoning'].extend(MMV_DIMENSIONS[k]) MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k]) def get_dimension_rating(data_path): data = load(data_path) coarse_rating = {k: [] for k in MMV_DIMENSIONS} fine_rating = {k: [] for k in L3_DIMS} for i in range(len(data)): cate = data.iloc[i]['dimensions'] cates = eval(cate) for c in cates: fine_rating[c].append(data.iloc[i]['score']) for d in MMV_DIMENSIONS: if np.any([x in MMV_DIMENSIONS[d] for x in cates]): coarse_rating[d].append(data.iloc[i]['score']) coarse_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in coarse_rating.items()} coarse_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in coarse_rating.items()} fine_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in fine_rating.items()} fine_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in fine_rating.items()} return dict(coarse_all=coarse_all, coarse_valid=coarse_valid, fine_all=fine_all, fine_valid=fine_valid) def build_prompt(item): tmpl = 'Question: {}\nGroundtruth answer: {}\nCandidate answer: {}\nYour response: ' return tmpl.format(item['question'], item['answer'], item['prediction'])