Modify eval_mm for MiniCPM-o 2.6

2026-02-05 18:29:18 +08:00 · 2025-01-21 15:34:54 +08:00
parent ec68cefc17
commit d8f382e157
82 changed files with 14279 additions and 843 deletions
--- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py
@@ -0,0 +1,123 @@
+from ...smp import *
+from ...utils import can_infer
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def get_gpt4_ICE_for_qspatial():
+    example_1 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (1, m), (2.2, cm), (3.12, meter), at the end.\n
+Model response: **Object Identification**
+
+* The object in question is a chair.
+* The chair is not visible in the image.
+
+**Conclusion**
+
+The height of the chair cannot be determined from the provided image.\n
+Extracted answer: (0, cm)
+"""
+
+    example_2 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (1, inch), (1.2, cm), (3.0, feet), at the end.\n
+Model response: **Step 1: Identify the stapler and the recycle bin in the image.**
+
+The stapler is located on the wooden table, and the recycle bin is located on the floor.
+
+**Step 2: Determine the distance between the stapler and the recycle bin.**
+
+The stapler is 0.5 meters from the edge of the table, and the recycle bin is 1.5 meters from the edge of the table.
+Therefore, the minimum distance between the stapler and the recycle bin is 1.5 - 0.5 = 1 meter.
+
+**Answer:** 1 m\n
+Extracted answer: (1, m)
+"""
+    example_3 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (1, foot), (2, cm), (4.3, meter), at the end.\n
+Model response: The mirror in the image is approximately 5 feet 4 inches tall.\n
+Extracted answer: (64, inch)
+"""
+    example_4 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (0.1, cm), (2.9, cm), (0.3, meter), at the end.\n
+Model response: The minimum distance between the wooden chair and the chair near the camera in the image is 1.7 feet.\n
+Extracted answer: (1.7, feet)
+"""
+    example_5 = """
+Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit,
+e.g., (5.1, cm), (0.9, cm), (55, mm), at the end.\n
+Model response: The height of the painting's bottom edge from the floor is approximately 4.5 feet.\n
+Extracted answer: (4.5, feet)
+"""
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def list_to_dict(lst):
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def post_check(line, prefetch=False):
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if line['question_type'] == 'multi_choice':
+            ans = line['answer_option']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            if line['answer_type'] == 'integer':
+                res = int(response)
+                ans = int(line['answer'])
+            elif line['answer_type'] == 'float':
+                res = float(response)
+                ans = float(line['answer'])
+            else:
+                res = str(res)
+                ans = str(ans)
+    except ValueError:
+        pass
+
+    if res == ans:
+        return res if prefetch else True
+    else:
+        return False
+
+
+def build_qspatial_gpt4_prompt(line):
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE_for_qspatial()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += 'Model respone: ' + prediction
+    prompt += '\nExtracted answer:'
+    return prompt
+
+
+def QSpatial_auxeval(model, line):
+    prompt = build_qspatial_gpt4_prompt(line)
+
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, res=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='')