diff --git a/eval_mm/README.md b/eval_mm/README.md index 92d875d..6f924dc 100644 --- a/eval_mm/README.md +++ b/eval_mm/README.md @@ -1,7 +1,185 @@ # Evaluation +## MiniCPM-o 2.6 + +### opencompass +First, enter the `vlmevalkit` directory and install all dependencies: +```bash +cd vlmevalkit +pip install --upgrade pip +pip install -e . +wget https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=4377e0a7fe8ff8ffc4f7c9c6130c1dcd3874050ae4fc28b7ff1d35234fbca423 +wget https://download.pytorch.org/whl/cu118/torchvision-0.17.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=2e63d62e09d9b48b407d3e1b30eb8ae4e3abad6968e8d33093b60d0657542428 +wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl +pip install torch-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl +pip install torchvision-0.17.0%2Bcu118-cp310-cp310-linux_x86_64.whl +pip install flash_attn-2.6.3+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl +``` +
+ +Then, run `scripts/run_inference.sh`, which receives two input parameters in sequence: `MODELNAME` and `DATALIST`. `MODELNAME` represents the name of the model, `DATALIST` represents the datasets used for inference: +```bash +chmod +x ./scripts/run_inference.sh +./scripts/run_inference.sh $MODELNAME $DATALIST +``` +
+ +The five available choices for `MODELNAME` are listed in `vlmeval/config.py`: +```bash +minicpm_series = { + 'MiniCPM-V': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'), + 'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'), + 'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'), + 'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'), + 'MiniCPM-o-2_6': partial(MiniCPM_o_2_6, model_path='openbmb/MiniCPM-o-2_6'), +} +``` +
+ +All available choices for `DATALIST` are listed in `vlmeval/utils/dataset_config.py`. While evaluating on multiple datasets at a time, separate the names of different datasets with spaces and add quotation marks at both ends: +```bash +$DATALIST="MMMU_DEV_VAL MathVista_MINI MMVet MMBench_DEV_EN_V11 MMBench_DEV_CN_V11 MMStar HallusionBench AI2D_TEST" +``` +
+ +When the benchmark requires GPT series model for scoring, please specify `OPENAI_API_BASE` and `OPENAI_API_KEY` in the `.env` file. +In order to reproduce the results on OpenCompass benchmarks together with ChartQA and MME, which are displayed in the table on the homepage (columns between OCRBench and HallusionBench), you need to run the script according to the following settings: +```bash +# Please note that we use different prompts for the perception and reasoning sets of MME. While evaluating on the reasoning subset, CoT is required, so you need to manually modify the judgment condition of the use_cot function in vlmeval/vlm/minicpm_v.py +./scripts/run_inference.sh MiniCPM-o-2_6 "MMMU_DEV_VAL MathVista_MINI MMVet MMBench_TEST_EN_V11 MMBench_TEST_CN_V11 MMStar HallusionBench AI2D_TEST OCRBench ChartQA_TEST MME" +``` +
+ +### vqadataset +First, enter the `vqaeval` directory and install all dependencies. Then, create `downloads` subdirectory to store the downloaded dataset for all tasks: +```bash +cd vqaeval +pip install -r requirements.txt +mkdir downloads +``` +
+ +Download the datasets from the following links and place it in the specified directories: +###### TextVQA +```bash +cd downloads +mkdir TextVQA && cd TextVQA +wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip +unzip train_val_images.zip && rm train_val_images.zip +mv train_val_images/train_images . && rm -rf train_val_images +wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json +cd ../.. +``` + +###### DocVQA / DocVQATest + +```bash +cd downloads +mkdir DocVQA && cd DocVQA && mkdir spdocvqa_images +# Download Images and Annotations from Task 1 - Single Page Document Visual Question Answering at https://rrc.cvc.uab.es/?ch=17&com=downloads +# Move the spdocvqa_images.tar.gz and spdocvqa_qas.zip to DocVQA directory +tar -zxvf spdocvqa_images.tar.gz -C spdocvqa_images && rm spdocvqa_images.tar.gz +unzip spdocvqa_qas.zip && rm spdocvqa_qas.zip +cp spdocvqa_qas/val_v1.0_withQT.json . && cp spdocvqa_qas/test_v1.0.json . && rm -rf spdocvqa_qas +cd ../.. +``` +
+ +The `downloads` directory should be organized according to the following structure: +```bash +downloads +├── TextVQA +│ ├── train_images +│ │ ├── ... +│ ├── TextVQA_0.5.1_val.json +├── DocVQA +│ ├── spdocvqa_images +│ │ ├── ... +│ ├── val_v1.0_withQT.json +│ ├── test_v1.0.json +``` +
+ +Modify the parameters in `shell/run_inference.sh` and run inference: + +```bash +chmod +x ./shell/run_inference.sh +./shell/run_inference.sh +``` +
+ +All optional parameters are listed in `eval_utils/getargs.py`. The meanings of some major parameters are listed as follows. +For `MiniCPM-o-2_6`, set `model_name` to `minicpmo26`: +```bash +# path to images and their corresponding questions +# TextVQA +--textVQA_image_dir +--textVQA_ann_path +# DocVQA +--docVQA_image_dir +--docVQA_ann_path +# DocVQATest +--docVQATest_image_dir +--docVQATest_ann_path + +# whether to eval on certain task +--eval_textVQA +--eval_docVQA +--eval_docVQATest +--eval_all + +# model name and model path +--model_name +--model_path +# load model from ckpt +--ckpt +# the way the model processes input data, "interleave" represents interleaved image-text form, while "old" represents non-interleaved. +--generate_method + +--batchsize + +# path to save the outputs +--answer_path +``` +
+ +While evaluating on different tasks, parameters need to be set as follows: +###### TextVQA +```bash +--eval_textVQA +--textVQA_image_dir ./downloads/TextVQA/train_images +--textVQA_ann_path ./downloads/TextVQA/TextVQA_0.5.1_val.json +``` + +###### DocVQA +```bash +--eval_docVQA +--docVQA_image_dir ./downloads/DocVQA/spdocvqa_images +--docVQA_ann_path ./downloads/DocVQA/val_v1.0_withQT.json +``` + +###### DocVQATest +```bash +--eval_docVQATest +--docVQATest_image_dir ./downloads/DocVQA/spdocvqa_images +--docVQATest_ann_path ./downloads/DocVQA/test_v1.0.json +``` + +
+ +For the DocVQATest task, in order to upload the inference results to the [official website](https://rrc.cvc.uab.es/?ch=17) for evaluation, run `shell/run_transform.sh` for format transformation after inference. `input_file_path` represents the path to the original output json, `output_file_path` represents the path to the transformed json: +```bash +chmod +x ./shell/run_transform.sh +./shell/run_transform.sh +``` + +
+ ## MiniCPM-V 2.6 +
+Expand + ### opencompass First, enter the `vlmevalkit` directory and install all dependencies: ```bash @@ -175,6 +353,9 @@ For the DocVQATest task, in order to upload the inference results to the [offici chmod +x ./shell/run_transform.sh ./shell/run_transform.sh ``` + +
+
## MiniCPM-Llama3-V-2_5 diff --git a/eval_mm/README_zh.md b/eval_mm/README_zh.md index 3eef51c..eeab8a8 100644 --- a/eval_mm/README_zh.md +++ b/eval_mm/README_zh.md @@ -1,7 +1,183 @@ # Evaluation +## MiniCPM-o 2.6 + +### opencompass +首先,进入 `vlmevalkit` 目录下,安装必要的依赖: +```bash +cd vlmevalkit +pip install --upgrade pip +pip install -e . +wget https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=4377e0a7fe8ff8ffc4f7c9c6130c1dcd3874050ae4fc28b7ff1d35234fbca423 +wget https://download.pytorch.org/whl/cu118/torchvision-0.17.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=2e63d62e09d9b48b407d3e1b30eb8ae4e3abad6968e8d33093b60d0657542428 +wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl +pip install torch-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl +pip install torchvision-0.17.0%2Bcu118-cp310-cp310-linux_x86_64.whl +pip install flash_attn-2.6.3+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl +rm *.whl +``` +
+ +然后,运行 `scripts/run_inference.sh`,该脚本依次接收两个输入参数:`MODELNAME`, `DATALIST`。其中,`MODELNAME` 为模型名称,`DATALIST` 为目标数据集。 +```bash +chmod +x ./scripts/run_inference.sh +./scripts/run_inference.sh $MODELNAME $DATALIST +``` +
+ +`MODELNAME` 有五种选择,位于 `vlmeval/config.py` 中: +```bash +minicpm_series = { + 'MiniCPM-V': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'), + 'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'), + 'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'), + 'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'), + 'MiniCPM-o-2_6': partial(MiniCPM_o_2_6, model_path='openbmb/MiniCPM-o-2_6'), +} +``` +
+ +可选的所有 `DATALIST` 位于 `vlmeval/utils/dataset_config.py` 中。一次评测多个数据集时,将不同数据集名称以空格隔开,两端加引号: +```bash +$DATALIST="MMMU_DEV_VAL MathVista_MINI MMVet MMBench_TEST_EN_V11 MMBench_TEST_CN_V11 MMStar HallusionBench AI2D_TEST" +``` +
+ +当评测的 benchmark 需要 GPT 系列模型进行评分时,请在 `.env` 文件中预先指定 `OPENAI_API_BASE` 和 `OPENAI_API_KEY`。 +为了复现出首页展示的表格中 OpenCompass 对应的各项数据集以及 ChartQA 和 MME 上的结果(OCRBench 到 HallusionBench 之间的列),需要按照如下设置运行: +```bash +# 请注意,对于 MME 的 perception 和 reasoning 集,我们采取了不同的 prompt 方式。评测 reasoning 子集时,需要使用 CoT,因此需要手动到 vlmeval/vlm/minicpm_v.py 中修改 use_cot 函数的判断条件 +./scripts/run_inference.sh MiniCPM-o-2_6 "MMMU_DEV_VAL MathVista_MINI MMVet MMBench_TEST_EN_V11 MMBench_TEST_CN_V11 MMStar HallusionBench AI2D_TEST OCRBench ChartQA_TEST MME" +``` +
+ +### vqadataset +首先,进入 `vqaeval` 目录下,安装必要的依赖,并创建 `downloads` 子目录,用于存储下载的数据集: +```bash +cd vqaeval +pip install -r requirements.txt +mkdir downloads +``` +
+ +然后,从下列各地址下载数据集并置于指定目录下: +###### TextVQA +```bash +cd downloads +mkdir TextVQA && cd TextVQA +wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip +unzip train_val_images.zip && rm train_val_images.zip +mv train_val_images/train_images . && rm -rf train_val_images +wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json +cd ../.. +``` + +###### DocVQA / DocVQATest +```bash +cd downloads +mkdir DocVQA && cd DocVQA && mkdir spdocvqa_images +# 在 https://rrc.cvc.uab.es/?ch=17&com=downloads 下载 Task 1 - Single Page Document Visual Question Answering 下的 Images 和 Annotations +# 将下载得到的 spdocvqa_images.tar.gz 以及 spdocvqa_qas.zip 置于 DocVQA 目录下 +tar -zxvf spdocvqa_images.tar.gz -C spdocvqa_images && rm spdocvqa_images.tar.gz +unzip spdocvqa_qas.zip && rm spdocvqa_qas.zip +cp spdocvqa_qas/val_v1.0_withQT.json . && cp spdocvqa_qas/test_v1.0.json . && rm -rf spdocvqa_qas +cd ../.. +``` +
+ +`downloads` 目录应当按照下列结构组织: +```bash +downloads +├── TextVQA +│ ├── train_images +│ │ ├── ... +│ ├── TextVQA_0.5.1_val.json +├── DocVQA +│ ├── spdocvqa_images +│ │ ├── ... +│ ├── val_v1.0_withQT.json +│ ├── test_v1.0.json +``` +
+ +准备好相应的数据集之后,修改 `shell/run_inference.sh` 的参数,运行推理: + +```bash +chmod +x ./shell/run_inference.sh +./shell/run_inference.sh +``` +
+ +可以传入的参数位于 `eval_utils/getargs.py` 中,各主要参数的含义如下。 +对于 `MiniCPM-o-2_6`,需要将 `model_name`设置为 `minicpmo26`: +```bash +# 指定 TextVQA 评测所有图片和问题的路径 +--textVQA_image_dir +--textVQA_ann_path +# 指定 DocVQA 评测所有图片和问题的路径 +--docVQA_image_dir +--docVQA_ann_path +# 指定 DocVQATest 评测所有图片和问题的路径 +--docVQATest_image_dir +--docVQATest_ann_path + +# 决定是否评测某个任务,eval_all 设置为 True 表示所有任务都评测 +--eval_textVQA +--eval_docVQA +--eval_docVQATest +--eval_all + +# 模型名称、模型路径(从指定路径加载模型) +--model_name +--model_path +# 从 checkpoint 加载模型 +--ckpt +# 模型处理输入数据的方式,interleave 表示图文交错式,old 表示非交错式 +--generate_method +# 推理时的批处理规模,建议推理时设置为 1 +--batchsize + +# 输出内容保存的路径 +--answer_path +``` +
+ +评测三个任务需要设置的参数如下: +###### TextVQA +```bash +--eval_textVQA +--textVQA_image_dir ./downloads/TextVQA/train_images +--textVQA_ann_path ./downloads/TextVQA/TextVQA_0.5.1_val.json +``` + +###### DocVQA +```bash +--eval_docVQA +--docVQA_image_dir ./downloads/DocVQA/spdocvqa_images +--docVQA_ann_path ./downloads/DocVQA/val_v1.0_withQT.json +``` + +###### DocVQATest +```bash +--eval_docVQATest +--docVQATest_image_dir ./downloads/DocVQA/spdocvqa_images +--docVQATest_ann_path ./downloads/DocVQA/test_v1.0.json +``` +
+ +对于 DocVQATest 任务,为了将推理结果上传到[官方网站](https://rrc.cvc.uab.es/?ch=17)进行评测,还需要运行 `shell/run_transform.sh` 进行格式转换。其中,`input_file_path` 对应原始输出的 json 的路径,`output_file_path` 为自定义的转换后的 json 的路径: +```bash +chmod +x ./shell/run_transform.sh +./shell/run_transform.sh +``` + +
+ ## MiniCPM-V 2.6 +
+展开 + ### opencompass 首先,进入 `vlmevalkit` 目录下,安装必要的依赖: ```bash @@ -173,6 +349,9 @@ chmod +x ./shell/run_inference.sh chmod +x ./shell/run_transform.sh ./shell/run_transform.sh ``` + +
+
## MiniCPM-Llama3-V-2_5 diff --git a/eval_mm/vlmevalkit/.env b/eval_mm/vlmevalkit/.env new file mode 100644 index 0000000..290491a --- /dev/null +++ b/eval_mm/vlmevalkit/.env @@ -0,0 +1,28 @@ +# .env 文件,将其放置在 $VLMEvalKit 下 +# 专有 VLMs 的 API 密钥 +# QwenVL APIs +DASHSCOPE_API_KEY= +# Gemini w. Google Cloud Backends +GOOGLE_API_KEY= +# OpenAI API +OPENAI_API_KEY= +OPENAI_API_BASE= +# StepAI API +STEPAI_API_KEY= +# REKA API +REKA_API_KEY= +# GLMV API +GLMV_API_KEY= +# CongRong API +CW_API_BASE= +CW_API_KEY= +# SenseChat-V API +SENSECHAT_AK= +SENSECHAT_SK= +# Hunyuan-Vision API +HUNYUAN_SECRET_KEY= +HUNYUAN_SECRET_ID= +# LMDeploy API +LMDEPLOY_API_BASE= +# 你可以设置一个评估时代理,评估阶段产生的 API 调用将通过这个代理进行 +EVAL_PROXY= diff --git a/eval_mm/vlmevalkit/requirements.txt b/eval_mm/vlmevalkit/requirements.txt index b45133c..0146dc3 100644 --- a/eval_mm/vlmevalkit/requirements.txt +++ b/eval_mm/vlmevalkit/requirements.txt @@ -1,18 +1,18 @@ -decord +decord; platform_machine != 'arm64' +eva-decord; platform_machine == 'arm64' gradio huggingface_hub imageio matplotlib -moviepy -numpy>=1.23.4 +numpy omegaconf -openai==1.3.5 +openai opencv-python>=4.4.0.46 openpyxl pandas -peft pillow portalocker +protobuf python-dotenv requests rich @@ -22,9 +22,9 @@ sty tabulate tiktoken timeout-decorator -torch>=2.0.1 +torch tqdm transformers -typing_extensions==4.7.1 +typing_extensions validators xlsxwriter diff --git a/eval_mm/vlmevalkit/run.py b/eval_mm/vlmevalkit/run.py index 278caf8..4142145 100644 --- a/eval_mm/vlmevalkit/run.py +++ b/eval_mm/vlmevalkit/run.py @@ -2,6 +2,7 @@ import torch import torch.distributed as dist from vlmeval.config import supported_VLM +from vlmeval.dataset.video_dataset_config import supported_video_datasets from vlmeval.dataset import build_dataset from vlmeval.inference import infer_data_job from vlmeval.inference_video import infer_data_job_video @@ -10,21 +11,127 @@ from vlmeval.smp import * from vlmeval.utils.result_transfer import MMMU_result_transfer, MMTBench_result_transfer +def build_model_from_config(cfg, model_name): + import vlmeval.api + import vlmeval.vlm + config = cp.deepcopy(cfg[model_name]) + if config == {}: + return supported_VLM[model_name]() + assert 'class' in config + cls_name = config.pop('class') + if hasattr(vlmeval.api, cls_name): + return getattr(vlmeval.api, cls_name)(**config) + elif hasattr(vlmeval.vlm, cls_name): + return getattr(vlmeval.vlm, cls_name)(**config) + else: + raise ValueError(f'Class {cls_name} is not supported in `vlmeval.api` or `vlmeval.vlm`') + + +def build_dataset_from_config(cfg, dataset_name): + import vlmeval.dataset + import inspect + config = cp.deepcopy(cfg[dataset_name]) + if config == {}: + return supported_video_datasets[dataset_name]() + assert 'class' in config + cls_name = config.pop('class') + if hasattr(vlmeval.dataset, cls_name): + cls = getattr(vlmeval.dataset, cls_name) + sig = inspect.signature(cls.__init__) + valid_params = {k: v for k, v in config.items() if k in sig.parameters} + if valid_params.get('fps', 0) > 0 and valid_params.get('nframe', 0) > 0: + raise ValueError('fps and nframe should not be set at the same time') + if valid_params.get('fps', 0) <= 0 and valid_params.get('nframe', 0) <= 0: + raise ValueError('fps and nframe should be set at least one valid value') + return cls(**valid_params) + else: + raise ValueError(f'Class {cls_name} is not supported in `vlmeval.dataset`') + + def parse_args(): - parser = argparse.ArgumentParser() - # Essential Args - parser.add_argument('--data', type=str, nargs='+', required=True) - parser.add_argument('--model', type=str, nargs='+', required=True) - # Args that only apply to Video Dataset - parser.add_argument('--nframe', type=int, default=8) - parser.add_argument('--pack', action='store_true') - parser.add_argument('--use-subtitle', action='store_true') + help_msg = """\ +You can launch the evaluation by setting either --data and --model or --config. + +--data and --model: + Each Arg should be a list of strings, specifying the names of datasets and models. + To find all supported model names, please refer to the `vlmeval/config.py` of check the output of the command \ + `vlmutil mlist all` in the terminal (you should first have vlmeval installed). + To find all supported dataset names, please refer to the `vlmeval/dataset/__init__.py` file. The python script \ + to print all supported dataset names is as follows: + ```python + from vlmeval.dataset import SUPPORTED_DATASETS + print(SUPPORTED_DATASETS) + ``` + or you can check the output of the command `vlmutil dlist all` in the terminal. + To find all supported video dataset default settings, please refer to the \ + `vlmeval/dataset/video_dataset_config.py` file. + +--config: + Launch the evaluation by specifying the path to the config json file. Sample Json Content: + ```json + { + "model": { + "GPT4o_20240806_T00_HIGH": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 0, + "img_detail": "high" + }, + "GPT4o_20240806_T10_Low": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 1.0, + "img_detail": "low" + }, + "GPT4o_20241120": {} + }, + "data": { + "MME-RealWorld-Lite": { + "class": "MMERealWorld", + "dataset": "MME-RealWorld-Lite" + }, + "MMBench_DEV_EN_V11": { + "class": "ImageMCQDataset", + "dataset": "MMBench_DEV_EN_V11" + }, + "MMBench_Video_8frame_nopack": {}, + "Video-MME_16frame_subs": { + "class": "VideoMME", + "dataset": "Video-MME", + "nframe": 16, + "use_subtitle": true, + } + } + } + ``` + Currently, only `model` and `data` are supported fields. The content of each field is a dictionary. + For `model`, the key is the name of the model, and the value is a dictionary containing the following keys: + - `class`: The class name of the model, which should be a class in `vlmeval.vlm` or `vlmeval.api`. + - Other keys are specific to the model, please refer to the corresponding class. + - Tip: The defined model in the `supported_VLM` of `vlmeval/config.py` can be used as a shortcut. + For `data`, the key is the name of the dataset (should be the same as the `dataset` field in most cases, \ + except for video datasets), and the value is a dictionary containing the following keys: + - `class`: The class name of the dataset, which should be a class in `vlmeval.dataset`. + - `dataset`: The name of the dataset, which should be a string that is accepted by the `dataset` argument of the \ + corresponding class. + - Other keys are specific to the dataset, please refer to the corresponding class. + - Tip: The defined dataset in the `supported_video_datasets` of `vlmeval/dataset/video_dataset_config.py` \ + can be used as a shortcut. + + The keys in the `model` and `data` fields will be used for naming the prediction files and evaluation results. + When launching with `--config`, args for API VLMs, such as `--retry`, `--verbose`, will be ignored. +""" + parser = argparse.ArgumentParser(description=help_msg, formatter_class=argparse.RawTextHelpFormatter) + # Essential Args, Setting the Names of Datasets and Models + parser.add_argument('--data', type=str, nargs='+', help='Names of Datasets') + parser.add_argument('--model', type=str, nargs='+', help='Names of Models') + parser.add_argument('--config', type=str, help='Path to the Config Json File') # Work Dir parser.add_argument('--work-dir', type=str, default='./outputs', help='select the output directory') # Infer + Eval or Infer Only parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer']) # API Kwargs, Apply to API VLMs and Judge API LLMs - parser.add_argument('--nproc', type=int, default=4, help='Parallel API calling') + parser.add_argument('--api_nproc', type=int, default=4, help='Parallel API calling') parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs') # Explicitly Set the Judge Model parser.add_argument('--judge', type=str, default=None) @@ -33,189 +140,283 @@ def parse_args(): # Configuration for Resume # Ignore: will not rerun failed VLM inference parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ') - # Rerun: will remove all evaluation temp files - parser.add_argument('--rerun', action='store_true') + # Reuse: will reuse the existing prediction files + parser.add_argument('--reuse', action='store_true') + args = parser.parse_args() return args def main(): logger = get_logger('RUN') - + rank, world_size = get_rank_and_world_size() args = parse_args() - assert len(args.data), '--data should be a list of data files' + use_config, cfg = False, None + if args.config is not None: + assert args.data is None and args.model is None, '--data and --model should not be set when using --config' + use_config, cfg = True, load(args.config) + args.model = list(cfg['model'].keys()) + args.data = list(cfg['data'].keys()) + else: + assert len(args.data), '--data should be a list of data files' - if args.retry is not None: + if rank == 0: + if not args.reuse: + logger.warning('--reuse is not set, will not reuse previous (before one day) temporary files') + else: + logger.warning('--reuse is set, will reuse the latest prediction & temporary pickle files') + + if 'MMEVAL_ROOT' in os.environ: + args.work_dir = os.environ['MMEVAL_ROOT'] + + if not use_config: for k, v in supported_VLM.items(): - if hasattr(v, 'keywords') and 'retry' in v.keywords: + if hasattr(v, 'keywords') and 'retry' in v.keywords and args.retry is not None: v.keywords['retry'] = args.retry supported_VLM[k] = v - if hasattr(v, 'keywords') and 'verbose' in v.keywords: + if hasattr(v, 'keywords') and 'verbose' in v.keywords and args.verbose is not None: v.keywords['verbose'] = args.verbose supported_VLM[k] = v - rank, world_size = get_rank_and_world_size() if world_size > 1: local_rank = os.environ.get('LOCAL_RANK', 0) torch.cuda.set_device(int(local_rank)) - dist.init_process_group(backend='nccl', timeout=datetime.timedelta(seconds=10800)) + dist.init_process_group( + backend='nccl', + timeout=datetime.timedelta(seconds=int(os.environ.get('DIST_TIMEOUT', 3600))) + ) for _, model_name in enumerate(args.model): model = None + date, commit_id = timestr('day'), githash(digits=8) + eval_id = f"T{date}_G{commit_id}" - pred_root = osp.join(args.work_dir, model_name) - os.makedirs(pred_root, exist_ok=True) + pred_root = osp.join(args.work_dir, model_name, eval_id) + pred_root_meta = osp.join(args.work_dir, model_name) + os.makedirs(pred_root_meta, exist_ok=True) + + prev_pred_roots = ls(osp.join(args.work_dir, model_name), mode='dir') + if len(prev_pred_roots) and args.reuse: + prev_pred_roots.sort() + + if not osp.exists(pred_root): + os.makedirs(pred_root, exist_ok=True) + + if use_config: + model = build_model_from_config(cfg['model'], model_name) for _, dataset_name in enumerate(args.data): - dataset_kwargs = {} - if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']: - dataset_kwargs['model'] = model_name - if dataset_name == 'MMBench-Video': - dataset_kwargs['pack'] = args.pack - if dataset_name == 'Video-MME': - dataset_kwargs['use_subtitle'] = args.use_subtitle + try: + result_file_base = f'{model_name}_{dataset_name}.xlsx' - # If distributed, first build the dataset on the main process for doing preparation works - if world_size > 1: - dataset = build_dataset(dataset_name, **dataset_kwargs) if rank == 0 else None - dist.barrier() - dataset_list = [dataset] - dist.broadcast_object_list(dataset_list, src=0) - dataset = dataset_list[0] - else: - dataset = build_dataset(dataset_name, **dataset_kwargs) - if dataset is None: - logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') + if use_config: + if world_size > 1: + if rank == 0: + dataset = build_dataset_from_config(cfg['data'], dataset_name) + dist.barrier() + dataset = build_dataset_from_config(cfg['data'], dataset_name) + if dataset is None: + logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') + continue + else: + dataset_kwargs = {} + if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']: + dataset_kwargs['model'] = model_name + + # If distributed, first build the dataset on the main process for doing preparation works + if world_size > 1: + if rank == 0: + dataset = build_dataset(dataset_name, **dataset_kwargs) + dist.barrier() + + dataset = build_dataset(dataset_name, **dataset_kwargs) + if dataset is None: + logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') + continue + + # Handling Multi-Turn Dataset + if dataset.TYPE == 'MT': + result_file_base = result_file_base.replace('.xlsx', '.tsv') + + result_file = osp.join(pred_root, result_file_base) + + # Reuse the previous prediction file if exists + if rank == 0 and len(prev_pred_roots): + prev_result_file = None + prev_pkl_file_list = [] + for root in prev_pred_roots[::-1]: + if osp.exists(osp.join(root, result_file_base)): + prev_result_file = osp.join(root, result_file_base) + break + elif commit_id in root and len(ls(root)) and root != pred_root: + temp_files = ls(root, match=[dataset_name, '.pkl']) + if len(temp_files): + prev_pkl_file_list.extend(temp_files) + break + if not args.reuse: + prev_result_file = None + prev_pkl_file_list = [] + if prev_result_file is not None: + logger.warning( + f'--reuse is set, will reuse the prediction file {prev_result_file}.') + if prev_result_file != result_file: + shutil.copy(prev_result_file, result_file) + elif len(prev_pkl_file_list): + for fname in prev_pkl_file_list: + target_path = osp.join(pred_root, osp.basename(fname)) + if not osp.exists(target_path): + shutil.copy(fname, target_path) + logger.info(f'--reuse is set, will reuse the prediction pickle file {fname}.') + else: + logger.warning(f'File already exists: {target_path}') + + if world_size > 1: + dist.barrier() + + if model is None: + model = model_name # which is only a name + + # Perform the Inference + if dataset.MODALITY == 'VIDEO': + model = infer_data_job_video( + model, + work_dir=pred_root, + model_name=model_name, + dataset=dataset, + result_file_name=result_file_base, + verbose=args.verbose, + api_nproc=args.api_nproc) + elif dataset.TYPE == 'MT': + model = infer_data_job_mt( + model, + work_dir=pred_root, + model_name=model_name, + dataset=dataset, + verbose=args.verbose, + api_nproc=args.api_nproc, + ignore_failed=args.ignore) + else: + model = infer_data_job( + model, + work_dir=pred_root, + model_name=model_name, + dataset=dataset, + verbose=args.verbose, + api_nproc=args.api_nproc, + ignore_failed=args.ignore) + + # Set the judge kwargs first before evaluation or dumping + + judge_kwargs = { + 'nproc': args.api_nproc, + 'verbose': args.verbose, + 'retry': args.retry if args.retry is not None else 3 + } + + if args.retry is not None: + judge_kwargs['retry'] = args.retry + if args.judge is not None: + judge_kwargs['model'] = args.judge + else: + if dataset.TYPE in ['MCQ', 'Y/N']: + judge_kwargs['model'] = 'chatgpt-0125' + elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): + judge_kwargs['model'] = 'gpt-4-turbo' + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench', 'WeMath', 'LogicVista'], dataset_name): # noqa: E501 + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name): # noqa: E501 + judge_kwargs['model'] = 'gpt-4o' + + if rank == 0: + logger.info(judge_kwargs) + + if world_size > 1: + dist.barrier() + + # Only Rank 0 handles the evaluation part + if rank == 0: + # Prepare Submission Files for MMMU_TEST AND MMT-Bench_ALL + if dataset_name in ['MMMU_TEST']: + result_json = MMMU_result_transfer(result_file) + logger.info(f'Transfer MMMU_TEST result to json for official evaluation, ' + f'json file saved in {result_json}') + continue + elif 'MMT-Bench_ALL' in dataset_name: + submission_file = MMTBench_result_transfer(result_file, **judge_kwargs) + logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation ' + f'(https://eval.ai/web/challenges/challenge-page/2328/overview), ' + f'submission file saved in {submission_file}') + continue + + # Skip the evaluation part if only infer + if args.mode == 'infer': + continue + + # Skip the evaluation part if the dataset evaluation is not supported or annotations are missing + if 'MLLMGuard_DS' in dataset_name: + logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') + continue + elif 'AesBench_TEST' == dataset_name: + logger.info(f'The results are saved in {result_file}. ' + f'Please send it to the AesBench Team via huangyipo@hotmail.com.') + continue + elif dataset_name in ['DocVQA_TEST', 'InfoVQA_TEST', 'Q-Bench1_TEST', 'A-Bench_TEST']: + logger.info(f'{dataset_name} is a test split without ground-truth. ' + 'Thus only the inference part is supported for those datasets. ') + continue + elif dataset_name in [ + 'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN', + 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11' + ] and not MMBenchOfficialServer(dataset_name): + logger.error( + f'Can not evaluate {dataset_name} on non-official servers, will skip the evaluation.') + continue + + # Setup the proxy for the evaluation + eval_proxy = os.environ.get('EVAL_PROXY', None) + old_proxy = os.environ.get('HTTP_PROXY', '') + if eval_proxy is not None: + proxy_set(eval_proxy) + + # Perform the Evaluation + eval_results = dataset.evaluate(result_file, **judge_kwargs) + # Display Evaluation Results in Terminal + if eval_results is not None: + assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) + logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ') + logger.info('Evaluation Results:') + if isinstance(eval_results, dict): + logger.info('\n' + json.dumps(eval_results, indent=4)) + elif isinstance(eval_results, pd.DataFrame): + if len(eval_results) < len(eval_results.columns): + eval_results = eval_results.T + logger.info('\n' + tabulate(eval_results)) + + # Restore the proxy + if eval_proxy is not None: + proxy_set(old_proxy) + + # Create the symbolic links for the prediction files + files = os.listdir(pred_root) + files = [x for x in files if (f'{model_name}_{dataset_name}' in x or "status.json" in x)] + for f in files: + cwd = os.getcwd() + file_addr = osp.join(cwd, pred_root, f) + link_addr = osp.join(cwd, pred_root_meta, f) + if osp.exists(link_addr) or osp.islink(link_addr): + os.remove(link_addr) + os.symlink(file_addr, link_addr) + + except Exception as e: + logger.exception(f'Model {model_name} x Dataset {dataset_name} combination failed: {e}, ' + 'skipping this combination.') continue - result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx' - if dataset_name in ['MMBench-Video']: - packstr = 'pack' if args.pack else 'nopack' - result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' - elif dataset.MODALITY == 'VIDEO': - if args.pack: - logger.info(f'{dataset_name} not support Pack Mode, directly change to unpack') - args.pack = False - packstr = 'pack' if args.pack else 'nopack' - result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' - if dataset_name in ['Video-MME']: - subtitlestr = 'subs' if args.use_subtitle else 'nosubs' - result_file = result_file.replace('.xlsx', f'_{subtitlestr}.xlsx') + if world_size > 1: + dist.barrier() - if dataset.TYPE == 'MT': - result_file = result_file.replace('.xlsx', '.tsv') - - if osp.exists(result_file) and args.rerun: - for keyword in ['openai', 'gpt', 'auxmatch']: - os.system(f'rm {pred_root}/{model_name}_{dataset_name}_{keyword}*') - - if model is None: - model = model_name # which is only a name - - # Perform the Inference - if dataset.MODALITY == 'VIDEO': - model = infer_data_job_video( - model, - work_dir=pred_root, - model_name=model_name, - dataset=dataset, - nframe=args.nframe, - pack=args.pack, - verbose=args.verbose, - subtitle=args.use_subtitle, - api_nproc=args.nproc) - elif dataset.TYPE == 'MT': - model = infer_data_job_mt( - model, - work_dir=pred_root, - model_name=model_name, - dataset=dataset, - verbose=args.verbose, - api_nproc=args.nproc, - ignore_failed=args.ignore) - else: - model = infer_data_job( - model, - work_dir=pred_root, - model_name=model_name, - dataset=dataset, - verbose=args.verbose, - api_nproc=args.nproc, - ignore_failed=args.ignore) - - # Set the judge kwargs first before evaluation or dumping - judge_kwargs = { - 'nproc': args.nproc, - 'verbose': args.verbose, - } - if args.retry is not None: - judge_kwargs['retry'] = args.retry - if args.judge is not None: - judge_kwargs['model'] = args.judge - else: - if dataset.TYPE in ['MCQ', 'Y/N']: - judge_kwargs['model'] = 'chatgpt-0125' - elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], dataset_name): - judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' - if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']): - judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] - if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): - judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] - - if rank == 0: - if dataset_name in ['MMMU_TEST']: - result_json = MMMU_result_transfer(result_file) - logger.info(f'Transfer MMMU_TEST result to json for official evaluation, ' - f'json file saved in {result_json}') # noqa: E501 - continue - elif 'MMT-Bench_ALL' in dataset_name: - submission_file = MMTBench_result_transfer(result_file, **judge_kwargs) - logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation ' - f'(https://eval.ai/web/challenges/challenge-page/2328/overview), ' - f'submission file saved in {submission_file}') # noqa: E501 - continue - elif 'MLLMGuard_DS' in dataset_name: - logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') # noqa: E501 - continue - elif 'AesBench_TEST' == dataset_name: - logger.info(f'The results are saved in {result_file}. ' - f'Please send it to the AesBench Team via huangyipo@hotmail.com.') # noqa: E501 - continue - - if dataset_name in [ - 'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN', - 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11' - ]: - if not MMBenchOfficialServer(dataset_name): - logger.error( - f'Can not evaluate {dataset_name} on non-official servers, ' - 'will skip the evaluation. ' - ) - continue - - eval_proxy = os.environ.get('EVAL_PROXY', None) - old_proxy = os.environ.get('HTTP_PROXY', '') - - if rank == 0 and args.mode == 'all': - if eval_proxy is not None: - proxy_set(eval_proxy) - - eval_results = dataset.evaluate(result_file, **judge_kwargs) - if eval_results is not None: - assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) - logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ') - logger.info('Evaluation Results:') - if isinstance(eval_results, dict): - logger.info('\n' + json.dumps(eval_results, indent=4)) - elif isinstance(eval_results, pd.DataFrame): - if len(eval_results) < len(eval_results.columns): - eval_results = eval_results.T - logger.info('\n' + tabulate(eval_results)) - - if eval_proxy is not None: - proxy_set(old_proxy) + if world_size > 1: + dist.destroy_process_group() if __name__ == '__main__': diff --git a/eval_mm/vlmevalkit/scripts/run_inference.sh b/eval_mm/vlmevalkit/scripts/run_inference.sh index 3952194..f3de1ac 100644 --- a/eval_mm/vlmevalkit/scripts/run_inference.sh +++ b/eval_mm/vlmevalkit/scripts/run_inference.sh @@ -5,27 +5,37 @@ export OMP_NUM_THREADS=1 export timestamp=`date +"%Y%m%d%H%M%S"` export OLD_VERSION='False' export PYTHONPATH=$(dirname $SELF_DIR):$PYTHONPATH +export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" # gpu consumed # fp16 17-18G # int4 7-8G # model to be used -# Example: MODELNAME=MiniCPM_V_2_6 +# Example: MODELNAME=MiniCPM-o-2_6 MODELNAME=$1 # datasets to be tested -# Example: DATALIST="MMMU_DEV_VAL MathVista_MINI MMVet MMBench_DEV_EN_V11 MMBench_DEV_CN_V11 MMStar HallusionBench AI2D_TEST" +# Example: DATALIST=MMMU_DEV_VAL DATALIST=$2 -# test mode, all or infer -MODE=$3 -echo "Starting inference with model $MODELNAME on datasets $DATALIST" # run on multi gpus with torchrun command # remember to run twice, the first run may fail -torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE -torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE -# run on single gpu with python command -# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE -# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE +for DATASET in $DATALIST; do + echo "Starting inference with model $MODELNAME on dataset $DATASET" + torchrun --master_port 29500 --nproc_per_node=8 run.py --data $DATASET --model $MODELNAME --mode infer --reuse + torchrun --master_port 29501 --nproc_per_node=8 run.py --data $DATASET --model $MODELNAME --mode infer --reuse -ls + # for benchmarks which require gpt for scoring, you need to specify OPENAI_API_BASE and OPENAI_API_KEY in .env file + if [[ "$DATASET" == *"MMBench_TEST"*]]; then + echo "Skipping evaluation for dataset $DATASET" + else + echo "Starting evaluation with model $MODELNAME on datasets $DATASET" + python run.py --data $DATASET --model $MODELNAME --nproc 16 --verbose + fi +done + +# run on single gpu with python command +# python run.py --data $DATALIST --model $MODELNAME --verbose --mode infer +# python run.py --data $DATALIST --model $MODELNAME --verbose --mode infer +# echo "Starting evaluation with model $MODELNAME on datasets $DATASET" +# python run.py --data $DATASET --model $MODELNAME --nproc 16 --verbose diff --git a/eval_mm/vlmevalkit/vlmeval/api/__init__.py b/eval_mm/vlmevalkit/vlmeval/api/__init__.py index b2573c5..01f2b46 100644 --- a/eval_mm/vlmevalkit/vlmeval/api/__init__.py +++ b/eval_mm/vlmevalkit/vlmeval/api/__init__.py @@ -1,5 +1,5 @@ from .gpt import OpenAIWrapper, GPT4V __all__ = [ - 'OpenAIWrapper', 'GPT4V' + 'OpenAIWrapper', 'GPT4V', ] diff --git a/eval_mm/vlmevalkit/vlmeval/api/base.py b/eval_mm/vlmevalkit/vlmeval/api/base.py index 8df5a5c..98eef51 100644 --- a/eval_mm/vlmevalkit/vlmeval/api/base.py +++ b/eval_mm/vlmevalkit/vlmeval/api/base.py @@ -3,7 +3,7 @@ import random as rd from abc import abstractmethod import os.path as osp import copy as cp -from ..smp import get_logger, parse_file, concat_images_vlmeval +from ..smp import get_logger, parse_file, concat_images_vlmeval, LMUDataRoot, md5, decode_base64_to_image_file class BaseAPI: @@ -143,7 +143,9 @@ class BaseAPI: while len(inputs): try: return self.generate_inner(inputs, **kwargs) - except: + except Exception as e: + if self.verbose: + self.logger.info(f'{type(e)}: {e}') inputs = inputs[1:] while len(inputs) and inputs[0]['role'] != 'user': inputs = inputs[1:] @@ -179,19 +181,38 @@ class BaseAPI: if not isinstance(log, str): try: log = log.text - except: - self.logger.warning(f'Failed to parse {log} as an http response. ') + except Exception as e: + self.logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ') self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}') except Exception as err: if self.verbose: - self.logger.error(f'An error occured during try {i}:') - self.logger.error(err) + self.logger.error(f'An error occured during try {i}: ') + self.logger.error(f'{type(err)}: {err}') # delay before each retry T = rd.random() * self.wait * 2 time.sleep(T) return self.fail_msg if answer in ['', None] else answer + def preprocess_message_with_role(self, message): + system_prompt = '' + new_message = [] + + for data in message: + assert isinstance(data, dict) + role = data.pop('role', 'user') + if role == 'system': + system_prompt += data['value'] + '\n' + else: + new_message.append(data) + + if system_prompt != '': + if self.system_prompt is None: + self.system_prompt = system_prompt + else: + self.system_prompt += '\n' + system_prompt + return new_message + def generate(self, message, **kwargs1): """The main function to generate the answer. Will call `generate_inner` with the preprocessed input messages. @@ -201,6 +222,9 @@ class BaseAPI: Returns: str: The generated answer of the Failed Message if failed to obtain answer. """ + if self.check_content(message) == 'listdict': + message = self.preprocess_message_with_role(message) + assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}' message = self.preproc_content(message) assert message is not None and self.check_content(message) == 'listdict' @@ -227,13 +251,13 @@ class BaseAPI: if not isinstance(log, str): try: log = log.text - except: - self.logger.warning(f'Failed to parse {log} as an http response. ') + except Exception as e: + self.logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ') self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}') except Exception as err: if self.verbose: - self.logger.error(f'An error occured during try {i}:') - self.logger.error(err) + self.logger.error(f'An error occured during try {i}: ') + self.logger.error(f'{type(err)}: {err}') # delay before each retry T = rd.random() * self.wait * 2 time.sleep(T) diff --git a/eval_mm/vlmevalkit/vlmeval/api/gpt.py b/eval_mm/vlmevalkit/vlmeval/api/gpt.py index f308e7c..09aff35 100644 --- a/eval_mm/vlmevalkit/vlmeval/api/gpt.py +++ b/eval_mm/vlmevalkit/vlmeval/api/gpt.py @@ -38,7 +38,7 @@ class OpenAIWrapper(BaseAPI): retry: int = 5, wait: int = 5, key: str = None, - verbose: bool = True, + verbose: bool = False, system_prompt: str = None, temperature: float = 0, timeout: int = 60, @@ -56,7 +56,7 @@ class OpenAIWrapper(BaseAPI): self.temperature = temperature self.use_azure = use_azure - if 'step-1v' in model: + if 'step' in model: env_key = os.environ.get('STEPAI_API_KEY', '') if key is None: key = env_key @@ -64,6 +64,14 @@ class OpenAIWrapper(BaseAPI): env_key = os.environ.get('YI_API_KEY', '') if key is None: key = env_key + elif 'internvl2-pro' in model: + env_key = os.environ.get('InternVL2_PRO_KEY', '') + if key is None: + key = env_key + elif 'abab' in model: + env_key = os.environ.get('MiniMax_API_KEY', '') + if key is None: + key = env_key else: if use_azure: env_key = os.environ.get('AZURE_OPENAI_API_KEY', None) @@ -124,7 +132,7 @@ class OpenAIWrapper(BaseAPI): self.api_base = api_base else: self.logger.error('Unknown API Base. ') - sys.exit(-1) + raise NotImplementedError self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') @@ -169,19 +177,22 @@ class OpenAIWrapper(BaseAPI): temperature = kwargs.pop('temperature', self.temperature) max_tokens = kwargs.pop('max_tokens', self.max_tokens) - context_window = GPT_context_window(self.model) - max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) - if 0 < max_tokens <= 100: - self.logger.warning( - 'Less than 100 tokens left, ' - 'may exceed the context window with some additional meta symbols. ' - ) - if max_tokens <= 0: - return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' + # context_window = GPT_context_window(self.model) + # new_max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) + # if 0 < new_max_tokens <= 100 and new_max_tokens < max_tokens: + # self.logger.warning( + # 'Less than 100 tokens left, ' + # 'may exceed the context window with some additional meta symbols. ' + # ) + # if new_max_tokens <= 0: + # return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' + # max_tokens = new_max_tokens # Will send request if use Azure, dk how to use openai client for it if self.use_azure: headers = {'Content-Type': 'application/json', 'api-key': self.key} + elif 'internvl2-pro' in self.model: + headers = {'Content-Type': 'application/json', 'Authorization': self.key} else: headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} payload = dict( @@ -200,8 +211,11 @@ class OpenAIWrapper(BaseAPI): try: resp_struct = json.loads(response.text) answer = resp_struct['choices'][0]['message']['content'].strip() - except: - pass + except Exception as err: + if self.verbose: + self.logger.error(f'{type(err)}: {err}') + self.logger.error(response.text if hasattr(response, 'text') else response) + return ret_code, answer, response def get_image_token_len(self, img_path, detail='low'): @@ -228,8 +242,13 @@ class OpenAIWrapper(BaseAPI): import tiktoken try: enc = tiktoken.encoding_for_model(self.model) - except: - enc = tiktoken.encoding_for_model('gpt-4') + except Exception as err: + if 'gpt' in self.model.lower(): + if self.verbose: + self.logger.warning(f'{type(err)}: {err}') + enc = tiktoken.encoding_for_model('gpt-4') + else: + return 0 assert isinstance(inputs, list) tot = 0 for item in inputs: diff --git a/eval_mm/vlmevalkit/vlmeval/config.py b/eval_mm/vlmevalkit/vlmeval/config.py index be42e9a..39072a5 100644 --- a/eval_mm/vlmevalkit/vlmeval/config.py +++ b/eval_mm/vlmevalkit/vlmeval/config.py @@ -7,6 +7,7 @@ minicpm_series = { 'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'), 'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'), 'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'), + 'MiniCPM-o-2_6': partial(MiniCPM_o_2_6, model_path='openbmb/MiniCPM-o-2_6'), } supported_VLM = {} diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/__init__.py b/eval_mm/vlmevalkit/vlmeval/dataset/__init__.py index 01e7f7f..2f00715 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/__init__.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/__init__.py @@ -3,22 +3,42 @@ import warnings from .image_base import img_root_map, ImageBaseDataset from .image_caption import ImageCaptionDataset from .image_yorn import ImageYORNDataset -from .image_mcq import ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset +from .image_mcq import ( + ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, + NaturalBenchDataset +) from .image_mt import MMDUDataset from .image_vqa import ( - ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, CustomVQADataset + ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench, + CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH, WeMath, LogicVista ) +from .image_ccocr import CCOCRDataset +from .text_mcq import CustomTextMCQDataset, TextMCQDataset + from .vcr import VCRDataset from .mmlongbench import MMLongBench from .dude import DUDE from .slidevqa import SlideVQA +from .vl_rewardbench import VLRewardBench from .mmbench_video import MMBenchVideo -from .text_mcq import CustomTextMCQDataset, TextMCQDataset from .videomme import VideoMME from .mvbench import MVBench, MVBench_MP4 +from .mlvu import MLVU, MLVU_MCQ, MLVU_OpenEnded +from .tempcompass import TempCompass, TempCompass_Captioning, TempCompass_MCQ, TempCompass_YorN +from .longvideobench import LongVideoBench +from .video_concat_dataset import ConcatVideoDataset +from .mmgenbench import MMGenBench +from .cgbench import CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded + +from .miabench import MIABench +from .cmmmu import CMMMU +from .wildvision import WildVision +from .mmmath import MMMath +from .dynamath import Dynamath from .utils import * +from .video_dataset_config import * from ..smp import * @@ -110,12 +130,18 @@ class ConcatDataset(ImageBaseDataset): # Add new supported dataset class here IMAGE_DATASET = [ ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision, - MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, - MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, GMAIMMBenchDataset + MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench, + MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset, + GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset, + MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH, + CMMMU, VLRewardBench, WeMath, LogicVista ] VIDEO_DATASET = [ - MMBenchVideo, VideoMME, MVBench, MVBench_MP4 + MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench, + MLVU, MLVU_MCQ, MLVU_OpenEnded, + TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN, + CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded ] TEXT_DATASET = [ @@ -126,7 +152,7 @@ CUSTOM_DATASET = [ CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset ] -DATASET_COLLECTION = [ConcatDataset] +DATASET_COLLECTION = [ConcatDataset, ConcatVideoDataset] DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION SUPPORTED_DATASETS = [] @@ -134,7 +160,7 @@ for DATASET_CLS in DATASET_CLASSES: SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets()) -def DATASET_TYPE(dataset): +def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str: for cls in DATASET_CLASSES: if dataset in cls.supported_datasets(): if hasattr(cls, 'TYPE'): @@ -148,13 +174,38 @@ def DATASET_TYPE(dataset): if 'openended' in dataset.lower(): return 'VQA' - warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as MCQ. ') - return 'MCQ' + warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ') + return default + + +def DATASET_MODALITY(dataset, *, default: str = 'IMAGE') -> str: + if dataset is None: + warnings.warn(f'Dataset is not specified, will treat modality as {default}. ') + return default + for cls in DATASET_CLASSES: + if dataset in cls.supported_datasets(): + if hasattr(cls, 'MODALITY'): + return cls.MODALITY + # Have to add specific routine to handle ConcatDataset + if dataset in ConcatDataset.DATASET_SETS: + dataset_list = ConcatDataset.DATASET_SETS[dataset] + MODALITIES = [DATASET_MODALITY(dname) for dname in dataset_list] + assert np.all([x == MODALITIES[0] for x in MODALITIES]), (dataset_list, MODALITIES) + return MODALITIES[0] + + if 'VIDEO' in dataset.lower(): + return 'VIDEO' + elif 'IMAGE' in dataset.lower(): + return 'IMAGE' + warnings.warn(f'Dataset {dataset} is a custom one, will treat modality as {default}. ') + return default def build_dataset(dataset_name, **kwargs): for cls in DATASET_CLASSES: - if dataset_name in cls.supported_datasets(): + if dataset_name in supported_video_datasets: + return supported_video_datasets[dataset_name](**kwargs) + elif dataset_name in cls.supported_datasets(): return cls(dataset=dataset_name, **kwargs) warnings.warn(f'Dataset {dataset_name} is not officially supported. ') diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/cgbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/cgbench.py new file mode 100644 index 0000000..172cdbb --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/cgbench.py @@ -0,0 +1,1760 @@ +from huggingface_hub import snapshot_download +from ..smp import * +from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from .utils.cgbench import * +from ..utils import track_progress_rich + + +class CGBench_MCQ_Grounding_Mini(VideoBaseDataset): + + dataset = "CG-Bench_MCQ_Grounding_Mini" + + TYPE = "Video-MCQ-Grounding" + + MD5 = "54ed3e90a51a6fb375c92b319a715f72" + + SYS = { + "long_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + 'The "option" is the uppercase letter corresponding to your answer.\n\n' + ), + "clue_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + "The 'option' is the uppercase letter corresponding to your answer.\n\n" + ), + "miou": ( + "You will be provided with uniformly sampled frames from a video and their " + "timestamps, along with a multiple-choice question that includes a question " + "and several answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + "In this output format, each 'start' and 'end' represents the beginning and " + "end of an interval in seconds where relevant clues can be found.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + "miou_wo_frame_time": ( + "You will be provided with uniformly sampled frames from a video, along " + "with a multiple-choice question that includes a question and several " + "answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + 'In this output format, each "start" and "end" represents the start and ' + "end of the video where the relevant clue can be found in the form of a " + "floating point number between 0 and 1, where 0 represents the start time " + "of the video and 1 represents the end time of the video.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + } + + def __init__( + self, + dataset="CG-Bench_MCQ_Grounding_Mini", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_MCQ_Grounding_Mini"] + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding_Mini", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + task_modes = ["long_acc", "clue_acc", "miou"] + all_data = [] + for task_mode in task_modes: + with open(osp.join(pth, "cgbench_mini.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: ( + f"cg_subtitles/{x}.srt" + if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt")) + else "" + ) + ) + + data_file["clue_video_path"] = "" + + if task_mode in ["clue_acc"]: + data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply( + lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1 + ) + + data_file["task_mode"] = task_mode + + if task_mode in ["clue_acc", "long_acc"]: + data_file["answer"] = data_file["right_answer"] + + if task_mode == "miou": + data_file["answer"] = data_file["clue_intervals"] + + if task_mode in ["long_acc", "miou"]: + data_file["clue_intervals"] = "" + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "choices", + "sub_category", + "subtitle_path", + "question", + "answer", + "task_mode", + "clue_intervals", + "qid", + "clue_video_path", + ] + ] + + all_data.append(data_file) + + final_data = pd.concat(all_data, ignore_index=True) + final_data["index"] = range(len(final_data)) + final_data.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + task_mode = line["task_mode"] + + message = [] + + origin_use_subtitle_time = self.use_subtitle_time + + try: + if task_mode in ["long_acc", "clue_acc"]: + system_prompt = self.SYS[task_mode] + elif task_mode == "miou": + if self.use_frame_time and not video_llm: + system_prompt = self.SYS[task_mode] + else: + system_prompt = self.SYS["miou_wo_frame_time"] + if self.use_subtitle_time is True: + self.use_subtitle_time = False + + user_prompt = "" + + if task_mode in ["long_acc", "miou"]: + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + elif task_mode == "clue_acc": + clue_video_path = line["clue_video_path"] + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path))) + print(message) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + if self.nframe > 32: + self.nframe = 32 + print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !") + + clue_intervals = eval(line["clue_intervals"]) + + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps + ) + + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + choices = eval(line["choices"]) + labels = [chr(ord("A") + i) for i in range(len(choices))] + user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n" + + message.append(dict(type="text", value=system_prompt + user_prompt)) + + return message + + finally: + # Ensure that `use_subtitle_time` is always restored to its original value + self.use_subtitle_time = origin_use_subtitle_time + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + # Save and validate frames + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + + data = load(eval_file) + + data_un = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["score"] = -1 + + data_un["score"] = data_un.apply( + lambda row: post_process( + response=row["prediction"], + right_answer=row["answer"], + task_mode=row["task_mode"], + duration=row["duration"], + ), + axis=1, + ) + + data = pd.concat([data_pred_na, data_un]) + + rejected_count = (data["score"] == -1).sum() + + print( + f"Among {len(data)} questions, " + f"failed to obtain prediction for {len(data_pred_na)} questions, " + f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. " + f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating." + ) + + dump(data, score_file) + + rating = get_dimention_rating_mcq_grouding(score_file) + + dump(rating, tgt_file) + + return rating + + +# 评估时,step_2 评估时,给出 [prompt] + image_paths 就行 +class CGBench_OpenEnded_Mini(VideoBaseDataset): + + TYPE = "Video-OpenEnded" + + dataset = "CG-Bench_OpenEnded_Mini" + + MD5 = "9175791b11afdfa305fdb3e525b7a4ee" + + SYS = ( + "You will be provided with sampled frames from a video, along with a " + "question.\n" + "Your task is to analyze the provided frames and infer the most plausible " + "answer based on the visual information.\n" + "If the visual information is ambiguous or insufficient, use the available " + "context to reason your answer.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "answer"}\n```\n\n' + 'The "answer" can be a word, phrase, or sentence that directly responds to ' + "the question.\n\n" + ) + + def __init__( + self, + dataset="CG-Bench_OpenEnded_Mini", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_OpenEnded_Mini"] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded_Mini", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + with open(osp.join(pth, "cgbench_mini.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else "" + ) + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "sub_category", + "subtitle_path", + "question", + "answer", + "clue_intervals", + "qid", + ] + ] + + data_file.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + message = [] + + sys_prompt = self.SYS + + user_prompt = "" + + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + message.append(dict(type="text", value=sys_prompt + user_prompt)) + + return message + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + from .utils.cgbench import get_dimention_rating_open_ended, post_process_open + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl") + step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl") + + data = load(eval_file) + + data_pred_no_na = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["model_result"] = -1 + data_pred_na["step_1_result"] = -1 + data_pred_na["step_2_result"] = -1 + data_pred_na["score"] = -1 + + data_pred_no_na["model_result"] = data_pred_no_na.apply( + lambda row: post_process_open( + response=row["prediction"], + ), + axis=1, + ) + + data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1] + data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1] + + if judge_kwargs.get("model", None) != "gpt-4o-0806": + judge_kwargs["model"] = "gpt-4o-0806" + print("The judge model in cg-bench is gpt-4o-0806!") + + model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs) + nproc = judge_kwargs.pop("nproc", 32) + + lines_step_1 = data_step_1.to_dict("records") + tups_step_1 = [(model_step_1, line) for line in lines_step_1] + + keys_step_1 = {line["qid"] for line in lines_step_1} + + ans = {} + if osp.exists(step_1_tmp_file): + ans = load(step_1_tmp_file) + tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans] + keys_step_1 = [i for i in keys_step_1 if i not in ans] + + _ = track_progress_rich( + eval_open_first, + tups_step_1, + nproc=nproc, + keys=keys_step_1, + save=step_1_tmp_file, + ) + + step_1_results = load(step_1_tmp_file) + data_step_1 = save_step_1_steps(data_step_1, step_1_results) # -1, 0, 1, 2 + + data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1] + data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])] + data_step_2 = data_step_1[data_step_1["step_1_result"] == 2] + + print(judge_kwargs) + + model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs) + + lines_step_2 = data_step_2.to_dict("records") + + tups_step_2 = [] + + for line in tqdm(lines_step_2): + clue_intervals = eval(line["clue_intervals"]) + lmu_root = LMUDataRoot() + clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset) + data_root = self.data_root + frame_paths, _, _ = save_clue_video_frames( + data_root, + clue_frame_root, + video=line["video"], + uid=line["qid"], + clue_intervals=clue_intervals, + num_frames=32, + ) + tups_step_2.append((model_step_2, line, frame_paths)) + + keys_step_2 = {line["qid"] for line in lines_step_2} + + ans = {} + if osp.exists(step_2_tmp_file): + ans = load(step_2_tmp_file) + tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans] + keys_step_2 = [i for i in keys_step_2 if i not in ans] + + _ = track_progress_rich( + eval_open_second, + tups_step_2, + nproc=nproc, + keys=keys_step_2, + save=step_2_tmp_file, + ) + + step_2_results = load(step_2_tmp_file) + data_step_2 = save_step_2_steps(data_step_2, step_2_results) + + data_no_step_2_results = data_step_2[data_step_2["score"] == -1] + data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])] + + data = pd.concat( + [ + data_pred_na, + data_no_model_result, + data_no_step_1_results, + data_step_1_over, + data_no_step_2_results, + data_step_2_over, + ] + ) + + dump(data, score_file) + + rating = get_dimention_rating_open_ended(score_file) + + dump(rating, tgt_file) + + return rating + + +class CGBench_MCQ_Grounding(VideoBaseDataset): + + TYPE = "Video-MCQ-Grounding" + + MD5 = "eaead3d978a689269fefce4ae29c86df" + + SYS = { + "long_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + 'The "option" is the uppercase letter corresponding to your answer.\n\n' + ), + "clue_acc": ( + "You will be provided with sampled frames from a video, along with a " + "multiple-choice question that includes a question and several answer options.\n" + "Your task is to analyze the provided frames, infer the most plausible " + "answer based on the visual information.\n" + "If the video does not provide enough information, infer the answer based " + "on the options available and still provide a result. " + "Therefore, In all cases, an answer must be given.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "option"}\n```\n\n' + "The 'option' is the uppercase letter corresponding to your answer.\n\n" + ), + "miou": ( + "You will be provided with uniformly sampled frames from a video and their " + "timestamps, along with a multiple-choice question that includes a question " + "and several answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + "In this output format, each 'start' and 'end' represents the beginning and " + "end of an interval in seconds where relevant clues can be found.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + "miou_wo_frame_time": ( + "You will be provided with uniformly sampled frames from a video, along " + "with a multiple-choice question that includes a question and several " + "answer options.\n" + "Your task is to determine in which intervals the 'clue intervals' exist " + "that contain visual information needed to answer the question.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n' + 'In this output format, each "start" and "end" represents the start and ' + "end of the video where the relevant clue can be found in the form of a " + "floating point number between 0 and 1, where 0 represents the start time " + "of the video and 1 represents the end time of the video.\n" + "You must provide at least one interval and at most five intervals. " + "Intervals exceeding five will NOT be considered valid.\n" + ), + } + + def __init__( + self, + dataset="CG-Bench_MCQ_Grounding", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_MCQ_Grounding"] + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + for clue_video_pth in data["clue_video_path"]: + if clue_video_pth and not (isinstance(clue_video_pth, float) and np.isnan(clue_video_pth)): + if not osp.exists(osp.join(pth, clue_video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + task_modes = ["long_acc", "clue_acc", "miou"] + all_data = [] + for task_mode in task_modes: + with open(osp.join(pth, "cgbench.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: ( + f"cg_subtitles/{x}.srt" + if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt")) + else "" + ) + ) + + data_file["clue_video_path"] = "" + + if task_mode in ["clue_acc"]: + data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply( + lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1 + ) + + data_file["task_mode"] = task_mode + + if task_mode in ["clue_acc", "long_acc"]: + data_file["answer"] = data_file["right_answer"] + + if task_mode == "miou": + data_file["answer"] = data_file["clue_intervals"] + + if task_mode in ["long_acc", "miou"]: + data_file["clue_intervals"] = "" + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "choices", + "sub_category", + "subtitle_path", + "question", + "answer", + "task_mode", + "clue_intervals", + "qid", + "clue_video_path", + ] + ] + + all_data.append(data_file) + + final_data = pd.concat(all_data, ignore_index=True) + final_data["index"] = range(len(final_data)) + final_data.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + task_mode = line["task_mode"] + + message = [] + + origin_use_subtitle_time = self.use_subtitle_time + + try: + if task_mode in ["long_acc", "clue_acc"]: + system_prompt = self.SYS[task_mode] + elif task_mode == "miou": + if self.use_frame_time and not video_llm: + system_prompt = self.SYS[task_mode] + else: + system_prompt = self.SYS["miou_wo_frame_time"] + if self.use_subtitle_time is True: + self.use_subtitle_time = False + + user_prompt = "" + + if task_mode in ["long_acc", "miou"]: + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + elif task_mode == "clue_acc": + clue_video_path = line["clue_video_path"] + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path))) + print(message) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + if self.nframe > 32: + self.nframe = 32 + print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !") + + clue_intervals = eval(line["clue_intervals"]) + + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps + ) + + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + choices = eval(line["choices"]) + labels = [chr(ord("A") + i) for i in range(len(choices))] + user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n" + + message.append(dict(type="text", value=system_prompt + user_prompt)) + + return message + + finally: + # Ensure that `use_subtitle_time` is always restored to its original value + self.use_subtitle_time = origin_use_subtitle_time + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + # Save and validate frames + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + + data = load(eval_file) + + data_un = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["score"] = -1 + + data_un["score"] = data_un.apply( + lambda row: post_process( + response=row["prediction"], + right_answer=row["answer"], + task_mode=row["task_mode"], + duration=row["duration"], + ), + axis=1, + ) + + data = pd.concat([data_pred_na, data_un]) + + rejected_count = (data["score"] == -1).sum() + + print( + f"Among {len(data)} questions, " + f"failed to obtain prediction for {len(data_pred_na)} questions, " + f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. " + f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating." + ) + + dump(data, score_file) + + rating = get_dimention_rating_mcq_grouding(score_file) + + dump(rating, tgt_file) + + return rating + + +# 评估时,step_2 评估时,给出 [prompt] + image_paths 就行 +class CGBench_OpenEnded(VideoBaseDataset): + + TYPE = "Video-OpenEnded" + + dataset = "CG-Bench_OpenEnded" + + MD5 = "796035eda0b1e916c517cdc1bc145cfc" + + SYS = ( + "You will be provided with sampled frames from a video, along with a " + "question.\n" + "Your task is to analyze the provided frames and infer the most plausible " + "answer based on the visual information.\n" + "If the visual information is ambiguous or insufficient, use the available " + "context to reason your answer.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": "answer"}\n```\n\n' + 'The "answer" can be a word, phrase, or sentence that directly responds to ' + "the question.\n\n" + ) + + def __init__( + self, + dataset="CG-Bench_OpenEnded", + use_subtitle=False, + use_subtitle_time=False, + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.use_subtitle_time = use_subtitle_time + self.use_frame_time = use_frame_time + self.dataset_name = dataset + lmu_root = LMUDataRoot() + self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset) + + @classmethod + def supported_datasets(cls): + return ["CG-Bench_OpenEnded"] + + def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False): + + subtitles = [] + + srt_path = osp.join(self.data_root, subtitle_path) + assert osp.exists(srt_path) + import pysubs2 + + subs = pysubs2.load(srt_path, encoding="utf-8") + if not frame_indices: + for sub in subs: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + else: + for selected_frame_id in frame_indices: + cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id) + for sub in subs: + if sub.start < cur_time and sub.end > cur_time: + sub_text = sub.text.replace("\\N", " ") + if sub_time: + start_time = milliseconds_to_seconds(sub.start) + end_time = milliseconds_to_seconds(sub.end) + sub_text = f"[{start_time}, {end_time}] {sub_text}" + if sub_text.strip() and sub_text not in subtitles: + subtitles.append(sub_text) + + if subtitles: + subtitles_str = '\n'.join(subtitles) + return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n" + else: + return "" + + def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded", repo_id="CG-Bench/CG-Bench"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + with open(osp.join(pth, "cgbench.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4") + data_file["subtitle_path"] = data_file["video_uid"].apply( + lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else "" + ) + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "duration", + "domain", + "sub_category", + "subtitle_path", + "question", + "answer", + "clue_intervals", + "qid", + ] + ] + + data_file.to_csv(tsv_file, sep="\t", index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + message = [] + + sys_prompt = self.SYS + + user_prompt = "" + + video_path = line["video"] + + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + if self.nframe: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices, + fps=vid_fps, sub_time=self.use_subtitle_time) + else: + user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]): + user_prompt += self.get_subtitles( + line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps, + sub_time=self.use_subtitle_time + ) + + question = line["question"] + user_prompt += f"Question: {question}\n\n" + + message.append(dict(type="text", value=sys_prompt + user_prompt)) + + return message + + def clue_frame_paths(self, qid, num_frames=8): + frame_root = osp.join(self.clue_frame_root, qid) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if clue_intervals is not None: + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = self.clue_frame_paths(uid, len(indices)) + + elif fps > 0: + frame_indices = [] + for start, end in merged_intervals: + start_frame = int(start * vid_fps) + end_frame = int(end * vid_fps) + step = vid_fps / fps + interval_indices = [ + int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step)) + ] + frame_indices.extend(interval_indices) + + if len(frame_indices) < 32: + indices = sample_frames_clue_average(merged_intervals, 32, vid_fps) + else: + indices = frame_indices + frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps) + + else: + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + valid_paths = [] + valid_indices = [] + + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + from .utils.cgbench import get_dimention_rating_open_ended, post_process_open + + assert eval_file.endswith(".xlsx"), "data file should be an xlsx file" + + tgt_file = eval_file.replace(".xlsx", "_rating.json") + score_file = eval_file.replace(".xlsx", "_score.xlsx") + step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl") + step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl") + + data = load(eval_file) + + data_pred_no_na = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["model_result"] = -1 + data_pred_na["step_1_result"] = -1 + data_pred_na["step_2_result"] = -1 + data_pred_na["score"] = -1 + + data_pred_no_na["model_result"] = data_pred_no_na.apply( + lambda row: post_process_open( + response=row["prediction"], + ), + axis=1, + ) + + if judge_kwargs.get("model", None) != "gpt-4o-0806": + judge_kwargs["model"] = "gpt-4o-0806" + print("The judge model in cg-bench is gpt-4o-0806!") + + data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1] + data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1] + + model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs) + nproc = judge_kwargs.pop('nproc', 32) + + lines_step_1 = data_step_1.to_dict("records") + tups_step_1 = [(model_step_1, line) for line in lines_step_1] + + keys_step_1 = {line["qid"] for line in lines_step_1} + + ans = {} + if osp.exists(step_1_tmp_file): + ans = load(step_1_tmp_file) + tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans] + keys_step_1 = [i for i in keys_step_1 if i not in ans] + + _ = track_progress_rich( + eval_open_first, + tups_step_1, + nproc=nproc, + keys=keys_step_1, + save=step_1_tmp_file, + ) + + step_1_results = load(step_1_tmp_file) + data_step_1 = save_step_1_steps(data_step_1, step_1_results) # -1, 0, 1, 2 + + data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1] + data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])] + data_step_2 = data_step_1[data_step_1["step_1_result"] == 2] + + model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs) + + lines_step_2 = data_step_2.to_dict("records") + + tups_step_2 = [] + + for line in tqdm(lines_step_2): + clue_intervals = eval(line["clue_intervals"]) + lmu_root = LMUDataRoot() + clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset) + data_root = self.data_root + frame_paths, _, _ = save_clue_video_frames( + data_root, + clue_frame_root, + video=line["video"], + uid=line["qid"], + clue_intervals=clue_intervals, + num_frames=32, + ) + tups_step_2.append((model_step_2, line, frame_paths)) + + keys_step_2 = {line["qid"] for line in lines_step_2} + + ans = {} + if osp.exists(step_2_tmp_file): + ans = load(step_2_tmp_file) + tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans] + keys_step_2 = [i for i in keys_step_2 if i not in ans] + + _ = track_progress_rich( + eval_open_second, + tups_step_2, + nproc=nproc, + keys=keys_step_2, + save=step_2_tmp_file, + ) + + step_2_results = load(step_2_tmp_file) + data_step_2 = save_step_2_steps(data_step_2, step_2_results) + + data_no_step_2_results = data_step_2[data_step_2["score"] == -1] + data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])] + + data = pd.concat( + [ + data_pred_na, + data_no_model_result, + data_no_step_1_results, + data_step_1_over, + data_no_step_2_results, + data_step_2_over, + ] + ) + + dump(data, score_file) + + rating = get_dimention_rating_open_ended(score_file) + + dump(rating, tgt_file) + + return rating diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/cmmmu.py b/eval_mm/vlmevalkit/vlmeval/dataset/cmmmu.py new file mode 100644 index 0000000..12c583f --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/cmmmu.py @@ -0,0 +1,354 @@ +from .image_base import ImageBaseDataset +import random +from collections import Counter +import os +import re +import tempfile +from ..smp import * + + +def get_multi_choice_prediction(response, all_choices, index2ans): + for char in [',', '.', '!', '?', ';', ':', "'"]: + response = response.strip(char) + response = " " + response + " " # add space to avoid partial match + + candidates = [] + + for choice in all_choices: # (A) (B) (C) (D) + # Add the choice to candidates each time it appears in the response + candidates.extend([choice for _ in range(response.count(f'({choice})'))]) + + if len(candidates) == 0: + for choice in all_choices: # A B C D + # Similarly, add the choice for each occurrence + candidates.extend([choice for _ in range(response.count(f'{choice}'))]) + + if len(candidates) == 0 and len(response.split()) >= 1: + for index, ans in index2ans.items(): + # Add index for each occurrence of ans in response + candidates.extend([index for _ in range(response.count(ans))]) + + # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example + if len(candidates) == 0 and len(response.split()) >= 1: + for index, ans in index2ans.items(): + if ans in response: + candidates.append(index) + # index_ans = False # it's content ans. + + if len(candidates) == 0: # still not get answer, randomly choose one. + return random.choice(all_choices) + # return '' + else: + # Count the occurrence of each candidate + candidate_counts = Counter(candidates) + + # Select the most frequent candidates + max_count = max(candidate_counts.values()) + most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count] + + # Combine the most frequent candidates in ABCD order + return ''.join(most_frequent_candidates) + + +def extract_numbers(string): + # Pattern for numbers with Chinese commas + pattern_commas = r'-?\d{1,3}(?:,\d{3})+' + # Pattern for scientific notation + pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+' + # Pattern for simple numbers without Chinese commas + pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!,\d)' + + # Extract numbers with Chinese commas + numbers_with_commas = re.findall(pattern_commas, string) + # Extract numbers in scientific notation + numbers_scientific = re.findall(pattern_scientific, string) + # Extract simple numbers without Chinese commas + numbers_simple = re.findall(pattern_simple, string) + + # Combine all extracted numbers + all_numbers = numbers_with_commas + numbers_scientific + numbers_simple + return all_numbers + + +def check_is_number(string): + try: + float(string.replace(',', '')) + return True + except ValueError: + # check if there's comma inside + return False + + +def count_letters(string): + return sum(c.isalpha() and 'a' <= c <= 'z' or 'A' <= c <= 'Z' for c in string) + + +def normalize_str(string, answer): + # check if characters in the string + + # if number, numerize it. + if string is None: + return [string] + string = string.strip() + + is_number = check_is_number(string) + + if is_number: + string = string.replace(',', '') + string = float(string) + # leave 2 decimal + string = round(string, 2) + return [string] + else: # it's likely to be a string + if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2: + return [] + return [string] + + +def get_fill_blank_prediction(response, answer): + """get the prediction from the generated response, + return a list of predicted strings or numbers""" + + def get_key_subresponses(response): + response = response.strip("。").strip() + sub_responses = re.split(r'。|\n', response) + indicators_of_keys = ['是', '为', '所以', '等于', '方案', '选择', + '正确答案', '因此', '最后', '答案', '结果'] + key_responses = [] + for index, resp in enumerate(sub_responses): + # if last one, accept it's an equation (the entire response can be just one sentence with equation) + if index == len(sub_responses) - 1: + indicators_of_keys.extend(['=']) + shortest_key_response = None + # the shortest response that may contain the answer (tail part of the response) + for indicator in indicators_of_keys: + if indicator in resp: + if not shortest_key_response: + shortest_key_response = resp.split(indicator)[-1].strip() + else: + if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response): + shortest_key_response = resp.split(indicator)[-1].strip() + + if shortest_key_response: + # and it's not trivial + if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]: + key_responses.append(shortest_key_response) + if len(key_responses) == 0: # did not found any + return [response] + return key_responses + + key_responses = get_key_subresponses(response) + + pred_list = key_responses.copy() # keep the original string response + for resp in key_responses: + pred_list.extend(extract_numbers(resp)) + + tmp_pred_list = [] + for i in range(len(pred_list)): + tmp_pred_list.extend(normalize_str(pred_list[i], answer)) + pred_list = tmp_pred_list + + # remove duplicates + pred_list = list(set(pred_list)) + + return pred_list + + +def get_TF_prediction(response): + """get the prediction from the generated response, + return a list of predicted strings or numbers""" + + def get_key_subresponses(response): + response = response.strip("。").strip() + sub_responses = re.split(r'。|\n', response) + indicators_of_keys = ['是', '为', '所以', '判断', + '陈述', '说法', '表达', '答案', '结果'] + key_responses = [] + for index, resp in enumerate(sub_responses): + shortest_key_response = None + # the shortest response that may contain the answer (tail part of the response) + for indicator in indicators_of_keys: + if indicator in resp: + if not shortest_key_response: + shortest_key_response = resp.split(indicator)[-1].strip() + else: + if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response): + shortest_key_response = resp.split(indicator)[-1].strip() + + if shortest_key_response: + # and it's not trivial + if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]: + key_responses.append(shortest_key_response) + if len(key_responses) == 0: # did not found any + return [response] + return key_responses + + key_responses = get_key_subresponses(response) + + pred_list = key_responses.copy() # keep the original string response + # remove duplicates + pred_list = list(set(pred_list)) + + return pred_list + + +class CMMMU(ImageBaseDataset): + TYPE = 'VQA' + + DATASET_URL = { + 'CMMMU_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/CMMMU_VAL.tsv' + } + + DATASET_MD5 = { + 'CMMMU_VAL': 'b4727e2fce2415bf646379e60c11a726' + } + + def dump_image(self, line): + os.makedirs(self.img_root, exist_ok=True) + + tgt_path_z = [] + if isinstance(line['image'], list): + for i in range(len(line['image'])): + tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'][i], tgt_path) + tgt_path_z.append(tgt_path) + else: + tgt_path = osp.join(self.img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path_z.append(tgt_path) + return tgt_path_z + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + + if not osp.exists(result_file): + data = load(eval_file) + assert 'answer' in data and 'prediction' in data + data['prediction'] = [str(x) for x in data['prediction']] + data['answer'] = [str(x) for x in data['answer']] + + correct_count = 0 + correct_category = { + '技术与工程': [0, 0], + '科学': [0, 0], + '健康与医学': [0, 0], + '商业': [0, 0], + '艺术与设计': [0, 0], + '人文社会科学': [0, 0], + } + + for i in tqdm(data.iterrows()): + line = i[1] + correct_category[line['category']][0] += 1 + + # Options + if line['type'] == '选择': + index2ans = { + 'A': line['option1'], + 'B': line['option2'], + 'C': line['option3'], + 'D': line['option4'] + } + fact_option = get_multi_choice_prediction(line['prediction'], ['A', 'B', 'C', 'D'], index2ans) + if fact_option == line['answer']: + correct_count += 1 + correct_category[line['category']][1] += 1 + + # Binary + elif line['type'] == '判断': + positive_keywords = ['正确', '对', '准确', '肯定', '对的'] + negative_keywords = ['不对', '错误', '不正确', '不准确', '不合适', '否定', '错的', '错'] + ambiguous_keywords = ['对错', '是否正确', '否正确', '或者', '是否', '正确性', '对不'] + + def judge_similarity(pred_list, positive_keywords, negative_keywords): + positive_count = 0 + negative_count = 0 + + for pred in pred_list: + if any(pos_word in pred for pos_word in positive_keywords): + positive_count += 1 + elif any(neg_word in pred for neg_word in negative_keywords): + negative_count += 1 + + if positive_count > negative_count: + return "对" + elif negative_count > positive_count: + return "错" + else: + return random.choice(['对', '错']) + + answer = get_TF_prediction(line['prediction']) + answer = [word for word in answer if not any(ambiguous in word for ambiguous in ambiguous_keywords)] + fact_answer = judge_similarity(answer, positive_keywords, negative_keywords) + if fact_answer == line['answer']: + correct_count += 1 + correct_category[line['category']][1] += 1 + + # Fill the Blank + else: + norm_answers = normalize_str(line['answer'], line['answer']) + predicted_answer = get_fill_blank_prediction(line['prediction'], line['answer']) + + for pred in predicted_answer: + # already normalized + if isinstance(pred, str): # if it's a string, then find if ans in the pred_i + for norm_ans in norm_answers: + # only see if the string answer in the string pred + # print(norm_ans, pred) + if isinstance(norm_ans, str) and norm_ans in pred: + correct_count += 1 + correct_category[line['category']][1] += 1 + else: # it's a number + if pred in norm_answers: + correct_count += 1 + correct_category[line['category']][1] += 1 + + accuracyz = {} + accuracyz['总准确率'] = correct_count / len(data) + for i in correct_category.keys(): + accuracyz[i] = correct_category[i][1] / correct_category[i][0] + + accuracyz = d2df(accuracyz) + accuracyz.round(10) + dump(accuracyz, result_file) + + result = pd.read_csv(result_file) + return result + + def build_prompt(self, line): + if line['type'] == '选择': + tgt_path = self.dump_image(line) + question = line['question'] + options_prompt = 'Options:\n' + + for i in [['A', '1'], ['B', '2'], ['C', '3'], ['D', '4']]: + options_prompt += i[0] + '. ' + line['option' + i[1]] + '\n' + + prompt = (f'问题: {question}\n' + options_prompt + + '请回答上述多项选择题,并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案,那么请根据可用的数据和你的判断来选择最可能正确的选项。') + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + + return msgs + + elif line['type'] == '判断': + msgs = super().build_prompt(line) + assert msgs[-1]['type'] == 'text' + msgs[-1]['value'] += '\n请回答上述判断题,并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断,请运用你的逻辑推理和现有信息来做出最可能的判断。' + return msgs + + else: + msgs = super().build_prompt(line) + assert msgs[-1]['type'] == 'text' + msgs[-1]['value'] += '\n请回答上述填空题,并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答,那么请依据现有的数据和你的推理能力来填写最合理的答案。' + return msgs diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/dude.py b/eval_mm/vlmevalkit/vlmeval/dataset/dude.py index 5e3c1da..c520c7d 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/dude.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/dude.py @@ -89,8 +89,9 @@ class DUDE(ImageBaseDataset): os.makedirs(self.img_root, exist_ok=True) try: import fitz - except: - warnings.warn('Please use `pip install pymupdf` to parse PDF files.') + except Exception as e: + logging.critical(f'{type(e)}: {e}') + logging.critical('Please use `pip install pymupdf` to parse PDF files.') line = origin_line.copy() if not isinstance(line['image_path'], List): diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py b/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py new file mode 100644 index 0000000..648c1e0 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/dynamath.py @@ -0,0 +1,240 @@ +import re +import json +import sympy as sp +import numpy as np +import pandas as pd +from sympy import simplify, Eq, sympify, Pow, pi +from sympy.parsing.latex import parse_latex +import sys +import math +import os +import os.path as osp +import argparse + +from .image_base import ImageBaseDataset +from .utils import build_judge +from ..utils import track_progress_rich +from ..smp import load, dump, d2df, toliststr + + +def preprocess(str1): + if 0 <= str1.find("{") < str1.rfind("}"): + str1 = str1[str1.find("{"): str1.rfind("}") + 1] + str2 = str1.replace("\\", "") + str2 = str2.replace("\\n", "\n") + return str2 + + +def transfer(str1): + if "\u03c0" in str1: + strs = str1.split('\u03c0') + str1 = strs[0] + return float(str1) * np.pi + else: + return float(str1) + + +def parse_answer(answer, answer_type="multiple choice"): + if answer_type == "float": + if answer.isdigit(): + return True, float(answer) + else: + parts = answer.split(' ') + answer = parts[0] + try: + answer = transfer(answer) + return True, answer + except: + return False, None + elif answer_type == "multiple choice": + if len(answer) == 1: + return True, answer.upper() + else: + in_flag = [ch in answer.upper() for ch in 'ABCDE'] + if sum(in_flag) == 1: + for ch in 'ABCDE': + if ch in answer.upper(): + return True, ch + return False, None + else: + return True, answer + + +def DynaMath_auxeval(model, line): + pred = line['prediction'] + pred = preprocess(pred) + + succeed, short_answer = None, None + try: + dj = json.loads(pred, strict=False) + short_answer = dj.get("short answer") + assert short_answer is not None + succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type']) + assert succeed + except: + # Failed to parse the JSON, use an auxiliary LLM to get the short answer + if line['answer_type'] == 'multiple choice': + inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line." + elif line['answer_type'] == 'float': + inst = "Output a three-digit floating-point number in a single line." + else: + inst = ( + "Output a short answer in a single line. Any float numbers in the answer " + "should be formatted as three-digit floating-point numbers." + ) + + prompt = f"Free-form answer: {pred}\nInstruction: {inst}" + response = pred + succeed, short_answer = parse_answer(response, line['answer_type']) + if not succeed: + response = model.generate(prompt) + succeed, short_answer = parse_answer(response, line['answer_type']) + + if line['answer_type'] == 'float': + if succeed: + diff = float(short_answer) - float(line['answer']) + if abs(diff) <= 0.001: + return dict(parse=True, extracted=short_answer, correct=True) + else: + return dict(parse=True, extracted=short_answer, correct=False) + else: + return dict(parse=False, extracted=None, correct=False) + elif line['answer_type'] == 'multiple choice': + if succeed: + return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer'])) + else: + if line['answer'] in pred[:3].upper(): + return dict(parse=False, extracted=None, correct=True) + else: + return dict(parse=False, extracted=None, correct=False) + else: + if succeed: + return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower())) + else: + return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower())) + + +class Dynamath(ImageBaseDataset): + + TYPE = 'VQA' + DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'} + DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'} + GUIDE = """ +## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \ +to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \ +detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST} + +Example of expected JSON response format: + +""" + EXAMPLE = { + "solution": "[Detailed step-by-step explanation]", + "short answer": "[Concise Answer]" + } + TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4) + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + + if self.meta_only: + tgt_path = toliststr(line['image_path']) + else: + tgt_path = self.dump_image(line) + + prompt = f"## Question\n {line['question']}" + if line['answer_type'] == 'multiple choice': + inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'." + elif line['answer_type'] == 'float': + inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key." + else: + inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers." + + prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + return msgs + + def evaluate(self, eval_file, **judge_kwargs): + judge_name = judge_kwargs.pop('model', 'gpt-4o-mini') + + model = build_judge(model=judge_name, **judge_kwargs) + suffix = eval_file.split('.')[-1] + + storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841 + score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841 + tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841 + nproc = judge_kwargs.pop('nproc', 6) # noqa: F841 + + res = load(tmp_file) if os.path.exists(tmp_file) else {} + res = {k: v for k, v in res.items() if v is not None} + + model.system_prompt = """\ +You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction. +""" + if not osp.exists(storage): + data = load(eval_file) + lt = len(data) + payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res] + keys = [idx for idx in data['index'] if idx not in res] + + if len(keys): + results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys) + for k, r in zip(keys, results): + res[k] = r + + data['parse'] = [res[idx]['parse'] for idx in data['index']] + data['extracted'] = [res[idx]['extracted'] for idx in data['index']] + data['correct'] = [res[idx]['correct'] for idx in data['index']] + dump(data, storage) + + data = load(storage) + # Calculate Average Accuracy + score_avg = {} + score_avg['Overall'] = np.mean(data['correct']) + + subs = set(data['subject']) + for sub in subs: + data_sub = data[data['subject'] == sub] + score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct']) + + lvls = set(data['knowledge_level']) + for lvl in lvls: + data_lvl = data[data['knowledge_level'] == lvl] + score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct']) + + # Calculate the Worst Case Accuracy + score_worst = {} + data_worst = data[data['varid'] == 1] + qid2corr = {idx: True for idx in data_worst['index']} + lt = len(data) + for i in range(lt): + item = data.iloc[i] + qid2corr[item['qid']] *= item['correct'] + data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']] + score_worst['Overall'] = np.mean(data_worst['correct']) + + subs = set(data_worst['subject']) + for sub in subs: + data_sub = data_worst[data_worst['subject'] == sub] + score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct']) + + lvls = set(data_worst['knowledge_level']) + for lvl in lvls: + data_lvl = data_worst[data_worst['knowledge_level'] == lvl] + score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct']) + + d1 = {'Setting': 'Average'} + d1.update(score_avg) + d2 = {'Setting': 'Worst Case'} + d2.update(score_worst) + score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True) + + dump(score, score_file) + return score diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py index 80f7b93..8bce438 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_base.py @@ -4,12 +4,19 @@ from ..smp import * def img_root_map(dataset): + if 'MM_NIAH' in dataset: + return 'MMNIAH' + if 'CRPE' in dataset: + return 'CRPE' if 'OCRVQA' in dataset: return 'OCRVQA' if 'COCO_VAL' == dataset: return 'COCO' if 'MMMU' in dataset: return 'MMMU' + if "QSpatial" in dataset: + return "QSpatial" + mmbench_root_map = { 'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench', 'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench', diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py new file mode 100644 index 0000000..aa1d7e2 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_ccocr.py @@ -0,0 +1,197 @@ +# flake8: noqa + +import os +import re +import tempfile +from functools import partial +import pandas as pd + +from .image_base import ImageBaseDataset +from ..smp import * + +# should be the same as FAIL_MSG definded in vlmeval/inference.py +FAIL_MSG = 'Failed to obtain answer via API.' + + +class CCOCRDataset(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL_MODELSCOPE = { + "CCOCR_DocParsing_DocPhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_chn_75.tsv", + "CCOCR_DocParsing_DocPhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_photo_eng_75.tsv", + "CCOCR_DocParsing_DocScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_chn_75.tsv", + "CCOCR_DocParsing_DocScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/doc/doc_scan_eng_75.tsv", + "CCOCR_DocParsing_TablePhotoChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_chn_75.tsv", + "CCOCR_DocParsing_TablePhotoEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_photo_eng_75.tsv", + "CCOCR_DocParsing_TableScanChn": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_chn_75.tsv", + "CCOCR_DocParsing_TableScanEng": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/table/table_scan_eng_75.tsv", + "CCOCR_DocParsing_MolecularHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/molecular/molecular_handwriting_100.tsv", + "CCOCR_DocParsing_FormulaHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/doc_parsing/formula/formula_handwriting_100.tsv", + "CCOCR_Kie_Sroie2019Word": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/sroie2019_word_347.tsv", + "CCOCR_Kie_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/CORD_100.tsv", + "CCOCR_Kie_EphoieScut": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/EPHOIE_SCUT_311.tsv", + "CCOCR_Kie_Poie": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/constrained_category/POIE_250.tsv", + "CCOCR_Kie_ColdSibr": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_SIBR_400.tsv", + "CCOCR_Kie_ColdCell": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/kie/open_category/COLD_CELL_600.tsv", + "CCOCR_MultiLanOcr_Arabic": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Arabic/Arabic_150.tsv", + "CCOCR_MultiLanOcr_French": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/French/French_150.tsv", + "CCOCR_MultiLanOcr_German": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/German/German_150.tsv", + "CCOCR_MultiLanOcr_Italian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Italian/Italian_150.tsv", + "CCOCR_MultiLanOcr_Japanese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Japanese/Japanese_150.tsv", + "CCOCR_MultiLanOcr_Korean": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Korean/Korean_150.tsv", + "CCOCR_MultiLanOcr_Portuguese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Portuguese/Portuguese_150.tsv", + "CCOCR_MultiLanOcr_Russian": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Russian/Russian_150.tsv", + "CCOCR_MultiLanOcr_Spanish": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Spanish/Spanish_150.tsv", + "CCOCR_MultiLanOcr_Vietnamese": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv", + "CCOCR_MultiSceneOcr_Cord": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/CORD_100.tsv", + "CCOCR_MultiSceneOcr_Funsd": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/FUNSD_50.tsv", + "CCOCR_MultiSceneOcr_Iam": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/IAM_50.tsv", + "CCOCR_MultiSceneOcr_ZhDoc": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_doc_100.tsv", + "CCOCR_MultiSceneOcr_ZhHandwriting": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/document_text/zh_handwriting_50.tsv", + "CCOCR_MultiSceneOcr_Hieragent": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/Hieragent_100.tsv", + "CCOCR_MultiSceneOcr_Ic15": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/IC15_500.tsv", + "CCOCR_MultiSceneOcr_Inversetext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/InverseText_500.tsv", + "CCOCR_MultiSceneOcr_Totaltext": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/TotalText_300.tsv", + "CCOCR_MultiSceneOcr_ZhScene": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/scene_text/zh_scene_450.tsv", + "CCOCR_MultiSceneOcr_UgcLaion": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/ugc_laion_400.tsv", + "CCOCR_MultiSceneOcr_ZhDense": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_dense_50.tsv", + "CCOCR_MultiSceneOcr_ZhVertical": "https://www.modelscope.cn/datasets/Qwen/CC-OCR/resolve/master/multi_scene_ocr/ugc_text/zh_vertical_100.tsv" + } + + DATASET_URL_HUGGINGFACE = { + "CCOCR_DocParsing_DocPhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_chn_75.tsv", + "CCOCR_DocParsing_DocPhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_photo_eng_75.tsv", + "CCOCR_DocParsing_DocScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_chn_75.tsv", + "CCOCR_DocParsing_DocScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/doc/doc_scan_eng_75.tsv", + "CCOCR_DocParsing_TablePhotoChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_chn_75.tsv", + "CCOCR_DocParsing_TablePhotoEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_photo_eng_75.tsv", + "CCOCR_DocParsing_TableScanChn": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_chn_75.tsv", + "CCOCR_DocParsing_TableScanEng": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/table/table_scan_eng_75.tsv", + "CCOCR_DocParsing_MolecularHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/molecular/molecular_handwriting_100.tsv", + "CCOCR_DocParsing_FormulaHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/doc_parsing/formula/formula_handwriting_100.tsv", + "CCOCR_Kie_Sroie2019Word": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/sroie2019_word_347.tsv", + "CCOCR_Kie_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/CORD_100.tsv", + "CCOCR_Kie_EphoieScut": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/EPHOIE_SCUT_311.tsv", + "CCOCR_Kie_Poie": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/constrained_category/POIE_250.tsv", + "CCOCR_Kie_ColdSibr": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_SIBR_400.tsv", + "CCOCR_Kie_ColdCell": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/kie/open_category/COLD_CELL_600.tsv", + "CCOCR_MultiLanOcr_Arabic": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Arabic/Arabic_150.tsv", + "CCOCR_MultiLanOcr_French": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/French/French_150.tsv", + "CCOCR_MultiLanOcr_German": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/German/German_150.tsv", + "CCOCR_MultiLanOcr_Italian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Italian/Italian_150.tsv", + "CCOCR_MultiLanOcr_Japanese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Japanese/Japanese_150.tsv", + "CCOCR_MultiLanOcr_Korean": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Korean/Korean_150.tsv", + "CCOCR_MultiLanOcr_Portuguese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Portuguese/Portuguese_150.tsv", + "CCOCR_MultiLanOcr_Russian": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Russian/Russian_150.tsv", + "CCOCR_MultiLanOcr_Spanish": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Spanish/Spanish_150.tsv", + "CCOCR_MultiLanOcr_Vietnamese": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_lan_ocr/Vietnamese/Vietnamese_150.tsv", + "CCOCR_MultiSceneOcr_Cord": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/CORD_100.tsv", + "CCOCR_MultiSceneOcr_Funsd": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/FUNSD_50.tsv", + "CCOCR_MultiSceneOcr_Iam": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/IAM_50.tsv", + "CCOCR_MultiSceneOcr_ZhDoc": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_doc_100.tsv", + "CCOCR_MultiSceneOcr_ZhHandwriting": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/document_text/zh_handwriting_50.tsv", + "CCOCR_MultiSceneOcr_Hieragent": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/Hieragent_100.tsv", + "CCOCR_MultiSceneOcr_Ic15": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/IC15_500.tsv", + "CCOCR_MultiSceneOcr_Inversetext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/InverseText_500.tsv", + "CCOCR_MultiSceneOcr_Totaltext": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/TotalText_300.tsv", + "CCOCR_MultiSceneOcr_ZhScene": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/scene_text/zh_scene_450.tsv", + "CCOCR_MultiSceneOcr_UgcLaion": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/ugc_laion_400.tsv", + "CCOCR_MultiSceneOcr_ZhDense": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_dense_50.tsv", + "CCOCR_MultiSceneOcr_ZhVertical": "https://huggingface.co/datasets/wulipc/CC-OCR/resolve/main/multi_scene_ocr/ugc_text/zh_vertical_100.tsv" + } + + # define data path + DATASET_URL = DATASET_URL_MODELSCOPE + DATASET_MD5 = { + "CCOCR_DocParsing_DocPhotoChn": "9039dcbb31830d413261a95cfa29d97f", + "CCOCR_DocParsing_DocPhotoEng": "2ca0824881e1d7317626f2a19d902989", + "CCOCR_DocParsing_DocScanChn": "9e265c8aa760ebdf5c3bf9e892d55492", + "CCOCR_DocParsing_DocScanEng": "77d04637be3def86dbc2ce37ba64a704", + "CCOCR_DocParsing_TablePhotoChn": "c4dc85252ddad2b43a03a67b1d1ae983", + "CCOCR_DocParsing_TablePhotoEng": "02ab75d6169da0cd2ece9ce0ae14a479", + "CCOCR_DocParsing_TableScanChn": "f1f79959fdd01127df7377c9d46722f2", + "CCOCR_DocParsing_TableScanEng": "794903c7acf52bfe956eefba2166d14b", + "CCOCR_DocParsing_MolecularHandwriting": "30b7f7679b713ce000a939eca7b4078f", + "CCOCR_DocParsing_FormulaHandwriting": "e03047776ce5e79a61ae1c057e2a348e", + "CCOCR_Kie_Sroie2019Word": "3287d99a8e86a99b74171fa5a70f9acb", + "CCOCR_Kie_Cord": "ab297cadcbc7158884a301c366f3330a", + "CCOCR_Kie_EphoieScut": "bb8fa3ba7ea91cbf17be0904956ad3f3", + "CCOCR_Kie_Poie": "882b64317989ecbfed6518051cdffb14", + "CCOCR_Kie_ColdSibr": "109d5dad8b7081fb6a2f088e963196d4", + "CCOCR_Kie_ColdCell": "7b44c45b4d7d768d1dbdc08872fe7d3a", + "CCOCR_MultiLanOcr_Arabic": "e9a3f2bb9298d0b882ebc7a98980c3f3", + "CCOCR_MultiLanOcr_French": "729407ed2036c22e602eff645eddd40c", + "CCOCR_MultiLanOcr_German": "96fc2edae747f0ec95b0a6f9bf723022", + "CCOCR_MultiLanOcr_Italian": "29a508fa5d5a5e767497dd69e2430ebb", + "CCOCR_MultiLanOcr_Japanese": "bbcca96ccf25fff63597c2ab4f3ebb1f", + "CCOCR_MultiLanOcr_Korean": "0f55dbd24eba5edc189c91e124411641", + "CCOCR_MultiLanOcr_Portuguese": "a6fcf8831775a61aa631c0cf1c422ae7", + "CCOCR_MultiLanOcr_Russian": "19d2f84062a1699d3e9333912bd6b303", + "CCOCR_MultiLanOcr_Spanish": "f5a0cfa9f2ae4115c91c7b362034e591", + "CCOCR_MultiLanOcr_Vietnamese": "bf1cd4e83d91767f4906f81550cec8b9", + "CCOCR_MultiSceneOcr_Cord": "92943f0ccb4c5a196c574222e76759a0", + "CCOCR_MultiSceneOcr_Funsd": "229cc38d193edd00f4383610e98ee873", + "CCOCR_MultiSceneOcr_Iam": "d897a6d6c3880c65e752ec11b211204c", + "CCOCR_MultiSceneOcr_ZhDoc": "303682cc16c8bb51b2b896f8ceb8bd38", + "CCOCR_MultiSceneOcr_ZhHandwriting": "faa298d366bc05e5cfb39e334afb8eff", + "CCOCR_MultiSceneOcr_Hieragent": "6f132cdd0473d7cc145c3e3a08957dd6", + "CCOCR_MultiSceneOcr_Ic15": "3d94869f312a41d53d0578a06a2fb1f2", + "CCOCR_MultiSceneOcr_Inversetext": "e141d424a0c4cf9579064428a270f13d", + "CCOCR_MultiSceneOcr_Totaltext": "ca1daf81d49eeb57ef844b72a23c2e62", + "CCOCR_MultiSceneOcr_ZhScene": "9295152a66e6f117db8bfbb20a9013e6", + "CCOCR_MultiSceneOcr_UgcLaion": "8e9ea1fbf9d56532157e807eabf39b21", + "CCOCR_MultiSceneOcr_ZhDense": "de8f48ee0c8a2cf8ed7f2b3a81e6322d", + "CCOCR_MultiSceneOcr_ZhVertical": "4892b4aec6e7fd11e39aaea23712709b" + } + + # It returns a DataFrame + def evaluate(self, eval_file, **judge_kwargs): + """ + """ + df = load(eval_file) + dict_list = df.to_dict(orient='records') + + required_colume_list = ['answer', 'prediction', "category", "image_name", "l2-category", "split"] + for required_colume in required_colume_list: + assert required_colume in df, "required_colume: {} NOT found".format(required_colume) + + gt_info, ptd_info = {}, {} + for data_info in dict_list: + image_name = data_info['image_name'] + gt_info[image_name] = data_info['answer'] + + # warning the FAIL samples + if data_info['prediction'] != FAIL_MSG: + ptd_info[image_name] = data_info['prediction'] + + # assert eval_file is a single dataset + group_name = set([str(x) for x in df['category']]).pop() + op_name = set([str(x) for x in df['l2-category']]).pop() + data_name = set([str(x) for x in df['split']]).pop() + + data_info = {"op": op_name, "group": group_name, "dataset": data_name, "num": len(gt_info)} + try: + from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map + except ImportError as err: + import warnings + warnings.warn('The dependency of CCOCR evaluator is not properly installed') + warnings.warn(f'{type(err)}: {err}') + eval_func = ccocr_evaluator_map.get(group_name, None) + if eval_func is None: + raise ValueError("error: evaluator not defined for: {}".format(group_name)) + meta_info, eval_info = eval_func(ptd_info, gt_info, **data_info) + + output_info = {"meta": meta_info, "evaluation": eval_info, "config": data_info} + result_file = os.path.splitext(os.path.abspath(eval_file))[0] + "_eval.json" + dump(output_info, result_file) + + # update global status for summary + # warning: the evaluate function should NOT run in parallel + all_status_info = {} + global_status_path = os.path.join(os.path.dirname(eval_file), "status.json") + if os.path.exists(global_status_path): + with open(global_status_path, "r") as f: + all_status_info = json.load(f) + all_status_info[data_name] = output_info + with open(global_status_path, "w") as f: + json.dump(all_status_info, f, ensure_ascii=False, indent=4) + return eval_info.get("summary") diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py index 5fb5ac4..235a655 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_mcq.py @@ -3,7 +3,7 @@ import warnings from .image_base import ImageBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..smp import * - +import pandas as pd MMMB_URLS = { 'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv', @@ -42,31 +42,31 @@ class ImageMCQDataset(ImageBaseDataset): DATASET_URL = { # MMBench v1.0 - 'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN.tsv', - 'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN.tsv', - 'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv', - 'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv', - 'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv', # Internal Only - 'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only + 'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv', + 'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv', + 'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv', + 'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv', + 'MMBench': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv', # Internal + 'MMBench_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv', # Internal # MMBench v1.1 - 'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv', - 'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv', - 'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv', - 'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv', - 'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv', # Internal Only - 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only + 'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv', + 'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv', + 'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv', + 'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv', + 'MMBench_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv', # Internal + 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv', # Internal # SEEDBench Series - 'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv', + 'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv', 'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv', - 'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench2_Plus.tsv', + 'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv', # ScienceQA Series - 'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_VAL.tsv', - 'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv', + 'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv', + 'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv', # MMT-Bench - 'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL_MI.tsv', - 'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL.tsv', - 'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL_MI.tsv', - 'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL.tsv', + 'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv', + 'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv', + 'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv', + 'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv', # AesBench 'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv', 'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv', @@ -76,6 +76,9 @@ class ImageMCQDataset(ImageBaseDataset): # A-Bench 'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv', 'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv', + # R-Bench + 'R-Bench-Dis': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv', + 'R-Bench-Ref': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv', # Other Benchmarks 'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv', 'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv', @@ -88,7 +91,16 @@ class ImageMCQDataset(ImageBaseDataset): 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/' 'resolve/main/TaskMeAnything-v1-imageqa-random.tsv' ), - 'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv' + 'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv', + 'WorldMedQA-V': 'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv', + 'VisOnlyQA-VLMEvalKit': ( + 'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/' + 'resolve/main/visonlyqa_vlmevalkit.tsv' + ), + '3DSRBench': ( + 'https://huggingface.co/datasets/ccvl/3DSRBench/' + 'resolve/main/3dsrbench_v1_vlmevalkit_circular.tsv' + ), } DATASET_MD5 = { @@ -127,15 +139,21 @@ class ImageMCQDataset(ImageBaseDataset): # A-Bench 'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1', 'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c', + # R-Bench + 'R-Bench-Dis': 'd6e961dbfc43350688af2560226830b4', + 'R-Bench-Ref': '270c1cb555acb523f3fdb178ed57021d', # Other Benchmarks 'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac', 'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975', 'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be', 'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e', - 'RealWorldQA': '92321028d2bc29040284b6674721e48f', + 'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889' + 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889', + 'WorldMedQA-V': '441e63875e30c87f5750528b57b41285', + "VisOnlyQA-VLMEvalKit": 'cf460a31d2acb8d3a7cecd0e69298bfa', + '3DSRBench': '13a99f33164dc1b9faf0e8b8b01fd6f2', } DATASET_URL.update(MMMB_URLS) @@ -256,6 +274,12 @@ class ImageMCQDataset(ImageBaseDataset): warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \ please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \ larger than the VAL dataset and the leaderboard results are based on AesBench TEST.') + if dataset == 'VisOnlyQA-VLMEvalKit': + warnings.warn('Note that the results on VisOnlyQA-VLMEvalKit are different from the results on \ + the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the \ + chemistry__shape_multi split and uses a different evaluation prompt. Please \ + explicitly specify the version of the dataset when you report results.') + return acc @@ -267,7 +291,7 @@ class MMMUDataset(ImageMCQDataset): } DATASET_MD5 = { - 'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d', + 'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64', 'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d', } @@ -381,13 +405,72 @@ class MUIRDataset(ImageMCQDataset): class GMAIMMBenchDataset(ImageMCQDataset): DATASET_URL = { - 'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv' + 'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv', + 'GMAI_mm_bench_TEST_part_1': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_2': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_3': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_4': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_5': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_6': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_7': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_8': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_9': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_10': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv', # noqa: E501 + 'GMAI_mm_bench_TEST_part_11': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv', # noqa: E501 } DATASET_MD5 = { - 'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324' + 'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324', + 'GMAI_mm_bench_TEST_part_1': '900d735231230a63f4ed45665c078ef4', + 'GMAI_mm_bench_TEST_part_2': '1b27ab621386945d7e4a765ad2d22b0e', + 'GMAI_mm_bench_TEST_part_3': '44bdc2b6267dd505d529b8cad06f0fb2', + 'GMAI_mm_bench_TEST_part_4': '5a04a04fcac9f1466709f242fdb80acb', + 'GMAI_mm_bench_TEST_part_5': 'c70baf8909eda9af0ddeab275c721336', + 'GMAI_mm_bench_TEST_part_6': '825abc39596b644dead9350d0cfa3b96', + 'GMAI_mm_bench_TEST_part_7': 'defb8aed2fb77365a76b6b9abd6a2701', + 'GMAI_mm_bench_TEST_part_8': 'ff490d60b85f2bb0abb67a435b298c65', + 'GMAI_mm_bench_TEST_part_9': 'ff67c86f40da93b09139ac1d1ba5dc6b', + 'GMAI_mm_bench_TEST_part_10': '3dae94627b9ac0fe00180d4780fbf6dc', + 'GMAI_mm_bench_TEST_part_11': 'd08dc813f0eb6bbab63cae2a9d113c4b', } + @classmethod + def supported_datasets(cls): + return ['GMAI-MMBench_VAL', 'GMAI-MMBench_TEST'] + + def load_data(self, dataset): + if dataset == 'GMAI-MMBench_VAL': + data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv') + if file_size(data_path, 'GB') > 1: + local_path = data_path.replace('.tsv', '_local.tsv') + if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'): + from ..tools import LOCALIZE + LOCALIZE(data_path, local_path) + data_path = local_path + return load(data_path) + elif dataset == 'GMAI-MMBench_TEST': + dfs = [] + for part_num in range(1, 12): + part_name = f'GMAI_mm_bench_TEST_part_{part_num}' + url = self.DATASET_URL[part_name] + file_md5 = self.DATASET_MD5.get(part_name) + tsv_path = osp.join(LMUDataRoot(), f'{part_name}.tsv') + if not osp.exists(tsv_path) or (file_md5 and md5(tsv_path) != file_md5): + download_file(url, filename=tsv_path) + local_path = tsv_path.replace('.tsv', '_local.tsv') + if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'): + from ..tools import LOCALIZE + LOCALIZE(tsv_path, local_path) + tsv_path = local_path + # 加载数据 + df = load(tsv_path) + dfs.append(df) + # 合并所有数据 + data = pd.concat(dfs, ignore_index=True) + return data + else: + raise ValueError(f"未知的数据集:{dataset}") + def report_acc_by_groups(self, df, group_column): res = defaultdict(list) @@ -402,7 +485,7 @@ class GMAIMMBenchDataset(ImageMCQDataset): res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']] if group_column not in df: - raise ValueError(f"Column '{group_column}' not found in dataframe.") + raise ValueError(f"Column '{group_column}' not found in dataframe.") # noqa: E713 abilities = list(set(df[group_column])) abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities] @@ -470,6 +553,277 @@ class GMAIMMBenchDataset(ImageMCQDataset): return acc +class MMERealWorld(ImageMCQDataset): + + TYPE = 'MMERealWorld' + + DATASET_MD5 = { + 'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36', + 'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c', + 'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444', + } + SYS = { + 'MME-RealWorld': ( + 'Select the best answer to the above multiple-choice question based on the image. ' + 'Respond with only the letter (A, B, C, D, or E) of the correct option. \n' + 'The best answer is:' + ), + 'MME-RealWorld-Lite': ( + 'Select the best answer to the above multiple-choice question based on the image. ' + 'Respond with only the letter (A, B, C, D, or E) of the correct option. \n' + 'The best answer is:' + ), + 'MME-RealWorld-CN': ( + '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。\n' + '最佳答案为:' + ), + } + + @classmethod + def supported_datasets(cls): + return ['MME-RealWorld', 'MME-RealWorld-CN', 'MME-RealWorld-Lite',] + + def load_data( + self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64" + ): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.DATASET_MD5[dataset]: + return False + return True + + def generate_tsv(pth): + tsv_file = os.path.join(pth, f"{dataset}.tsv") + + if os.path.exists(tsv_file): + print(f"{tsv_file} already exists.") + return + + json_dir = os.path.join(pth, dataset) + json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")] + + data_list = [] + for json_file in json_files: + with open(os.path.join(json_dir, json_file), "r") as f: + data = json.load(f) + for item in tqdm(data): + choice_prompt = ( + "The choices are listed below:\n" + if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"] + else "选项如下所示:\n" + ) + data_list.append( + { + "index": item["index"], + "image": item["image"], + "question": item["question"], + "multi-choice options": choice_prompt + + "\n".join(item["multi-choice options"]), + "A": item["multi-choice options"][0][4:], + "B": item["multi-choice options"][1][4:], + "C": item["multi-choice options"][2][4:], + "D": item["multi-choice options"][3][4:], + "E": item["multi-choice options"][4][4:], + "answer": item["answer"], + "category": item["category"], + "l2-category": item["l2-category"], + } + ) + df = pd.DataFrame(data_list) + df.to_csv(tsv_file, sep="\t", index=False) + print(f"TSV file saved to {tsv_file}") + + # Check if dataset is cached and has integrity + if dataset == "MME-RealWorld-Lite": + url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv' # noqa: E501 + file_md5 = ( + self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None + ) + datas = self.prepare_tsv(url, file_md5) + choice_prompt = "The choices are listed below:\n" + for index, item in datas.iterrows(): + options = eval(item["multi-choice options"]) + datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join( + options + ) + datas.loc[index, "A"] = options[0][4:] + datas.loc[index, "B"] = options[1][4:] + datas.loc[index, "C"] = options[2][4:] + datas.loc[index, "D"] = options[3][4:] + datas.loc[index, "E"] = options[4][4:] + return datas + + update_flag = False + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + print(f"Using cached dataset from {cache_path}") + else: + from huggingface_hub import snapshot_download + + # Download or find the dataset path + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + generate_tsv(dataset_path) + update_flag = True + + data_path = os.path.join(dataset_path, f"{dataset}.tsv") + if file_size(data_path, "GB") > 1: + local_path = data_path.replace(".tsv", "_local.tsv") + if ( + not osp.exists(local_path) + or os.environ.get("FORCE_LOCAL", None) + or update_flag + ): + from vlmeval.tools import LOCALIZE + + LOCALIZE(data_path, local_path) + data_path = local_path + return load(data_path) + + def post_build(self, dataset): + self.TYPE = 'MMERealWorld' + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + + if self.meta_only: + tgt_path = toliststr(line['image_path']) + else: + tgt_path = self.dump_image(line) + + question = line['question'] + + choice_prompt = line['multi-choice options'] + '\n' + question += ' ' + choice_prompt + self.SYS[self.dataset_name] + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=question)) + return msgs + + # It returns a dictionary + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.multiple_choice import extract_characters_regex, get_dimension_rating + assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + FAIL_MSG = 'Failed to obtain answer via API.' + tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + tgt_file = eval_file.replace('.xlsx', '_rating.json') + score_file = eval_file.replace('.xlsx', '_score.xlsx') + + if not osp.exists(score_file): + + res = {} if not osp.exists(tmp_file) else load(tmp_file) + res = {k: v for k, v in res.items() if FAIL_MSG not in v} + + data = load(eval_file) + cnt_rejected = 0 + data_un = data[~pd.isna(data['prediction'])] + + for idx in data['index']: + ans = data.loc[data['index'] == idx, 'answer'].values[0] + pred = data.loc[data['index'] == idx, 'prediction'].values[0] + + extract_pred = extract_characters_regex(pred) + if extract_pred == '': + cnt_rejected += 1 + data.loc[data['index'] == idx, 'score'] = 0 + else: + data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans) + + print( + f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, ' + f'failed to obtain the score for another {cnt_rejected} questions. ' + f'Those questions will be counted as 0 score in ALL rating.' + ) + + dump(data, score_file) + + rating = get_dimension_rating(score_file) + dump(rating, tgt_file) + return rating + + +class HRBenchDataset(ImageMCQDataset): + + DATASET_URL = { + 'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv', + 'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv', + } + + DATASET_MD5 = { + 'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65', + 'HRBench8K': '274c9c7f89329b804a4723178a00219c', + } + + def evaluate(self, eval_file, **judge_kwargs): + assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file) + from .utils.multiple_choice import mcq_vanilla_eval + from .utils.hrbench import report_acc_hrbench + nproc = judge_kwargs.pop('nproc', 4) + + suffix = eval_file.split('.')[-1] + model = judge_kwargs.get('model', 'extract_matching') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'} + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + + data = load(eval_file) + data = data.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + # If not choice label, then use lower case + for k in data.keys(): + data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + + meta = self.data + meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} + data_map = {x: y for x, y in zip(data['index'], data['question'])} + for k in data_map: + assert k in meta_q_map, ( + f'eval_file should be the same as or a subset of dataset {self.dataset_name}' + ) + + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + + if osp.exists(score_file): + acc = load(score_file) + return acc + data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) + dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + + acc = report_acc_hrbench(data) + + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(acc, score_file) + + return acc + + class CustomMCQDataset(ImageMCQDataset): def load_data(self, dataset): @@ -482,3 +836,69 @@ class CustomMCQDataset(ImageMCQDataset): LOCALIZE(data_path, local_path) data_path = local_path return load(data_path) + + +class NaturalBenchDataset(ImageMCQDataset): + + DATASET_URL = { + 'NaturalBenchDataset': ( + 'https://huggingface.co/datasets/BaiqiL/' + 'NaturalBench/resolve/main/NaturalBenchDataset.tsv' + ), + } + DATASET_MD5 = { + 'NaturalBenchDataset':'dbe25b044bc35696426381e9ba4fe930', + } + + def build_prompt(self, line): + SUFFIX_FOR_VQA = { + "yes_no": "Please answer Yes or No.", + "multiple_choice": "Please output the letter corresponding to the correct option." + } + if isinstance(line, int): + line = self.data.iloc[line] + + if self.meta_only: + tgt_path = toliststr(line['image_path']) + else: + tgt_path = self.dump_image(line) + + question = line['question'] + prompt = f'{question} {SUFFIX_FOR_VQA[line["type"]]}' + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + + return msgs + + def evaluate(self, eval_file, **judge_kwargs): + from .utils.naturalbench import extract_answer, get_scores + + data = load(eval_file) + data = data.sort_values(by='index') + predictions = [str(x) for x in data['prediction']] + answers = [str(x) for x in data['answer']] + indexs = [str(x) for x in data['index']] + meta = self.data + types = [str(x) for x in meta['type']] + results = {} + assert len(predictions) == len(answers) == len(indexs) == len(types) == (1900 * 4) + number_answered_samples = len(predictions) // 4 + for i in range(number_answered_samples): + results[i] = { + "q0_i0": extract_answer(predictions[i * 4], types[i * 4]), + "q0_i1": extract_answer(predictions[i * 4 + 1], types[i * 4 + 1]), + "q1_i0": extract_answer(predictions[i * 4 + 2], types[i * 4 + 2]), + "q1_i1": extract_answer(predictions[i * 4 + 3], types[i * 4 + 3]) + } + + scores = get_scores(results) + print(scores) + score_file = 'NaturalBench_acc.csv' + df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score']) + dump(df, score_file) + + return scores diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py index 826cf3f..fbcc7de 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_vqa.py @@ -1,5 +1,10 @@ +import os +import re +import tempfile from functools import partial +import pandas as pd + from .image_base import ImageBaseDataset from .utils import build_judge, DEBUG_MESSAGE from ..smp import * @@ -18,6 +23,7 @@ class ImageVQADataset(ImageBaseDataset): 'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv', 'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv', 'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv', + 'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv', } DATASET_MD5 = { @@ -29,6 +35,7 @@ class ImageVQADataset(ImageBaseDataset): 'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe', 'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227', 'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42', + 'GQA_TestDev_Balanced': '99b62f22e224d9b2f32dcbe41359d1c9', } def build_prompt(self, line): @@ -53,7 +60,7 @@ class ImageVQADataset(ImageBaseDataset): res = pool.map(partial(process_line, method='vqa_score'), lines) elif listinstr(['ChartQA'], dataset): res = pool.map(partial(process_line, method='relaxed_accuracy'), lines) - elif listinstr(['OCRVQA'], dataset): + elif listinstr(['OCRVQA', 'GQA'], dataset): res = pool.map(partial(process_line, method='accuracy'), lines) elif listinstr(['DocVQA', 'InfoVQA'], dataset): res = pool.map(partial(process_line, method='anls'), lines) @@ -90,6 +97,46 @@ class ImageVQADataset(ImageBaseDataset): return ret +class VizWiz(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv' + } + DATASET_MD5 = { + 'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0' + } + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.vqa_eval import hit_calculate, process_line + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + + if not osp.exists(result_file): + data = load(eval_file) + assert 'answers' in data and 'prediction' in data + data['prediction'] = [str(x) for x in data['prediction']] + data['answer'] = [str(x) for x in data['answers']] + + lt = len(data) + pool = mp.Pool(16) + lines = [data.iloc[i] for i in range(lt)] + res = pool.map(process_line, lines) + + hit = hit_calculate(res, 'VizWiz') + ret = dict() + + ret['Overall'] = np.mean(hit) * 100 + ret = d2df(ret) + ret.round(2) + + dump(ret, result_file) + + retz = pd.read_csv(result_file) + return retz + + class OCRBench(ImageBaseDataset): TYPE = 'VQA' DATASET_URL = { @@ -213,6 +260,111 @@ class MathVista(ImageBaseDataset): return score +class MathVerse(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa + 'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa + 'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa + 'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa + 'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa + 'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa + } + DATASET_MD5 = { + 'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65', + 'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4', + 'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3', + 'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19', + 'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04', + 'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a', + } + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc + + model = judge_kwargs['model'] + suffix = eval_file.split('.')[-1] + storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx') + tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl') + storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') + tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl') + nproc = judge_kwargs.pop('nproc', 4) + # stage1: extract the answer + if not osp.exists(storage_extract): + data = load(eval_file) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file_extract): + ans = load(tmp_file_extract) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MathVerse_auxeval_extract, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file_extract, + ) + ans = load(tmp_file_extract) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract'] + + data['extract'] = [ans[idx]['extract'] for idx in data['index']] + data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']] + dump(data, storage_extract) + + # stage2: score the answer + if not osp.exists(storage_score): + data = load(storage_extract) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file_score): + ans = load(tmp_file_score) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MathVerse_auxeval_score, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file_score, + ) + ans = load(tmp_file_score) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score'] + + data['score'] = [ans[idx]['score'] for idx in data['index']] + data['log_score'] = [ans[idx]['log_score'] for idx in data['index']] + dump(data, storage_score) + + score = MathVerse_acc(storage_score) + score_pth = storage_score.replace('.xlsx', '.csv') + dump(score, score_pth) + return score + + class MathVision(ImageBaseDataset): TYPE = 'VQA' DATASET_URL = { @@ -277,6 +429,364 @@ class MathVision(ImageBaseDataset): return score +class OlympiadBench(ImageBaseDataset): + TYPE = 'VQA_ex_prompt' + DATASET_URL = { + 'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv', + 'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv', + 'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv' + } + DATASET_MD5 = { + 'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914', + 'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed', + 'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623' + } + + def dump_image(self, line): + os.makedirs(self.img_root, exist_ok=True) + + tgt_path_z = [] + if isinstance(line['image'], list): + for i in range(len(line['image'])): + tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'][i], tgt_path) + tgt_path_z.append(tgt_path) + else: + tgt_path = osp.join(self.img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path_z.append(tgt_path) + return tgt_path_z + + def build_prompt(self, line): + + from .utils.olympiadbench import get_answer_type_text, make_input + + self.is_chinese = 'zh' in line['source'] + self.is_math = 'maths' in line['source'] + self.is_theorem_proving = 'TP' in line['source'] + + if self.is_chinese: + subject_content = '数学' if self.is_math else '物理' + if self.is_theorem_proving: + prompt = ( + f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。" + "证明过程中使用的变量和公式请使用LaTeX格式表示。" + ) + else: + answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True, + multiple_answer=line['is_multiple_answer']) + if line['is_multiple_answer']: + multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}' + else: + multiple_answer_text = '\\boxed{答案}' + unit_text = '' + if line['unit']: + multiple_answer_text += '(单位)' + unit_text = ',注意答案的单位不要放在\\boxed{}中' + prompt = ( + f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。' + f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”' + f'显式给出结果{unit_text}。' + ) + else: + subject_content = 'Math' if self.is_math else 'Physics' + if self.is_theorem_proving: + prompt = ( + f'The following is a theorem proving problem from an International {subject_content} competition. ' + 'Please use logical reasoning and common theorems to prove the proposition in the problem ' + 'according to the given requirements. ' + 'Please use LaTeX format to represent the variables and formulas used in the proof.' + ) + else: + if line['is_multiple_answer']: + multiple_answer_text = '\\boxed{multiple answers connected with commas}' + else: + multiple_answer_text = '\\boxed{answer}' + unit_text = '' + if line['unit']: + multiple_answer_text += '(unit)' + unit_text = ', note that the unit of the answer should not be included in \\boxed{}' + answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False, + multiple_answer=line['is_multiple_answer']) + prompt = ( + f'The following is an open-ended problem from an International {subject_content} competition. ' + f'{answer_type_text}Please calculate the answer according to the given requirements and ' + 'the information provided. Please use LaTeX format to represent the variables and formulas ' + 'used in the solution process and results. Please end your solution with "So the final answer ' + f'is {multiple_answer_text}." and give the result explicitly{unit_text}.' + ) + + if self.is_math: + input = make_input(prompt, line['question']) + else: + if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null + input = make_input(prompt, line['context'] + '\n' + line['question']) + else: + input = make_input(prompt, line['question']) + + ret = [dict(type='text', value=input)] + tgt_path = self.dump_image(line) + + ret.extend([dict(type='image', value=s) for s in tgt_path]) + + return ret + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.olympiadbench import MathJudger, extract_answer + judger = MathJudger() + + suffix = eval_file.split('.')[-1] + name_str1 = 'judge' + name_str2 = 'score' + result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx') + score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv') + + if not osp.exists(result_file): + data = load(eval_file) + scorez = [] + + for i in tqdm(data.iterrows()): + line = i[1] + model_answer = line['prediction'] + is_chinese = 'zh' in line['source'] + model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False) + answer_type = line['answer_type'] + + final_answer = line['final_answer'][2:-2] + + if str(answer_type) != 'nan' and 'Tuple' in answer_type: + judge_result = judger.judge(model_answer, final_answer) + else: + if str(line['error']) != 'nan': + if ',' in line['error']: + precisions = line['error'].split(',') + precisions = [float(p) if p else 1e-8 for p in precisions] + judge_result = judger.judge(model_answer, final_answer, precisions) + else: + precision = float(line['error']) + judge_result = judger.judge(model_answer, final_answer, precision) + else: + judge_result = judger.judge(model_answer, final_answer) + scorez.append(judge_result) + + data['score'] = scorez + dump(data, result_file) + + judge_file = load(result_file) + + if not osp.exists(score_file): + name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP', + 'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP', + 'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE'] + + sample_list = [[] for _ in range(len(name_list))] + for i in judge_file.iterrows(): + line = i[1] + for j in range(len(name_list)): + if line['source'] == name_list[j]: + sample_list[j].append(line['score']) + + acc_dict = {} + correct_list = [] + + # fine-grained + for i in range(len(name_list)): + correct_num = 0 + for j in sample_list[i]: + if j: + correct_num += 1 + correct_list.append(correct_num) + acc = 100 * correct_num / len(sample_list[i]) + acc_dict[name_list[i]] = [acc] + + # 4 grained + labela = ['zh', 'en'] + labelb = ['maths', 'physics'] + + grain_list = [[x,y] for x in labela for y in labelb] + for j in grain_list: + dict_name = j[0] + "_" + j[1] + correct_num = 0 + full_num = 0 + for i in range(len(name_list)): + if all(k in name_list[i] for k in j): + correct_num += correct_list[i] + full_num += len(sample_list[i]) + acc = 100 * correct_num / full_num + acc_dict[dict_name] = [acc] + + # 2 grained + grain_list = ['maths', 'physics'] + for j in grain_list: + dict_name = j + correct_num = 0 + full_num = 0 + for i in range(len(name_list)): + if j in name_list[i]: + correct_num += correct_list[i] + full_num += len(sample_list[i]) + acc = 100 * correct_num / full_num + acc_dict[dict_name] = [acc] + + # AVG + correct_num = sum(correct_list) + acc = 100 * correct_num / len(judge_file) + acc_dict['AVG'] = [acc] + + acc_pd = pd.DataFrame(acc_dict) + acc_pd.to_csv(score_file, index=False, encoding='gbk') + + accdz = pd.read_csv(score_file) + return accdz + + +class WeMath(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'WeMath': 'https://opencompass.openxlab.space/utils/VLMEval/WeMath.tsv' + } + DATASET_MD5 = {'WeMath': '056142c89b09d864702450b5b5ea0913'} + + def evaluate(self, eval_file, **judge_kwargs): + from .utils.wemath import wemath_evaluate_models, wemath_accuracy + from .utils.multiple_choice import mcq_vanilla_eval + + # model = judge_kwargs['model'] + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model + name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'} + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage) and model is not None: + data = load(eval_file) + result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + + data = load(eval_file) + data = data.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + # If not choice label, then use lower case + for k in data.keys(): + data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + + meta = self.data + meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} + data_map = {x: y for x, y in zip(data['index'], data['question'])} + for k in data_map: + assert k in meta_q_map, ( + f'eval_file should be the same as or a subset of dataset {self.dataset_name}' + ) + data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) + + if 'id' in data.columns: + # 更改列名 + data.rename(columns={'id': 'ID'}, inplace=True) + dump(data, storage) + if osp.exists(storage): + accuracy_scores = wemath_evaluate_models(storage) + four_dim_scores = wemath_accuracy(storage) + else: + accuracy_scores = wemath_evaluate_models(eval_file) + four_dim_scores = wemath_accuracy(eval_file) + combine_score = {**accuracy_scores, **four_dim_scores} + combine_score = pd.DataFrame(combine_score) + score_pth = storage.replace('.xlsx', '_score.csv') + dump(combine_score, score_pth) + return combine_score + + +class LogicVista(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'LogicVista': 'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv' + } + DATASET_MD5 = {'LogicVista': '41c5d33adf33765c399e0e6ae588c061'} + + def evaluate(self, eval_file, **judge_kwargs): + from .utils.logicvista import LogicVista_auxeval, evaluate_logicvista + + # model = judge_kwargs['model'] + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model + name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'} + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{name_str}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage) and model is not None: + data = load(eval_file) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('LogicVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + LogicVista_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] and ans[k]['hit'] == v['hit'] + + data['res'] = [ans[idx]['res'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + data['hit'] = [ans[idx]['hit'] for idx in data['index']] + + dump(data, storage) + if osp.exists(storage): + accuracy_scores = evaluate_logicvista(storage) + score_pth = storage.replace('.xlsx', '_score.csv') + dump(accuracy_scores, score_pth) + + return accuracy_scores + class LLaVABench(ImageBaseDataset): TYPE = 'VQA' DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'} @@ -319,9 +829,10 @@ class LLaVABench(ImageBaseDataset): class MMVet(ImageBaseDataset): TYPE = 'VQA' DATASET_URL = { - 'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv' + 'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv', + 'MMVet_Hard': 'http://opencompass.openxlab.space/utils/VLMEval/MMVet_Hard.tsv' } - DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'} + DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3', 'MMVet_Hard': '63a598819a936a2e77c410a78a21ff16'} # It returns a DataFrame @classmethod @@ -414,6 +925,60 @@ class MTVQADataset(ImageBaseDataset): return msgs +class TableVQABench(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv' + } + DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'} + + from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + import pandas as pd + from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq + + data = load(eval_file) + assert 'answer' in data and 'prediction' in data + + data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True) + data_group = dict(tuple(data.groupby('split'))) + eval_result = {'split': [], 'average_scores': []} + for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']: + data_split = data_group[split].to_dict(orient='records') + if split == 'fintabnetqa': + split_eval_meta = evaluate_fintabnet(data_split, ['accuracy']) + elif split == 'vtabfact': + split_eval_meta = evaluate_tabfact(data_split, ['accuracy']) + elif split == 'vwtq' or split == 'vwtq_syn': + split_eval_meta = evaluate_wtq(data_split, ['accuracy']) + eval_result['split'].append(split) + eval_result['average_scores'].append(split_eval_meta['average_scores']) + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + eval_result = pd.DataFrame(eval_result) + dump(eval_result, result_file) + + return eval_result + + # TableVQABench adopts a custom prompt + def build_prompt(self, line): + msgs = super().build_prompt(line) + assert sum([x['type'] == 'text' for x in msgs]) == 1 + for item in msgs: + if item['type'] == 'text': + if line['split'] == 'fintabnetqa': + item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']}) + elif line['split'] == 'vtabfact': + item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']}) + elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq': + item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']}) + return msgs + + class CustomVQADataset(ImageBaseDataset): TYPE = 'VQA' @@ -431,3 +996,480 @@ class CustomVQADataset(ImageBaseDataset): def evaluate(self, eval_file, **judge_kwargs): raise NotImplementedError + + +class CRPE(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv', + 'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv' + } + DATASET_MD5 = { + 'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08', + 'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'} + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.crpe import is_correct + # find-image, count-text, find-text, + # infer-choose, count-image, visual-reasoning + score = { + 'exist': 0, + 'subject': 0, + 'predicate': 0, + 'object': 0, + 'total': 0, + } + num = { + 'exist': 0, + 'subject': 0, + 'predicate': 0, + 'object': 0, + 'total': 0, + } + final_score_dict = { + 'exist': 0, + 'subject': 0, + 'predicate': 0, + 'object': 0, + 'total': 0, + } + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + for i in tqdm(range(len(lines))): + line = lines[i] + predict = str(line['prediction']) + answers = str(line['answer']) + # print("predict =", predict) + # print("answers =", answers) + category = line['category'] + if is_correct(answers, predict): + score[category] += 1 + score['total'] += 1 + num[category] += 1 + num['total'] += 1 + + for category in ['exist', 'subject', 'predicate', 'object', 'total']: + if num[category] != 0: + final_score_dict[category] = score[category] / num[category] + else: + final_score_dict[category] = None + + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + def build_prompt(self, line): + ROOT = LMUDataRoot() + msgs = super().build_prompt(line) + for msg in msgs: + if msg['type'] == 'image': + msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value']) + return msgs + + +class QSpatial(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'QSpatial_plus': '', + 'QSpatial_scannet': '' + } + + # NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website + # Once you get the permission, you can use the helper code here to download and extract necessary images: + # https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet + qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET" + url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/" + + def post_build(self, dataset): + # Download the prompt templates from github + + links = [ + self.url + "system_prompt.txt", + self.url + "spatial_prompt_single.txt", + self.url + "spatial_prompt_steps.txt", + self.url + "standard_prompt.txt", + self.url + "zero_shot_prompt.txt" + ] + with tempfile.TemporaryDirectory() as temp_dir: + for link in links: + tgt_path = os.path.join(temp_dir, link.split("/")[-1]) + os.system(f"wget {link} -O {tgt_path}") + + self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read() + self._prompt_templates = dict( + spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(), + spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(), + standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(), + zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(), + ) + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + from jinja2.sandbox import SandboxedEnvironment + text_prompt_template = self._prompt_templates["spatial_prompt_single"] + env = SandboxedEnvironment() + text_prompt = env.from_string(text_prompt_template).render(question=line["question"]) + tgt_path = self.dump_image(line) + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + + msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}")) + return msgs + + # Given the dataset name, return the dataset as a pandas dataframe, can override + def load_data(self, dataset): + import io + import pandas as pd + from datasets import load_dataset + + hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset) + df = hf_dataset.to_pandas() + + df.reset_index(drop=True, inplace=True) + df['index'] = df.index + df['answer'] = list(zip(df['answer_value'], df['answer_unit'])) + df = df[['index'] + [col for col in df.columns if col != 'index']] + + if dataset == "QSpatial_scannet": + df = df.drop(columns=["image"]) + df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]] + else: + df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]] + + df["image"] = [encode_image_to_base64(image) for image in df["image"]] + return df + + @classmethod + def get_multiplier(self, unit): + + unit = unit.lower() + if unit in ["meters", "meter", "m", "metre", "metres"]: + multiplier = 100 + elif unit in ["centimeters", "centimeter", "cm"]: + multiplier = 1 + elif unit in ["feet", "foot", "ft"]: + multiplier = 30.48 + elif unit in ["inch", "inches", "in"]: + multiplier = 2.54 + elif unit in ["mm"]: + multiplier = 0.1 + else: + print(f"Unknown unit: {unit}") + multiplier = 0. + + return multiplier + + @classmethod + def parse_string(self, input_str): + # Regular expression to match the pattern (number or range, text) + match = re.match(r'\(([\d.-]+), (.+)\)', input_str) + if match: + number_part = match.group(1) + text = match.group(2) + + if '-' in number_part: + start, end = map(float, number_part.split('-')) + number = (start + end) / 2 + else: + number = float(number_part) + + return number * self.get_multiplier(text) + else: + print(f"Unable to parse the input string {input_str}") + return 0 + + @classmethod + def parse_prediction(self, vlm_response): + # Value + pattern = r'scalar{([^}]*)}' + str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1] + scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes) + parsed_scalar = np.array(scalar_list).astype(float).mean() + + # Unit + pattern = r'distance_unit{([^}]*)}' + str_inside_unit_boxes = re.findall(pattern, vlm_response) + parsed_unit = str_inside_unit_boxes[-1] + + pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit) + return pred_value_in_cms + + # It returns a dictionary + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + + data = load(eval_file) + if "model" in judge_kwargs: + from .utils.qspatial import QSpatial_auxeval + + # extract using model + model = judge_kwargs['model'] + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + model = build_judge(max_tokens=128, **judge_kwargs) + + assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + QSpatial_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] + + data['res'] = [ans[idx]['res'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + dump(data, storage) + + data = load(storage) + + pred_value_in_cms = [] + for res in data["res"]: + try: + pred_value_in_cms.append(self.parse_string(res)) + except ValueError: + pred_value_in_cms.append(0.) + + pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8 + else: + # regex parsing + pred_value_in_cms = [] + n_errors_in_parsing = 0 + for pred in data["prediction"]: + try: + parsed_value = self.parse_prediction(pred) + except IndexError: + n_errors_in_parsing += 1 + parsed_value = 1e-8 + + pred_value_in_cms.append(parsed_value) + + print(f"Encounter {n_errors_in_parsing} errors in parsing") + pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8 + + # Ground truth + ground_truth_value_in_cms = [] + for answer in data["answer"]: + value, unit = eval(answer) + ground_truth_value_in_cms.append(value * self.get_multiplier(unit)) + ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8 + + # Calculate the score + pred_gt = pred_value_in_cms / ground_truth_value_in_cms + gt_pred = ground_truth_value_in_cms / pred_value_in_cms + delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2. + delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5 + + data["eval_score_delta_2"] = delta_2 + data["eval_score_delta_1_point_5"] = delta_1_point_5 + + final_score_dict = { + "delta_2": delta_2.mean(), + "delta_1_point_5": delta_1_point_5.mean() + } + for question_type in set(data["question_type"]): + filtered_data = data[data["question_type"] == question_type] + delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean() + delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean() + final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type}) + final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type}) + + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + +class MMNIAH(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MM_NIAH_VAL': + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv', + 'MM_NIAH_TEST': + ['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']} + DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5', + 'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'} + + def prepare_tsv(self, url, file_md5=None): + import os + data_root = LMUDataRoot() + os.makedirs(data_root, exist_ok=True) + update_flag = False + file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv' + data_path = osp.join(data_root, file_name) + if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): + pass + elif file_name == 'MM_NIAH_TEST.tsv': + warnings.warn('The dataset tsv is not downloaded') + for i in range(len(url)): + if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))): + print('part_a' + chr(ord('a') + i) + ' is existed') + continue + download_file(url[i], data_path) + file_prefix = 'part-' + output_file = data_path + split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)]) + with open(output_file, 'wb') as outfile: + # 逐个读取每个拆分文件并写入到输出文件 + for filename in split_files: + with open(osp.join(data_root, filename), 'rb') as infile: + outfile.write(infile.read()) + update_flag = True + else: + warnings.warn('The dataset tsv is not downloaded') + download_file(url, data_path) + update_flag = True + + if file_size(data_path, 'GB') > 1: + local_path = data_path.replace('.tsv', '_local.tsv') + if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag: + from ..tools import LOCALIZE + LOCALIZE(data_path, local_path) + data_path = local_path + return load(data_path) + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mmniah import is_correct + # find-image, count-text, find-text, + # infer-choose, count-image, visual-reasoning + MMNIAH_score = { + 'count-text': 0, + 'find-image': 0, + 'find-text': 0, + 'infer-choose': 0, + 'count-image': 0, + 'visual-reasoning': 0, + 'total': 0, + } + MMNIAH_num = { + 'count-text': 0, + 'find-image': 0, + 'find-text': 0, + 'infer-choose': 0, + 'count-image': 0, + 'visual-reasoning': 0, + 'total': 0, + } + final_score_dict = { + 'count-text': 0, + 'find-image': 0, + 'find-text': 0, + 'infer-choose': 0, + 'count-image': 0, + 'visual-reasoning': 0, + 'total': 0, + } + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + for i in tqdm(range(len(lines))): + line = lines[i] + predict = line['prediction'] + answers = line['answer'] + category = line['category'] + if category in ['visual-reasoning', 'find-image']: + answers = int(answers) + if is_correct(answers, predict): + MMNIAH_score[category] += 1 + MMNIAH_score['total'] += 1 + MMNIAH_num[category] += 1 + MMNIAH_num['total'] += 1 + + for category in ['find-image', 'count-text', 'find-text', + 'infer-choose', 'count-image', 'visual-reasoning', 'total']: + if MMNIAH_num[category] != 0: + final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category] + else: + final_score_dict[category] = None + + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + def build_prompt(self, line): + msgs = super().build_prompt(line) + if isinstance(line, int): + line = self.data.iloc[line] + totalchoice = line['multi-choice options'] + totalchoice = eval(totalchoice) + # find-image, count-text, find-text, + # infer-choose, count-image, visual-reasoning + context = msgs[-1]['value'] + context = eval(context) + question = context[0] + '\n' + context[1] + # tgt_path是所有图像地址列表 + tgt_path = [] + for i in range(len(msgs) - 1): + tgt_path.append(msgs[i]['value']) + choices = totalchoice[0] + choices_image = totalchoice[1] + if choices: + for c_idx, c in enumerate(choices): + question = f"{question}\n{chr(c_idx + ord('A'))}. {c}" + question += "\nAnswer with the option's letter from the given choices directly." + elif choices_image: + for c_idx in range(len(choices_image)): + question = f"{question}\n{chr(c_idx + ord('A'))}. " + question += "\nAnswer with the option's letter from the given choices directly." + else: + question += '\nAnswer the question using a single word or phrase.' + question = '' + question + '' + question = question.split('') + if choices_image: + for i in range(len(question) - 5): + question[i] = question[i] + '\n' + for i in range(len(question) - 5, len(question) - 1): + question[i] = question[i] + '' + else: + for i in range(len(question) - 1): + question[i] = question[i] + '\n' + assert len(tgt_path) + 1 == len(question) + context = [] + for i in range(len(tgt_path)): + context.append(question[i]) + context.append(tgt_path[i]) + context.append(question[-1]) + context[0] = context[0][7:] + context[-1] = context[-1][:-5] + msgs = [] + for i in range(len(context)): + if i % 2 == 0: + msgs.append(dict(type='text', value=context[i])) + else: + ROOT = LMUDataRoot() + msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i]))) + for element in msgs: + if element['value'] == '': + msgs.remove(element) + return msgs diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py index bfdaf94..46083e6 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_yorn.py @@ -12,18 +12,20 @@ class ImageYORNDataset(ImageBaseDataset): 'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv', 'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv', 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv', + 'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv', } DATASET_MD5 = { 'MME': 'b36b43c3f09801f5d368627fb92187c3', 'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c', 'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5', + 'AMBER': '970d94c0410916166e0a76ba75da7934', } # It returns a dataframe def evaluate(self, eval_file, **judge_kwargs): from .utils.yorn import YOrN_Extraction, YOrN_auxeval - from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating + from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating dataset = self.dataset_name data = load(eval_file) @@ -71,7 +73,10 @@ class ImageYORNDataset(ImageBaseDataset): dump(data, storage) data = load(storage) - data['score'] = (data['answer'] == data['extracted']) + if listinstr(['AMBER'], dataset): + data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower()) + else: + data['score'] = (data['answer'] == data['extracted']) dump(data, storage) if dataset is not None and listinstr(['MME'], dataset): @@ -80,6 +85,8 @@ class ImageYORNDataset(ImageBaseDataset): score = Hallusion_rating(storage) elif dataset is not None and listinstr(['POPE'], dataset): score = POPE_rating(storage) + elif dataset is not None and listinstr(['AMBER'], dataset): + score = AMBER_rating(storage) else: score = default_rating(storage) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py b/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py new file mode 100644 index 0000000..5204713 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py @@ -0,0 +1,328 @@ +from huggingface_hub import snapshot_download +from ..smp import * +from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from glob import glob + +FAIL_MSG = 'Failed to obtain answer via API.' + + +def timestamp_to_seconds(timestamp): + # Split the timestamp into hours, minutes, and seconds + h, m, s = timestamp.split(":") + # Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds + total_seconds = int(h) * 3600 + int(m) * 60 + float(s) + return total_seconds + + +def uniformly_subsample(lst, K): + n = len(lst) + if K >= n: + return lst + step = n / K + return [lst[int(i * step)] for i in range(K)] + + +def insert_subtitles_into_frames( + frames, + frame_timestamps, + subtitles, + starting_timestamp_for_subtitles, + duration, +): + interleaved_list = [] + cur_i = 0 + + for subtitle in subtitles: + if "timestamp" in subtitle: + start, end = subtitle["timestamp"] + + if not isinstance(end, float): + end = duration + + start -= starting_timestamp_for_subtitles + end -= starting_timestamp_for_subtitles + + subtitle_timestamp = (start + end) / 2 + subtitle_text = subtitle["text"] + else: + start, end = subtitle["start"], subtitle["end"] + start = timestamp_to_seconds(start) + end = timestamp_to_seconds(end) + start -= starting_timestamp_for_subtitles + end -= starting_timestamp_for_subtitles + + subtitle_timestamp = (start + end) / 2 + subtitle_text = subtitle["line"] + + for i, (frame, frame_timestamp) in enumerate( + zip(frames[cur_i:], frame_timestamps[cur_i:]) + ): + if frame_timestamp <= subtitle_timestamp: + # print("frame:", frame_timestamp) + interleaved_list.append({"type": "image", "value": frame}) + cur_i += 1 + else: + break + + if end - start < 1: + end = subtitle_timestamp + 0.5 + start = subtitle_timestamp - 0.5 + + covering_frames = False + for frame, frame_timestamp in zip(frames, frame_timestamps): + if frame_timestamp < end and frame_timestamp > start: + covering_frames = True + break + + if covering_frames: + interleaved_list.append({"type": "text", "value": subtitle_text + "\n"}) + else: + pass + + for i, (frame, frame_timestamp) in enumerate( + zip(frames[cur_i:], frame_timestamps[cur_i:]) + ): + interleaved_list.append({"type": "image", "value": frame}) + return interleaved_list + + +class LongVideoBench(VideoBaseDataset): + + MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9' + SYS = '' + + TYPE = 'Video-MCQ' + + def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_subtitle = use_subtitle + self.dataset_name = dataset + + @classmethod + def supported_datasets(cls): + return ['LongVideoBench'] + + def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'): + + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not osp.exists(data_file): + return False + + if md5(data_file) != self.MD5: + print("md5 mismatch", md5(data_file), self.MD5) + return False + data = load(data_file) + for video_pth in data['video_path']: + if not osp.exists(osp.join(pth, video_pth)): + print(video_pth, "is not found") + return False + return True + + if modelscope_flag_set(): + repo_id = "AI-ModelScope/LongVideoBench" + + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if osp.exists(data_file) and md5(data_file) == self.MD5: + return + + data_file = pd.read_json(osp.join(pth, 'lvb_val.json')) + data_file = data_file.assign(index=range(len(data_file))) + data_file['video'] = data_file['video_id'] + data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}') + + data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_snapshot_download(dataset_id=repo_id) + else: + snapshot_download(repo_id=repo_id, repo_type='dataset') + print("All videos are downloaded for LongVideoBench") + + if not glob(osp.join(cache_path, "videos")): + tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True) + + def untar_video_data(tar_file, cache_dir): + import tarfile + with tarfile.open(tar_file, "r") as tar_ref: + tar_ref.extractall(cache_dir) + print(f"Extracted all files from {tar_file} to {cache_dir}") + + def concat_tar_parts(tar_parts, output_tar): + with open(output_tar, "wb") as out_tar: + from tqdm import tqdm + for part in tqdm(sorted(tar_parts)): + with open(part, "rb") as part_file: + out_tar.write(part_file.read()) + print(f"Concatenated parts {tar_parts} into {output_tar}") + + tar_parts_dict = {} + + # Group tar parts together + for tar_file in tar_files: + base_name = tar_file.split(".tar")[0] + if base_name not in tar_parts_dict: + tar_parts_dict[base_name] = [] + tar_parts_dict[base_name].append(tar_file) + + # Concatenate and untar split parts + for base_name, parts in tar_parts_dict.items(): + print(f"Extracting following tar files: {parts}") + output_tar = base_name + ".tar" + if not osp.exists(output_tar): + print('Start concatenating tar files') + + concat_tar_parts(parts, output_tar) + print('Finish concatenating tar files') + + if not osp.exists(osp.join(cache_path, osp.basename(base_name))): + untar_video_data(output_tar, cache_path) + + print('All videos are extracted for LongVideoBench') + + dataset_path = cache_path + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + + return dict(data_file=data_file, root=dataset_path) + + def save_video_frames(self, video_path, video_llm=False): + + vid_path = osp.join(self.data_root, video_path) + vid = decord.VideoReader(vid_path) + video_info = { + 'fps': vid.get_avg_fps(), + 'n_frames': len(vid), + } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(video_path[:-4]) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(video_path[:-4], len(indices)) + + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth) and not video_llm: + im.save(pth) + + return frame_paths, indices, video_info + + # def save_video_into_images(self, line, num_frames=8): + # frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames) + # return frame_paths + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm) + fps = video_info["fps"] + + message = [dict(type='text', value=self.SYS)] + if video_llm: + message.append(dict(type='video', value=osp.join(self.data_root, line['video_path']))) + else: + if not self.use_subtitle: + with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f: + subtitles = json.load(f) + + frame_message = insert_subtitles_into_frames( + frames, + [ind_ / fps for ind_ in indices], + subtitles, + line["starting_timestamp_for_subtitles"], + line["duration"] + ) + + message += frame_message + else: + for im in frames: + message.append(dict(type='image', value=im)) + + line['question'] += '\n' + '\n'.join( + ["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))] + ) + prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly." + message.append(dict(type='text', value=prompt)) + return message + + # It returns a dictionary + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option + + assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + + tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + tgt_file = eval_file.replace('.xlsx', '_rating.json') + score_file = eval_file.replace('.xlsx', '_score.xlsx') + + if not osp.exists(score_file): + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + res = {} if not osp.exists(tmp_file) else load(tmp_file) + res = {k: v for k, v in res.items() if FAIL_MSG not in v} + + data = load(eval_file) + data_un = data[~pd.isna(data['prediction'])] + + for idx in data['index']: + ans = data.loc[data['index'] == idx, 'correct_choice'].values[0] + ans = chr(ord("A") + ans) + pred = str(data.loc[data['index'] == idx, 'prediction'].values[0]) + + if extract_characters_regex(pred) == '': + extract_pred = extract_option( + model, + data.loc[data['index'] == idx].to_dict(orient='records')[0], + 'LongVideoBench' + ) + data.loc[idx, 'score'] = int(extract_pred == ans) + else: + data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans) + + rejected = [x for x in data['score'] if x == -1] + + print( + f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, ' + f'failed to obtain the score for another {len(rejected)} questions. ' + f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.' + ) + + dump(data, score_file) + + rating = get_dimension_rating(score_file) + dump(rating, tgt_file) + return rating diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py b/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py new file mode 100644 index 0000000..2e99d39 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/miabench.py @@ -0,0 +1,167 @@ +import json +import os + +import pandas as pd + +from .image_base import ImageBaseDataset +from ..smp import * +from .utils import build_judge, DEBUG_MESSAGE +from ..utils import track_progress_rich + + +def generate_prompt(d): + question = d['question'] + weights = eval(d['component_weight']) + components = eval(d['components']) + num_of_component = int(d['num_of_component']) + response = d['prediction'] + + if num_of_component == 1: + components = f"The first component is: '{components[0]}'. " + score = f"The first component is worth: {weights[0]} scores. " + elif num_of_component == 2: + components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. " + score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. " + elif num_of_component == 3: + components = ( + f"The first component is: '{components[0]}', and the second component is '{components[1]}', " + f"and the third component is '{components[2]}'. " + ) + score = ( + "The first, second, and third component is each worth " + f"{weights[0]}, {weights[1]}, and {weights[2]} scores." + ) + elif num_of_component == 4: + components = ( + f"The first component is: '{components[0]}', and the second component is '{components[1]}', " + f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. " + ) + score = ( + "The first, second, third, and fourth component is each worth " + f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores." + ) + elif num_of_component == 5: + components = ( + f"The first component is: '{components[0]}', and the second component is '{components[1]}', " + f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', " + f"and the fifth component is '{components[4]}'. " + ) + score = ( + "The first, second, third, fourth, and fifth component is each worth " + f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores." + ) + + return ( + "Here is an instruction for a multimodal LLM: '" + f"{question}" + "'. You need to grade if the response from the model follows each component of the instruction. " + f"{components}" + "The response is: '" + f"{response}" + "'. You need to score the response and be strict. The total score ranges from 0 to 10, " + "depending on if the response follows the instruction. " + f"{score}" + "List scores of each component, and the total score in one sentence in this format: " + "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons." + ) + + +def process_rawscore(component_type, raw_score): + first_sentence = raw_score.split('.')[0].split(',') + score_dict = {} + for i in range(len(first_sentence) - 1): + score_ = first_sentence[i].split(':')[1][1:].split('/') + score = int(score_[0]) / int(score_[1]) + score_dict[component_type[i]] = score + total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/') + total_score = int(total_score_[0]) / int(total_score_[1]) + score_dict['total_score'] = total_score + return score_dict + + +def get_score_dict(data, score_raw): + cat_score_dict = {} + for i in range(len(data)): + try: + cmp = data['component_type'][i][2:-2] + cmp_list = cmp.split('\', \'') + score_dict = process_rawscore(cmp_list, score_raw[i]) + for key, val in score_dict.items(): + if key not in cat_score_dict.keys(): + cat_score_dict[key] = [val] + else: + cat_score_dict[key].append(val) + except: + pass + cat_score_dict_average = {} + for key, val in cat_score_dict.items(): + cat_score_dict_average[key] = sum(val) / len(val) + return cat_score_dict_average + + +class MIABench(ImageBaseDataset): + TYPE = 'VQA' + + DATASET_URL = { + 'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv', + } + DATASET_MD5 = { + 'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82', + } + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + judge_name = judge_kwargs.pop('model', 'gpt-4o') + + model = build_judge(model=judge_name, **judge_kwargs) + suffix = eval_file.split('.')[-1] + + storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841 + tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841 + nproc = judge_kwargs.pop('nproc', 4) # noqa: F841 + + if not osp.exists(storage): + data = load(eval_file) + num_samples = len(data) + lines = [data.loc[i] for i in range(num_samples)] + prompts = [generate_prompt(line) for line in lines] + org_data = MIABench('MIA-Bench').data + img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])} + image_b64 = [img_map[idx] for idx in data['index']] + indices = list(data['index']) + mm_messages = [ + dict(message=[ + dict(type='text', value=prompt), + dict(type='image', value=f'data:image/jpeg;base64,{b64}') + ]) + for prompt, b64 in zip(prompts, image_b64) + ] + + res = {} + if osp.exists(tmp_file): + res = load(tmp_file) + + jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res} + job_keys = list(jobs.keys()) + job_vals = [jobs[k] for k in job_keys] + + resps = track_progress_rich( + model.generate, + job_vals, + nproc=nproc, + chunksize=nproc, + keys=job_keys, + save=tmp_file, + ) + for k, resp in zip(job_keys, resps): + res[k] = resp + data['score_raw'] = [res[idx] for idx in indices] + dump(data, storage) + + goresult = load(storage) + results = get_score_dict(goresult, goresult['score_raw']) + result_pth = storage.replace('.xlsx', '_score.csv') + results_pd = pd.DataFrame.from_dict(list(results.items())) + dump(results_pd, result_pth) + + return results diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py b/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py new file mode 100644 index 0000000..f7094a0 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mlvu.py @@ -0,0 +1,455 @@ +import huggingface_hub +from huggingface_hub import snapshot_download +from ..smp import * +from .video_concat_dataset import ConcatVideoDataset +from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..utils import track_progress_rich +import torchvision.transforms as T +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from decord import VideoReader, cpu +import pandas as pd +import imageio +import cv2 +import zipfile +import os +import glob +from .utils.mlvu import * + +FAIL_MSG = 'Failed to obtain answer via API.' + + +class MLVU(ConcatVideoDataset): + def __init__(self, dataset='MLVU', nframe=0, fps=-1): + self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded'] + self.type_data_dict = { + 'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'], + 'G-Avg':['sub_scene', 'summary'] + } + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['MLVU'] + + def evaluate(self, eval_file, **judge_kwargs): + result = super().evaluate(eval_file=eval_file, **judge_kwargs) + suffix = eval_file.split('.')[-1] + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + for key in self.type_data_dict: + result.loc[key] = 0.0 + for name, item in result.iterrows(): + if name in self.type_data_dict[key]: + result.loc[key, 'success'] += item['success'] + result.loc[key, 'overall'] += item['overall'] + if key == 'G-Avg': + result.loc[key, 'acc'] = round( + result.loc[key, 'success'] / result.loc[key, 'overall'], 2 + ) + else: + result.loc[key, 'acc'] = round( + result.loc[key, 'success'] / result.loc[key, 'overall'] * 100, 1 + ) + result = result.reset_index().rename(columns={'index': 'task'}) + dump(result, score_file) + return result + + +class MLVU_MCQ(VideoBaseDataset): + + MD5 = 'bb5c37e7cf8d43fc9a25c23d2b4633f5' + BASE_SYS = 'Carefully watch this video and pay attention to every detail. ' + SYS = BASE_SYS + 'Based on your observations, select the best option that accurately addresses the question.' + TYPE = 'Video-MCQ' + + def __init__(self, dataset='MLVU_MCQ', nframe=0, fps=-1): + self.type_data_list = { + 'plotQA': ('1_plotQA.json', './MLVU/video/1_plotQA', 'MCQ'), + 'needle': ('2_needle.json', './MLVU/video/2_needle', 'MCQ'), + 'ego': ('3_ego.json', './MLVU/video/3_ego', 'MCQ'), + 'count': ('4_count.json', './MLVU/video/4_count', 'MCQ'), + 'order': ('5_order.json', './MLVU/video/5_order', 'MCQ'), + 'anomaly_reco': ('6_anomaly_reco.json', './MLVU/video/6_anomaly_reco', 'MCQ'), + 'topic_reasoning': ('7_topic_reasoning.json', './MLVU/video/7_topic_reasoning', 'MCQ'), + } + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['MLVU_MCQ'] + + def prepare_dataset(self, dataset_name='MLVU_MCQ', repo_id='MLVU/MVLU'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'])): + return False + return True + + if modelscope_flag_set(): + repo_id = "AI-ModelScope/MLVU" + + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if os.path.exists(data_file) and md5(data_file) == self.MD5: + return + json_data_dir = os.path.join(dataset_path, 'MLVU', 'json') + self.data_list = [] + for k, v in self.type_data_list.items(): + with open(os.path.join(json_data_dir, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'prefix': v[1], + 'duration': data['duration'], + 'video': data['video'], + 'question': data['question'], + 'answer': data['answer'], + 'candidates': data['candidates'], + }) + + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + hf_token = os.environ.get('HUGGINGFACE_TOKEN') + huggingface_hub.login(hf_token) + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + return dict(root=dataset_path, data_file=data_file) + + def qa_template(self, data): + question = f"Question: {data['question']}\n" + question += 'Options:\n' + answer = data['answer'] + answer_idx = -1 + for idx, c in enumerate(eval(data['candidates'])): + question += f"({chr(ord('A') + idx)}) {c}\n" + if c == answer: + answer_idx = idx + question = question.rstrip() + answer = f"({chr(ord('A') + answer_idx)}) {answer}" + return question, answer + + def save_video_frames(self, line): + suffix = line['video'].split('.')[-1] + video = line['video'].replace(f'.{suffix}','') + vid_path = osp.join(self.data_root, line['prefix'], line['video']) + vid = decord.VideoReader(vid_path) + video_info = { + 'fps': vid.get_avg_fps(), + 'n_frames': len(vid), + } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(video) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(video, len(indices)) + + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def save_video_into_images(self, line): + frame_paths = self.save_video_frames(line) + return frame_paths + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [dict(type='text', value=self.SYS, role='system')] + message.append(dict(type='text', value=question)) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + if video_llm: + message.append(dict(type='video', value=video_path)) + else: + img_frame_paths = self.save_video_into_images(line) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + message.append(dict(type='text', value='\nOnly give the best option.')) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + + tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + score_file = eval_file.replace('.xlsx', '_score.xlsx') + + if not osp.exists(score_file): + model = judge_kwargs.setdefault('model', 'chatgpt-0125') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + res = {} if not osp.exists(tmp_file) else load(tmp_file) + res = {k: v for k, v in res.items() if FAIL_MSG not in v} + + data = load(eval_file) + data_un = data[~pd.isna(data['prediction'])] + + for idx in data['index']: + ans = data.loc[data['index'] == idx, 'answer'].values[0] + pred = data.loc[data['index'] == idx, 'prediction'].values[0] + options = eval(data.loc[data['index'] == idx, 'candidates'].values[0]) + answer_idx = -1 + for id, c in enumerate(options): + if c == ans: + answer_idx = id + ans = f"({chr(ord('A') + answer_idx)}) {ans}" + input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0] + for id, option_content in enumerate(eval(input_item['candidates'])): + input_item[chr(ord('A') + id)] = option_content + if option_content == input_item['answer']: + input_item['answer'] = chr(ord('A') + id) + + if FAIL_MSG in pred: + data.loc[idx, 'score'] = -1 + else: + data.loc[idx, 'score'] = int(check_ans_with_model( + pred, ans, model, + input_item, + 'MLVU_MCQ' + )) + + rejected = [x for x in data['score'] if x == -1] + + print( + f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, ' + f'failed to obtain the score for another {len(rejected)} questions. ' + f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.' + ) + + dump(data, score_file) + + rating = get_dimension_rating(score_file) + return rating + + +class MLVU_OpenEnded(VideoBaseDataset): + + MD5 = 'cee573a3627c6ac434ded704c60511ba' + BASE_SYS = 'Carefully watch this video and pay attention to every detail. ' + SYS = BASE_SYS + 'Based on your observations, answer the given questions.' + TYPE = 'Video-VQA' + + def __init__(self, dataset='MLVU_OpenEnded', nframe=0, fps=-1): + self.type_data_list = { + 'sub_scene': ('8_sub_scene.json', './MLVU/video/8_sub_scene', 'VQA'), + 'summary': ('9_summary.json', './MLVU/video/9_summary', 'VQA') + } + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['MLVU_OpenEnded'] + + def prepare_dataset(self, dataset_name='MLVU_OpenEnded', repo_id='MLVU/MVLU'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'])): + return False + return True + + if modelscope_flag_set(): + repo_id = "AI-ModelScope/MLVU" + + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if os.path.exists(data_file) and md5(data_file) == self.MD5: + return + json_data_dir = os.path.join(dataset_path, 'MLVU', 'json') + self.data_list = [] + for k, v in self.type_data_list.items(): + with open(os.path.join(json_data_dir, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'prefix': v[1], + 'duration': data['duration'], + 'video': data['video'], + 'question': data['question'], + 'answer': data['answer'], + 'scoring_points': data['scoring_points'] if 'scoring_points' in data else '' + }) + + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + hf_token = os.environ.get('HUGGINGFACE_TOKEN') + huggingface_hub.login(hf_token) + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + return dict(root=dataset_path, data_file=data_file) + + def qa_template(self, data): + question = f"{data['question']}" + answer = data['answer'] + return question, answer + + def save_video_frames(self, line): + suffix = line['video'].split('.')[-1] + video = line['video'].replace(f'.{suffix}','') + vid_path = osp.join(self.data_root, line['prefix'], line['video']) + vid = decord.VideoReader(vid_path) + video_info = { + 'fps': vid.get_avg_fps(), + 'n_frames': len(vid), + } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(video) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(video, len(indices)) + + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def save_video_into_images(self, line): + frame_paths = self.save_video_frames(line) + return frame_paths + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [dict(type='text', value=self.SYS, role='system')] + message.append(dict(type='text', value=question)) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + if video_llm: + message.append(dict(type='video', value=video_path)) + else: + img_frame_paths = self.save_video_into_images(line) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + + model = judge_kwargs['model'] if 'model' in judge_kwargs else judge_kwargs.setdefault('model', 'gpt-4-0125') + if model != 'gpt-4-0125': + print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125') + judge_kwargs['model'] = 'gpt-4-0125' + + suffix = eval_file.split('.')[-1] + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(score_file): + data = load(eval_file) + model_dict = { + 'sub_scene': build_judge(system_prompt=system_prompt_sub_scene, **judge_kwargs), + 'summary': build_judge(system_prompt=system_prompt_summary, **judge_kwargs) + } + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model_dict[line['task_type']], line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + _ = track_progress_rich( + MLVU_OpenEnded_generate, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + data = MLVU_OpenEnded_extract(ans, data) + dump(data, score_file) + + rating = get_dimension_rating(score_file) + return rating diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py b/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py index cded905..816ec1d 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mmbench_video.py @@ -57,16 +57,16 @@ Please analyze these images and provide the answer to the question about the vid Please directly reply with your response to the only question. """ - TYPE = 'VQA' + TYPE = 'Video-VQA' - def __init__(self, dataset='MMBench-Video', pack=False): - super().__init__(dataset=dataset, pack=pack) + def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1): + super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps) @classmethod def supported_datasets(cls): return ['MMBench-Video'] - def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='nebulae09/MMBench-Video'): + def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='opencompass/MMBench-Video'): def check_integrity(pth): data_file = osp.join(pth, f'{dataset_name}.tsv') if md5(data_file) != self.MD5: @@ -81,14 +81,18 @@ Please directly reply with your response to the only question. if cache_path is not None and check_integrity(cache_path): dataset_path = cache_path else: - dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') unwrap_hf_pkl(dataset_path) self.video_path = osp.join(dataset_path, 'video/') data_file = osp.join(dataset_path, f'{dataset_name}.tsv') return dict(data_file=data_file, root=osp.join(dataset_path, 'video')) - def build_prompt_pack(self, line, num_frames): + def build_prompt_pack(self, line): if isinstance(line, int): assert line < len(self) video = self.videos[line] @@ -97,9 +101,9 @@ Please directly reply with your response to the only question. elif isinstance(line, str): video = line - frames = self.save_video_frames(video, num_frames) + frames = self.save_video_frames(video) sub = self.data[self.data['video'] == video] - sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(num_frames) + sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(len(frames)) message = [dict(type='text', value=sys_prompt)] for im in frames: message.append(dict(type='image', value=im)) @@ -110,7 +114,7 @@ Please directly reply with your response to the only question. message.append(dict(type='text', value=prompt)) return message - def build_prompt_nopack(self, line, num_frames, video_llm): + def build_prompt_nopack(self, line, video_llm): if isinstance(line, int): assert line < len(self) line = self.data.iloc[line] @@ -121,8 +125,8 @@ Please directly reply with your response to the only question. message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path))) return message else: - frames = self.save_video_frames(line['video'], num_frames) - sys_prompt = self.FRAMES_TMPL_NOPACK.format(num_frames) + frames = self.save_video_frames(line['video']) + sys_prompt = self.FRAMES_TMPL_NOPACK.format(len(frames)) message = [dict(type='text', value=sys_prompt)] for im in frames: message.append(dict(type='image', value=im)) @@ -130,11 +134,11 @@ Please directly reply with your response to the only question. message.append(dict(type='text', value=prompt)) return message - def build_prompt(self, line, num_frames, video_llm): + def build_prompt(self, line, video_llm): if self.pack and not video_llm: - return self.build_prompt_pack(line, num_frames) + return self.build_prompt_pack(line) else: - return self.build_prompt_nopack(line, num_frames, video_llm) + return self.build_prompt_nopack(line, video_llm) @staticmethod def remove_side_quote(s, syms=[',', '"', "'"]): diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py new file mode 100644 index 0000000..7dec794 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mmgenbench.py @@ -0,0 +1,69 @@ +import warnings +import pandas as pd +from abc import abstractmethod +from ..smp import * +from .image_base import ImageBaseDataset + + +class MMGenBench(ImageBaseDataset): + + prompt_list = [ + """ +# Role +You are an expert in the field of image understanding, focusing on the \ +understanding of images and generating the image caption-prompt. + +# Definition Explanation +image caption-prompt: Refers to the caption or description of an image, \ +used to provide to a Text-to-Image model to generate a new image. +Text-to-Image model: Can generate a new image based on the provided image \ +caption-prompt, such as stable diffusion 3, flux, and other image generation models. + +# Task Description +Generate an image caption-prompt based on the input image. + +# Key Points and Requirements +1. Accurately understand the input image and precisely generate an image caption-prompt. +2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \ +Text-to-Image model to generate a new image that is as consistent as possible with the input image. +3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model. +4. The generated image caption-prompt should describe the input image in as much \ +detail as possible, and it should be between 20 to 60 words. + +# Output Format +A string, that is the image caption-prompt. No extra output needed. +""" + ] + TYPE = 'GenerateImgPrompt' + DATASET_URL = { + 'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv', + 'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv', + } + PROMPT_MAP = { + 'MMGenBench-Test': prompt_list[0], + 'MMGenBench-Domain': prompt_list[0], + } + DATASET_MD5 = { + 'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da", + 'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb", + } + + def __init__(self, dataset='MMGenBench', **kwargs): + super().__init__(dataset, **kwargs) + warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n') + warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n') + + def load_data(self, dataset): + data = super().load_data(dataset) + if 'question' not in data: + data['question'] = [( + self.PROMPT_MAP[dataset] + )] * len(data) + return data + + # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe + @abstractmethod + def evaluate(self, eval_file, **judge_kwargs): + warnings.warn('This evaluation method is not supported.\n') + warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n') + return None diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py index be9c1d6..8a7af74 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mmlongbench.py @@ -194,11 +194,12 @@ def isfloat(num): def get_font(): try: - truetype_url = 'http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf' + truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf" ff = urlopen(truetype_url) font = ImageFont.truetype(ff, size=40) - except: - print('Fail to download the font. Use the default one.') + except Exception as e: + logging.warning(f'{type(e)}: {e}') + logging.warning("Fail to download the font. Use the default one.") font = ImageFont.load_default(size=40) return font @@ -227,13 +228,13 @@ def frame2img(img_path_list, font, save_path=None, idx_start=0): w, h = im.size new_w = max(new_w, w) new_h += h + 10 + pad - new_img = Image.new('RGB', (new_w, new_h), 'white') + new_img = Image.new("RGB", (new_w, new_h), "white") draw = ImageDraw.Draw(new_img) curr_h = 0 for idx, im in enumerate(imgs): w, h = im.size new_img.paste(im, (0, pad + curr_h)) - draw.text((0, curr_h), f'', font=font, fill='black') + draw.text((0, curr_h), f"", font=font, fill="black") if idx + 1 < len(imgs): draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2) curr_h += h + 10 + pad @@ -249,7 +250,7 @@ def frame2img(img_path_list, font, save_path=None, idx_start=0): for idx, im in enumerate(imgs): w, h = im.size new_img.paste(im, (curr_w, pad)) - draw.text((curr_w, 0), f'', font=font, fill='black') + draw.text((curr_w, 0), f"", font=font, fill='black') if idx + 1 < len(imgs): draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2) curr_w += w + 10 @@ -460,8 +461,9 @@ class MMLongBench(ImageBaseDataset): os.makedirs(self.img_root, exist_ok=True) try: import fitz - except: - warnings.warn('Please use `pip install pymupdf` to parse PDF files.') + except Exception as e: + logging.critical(f'{type(e)}: {e}') + logging.critical('Please use `pip install pymupdf` to parse PDF files.') line = origin_line.copy() line['image_path'] = line['image_path'][:self.max_pages] diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py b/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py new file mode 100644 index 0000000..a6d78d5 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mmmath.py @@ -0,0 +1,446 @@ +import re +import json +import sympy as sp +import numpy as np +from sympy import simplify, Eq, sympify, Pow, pi +from sympy.parsing.latex import parse_latex +import sys +import math +import os +import argparse + +from .image_base import ImageBaseDataset +from ..utils import track_progress_rich +from ..smp import load, dump + + +class AutoScoringJudge: + def __init__(self): + # Map of special symbols to their replacements + self.special_signal_map = { + "\\left": "", + "\\right": "", + "厘米":"", + # "∶": ":", + ",": ",", + "$": "", + "(":"(", + ")":")", + "\\infty":"oo", + "\\colon ":":", + # "\\approx": "=", + # "\\simeq": "=", + # "\\sim": "=", + # "^\\prime": "'", + # "^{\\prime}": "'", + "+":"+", + "\\, ": "", + "\\,":"", + "^\\circ": "", + "^{\\circ}": "", + # "%": "", + } + self.pi = parse_latex("\\pi") + # MM-Math default precision + self.precision = 1e-2 + + def trans_greater_sign_to_interval(self, expr:str): + expr_tmp = expr.split("<") + return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")" + + def split_by_comma(self, expr: str): + # Splits expressions by commas outside of brackets + in_bracket_num = 0 + splitted_expr = [] + start_idx = 0 + for i, char in enumerate(expr): + if char in ["(", "["]: + in_bracket_num += 1 + elif char in [")", "]"]: + in_bracket_num -= 1 + elif char == "," and in_bracket_num == 0: + splitted_expr.append(expr[start_idx:i].strip()) + start_idx = i + 1 + + if start_idx < len(expr): + splitted_expr.append(expr[start_idx:].strip()) + + return splitted_expr + + def trans_plus_minus_sign(self, expr_list: list): + # Translates plus-minus signs into separate expressions + new_expr_list = [] + for expr in expr_list: + if "\\pm" in expr: + new_expr_list.append(expr.replace("\\pm", "+")) + new_expr_list.append(expr.replace("\\pm", "-")) + else: + new_expr_list.append(expr) + + return new_expr_list + + def judge(self, expression1, expression2, precision=1e-2): + # Judge if two expressions are equal (expression1 is considered as the Ground Truth) + # Default precision is a list for supporting multiple expressions + precision = precision if isinstance(precision, list) else [precision] + + try: + expression1, expression2 = self.preprocess(expression1, expression2) + except: + return False + if expression1 == expression2: + # print("Exactly equal") + return True + + # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered + expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501 + expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501 + # Check if two < or > in expression + if self.is_two_greater_sign(expression1): + expression1 = self.trans_greater_sign_to_interval(expression1) + + if self.is_two_greater_sign(expression2): + expression2 = self.trans_greater_sign_to_interval(expression2) + + expression1 = self.split_by_comma(expression1) + expression2 = self.split_by_comma(expression2) + + temp_list1 = self.trans_plus_minus_sign(expression1) + temp_list2 = self.trans_plus_minus_sign(expression2) + + # Set up a list for allowed errors + if len(precision) <= 1: + precision = precision * len(temp_list1) + + if len(temp_list1) != len(temp_list2): + return False + + # Check if elements in both lists can be paired and are equal + idx = -1 + while len(temp_list1) != 0: + idx = (idx + 1) % len(temp_list1) + + item1 = temp_list1[idx] + self.precision = precision[idx] + + for item2 in temp_list2: + if self.is_equal(item1, item2): + temp_list1.remove(item1) + temp_list2.remove(item2) + precision.remove(self.precision) + break + else: + # If no match was found, return False + return False + + # If all elements are matched, return True + return True + + def is_interval(self, expr): + # Checks if an expression is an interval + return expr.startswith(("(", "[")) and expr.endswith((")", "]")) + + def is_two_greater_sign(self, expr): + match = re.findall(r'<', expr) + return len(match) == 2 + + def sympy_sub_pi(self, expression_sympy): + # Replaces the symbol for pi in sympy expressions with its numerical value + return expression_sympy.subs(self.pi, math.pi) + + def is_equal(self, expression1, expression2): + # Default first expression is ground truth. Check if expressions are equal in different aspects + if expression1 == expression2 and expression1 != "" and expression2 != "": + # print("Equivalent natively") + return True + + # First check if both are intervals + if self.is_interval(expression1) and self.is_interval(expression2): + try: + if self.interval_equal(expression1, expression2): + # print("Interval equivalent") + return True + except: + return False + + # Then check for numerical equality + try: + if self.numerical_equal(expression1, expression2): + # print("Numerically equivalent") + return True + except: + pass + # Then check if expressions are mathematically equal + try: + if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2): + # print("Expression equivalent") + return True + except: + pass + + # Lastly, check for equation equality + try: + if self.equation_equal(expression1, expression2): + # print("Equation equivalent") + return True + except: + pass + + return False + + def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True): + # Check if two numerical values are equal within an allowed error range + # Includes possible percentage cases + reference = float(expression1) + prediction = float(expression2) + + if include_percentage: + gt_result = [reference / 100, reference, reference * 100] + else: + gt_result = [reference] + + for item in gt_result: + if abs(item - prediction) <= self.precision * 1.01: + return True + return False + + def expression_equal(self, exp1, exp2): + # Check if two expressions are mathematically equivalent + # Extract expression and use sympy for equivalence checking + def extract_expression(expression): + if "=" in expression: + expression = expression.split("=")[1] + return expression.strip() + + exp1 = extract_expression(exp1) + exp2 = extract_expression(exp2) + + exp_too_long = len(exp1) > 300 or len(exp2) > 300 + + expr1_sym = sympify(parse_latex(exp1)) + expr2_sym = sympify(parse_latex(exp2)) + if expr1_sym == expr2_sym: + return True + else: + expr1_sym = self.sympy_sub_pi(expr1_sym) + expr2_sym = self.sympy_sub_pi(expr2_sym) + + if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \ + (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)): + return False + elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol): + try: + if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)): + print("These two numbers cannot be calculated by the current computer for: " + f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"") + return False + if exp_too_long: + print(f'Expression {exp1} or {exp2} is too long to compute. ') + return False + if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01: + return True + else: + return False + except: + return False + elif exp_too_long: + print(f'Expression {exp1} or {exp2} is too long to compute. ') + return False + else: + try: + simplified_expr = simplify(expr1_sym - expr2_sym) + num_value = simplified_expr.evalf() + return abs(num_value) < 1e-3 + except: + return False + + def equation_equal(self, expression1, expression2): + # Check if two equations are mathematically equivalent + # Simplify equations and use sympy for equivalence checking + def simplify_equation(latex_eq): + lhs, rhs = latex_eq.split('=') + + lhs_expr = parse_latex(lhs) + rhs_expr = parse_latex(rhs) + + equation = Eq(lhs_expr, rhs_expr) + + simplified_eq = simplify(equation.lhs - equation.rhs) + + return simplified_eq + + expr1_sym = simplify_equation(expression1) + expr2_sym = simplify_equation(expression2) + + division_result_1 = simplify(expr1_sym / expr2_sym) + division_result_2 = simplify(expr2_sym / expr1_sym) + + if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504 + (division_result_2.is_Integer and division_result_2 != 0)): + return True + else: + return False + + def interval_equal(self, expression1, expression2): + # Check if two intervals are mathematically equivalent + def compare_two_interval(inter1, inter2): + if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]: + return False + + inter1 = inter1.strip('[]()') + inter2 = inter2.strip('[]()') + + items_1 = inter1.split(',') + items_2 = inter2.split(',') + + for item_1, item_2 in zip(items_1, items_2): + if not self.expression_equal(item_1, item_2): + return False + return True + + interval1 = expression1 + interval2 = expression2 + + if interval1 == interval2: + return True + else: + inter_list1 = interval1.split("\\cup") + inter_list2 = interval2.split("\\cup") + + if len(inter_list1) != len(inter_list2): + return False + else: + for inter1, inter2 in zip(inter_list1, inter_list2): + if not compare_two_interval(inter1, inter2): + return False + return True + + def preprocess(self, expression1, expression2): + # Preprocess expressions to extract and replace special symbols + def extract_boxed_content(latex_str): + boxed_matches = re.finditer(r'\\boxed{', latex_str) + results = "" + + for match in boxed_matches: + start_index = match.end() + end_index = start_index + stack = 1 + + while stack > 0 and end_index < len(latex_str): + if latex_str[end_index] == '{': + stack += 1 + elif latex_str[end_index] == '}': + stack -= 1 + end_index += 1 + + if stack == 0: + content = latex_str[start_index:end_index - 1] + results += content + "," + else: + raise ValueError("Mismatched braces in LaTeX string.") + + if results == "": + last_line_ans = latex_str.strip().split("\n")[-1] + dollar_pattern = r"\$(.*?)\$" + answers = re.findall(dollar_pattern, last_line_ans) + + if answers: + for ans in answers: + results += ans + "," + else: + results = latex_str + + return results + + def sepcial_symbol_replace(expression): + + expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501 + + expression = re.sub(r"(.+)m$", r"\1", expression) + + if "\\in " in expression: + expression = expression.split("\\in ")[1] + + for signal in self.special_signal_map: + expression = expression.replace(signal, self.special_signal_map[signal]) + + expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression) + + expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。") + + pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}' + expression = re.sub(pattern, r'\1', expression) + + return expression + + exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2) + + exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2) + + return exp1, exp2 + + def can_compute_power(self, expr): + # Checks if a power expression can be computed + if isinstance(expr, Pow): + base, exp = expr.as_base_exp() + if base.is_number and exp.is_number: + MAX_EXP = 1000 # Adjust based on computing environment + if abs(exp.evalf()) > MAX_EXP: + return False + else: + return True + else: + return False + else: + return True # Not a power expression, can compute + + +class MMMath(ImageBaseDataset): + + TYPE = 'VQA' + + DATASET_URL = { + 'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv', + } + DATASET_MD5 = { + 'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5', + } + + @classmethod + def evaluate(self, eval_file, **kwargs): + + data = load(eval_file) + judger = AutoScoringJudge() + func = judger.judge + + tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])] + + res = track_progress_rich(func, tups, nproc=16) + data['hit'] = res + dump(data, eval_file) + + score_file = eval_file.replace('.xlsx', '_score.json') + score = {} + score['overall'] = np.mean(data['hit']) + # Results by Difficulty + difficulties = set(data['difficulty']) + for d in difficulties: + score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit']) + + # Results by Year + years = set(data['year']) + for y in years: + score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit']) + + # Results by Knowledge-L1 + points = set(data['knowledge_l1']) + for p in points: + score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit']) + + # Results by Knowledge-L2 + points = set(data['knowledge_l2']) + for p in points: + score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit']) + + dump(score, score_file) + return score diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py index c45c412..f10a709 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py @@ -13,25 +13,22 @@ import cv2 import zipfile import os import glob -from moviepy.editor import VideoFileClip, ImageSequenceClip -import moviepy.config_defaults from .utils.mvbench import * FAIL_MSG = 'Failed to obtain answer via API.' -moviepy.config_defaults.LOGGER_LEVEL = logging.CRITICAL + 1 class MVBench(VideoBaseDataset): - MD5 = 'ae2a2607e2f8618155709220c6e927a6' + MD5 = 'fd21d36522cdedd46d84dc46715ad832' SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \ the detail and movement of objects, and the action and pose of persons. \ Based on your observations, select the best option that accurately addresses the question. """ - TYPE = 'MCQ' + TYPE = 'Video-MCQ' - def __init__(self, dataset='MVBench', pack=False): + def __init__(self, dataset='MVBench', nframe=0, fps=-1): self.type_data_list = { 'Action Sequence': ('action_sequence.json', 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end @@ -74,7 +71,7 @@ Based on your observations, select the best option that accurately addresses the 'Counterfactual Inference': ('counterfactual_inference.json', 'your_data_path/clevrer/video_validation/', 'video', False), } - super().__init__(dataset=dataset, pack=pack) + super().__init__(dataset=dataset, nframe=nframe, fps=fps) @classmethod def supported_datasets(cls): @@ -96,6 +93,9 @@ Based on your observations, select the best option that accurately addresses the return False return True + if modelscope_flag_set(): + repo_id = 'modelscope/MVBench' + cache_path = get_cache_path(repo_id, branch='main') if cache_path is not None and check_integrity(cache_path): dataset_path = cache_path @@ -115,32 +115,42 @@ Based on your observations, select the best option that accurately addresses the data_file = osp.join(pth, f'{dataset_name}.tsv') if os.path.exists(data_file) and md5(data_file) == self.MD5: return - json_data_dir = os.path.join(dataset_path, 'json') + json_data_dir = os.path.join(pth, 'json') self.data_list = [] for k, v in self.type_data_list.items(): with open(os.path.join(json_data_dir, v[0]), 'r') as f: json_data = json.load(f) for data in json_data: - self.data_list.append({ - 'task_type': k, - 'prefix': v[1].replace('your_data_path', os.path.join(dataset_path, 'video')), - 'data_type': v[2], - 'bound': v[3], - 'start': data['start'] if 'start' in data.keys() else None, - 'end': data['end'] if 'end' in data.keys() else None, - 'video': data['video'], - 'question': data['question'], - 'answer': data['answer'], - 'candidates': data['candidates'] - }) + if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])): + self.data_list.append({ + 'task_type': k, + 'prefix': v[1].replace('your_data_path', 'video'), + 'data_type': v[2], + 'bound': v[3], + 'start': data['start'] if 'start' in data.keys() else None, + 'end': data['end'] if 'end' in data.keys() else None, + 'video': data['video'], + 'question': data['question'], + 'answer': data['answer'], + 'candidates': data['candidates'] + }) + else: + print( + 'NTURGB-D zip file is removed according to MVBench, you can view it at ' + 'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.' + ) + raise Exception( + f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist" + ) data_df = pd.DataFrame(self.data_list) data_df = data_df.assign(index=range(len(data_df))) data_df.to_csv(data_file, sep='\t', index=False) def move_files(pth): - # special for mvbench src_folder = os.path.join(pth, 'video/data0613') + if not os.path.exists(src_folder): + return for subdir in os.listdir(src_folder): subdir_path = os.path.join(src_folder, subdir) if os.path.isdir(subdir_path): @@ -149,15 +159,24 @@ Based on your observations, select the best option that accurately addresses the if os.path.isdir(subsubdir_path): for item in os.listdir(subsubdir_path): item_path = os.path.join(subsubdir_path, item) - target_folder = os.path.join(pth, 'video', subdir, subsubdir, item) + target_folder = os.path.join(pth, 'video', subdir, subsubdir) if not os.path.exists(target_folder): - shutil.move(item_path, os.path.join(target_folder, item)) + os.makedirs(target_folder) + target_path = os.path.join(target_folder, item) + try: + shutil.move(item_path, target_path) + except Exception as e: + print(f"Error moving {item_path} to {target_path}: {e}") - hf_token = os.environ.get('HUGGINGFACE_TOKEN') - huggingface_hub.login(hf_token) - dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') - move_files(dataset_path) + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master') + else: + hf_token = os.environ.get('HUGGINGFACE_TOKEN') + huggingface_hub.login(hf_token) + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') unzip_hf_zip(dataset_path) + move_files(dataset_path) generate_tsv(dataset_path) data_file = osp.join(dataset_path, f'{dataset_name}.tsv') @@ -169,20 +188,12 @@ Based on your observations, select the best option that accurately addresses the } self.nframe = 8 - self.resolution = 224 self.frame_fps = 3 # transform - crop_size = self.resolution - scale_size = self.resolution - input_mean = [0.48145466, 0.4578275, 0.40821073] - input_std = [0.26862954, 0.26130258, 0.27577711] self.transform = T.Compose([ - GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC), - GroupCenterCrop(crop_size), Stack(), - ToTorchFormatTensor(), - GroupNormalize(input_mean, input_std) + ToTorchFormatTensor() ]) return dict(root=dataset_path, data_file=data_file) @@ -240,7 +251,7 @@ Based on your observations, select the best option that accurately addresses the def save_video_frames(self, imgs, video_name, frames): - frame_paths = self.frame_paths(video_name, frames) + frame_paths = self.frame_paths(video_name) flag = np.all([osp.exists(p) for p in frame_paths]) if not flag: @@ -268,7 +279,13 @@ Based on your observations, select the best option that accurately addresses the return question, answer def load_into_video_and_process(self, line): - video_path = os.path.join(line['prefix'], line['video']) + try: + from moviepy.editor import VideoFileClip, ImageSequenceClip + except: + raise ImportError( + 'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"' + ) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']: processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4') @@ -302,33 +319,39 @@ Based on your observations, select the best option that accurately addresses the return output_video_path - def build_prompt(self, line, num_frames, video_llm): + def save_video_into_images(self, line): + bound = None + if line['bound']: + bound = ( + line['start'], + line['end'], + ) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + decord_method = self.decord_method[line['data_type']] + self.num_segments = self.nframe + torch_imgs = decord_method(video_path, bound) + img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments) + return img_frame_paths + + def build_prompt(self, line, video_llm): + if self.fps > 0: + raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!') if isinstance(line, int): assert line < len(self) line = self.data.iloc[line] question, answer = self.qa_template(line) - message = [dict(type='text', value=self.SYS)] + message = [dict(type='text', value=self.SYS, role='system')] message.append(dict(type='text', value=question)) if video_llm: new_video_path = self.load_into_video_and_process(line) message.append(dict(type='video', value=new_video_path)) else: - bound = None - if line['bound']: - bound = ( - line['start'], - line['end'], - ) - video_path = os.path.join(line['prefix'], line['video']) - decord_method = self.decord_method[line['data_type']] - self.num_segments = num_frames if num_frames > 0 else self.nframe - torch_imgs = decord_method(video_path, bound) - img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments) + img_frame_paths = self.save_video_into_images(line) for im in img_frame_paths: message.append(dict(type='image', value=im)) message.append(dict(type='text', value='\nOnly give the best option.')) - message.append(dict(type='text', value='Best option:(')) + message.append(dict(type='text', value='Best option:(', role='assistant')) return message @classmethod @@ -341,13 +364,27 @@ Based on your observations, select the best option that accurately addresses the score_file = eval_file.replace('.xlsx', '_score.xlsx') if not osp.exists(score_file): + model = judge_kwargs.setdefault('model', 'chatgpt-0125') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None res = {} if not osp.exists(tmp_file) else load(tmp_file) res = {k: v for k, v in res.items() if FAIL_MSG not in v} data = load(eval_file) data_un = data[~pd.isna(data['prediction'])] - for idx in data['index']: + for idx in data_un['index']: ans = data.loc[data['index'] == idx, 'answer'].values[0] pred = data.loc[data['index'] == idx, 'prediction'].values[0] options = eval(data.loc[data['index'] == idx, 'candidates'].values[0]) @@ -356,11 +393,20 @@ Based on your observations, select the best option that accurately addresses the if c == ans: answer_idx = id ans = f"({chr(ord('A') + answer_idx)}) {ans}" + input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0] + for id, option_content in enumerate(eval(input_item['candidates'])): + input_item[chr(ord('A') + id)] = option_content + if option_content == input_item['answer']: + input_item['answer'] = chr(ord('A') + id) if FAIL_MSG in pred: data.loc[idx, 'score'] = -1 else: - data.loc[idx, 'score'] = int(check_ans(pred, ans)) + data.loc[idx, 'score'] = int(check_ans_with_model( + pred, ans, model, + input_item, + 'MVBench' + )) rejected = [x for x in data['score'] if x == -1] @@ -379,15 +425,15 @@ Based on your observations, select the best option that accurately addresses the class MVBench_MP4(VideoBaseDataset): - MP4_MD5 = '7b4608045347904c28c153015a7a2b6b' + MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5' SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \ the detail and movement of objects, and the action and pose of persons. \ Based on your observations, select the best option that accurately addresses the question. """ - TYPE = 'MCQ' + TYPE = 'Video-MCQ' - def __init__(self, dataset='MVBench_MP4', pack=False): - super().__init__(dataset=dataset, pack=pack) + def __init__(self, dataset='MVBench_MP4', nframe=0, fps=-1): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) @classmethod def supported_datasets(cls): @@ -409,13 +455,16 @@ Based on your observations, select the best option that accurately addresses the return False return True + if modelscope_flag_set(): + repo_id = 'modelscope/MVBench' + cache_path = get_cache_path(repo_id, branch='video') if cache_path is not None and check_integrity(cache_path): dataset_path = cache_path else: def generate_tsv(pth): data_file = osp.join(pth, f'{dataset_name}.tsv') - if os.path.exists(data_file) and md5(data_file) == self.MD5: + if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5: return json_data_path = os.path.join(dataset_path, 'test.json') json_data = load(json_data_path) @@ -435,27 +484,21 @@ Based on your observations, select the best option that accurately addresses the data_df = data_df.assign(index=range(len(data_df))) data_df.to_csv(data_file, sep='\t', index=False) - hf_token = os.environ.get('HUGGINGFACE_TOKEN') - huggingface_hub.login(hf_token) - dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video') + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video') + else: + hf_token = os.environ.get('HUGGINGFACE_TOKEN') + huggingface_hub.login(hf_token) + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video') generate_tsv(dataset_path) data_file = osp.join(dataset_path, f'{dataset_name}.tsv') - self.nframe = 8 - self.resolution = 224 - # transform - crop_size = self.resolution - scale_size = self.resolution - input_mean = [0.48145466, 0.4578275, 0.40821073] - input_std = [0.26862954, 0.26130258, 0.27577711] self.transform = T.Compose([ - GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC), - GroupCenterCrop(crop_size), Stack(), - ToTorchFormatTensor(), - GroupNormalize(input_mean, input_std) + ToTorchFormatTensor() ]) return dict(root=dataset_path, data_file=data_file) @@ -473,7 +516,7 @@ Based on your observations, select the best option that accurately addresses the answer = f"({chr(ord('A') + answer_idx)}) {answer}" return question, answer - def get_index(self, max_frame): + def get_index_by_frame(self, max_frame): seg_size = float(max_frame) / self.num_segments frame_indices = np.array([ int((seg_size / 2) + np.round(seg_size * idx)) @@ -481,12 +524,26 @@ Based on your observations, select the best option that accurately addresses the ]) return frame_indices - def read_video(self, video_path, bound=None): + def get_index_by_fps(self, vid, fps): + total_frames = len(vid) + video_fps = vid.get_avg_fps() + total_duration = total_frames / video_fps + required_frames = int(total_duration * fps) + step_size = video_fps / fps + frame_indices = np.array([int(i * step_size) for i in range(required_frames)]) + self.num_segments = len(frame_indices) + return frame_indices + + def read_video(self, video_path): vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) max_frame = len(vr) - 1 images_group = list() - frame_indices = self.get_index(max_frame) + if self.fps < 0: + frame_indices = self.get_index_by_frame(max_frame) + else: + frame_indices = self.get_index_by_fps(vr, self.fps) + for frame_index in frame_indices: img = Image.fromarray(vr[frame_index].asnumpy()) images_group.append(img) @@ -494,8 +551,10 @@ Based on your observations, select the best option that accurately addresses the return torch_imgs def save_video_frames(self, imgs, video_name, frames): - - frame_paths = self.frame_paths(video_name, frames) + if self.fps > 0: + frame_paths = self.frame_paths_fps(video_name, frames) + else: + frame_paths = self.frame_paths(video_name) flag = np.all([osp.exists(p) for p in frame_paths]) if not flag: @@ -509,26 +568,33 @@ Based on your observations, select the best option that accurately addresses the return frame_paths - def build_prompt(self, line, num_frames, video_llm): + def save_video_into_images(self, line): + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + if self.fps <= 0: + self.num_segments = self.nframe + else: + self.num_segments = 0 + torch_imgs = self.read_video(video_path) + img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments) + return img_frame_paths + + def build_prompt(self, line, video_llm): if isinstance(line, int): assert line < len(self) line = self.data.iloc[line] question, answer = self.qa_template(line) - message = [dict(type='text', value=self.SYS)] + message = [dict(type='text', value=self.SYS, role='system')] message.append(dict(type='text', value=question)) video_path = os.path.join(self.data_root, line['prefix'], line['video']) if video_llm: message.append(dict(type='video', value=video_path)) else: - video_path = os.path.join(self.data_root, line['prefix'], line['video']) - self.num_segments = num_frames if num_frames > 0 else self.nframe - torch_imgs = self.read_video(video_path) - img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments) + img_frame_paths = self.save_video_into_images(line) for im in img_frame_paths: message.append(dict(type='image', value=im)) message.append(dict(type='text', value='\nOnly give the best option.')) - message.append(dict(type='text', value='Best option:(')) + message.append(dict(type='text', value='Best option:(', role='assistant')) return message @classmethod @@ -541,13 +607,27 @@ Based on your observations, select the best option that accurately addresses the score_file = eval_file.replace('.xlsx', '_score.xlsx') if not osp.exists(score_file): + model = judge_kwargs.setdefault('model', 'chatgpt-0125') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None res = {} if not osp.exists(tmp_file) else load(tmp_file) res = {k: v for k, v in res.items() if FAIL_MSG not in v} data = load(eval_file) data_un = data[~pd.isna(data['prediction'])] - for idx in data['index']: + for idx in data_un['index']: ans = data.loc[data['index'] == idx, 'answer'].values[0] pred = data.loc[data['index'] == idx, 'prediction'].values[0] options = eval(data.loc[data['index'] == idx, 'candidates'].values[0]) @@ -556,11 +636,20 @@ Based on your observations, select the best option that accurately addresses the if c == ans: answer_idx = id ans = f"({chr(ord('A') + answer_idx)}) {ans}" + input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0] + for id, option_content in enumerate(eval(input_item['candidates'])): + input_item[chr(ord('A') + id)] = option_content + if option_content == input_item['answer']: + input_item['answer'] = chr(ord('A') + id) if FAIL_MSG in pred: data.loc[idx, 'score'] = -1 else: - data.loc[idx, 'score'] = int(check_ans(pred, ans)) + data.loc[idx, 'score'] = int(check_ans_with_model( + pred, ans, model, + input_item, + 'MVBench_MP4' + )) rejected = [x for x in data['score'] if x == -1] diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py b/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py new file mode 100644 index 0000000..032fccc --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/tempcompass.py @@ -0,0 +1,639 @@ +import huggingface_hub +from huggingface_hub import snapshot_download +from ..smp import * +from .video_concat_dataset import ConcatVideoDataset +from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..utils import track_progress_rich +import torchvision.transforms as T +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from decord import VideoReader, cpu +from .utils.tempcompass import * + + +FAIL_MSG = 'Failed to obtain answer via API.' + + +class TempCompass(ConcatVideoDataset): + def __init__(self, dataset='TempCompass', nframe=0, fps=-1): + self.DATASET_SETS[dataset] = ['TempCompass_MCQ', 'TempCompass_Captioning', 'TempCompass_YorN'] + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['TempCompass'] + + def evaluate(self, eval_file, **judge_kwargs): + result = super().evaluate(eval_file=eval_file, **judge_kwargs) + suffix = eval_file.split('.')[-1] + result = result.reset_index().rename(columns={'index': 'dim.task_type'}) + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + avg_dict = {} + for idx, item in result.iterrows(): + dim, task_type = item['dim.task_type'].split('. ') + if dim not in avg_dict: + avg_dict[dim] = {'success': 0.0, 'overall': 0.0} + if task_type not in avg_dict: + avg_dict[task_type] = {'success': 0.0, 'overall': 0.0} + if 'overall' not in avg_dict: + avg_dict['overall'] = {'success': 0.0, 'overall': 0.0} + avg_dict[dim]['success'] += item['success'] + avg_dict[dim]['overall'] += item['overall'] + avg_dict[task_type]['success'] += item['success'] + avg_dict[task_type]['overall'] += item['overall'] + avg_dict['overall']['success'] += item['success'] + avg_dict['overall']['overall'] += item['overall'] + result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 2) + for key, value in avg_dict.items(): + # 使用 loc 方法添加新行 + result.loc[len(result)] = { + 'dim.task_type': key, + 'success': value['success'], + 'overall': value['overall'], + 'acc': round(value['success'] / value['overall'] * 100, 2) + } + dump(result, score_file) + return result + + +class TempCompass_MCQ(VideoBaseDataset): + + MD5 = '7efbb9e6d9dabacd22daf274852691dd' + TYPE = 'Video-MCQ' + + def __init__(self, dataset='TempCompass_MCQ', nframe=0, fps=-1): + self.type_data_list = { + 'multi-choice': ('multi-choice.json', './videos', '.mp4'), + 'caption_matching': ('caption_matching.json', './videos', '.mp4'), + } + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['TempCompass_MCQ'] + + def prepare_dataset(self, dataset_name='TempCompass_MCQ', repo_id='lmms-lab/TempCompass'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not osp.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])): + return False + return True + + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def read_parquet(pth): + import pandas as pd + for task_name in self.type_data_list.keys(): + if not osp.exists(osp.join(pth, f'{task_name}.json')): + data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet')) + data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False) + + def unzip_videos(pth): + import zipfile + if not osp.exists(osp.join(pth, 'videos')): + zip_file = osp.join(pth, 'tempcompass_videos.zip') + with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(pth) + + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if osp.exists(data_file) and md5(data_file) == self.MD5: + return + self.data_list = [] + for k, v in self.type_data_list.items(): + with open(osp.join(pth, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'prefix': v[1], + 'suffix': v[2], + 'video': data['video_id'], + 'question': data['question'].split('\n')[0], + 'answer': data['answer'], + 'dim': data['dim'], + 'candidates': data['question'].split('\n')[1:], + }) + + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + read_parquet(dataset_path) + unzip_videos(dataset_path) + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + return dict(root=dataset_path, data_file=data_file) + + def qa_template(self, data): + question = data['question'] + '\n' + '\n'.join(eval(data['candidates'])) + answer = data['answer'] + return question, answer + + def save_video_frames(self, line): + vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix']) + vid = decord.VideoReader(vid_path) + video_info = { + 'fps': vid.get_avg_fps(), + 'n_frames': len(vid), + } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(line['video']) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(line['video'], len(indices)) + + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def save_video_into_images(self, line): + frame_paths = self.save_video_frames(line) + return frame_paths + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [] + message.append(dict(type='text', value=question)) + video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix']) + if video_llm: + message.append(dict(type='video', value=video_path)) + else: + img_frame_paths = self.save_video_into_images(line) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + message.append(dict(type='text', value='\nPlease directly give the best option:')) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-1106', 'exact_matching'] + judge_kwargs.update({ + "max_tokens": 128, + "temperature": 1.0, + "top_p": 1, + "presence_penalty": 1, + }) + + suffix = eval_file.split('.')[-1] + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(score_file): + data = load(eval_file) + if model != 'exact_matching': + model = build_judge(system_prompt=sys_prompt, **judge_kwargs) + else: + model = None + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + _ = track_progress_rich( + evaluate_tempcompass_mcq, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for idx, item in data.iterrows(): + data.loc[idx, 'score'] = ans[idx]['rating'] + dump(data, score_file) + + rating = get_dimension_rating(score_file) + return rating + + +class TempCompass_Captioning(VideoBaseDataset): + + MD5 = '35be9bf2581ea7767f02e9a8f37ae1ab' + TYPE = 'Video-VQA' + + def __init__(self, dataset='TempCompass_Captioning', nframe=0, fps=-1): + self.type_data_list = { + 'captioning': ('captioning.json', './videos', '.mp4'), + } + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['TempCompass_Captioning'] + + def prepare_dataset(self, dataset_name='TempCompass_Captioning', repo_id='lmms-lab/TempCompass'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not osp.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])): + return False + return True + + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def read_parquet(pth): + import pandas as pd + for task_name in self.type_data_list.keys(): + if not osp.exists(osp.join(pth, f'{task_name}.json')): + data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet')) + data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False) + + def unzip_videos(pth): + import zipfile + if not osp.exists(osp.join(pth, 'videos')): + zip_file = osp.join(pth, 'tempcompass_videos.zip') + with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(pth) + + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if osp.exists(data_file) and md5(data_file) == self.MD5: + return + self.data_list = [] + for k, v in self.type_data_list.items(): + with open(osp.join(pth, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'prefix': v[1], + 'suffix': v[2], + 'video': data['video_id'], + 'question': data['question'], + 'answer': data['answer'], + 'dim': data['dim'], + 'mc_question': data['mc_question'], + 'mc_answer': data['mc_answer'], + }) + + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + read_parquet(dataset_path) + unzip_videos(dataset_path) + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + return dict(root=dataset_path, data_file=data_file) + + def qa_template(self, data): + question = data['question'] + answer = data['answer'] + return question, answer + + def save_video_frames(self, line): + vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix']) + vid = decord.VideoReader(vid_path) + video_info = { + 'fps': vid.get_avg_fps(), + 'n_frames': len(vid), + } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(line['video']) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(line['video'], len(indices)) + + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def save_video_into_images(self, line): + frame_paths = self.save_video_frames(line) + return frame_paths + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [] + message.append(dict(type='text', value=question)) + video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix']) + if video_llm: + message.append(dict(type='video', value=video_path)) + else: + img_frame_paths = self.save_video_into_images(line) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-1106', 'exact_matching'] + judge_kwargs.update({ + "max_tokens": 128, + "temperature": 1.0, + "top_p": 1, + "presence_penalty": 1, + }) + + suffix = eval_file.split('.')[-1] + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(score_file): + data = load(eval_file) + if model != 'exact_matching': + model = build_judge(system_prompt=sys_prompt, **judge_kwargs) + else: + model = None + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + _ = track_progress_rich( + evaluate_tempcompass_captioning, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for idx, item in data.iterrows(): + data.loc[idx, 'score'] = ans[idx]['rating'] + dump(data, score_file) + + rating = get_dimension_rating(score_file) + return rating + + +class TempCompass_YorN(VideoBaseDataset): + + MD5 = 'c72c046d7fa0e82c8cd7462f2e844ea8' + TYPE = 'Video-Y/N' + + def __init__(self, dataset='TempCompass_YorN', nframe=0, fps=-1): + self.type_data_list = { + 'yes_no': ('yes_no.json', './videos', '.mp4'), + } + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + + @classmethod + def supported_datasets(cls): + return ['TempCompass_YorN'] + + def prepare_dataset(self, dataset_name='TempCompass_YorN', repo_id='lmms-lab/TempCompass'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not osp.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])): + return False + return True + + cache_path = get_cache_path(repo_id) + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def read_parquet(pth): + import pandas as pd + for task_name in self.type_data_list.keys(): + if not osp.exists(osp.join(pth, f'{task_name}.json')): + data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet')) + data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False) + + def unzip_videos(pth): + import zipfile + if not osp.exists(osp.join(pth, 'videos')): + zip_file = osp.join(pth, 'tempcompass_videos.zip') + with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(pth) + + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if osp.exists(data_file) and md5(data_file) == self.MD5: + return + self.data_list = [] + for k, v in self.type_data_list.items(): + with open(osp.join(pth, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'prefix': v[1], + 'suffix': v[2], + 'video': data['video_id'], + 'question': data['question'].split('\n')[0], + 'answer': data['answer'], + 'dim': data['dim'] + }) + + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + read_parquet(dataset_path) + unzip_videos(dataset_path) + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + return dict(root=dataset_path, data_file=data_file) + + def qa_template(self, data): + question = data['question'] + answer = data['answer'] + return question, answer + + def save_video_frames(self, line): + vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix']) + vid = decord.VideoReader(vid_path) + video_info = { + 'fps': vid.get_avg_fps(), + 'n_frames': len(vid), + } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(line['video']) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(line['video'], len(indices)) + + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def save_video_into_images(self, line): + frame_paths = self.save_video_frames(line) + return frame_paths + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [] + message.append(dict(type='text', value=question)) + video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix']) + if video_llm: + message.append(dict(type='video', value=video_path)) + else: + img_frame_paths = self.save_video_into_images(line) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + message.append(dict(type='text', value='\nPlease answer yes or no:')) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-1106', 'exact_matching'] + judge_kwargs.update({ + "max_tokens": 128, + "temperature": 1.0, + "top_p": 1, + "presence_penalty": 1, + }) + + suffix = eval_file.split('.')[-1] + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(score_file): + data = load(eval_file) + if model != 'exact_matching': + model = build_judge(system_prompt=sys_prompt, **judge_kwargs) + else: + model = None + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + _ = track_progress_rich( + evaluate_tempcompass_YorN, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for idx, item in data.iterrows(): + data.loc[idx, 'score'] = ans[idx]['rating'] + dump(data, score_file) + + rating = get_dimension_rating(score_file) + return rating diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py index 1ade2c4..ecad312 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/__init__.py @@ -5,5 +5,5 @@ from .vqa_eval import levenshtein_distance __all__ = [ 'build_judge', 'extract_answer_from_item', 'prefetch_answer', - 'levenshtein_distance', 'DEBUG_MESSAGE' + 'levenshtein_distance', 'DEBUG_MESSAGE', ] diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md new file mode 100644 index 0000000..99572ef --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/README.md @@ -0,0 +1,59 @@ +# CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy + +## Introduction + +Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information. + +## Running Scripts + +Once the environment is ready, execute the following script from the root directory of VLMEvalKit +to perform inference and evaluation tasks in batch. + +```shell +MODEL_NAME="QwenVLMax" +OUTPUT_DIR="/your/path/to/output_dir" + +SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_scene_ocr +python run.py --data CCOCR_MultiSceneOcr_Cord CCOCR_MultiSceneOcr_Funsd CCOCR_MultiSceneOcr_Iam CCOCR_MultiSceneOcr_ZhDoc CCOCR_MultiSceneOcr_ZhHandwriting CCOCR_MultiSceneOcr_Hieragent CCOCR_MultiSceneOcr_Ic15 CCOCR_MultiSceneOcr_Inversetext CCOCR_MultiSceneOcr_Totaltext CCOCR_MultiSceneOcr_ZhScene CCOCR_MultiSceneOcr_UgcLaion CCOCR_MultiSceneOcr_ZhDense CCOCR_MultiSceneOcr_ZhVertical --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose +python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR} + +SUB_OUTPUT_DIR=${OUTPUT_DIR}/multi_lan_ocr +python run.py --data CCOCR_MultiLanOcr_Arabic CCOCR_MultiLanOcr_French CCOCR_MultiLanOcr_German CCOCR_MultiLanOcr_Italian CCOCR_MultiLanOcr_Japanese CCOCR_MultiLanOcr_Korean CCOCR_MultiLanOcr_Portuguese CCOCR_MultiLanOcr_Russian CCOCR_MultiLanOcr_Spanish CCOCR_MultiLanOcr_Vietnamese --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose +python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR} + +SUB_OUTPUT_DIR=${OUTPUT_DIR}/doc_parsing +python run.py --data CCOCR_DocParsing_DocPhotoChn CCOCR_DocParsing_DocPhotoEng CCOCR_DocParsing_DocScanChn CCOCR_DocParsing_DocScanEng CCOCR_DocParsing_TablePhotoChn CCOCR_DocParsing_TablePhotoEng CCOCR_DocParsing_TableScanChn CCOCR_DocParsing_TableScanEng CCOCR_DocParsing_MolecularHandwriting CCOCR_DocParsing_FormulaHandwriting --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose +python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR} + +SUB_OUTPUT_DIR=${OUTPUT_DIR}/kie +python run.py --data CCOCR_Kie_Sroie2019Word CCOCR_Kie_Cord CCOCR_Kie_EphoieScut CCOCR_Kie_Poie CCOCR_Kie_ColdSibr CCOCR_Kie_ColdCell --model ${MODEL_NAME} --work-dir ${SUB_OUTPUT_DIR} --verbose +python vlmeval/dataset/utils/ccocr_evaluator/common.py ${SUB_OUTPUT_DIR} +``` + +## Example Output +The evaluation results will be saved in `${SUB_OUTPUT_DIR}/summary.md`. For example, for the KIE subset, +the output is as follows: + +| exp_name(f1_score) | COLD_CELL | COLD_SIBR | CORD | EPHOIE_SCUT | POIE | sroie2019_word | summary | +|:-------------------|------------:|------------:|-------:|--------------:|-------:|-----------------:|----------:| +| QwenVLMax | 81.01 | 72.46 | 69.33 | 71.2 | 60.85 | 76.37 | 71.87 | + + +## Citation +If you find our work helpful, feel free to give us a cite. + +``` +@misc{yang2024ccocr, + title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy}, + author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin}, + year={2024}, + eprint={2412.02210}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2412.02210}, +} +``` + +## Contact Us + +If you have any questions, feel free to send an email to: wpf272043@alibaba-inc.com or xixing.tj@alibaba-inc.com diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/__init__.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/__init__.py new file mode 100644 index 0000000..d89f6f6 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/__init__.py @@ -0,0 +1,12 @@ +from .kie_evaluator import KieEvaluator +from .doc_parsing_evaluator import ParsingEvaluator +from .ocr_evaluator import OcrEvaluator +from .common import summary + + +evaluator_map_info = { + "kie": KieEvaluator("kie"), + "doc_parsing": ParsingEvaluator("doc_parsing"), + "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"), + "multi_scene_ocr": OcrEvaluator("multi_scene_ocr") +} diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py new file mode 100644 index 0000000..6ce9bcb --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/common.py @@ -0,0 +1,222 @@ +import os +import json +import time +import sys +from abc import abstractmethod +from tabulate import tabulate + + +def pick_response_text(json_path): + """ + """ + try: + with open(json_path, "r") as f: + json_data = json.load(f) + except Exception as e: + print("--> file error: msg: {}, path: {}".format(e, json_path)) + return None + + for required_key in ["model_name", "response"]: + if required_key not in json_data: + print("--> required key not exists, name: {}, path: {}".format(required_key, json_path)) + return None + + model_name = json_data["model_name"] + model_response = json_data["response"] + + response_text = None + if model_name.startswith("gpt") or model_name.startswith("o1"): + response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501 + elif model_name.startswith("local_"): + response_text = model_response + else: + if model_name.startswith("claude"): + content_list = model_response.get("content", None) + elif model_name.startswith("gemini"): + content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None) + elif model_name.startswith("qwen"): + content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501 + else: + raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name)) + + if isinstance(content_list, list) and len(content_list) > 0: + response_text = content_list[0].get("text", None) + + if response_text is None: + print("--> [error][{}] text pick error, path: {}".format(model_name, json_path)) + return response_text + + +def load_response_from_dir(res_dir): + """ + """ + response_info = {} + for file_name in os.listdir(res_dir): + file_path = os.path.abspath(os.path.join(res_dir, file_name)) + if not file_name.endswith(".json"): + print("--> skip: result file should be a json: but got: {}".format(file_path)) + continue + + response_text = pick_response_text(file_path) + if response_text is None: + continue + + file_name_wo_ext, ext = os.path.splitext(file_name) + response_info[file_name_wo_ext] = response_text + return response_info + + +class BaseMetric(object): + """ BaseMetric """ + """ OCRMetric """ + def __init__(self, group_name, **kwargs): + self.group_name = group_name + self.kwargs = kwargs + + def response_post_func(self, response_text, **kwargs): + return response_text + + @abstractmethod + # Given the prediction and gt, return the evaluation results in the format of a dictionary + # results should contain a 'summary' key, for example: + # { + # "summary": { + # "f1-score": 99.99, + # "metric_name": "metric_value" # used for summary,only metric info could be placed in this dict. + # }, + # "your other info": "xxx" + # } + def evaluate(self, response_info, gt_info, normalize_func=None, **kwargs): + pass + + def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs): + if isinstance(pdt_res_dir, dict): + raw_response_info = pdt_res_dir + elif os.path.exists(pdt_res_dir) and os.path.isdir(pdt_res_dir): + raw_response_info = load_response_from_dir(pdt_res_dir) + else: + return ValueError("invalid input: response dict or folder are required, but got {}".format(pdt_res_dir)) + + post_error_list, response_info = [], {} + response_error_list = list(gt_info.keys() - raw_response_info.keys()) + for file_name, single_pdt_str in raw_response_info.items(): + single_pdt_str = self.response_post_func(single_pdt_str, **kwargs) + if single_pdt_str is None: + post_error_list.append(file_name) + continue + response_info[file_name] = single_pdt_str + + meta_info = { + "gt_total_num": len(gt_info), "pdt_total_num": len(response_info), + "post_error_list": post_error_list, "response_error_list": response_error_list, + } + eval_info = self.evaluate(response_info, gt_info, **kwargs) + + # add response_success_ratio + if "summary" in eval_info and with_response_ratio: + success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9) + eval_info["summary"].update({"response_success_ratio": success_ratio}) + return meta_info, eval_info + + +def summary(index_path, exp_dir_base, is_weighted_sum=False): + """ + """ + with open(index_path, "r") as f: + data_list = json.load(f) + + all_data_info = {} + for data_info_item in data_list: + data_name = data_info_item["dataset"] + if not data_info_item.get("release", True): + continue + all_data_info[data_name] = data_info_item + dataset_list = list(all_data_info.keys()) + summary_path = summary_multi_exp(exp_dir_base, dataset_list, is_weighted_sum=is_weighted_sum) + return summary_path + + +def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False): + """ + """ + if dataset_list is None: + all_dataset_name = [] + for exp_name in os.listdir(exp_dir_base): + dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json") + if not os.path.exists(dir_status_path): + continue + with open(dir_status_path, "r") as f: + data_status_info = json.load(f) + all_dataset_name.extend(data_status_info.keys()) + dataset_list = sorted(set(all_dataset_name)) + + # summary main code + all_evaluate_info, _ = {}, 0 + for exp_name in os.listdir(exp_dir_base): + dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json") + if not os.path.exists(dir_status_path): + print("--> skip: status.json not exist: {}".format(dir_status_path)) + continue + + with open(dir_status_path, "r") as f: + all_status_info = json.load(f) + + for data_name in dataset_list: + total_num = all_status_info.get(data_name, {}).get("config", {}).get("num", "-1") + summary_info = all_status_info.get(data_name, {}).get("evaluation", {}).get("summary", {}) + for metric_name, metric_value in summary_info.items(): + if metric_name not in all_evaluate_info: + all_evaluate_info[metric_name] = {} + if exp_name not in all_evaluate_info[metric_name]: + all_evaluate_info[metric_name][exp_name] = {} + all_evaluate_info[metric_name][exp_name][data_name] = (metric_value, total_num) + + all_table_md = [] + for metric_name, metric_info in all_evaluate_info.items(): + formatted_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time())) + summary_line_list = [] + summary_key_name = "summary(weighted)" if is_weighted_sum else "summary" + summary_head = [f"exp_name({metric_name}_{formatted_time})"] + dataset_list + [summary_key_name] + for exp_name, data_eval_info in metric_info.items(): + summary_line = [exp_name, ] + + all_metric_value = 0 + is_summary_valid, all_total_num, all_weighted_metric = True, 0, 0 + for data_name in dataset_list: + metric_value, total_num = data_eval_info.get(data_name, ("-1", "-1")) + summary_line.append("{:.2f}".format(float(metric_value) * 100)) + if str(metric_value) == "-1" or str(metric_value) == "-1": + is_summary_valid = False + continue + + all_total_num += float(total_num) + all_weighted_metric += float(total_num) * float(metric_value) + all_metric_value += float(metric_value) + + summary_value_valid = ((all_weighted_metric / (all_total_num + 1e-9)) * 100) if is_weighted_sum \ + else (all_metric_value / (len(dataset_list) + 1e-9) * 100) + summary_value = "-" if not is_summary_valid else "{:.2f}".format(summary_value_valid) + summary_line.append(summary_value) + summary_line_list.append(summary_line) + + md_table_info = tabulate(summary_line_list, headers=summary_head, tablefmt='pipe') + all_table_md.append(md_table_info) + + print("\n\n".join(all_table_md)) + summary_path = os.path.abspath(os.path.join(exp_dir_base, "summary.md")) + with open(summary_path, "w") as f: + f.write("\n\n".join(all_table_md)) + return summary_path + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("Usage: python {} exp_base_dir".format(__file__)) + exit(-1) + else: + print('--> info: {}'.format(sys.argv)) + exp_base_dir = sys.argv[1] + + summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False) + print("--> info: summary saved at : {}".format(summary_path)) + print("happy coding.") diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py new file mode 100644 index 0000000..d059adc --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py @@ -0,0 +1,256 @@ +import nltk +import re +from tqdm import tqdm +from collections import deque +from apted.helpers import Tree +from apted import APTED, Config + +# local import +from .common import BaseMetric + + +# 移除指定的LaTeX命令 +patterns = [ + r'\\documentclass\{.*?\}', + r'\\usepackage\[.*?\]\{.*?\}', + r'\\usepackage\{.*?\}', + r'\\geometry\{.*?\}', + r'\\begin\{document\}', + r'\\end\{document\}', + r'\\noindent' +] + + +class TableTree(Tree): + """ + # Copyright 2020 IBM + # Author: peter.zhong@au1.ibm.com + # License: Apache 2.0 License. + """ + def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): + self.tag = tag + self.colspan = colspan + self.rowspan = rowspan + self.content = content + self.children = list(children) + + def bracket(self): + """Show tree using brackets notation""" + if self.tag == "td": + result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % ( + self.tag, + self.colspan, + self.rowspan, + self.content, + ) + else: + result = '"tag": %s' % self.tag + for child in self.children: + result += child.bracket() + return "{{{}}}".format(result) + + +class CustomConfig(Config): + """ + # Copyright 2020 IBM + # Author: peter.zhong@au1.ibm.com + # License: Apache 2.0 License. + """ + def rename(self, node1, node2): + """Compares attributes of trees""" + # print(node1.tag) + if ( + (node1.tag != node2.tag) + or (node1.colspan != node2.colspan) + or (node1.rowspan != node2.rowspan) + ): + return 1.0 + if node1.tag == "td": + if node1.content or node2.content: + return nltk.edit_distance(node1.content, node2.content) / max(len(node1.content), len(node2.content)) + return 0.0 + + +class TEDS(object): + """Tree Edit Distance basead Similarity + # Copyright 2020 IBM + # Author: peter.zhong@au1.ibm.com + # License: Apache 2.0 License. + """ + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + assert isinstance(n_jobs, int) and ( + n_jobs >= 1 + ), "n_jobs must be an integer greather than 1" + self.structure_only = structure_only + self.n_jobs = n_jobs + self.ignore_nodes = ignore_nodes + self.__tokens__ = [] + + def tokenize(self, node): + """Tokenizes table cells""" + self.__tokens__.append("<%s>" % node.tag) + if node.text is not None: + self.__tokens__ += list(node.text) + for n in node.getchildren(): + self.tokenize(n) + if node.tag != "unk": + self.__tokens__.append("" % node.tag) + if node.tag != "td" and node.tail is not None: + self.__tokens__ += list(node.tail) + + def load_html_tree(self, node, parent=None): + """Converts HTML tree to the format required by apted""" + global __tokens__ + if node.tag == "td": + if self.structure_only: + cell = [] + else: + self.__tokens__ = [] + self.tokenize(node) + cell = self.__tokens__[1:-1].copy() + new_node = TableTree( + node.tag, + int(node.attrib.get("colspan", "1")), + int(node.attrib.get("rowspan", "1")), + cell, + *deque(), + ) + else: + new_node = TableTree(node.tag, None, None, None, *deque()) + if parent is not None: + parent.children.append(new_node) + if node.tag != "td": + for n in node.getchildren(): + self.load_html_tree(n, new_node) + if parent is None: + return new_node + + def evaluate(self, pred, true): + """Computes TEDS score between the prediction and the ground truth of a + given sample + """ + # try_import("lxml") + from lxml import etree, html + if (not pred) or (not true): + return 0.0 + + parser = html.HTMLParser(remove_comments=True, encoding="utf-8") + pred = html.fromstring(pred, parser=parser) + true = html.fromstring(true, parser=parser) + if pred.xpath("body/table") and true.xpath("body/table"): + pred = pred.xpath("body/table")[0] + true = true.xpath("body/table")[0] + if self.ignore_nodes: + etree.strip_tags(pred, *self.ignore_nodes) + etree.strip_tags(true, *self.ignore_nodes) + n_nodes_pred = len(pred.xpath(".//*")) + n_nodes_true = len(true.xpath(".//*")) + n_nodes = max(n_nodes_pred, n_nodes_true) + tree_pred = self.load_html_tree(pred) + tree_true = self.load_html_tree(true) + distance = APTED( + tree_pred, tree_true, CustomConfig() + ).compute_edit_distance() + return 1.0 - (float(distance) / n_nodes) + else: + return 0.0 + + +class ParsingEvaluator(BaseMetric): + def response_post_func(self, response_text, **kwargs): + return response_text + + def evaluate(self, response_info, gt_info, **kwargs): + op = kwargs['op'] + if op == 'doc': + score = self.eval_doc(response_info, gt_info) + elif op == 'table': + score = self.eval_table(response_info, gt_info) + elif op in ['molecular', "formula"]: + score = self.eval_formula(response_info, gt_info, op_name=op) + else: + raise ValueError(f'doc parsing unsupported op: {op}') + + # summary info + eval_info = {"summary": {"score": score}} + return eval_info + + def eval_doc(self, response_info, gt_info): + results = [] + for img_name, gt in tqdm(gt_info.items()): + if img_name not in response_info: + results.append(0) + continue + + pred = response_info[img_name] + for pattern in patterns: + pred = re.sub(pattern, '', pred) + + try: + pred = pred.split('```')[1] + except: + pass + + pred = pred.replace('```latex', '') + pred = pred.replace('```', '') + + pred = pred.replace(' ', '').replace('\n', '') + gt = gt.replace(' ', '').replace('\n', '') + + edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt)) + results.append(1 - edit_dist) + + score = sum(results) / len(results) + return score + + def eval_table(self, response_info, gt_info): + teds = TEDS(structure_only=False, n_jobs=1) + results = [] + for img_name, gt in tqdm(gt_info.items()): + if img_name not in response_info: + results.append(0) + continue + + pred = response_info[img_name] + for pattern in patterns: + pred = re.sub(pattern, '', pred) + + try: + pred = pred.split('```html')[1] + except: + pass + + pred = pred.replace('```', '') + pred = pred.replace(' ', '').replace('\n', '').replace(',', ',') + gt = gt.replace(' ', '').replace('\n', '') + + pred_html = '{}'.format(pred) + gt_html = '{}'.format(gt) + results.append(teds.evaluate(pred_html, gt_html)) + + score = sum(results) / len(results) + return score + + def eval_formula(self, response_info, gt_info, op_name='formula'): + results = [] + for img_name, gt in tqdm(gt_info.items()): + if img_name not in response_info: + results.append(0) + continue + + pred = response_info[img_name] + + if op_name == 'formula': + pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") # noqa: E501 + gt = gt.replace(" ", "") + elif op_name == 'molecular': + pred = pred.replace("\n", "").replace(" ", "").replace("", "").replace("", "") + gt = gt.replace(" ", "") + edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt)) + results.append(1 - edit_dist) + score = sum(results) / len(results) + return score + + +if __name__ == '__main__': + pass diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py new file mode 100644 index 0000000..797d424 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py @@ -0,0 +1,385 @@ + +""" +Donut +Copyright (c) 2022-present NAVER Corp. +MIT License +""" +import json +import os +import sys +import re +import time +from typing import Any, Dict, List, Tuple, Union + +import zss +from zss import Node +from collections import Counter +from nltk import edit_distance + +# local import +from .common import BaseMetric + + +def flatten(data: dict): + """ + Convert Dictionary into Non-nested Dictionary + Example: + input(dict) + { + "menu": [ + {"name" : ["cake"], "count" : ["2"]}, + {"name" : ["juice"], "count" : ["1"]}, + ] + } + output(list) + [ + ("menu.name", "cake"), + ("menu.count", "2"), + ("menu.name", "juice"), + ("menu.count", "1"), + ] + """ + flatten_data = list() + + def _flatten(value, key=""): + if type(value) is dict: + for child_key, child_value in value.items(): + _flatten(child_value, f"{key}.{child_key}" if key else child_key) + elif type(value) is list: + for value_item in value: + _flatten(value_item, key) + else: + flatten_data.append((key, value)) + + _flatten(data) + return flatten_data + + +def update_cost(node1: Node, node2: Node): + """ + Update cost for tree edit distance. + If both are leaf node, calculate string edit distance between two labels (special token '' will be ignored). + If one of them is leaf node, cost is length of string in leaf node + 1. + If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1 + """ + label1 = node1.label + label2 = node2.label + label1_leaf = "" in label1 + label2_leaf = "" in label2 + if label1_leaf and label2_leaf: + return edit_distance(label1.replace("", ""), label2.replace("", "")) + elif not label1_leaf and label2_leaf: + return 1 + len(label2.replace("", "")) + elif label1_leaf and not label2_leaf: + return 1 + len(label1.replace("", "")) + else: + return int(label1 != label2) + + +def insert_and_remove_cost(node: Node): + """ + Insert and remove cost for tree edit distance. + If leaf node, cost is length of label name. + Otherwise, 1 + """ + label = node.label + if "" in label: + return len(label.replace("", "")) + else: + return 1 + + +def normalize_dict(data: Union[Dict, List, Any]): + """ + Sort by value, while iterate over element if data is list + """ + # if not data: + # return {} + + if isinstance(data, dict): + new_data = dict() + for key in sorted(data.keys(), key=lambda k: (len(k), k)): + value = normalize_dict(data[key]) + if value: + if not isinstance(value, list): + value = [value] + new_data[key] = value + + elif isinstance(data, list): + if all(isinstance(item, dict) for item in data): + new_data = [] + for item in data: + item = normalize_dict(item) + if item: + new_data.append(item) + else: + new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()] + else: + new_data = [str(data).strip()] + return new_data + + +def cal_f1_all(preds, answers): + """ + Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, + false negatives and false positives + """ + metric_info, error_info = {}, {} + total_tp, total_fn_or_fp = 0, 0 + for file_name, answer in answers.items(): + sample_error_info = {"fp": [], "fn": [], "tp": []} + pred = preds.get(file_name, {}) + pred, answer = flatten(normalize_dict(pred)), flatten(normalize_dict(answer)) + for field in pred: + field_name = field[0] + if field_name not in metric_info: + metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0} + if field in answer: + total_tp += 1 + metric_info[field_name]["total_tp"] += 1 + sample_error_info["tp"].append(field) + answer.remove(field) + else: + total_fn_or_fp += 1 + metric_info[field_name]["total_fn_or_fp"] += 1 + sample_error_info["fp"].append(field) + + total_fn_or_fp += len(answer) + for field in answer: + field_name = field[0] + if field_name not in metric_info: + metric_info[field_name] = {"total_tp": 0, "total_fn_or_fp": 0} + metric_info[field_name]["total_fn_or_fp"] += 1 + sample_error_info["fn"].append(field) + + sample_error_num = sum([len(v) for k, v in sample_error_info.items() if k != "tp"]) + if sample_error_num > 0: + sample_error_info["error_num"] = sample_error_num + error_class_list = ["counter_" + x[0] for x in (sample_error_info["fn"] + sample_error_info["fp"])] + counter = Counter(error_class_list) + sample_error_info["error_info"] = dict(counter) + error_info[file_name] = sample_error_info + + # summary + for field_name, field_info in metric_info.items(): + field_tp, field_fn_or_fp = field_info["total_tp"], field_info["total_fn_or_fp"] + metric_info[field_name]["acc"] = field_tp / (field_tp + field_fn_or_fp / 2 + 1e-6) + + print("donut_evaluator: total_tp: {}, total_fn_or_fp: {}, ptd_num: {}, gt_num: {}".format(total_tp, total_fn_or_fp, + len(preds), len(answers))) + error_info = {k: v for k, v in + sorted(error_info.items(), key=lambda item: item[1].get("error_num", 0), reverse=True)} + metric_info = {k: v for k, v in + sorted(metric_info.items(), key=lambda item: item[1].get("total_fn_or_fp", 0), reverse=True)} + return total_tp / (total_tp + total_fn_or_fp / 2 + 1e-6), metric_info, error_info + + +def construct_tree_from_dict(data: Union[Dict, List], node_name: str = None): + """ + Convert Dictionary into Tree + + Example: + input(dict) + + { + "menu": [ + {"name" : ["cake"], "count" : ["2"]}, + {"name" : ["juice"], "count" : ["1"]}, + ] + } + + output(tree) + + | + menu + / \ + + / | | \ + name count name count + / | | \ + cake 2 juice 1 + """ + if node_name is None: + node_name = "" + + node = Node(node_name) + + if isinstance(data, dict): + for key, value in data.items(): + kid_node = construct_tree_from_dict(value, key) + node.addkid(kid_node) + elif isinstance(data, list): + if all(isinstance(item, dict) for item in data): + for item in data: + kid_node = construct_tree_from_dict( + item, + "", + ) + node.addkid(kid_node) + else: + for item in data: + node.addkid(Node(f"{item}")) + else: + raise Exception(data, node_name) + return node + + +def cal_acc(pred: dict, answer: dict): + """ + Calculate normalized tree edit distance(nTED) based accuracy. + 1) Construct tree from dict, + 2) Get tree distance with insert/remove/update cost, + 3) Divide distance with GT tree size (i.e., nTED), + 4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ). + """ + pred = construct_tree_from_dict(normalize_dict(pred)) + answer = construct_tree_from_dict(normalize_dict(answer)) + val1 = zss.distance( + pred, + answer, + get_children=zss.Node.get_children, + insert_cost=insert_and_remove_cost, + remove_cost=insert_and_remove_cost, + update_cost=update_cost, + return_operations=False, + ) + val2 = zss.distance( + construct_tree_from_dict(normalize_dict({})), + answer, + get_children=zss.Node.get_children, + insert_cost=insert_and_remove_cost, + remove_cost=insert_and_remove_cost, + update_cost=update_cost, + return_operations=False, + ) + return max(0, 1 - val1 / val2) + + +def cal_acc_all(pred_info, answer_info): + acc_info, error_info = {}, {} + for file_name, answer in answer_info.items(): + # if file_name not in pred_info: + # print("---> error: pdt not found: {}".format(file_name)) + # continue + pred = pred_info.get(file_name, {}) + acc = cal_acc(pred, answer) + acc_info[file_name] = acc + if acc < 1.0: + error_info[file_name] = {"acc": acc, "pred": pred, "answer": answer} + + error_info = {k: v for k, v in sorted(error_info.items(), key=lambda item: item[1].get("acc", 0))} + acc_averge = sum(list(acc_info.values())) / (len(acc_info) + 1e-6) + return acc_averge, error_info + + +def normalize_values_of_nested_dict(d, normalize_func): + """ + """ + if isinstance(d, dict): + return {k: normalize_values_of_nested_dict(v, normalize_func) for k, v in d.items()} + elif isinstance(d, list): + return [normalize_values_of_nested_dict(x, normalize_func) if isinstance(x, dict) else x for x in d] + elif isinstance(d, str): + return normalize_func(d) + else: + return d + + +def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None): + """ + """ + if normalize_func is not None: + print("--> info: normalize_func executed.") + pdt_info = normalize_values_of_nested_dict(pdt_info, normalize_func) + gt_info = normalize_values_of_nested_dict(gt_info, normalize_func) + + f1_score, class_eval_info, error_info = cal_f1_all(pdt_info, gt_info) + acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info) + eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info, + "f1_error_info": error_info, "acc_error_info": acc_error_info} + print(data_name, "f1_score", f1_score, "acc", acc_average) + return eval_info + + +def post_process_to_json(qwen_info_str, file_name=None): + try: + if "```json" in qwen_info_str: + if "```" not in qwen_info_str: + qwen_info_str += "```" + qwen_info_group = re.search(r'```json(.*?)```', qwen_info_str, re.DOTALL) + json_str = qwen_info_group.group(1).strip().replace("\n", "") + else: + json_str = qwen_info_str.strip().replace("\n", "") + json_data = json.loads(json_str) + return json_data + except Exception as err: # noqa: F841 + return None + + +def fullwidth_to_halfwidth(text): + # 全角转半角 + result = '' + for char in text: + code_point = ord(char) + # 全角空格直接转化 + if code_point == 0x3000: + code_point = 0x0020 + # 其他全角字符(除空格)转换为半角 + elif 0xFF01 <= code_point <= 0xFF5E: + code_point -= 0xFEE0 + result += chr(code_point) + result = result.replace("、", ",") + return result + + +def remove_unnecessary_spaces(text): + # 去掉中文字符之间的空格 + text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', text) + # 去掉中文和英文、数字之间的空格 + text = re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[a-zA-Z0-9])', '', text) + text = re.sub(r'(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fff])', '', text) + # 去掉符号前的不必要空格,保留符号后的一个空格 + text = re.sub(r'(? 0] + return text_token_normalized + + +def evaluate_single_sample(gts, preds): + right_num = 0 + gt_counter_info = dict(Counter(gts)) + pdt_counter_info = dict(Counter(preds)) + for gt_token, gt_count in gt_counter_info.items(): + pred_count = pdt_counter_info.get(gt_token, 0) + right_num += min(gt_count, pred_count) + return right_num + + +def calculate_metrics(response_info, gt_info, is_verbose=False): + """ + """ + macro_recall_list, macro_precision_list, macro_f1_list = [], [], [] + total_gt_num, total_pred_num, total_right_num = 0, 0, 0 + for file_name, fullbox_gts in gt_info.items(): + fullbox_preds = response_info.get(file_name, []) + right_num = evaluate_single_sample(fullbox_gts, fullbox_preds) + total_right_num += right_num + total_gt_num += len(fullbox_gts) + total_pred_num += len(fullbox_preds) + + macro_recall = right_num / (len(fullbox_gts) + 1e-9) + macro_precision = right_num / (len(fullbox_preds) + 1e-9) + macro_f1 = 2 * macro_recall * macro_precision / (macro_recall + macro_precision + 1e-9) + macro_recall_list.append(macro_recall) + macro_precision_list.append(macro_precision) + macro_f1_list.append(macro_f1) + + # marco + final_macro_recall = sum(macro_recall_list) / (len(macro_recall_list) + 1e-9) + final_macro_precision = sum(macro_precision_list) / (len(macro_precision_list) + 1e-9) + final_macro_f1 = sum(macro_f1_list) / (len(macro_f1_list) + 1e-9) + + # micro + recall_acc = total_right_num / (total_gt_num + 1e-9) + preci_acc = total_right_num / (total_pred_num + 1e-9) + hmean = 2 * recall_acc * preci_acc / (recall_acc + preci_acc + 1e-9) + vbs_eval_result = { + 'macro_recall': final_macro_recall, 'macro_precision': final_macro_precision, 'macro_f1_score': final_macro_f1, + 'micro_recall': recall_acc, 'micro_precision': preci_acc, 'mirco_f1_score': hmean + } + eval_result = vbs_eval_result if is_verbose else {'macro_f1_score': final_macro_f1, 'mirco_f1_score': hmean} + return eval_result + + +class OcrEvaluator(BaseMetric): + def response_post_func(self, response_text, **kwargs): + return response_text + + def evaluate(self, response_info, gt_info, **kwargs): + # hard code here + dataset_name = kwargs['dataset'] + is_word_level, is_lower, is_alphanum_only = True, True, False + if dataset_name in ["Arabic", "Japanese", "Korean"] or "zh" in dataset_name: + is_word_level = False + if "multi_scene_ocr" in self.group_name and is_word_level: + is_alphanum_only = True + eval_config = {"word_level": is_word_level, "alphanum_only": is_alphanum_only, "lowercase": is_lower} + + image_pdt_info, image_gt_info = {}, {} + for file_name, gt_src in gt_info.items(): + pred_src = response_info.get(file_name, "") + pdt_token_list = text_normalize_and_tokenize( + str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only) + gt_token_list = text_normalize_and_tokenize( + str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only) + image_pdt_info[file_name] = pdt_token_list + image_gt_info[file_name] = gt_token_list + eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False) + return {"summary": eval_result, "metric_config": eval_config} + + +if __name__ == '__main__': + pass diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py new file mode 100644 index 0000000..eaf643b --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/cgbench.py @@ -0,0 +1,682 @@ +from ...smp import * +from .multiple_choice import extract_answer_from_item +import pandas as pd +import numpy as np +import re + +FAIL_MSG = "Failed to obtain answer via API." + +frame_tmpl = "frame-{}-of-{}.jpg" + +sys_prompt_open_eval_step_1 = ( + "You will be provided with a question, a model's prediction, and the ground " + "truth answer for this question.\n" + "Your task is to judge whether the model's prediction is correct based on the " + "meaning of the two texts.\n" + "In most cases, this can be done by determining if the meaning of the model's " + "prediction is consistent with, or contains, the ground truth answer. However, " + "in some cases where the two texts differ, it may represent different " + "descriptions of the same visual scene, in which case visual information is " + "needed for further judgment.\n" + "Therefore, I hope you:\n" + "- Output 0, if the model's prediction and the ground truth answer are neither " + "consistent nor related by inclusion, with fundamentally different meanings.\n" + "- Output 1, if the meaning of the model's prediction and the ground truth " + "answer is consistent, or if the model's prediction meaningfully contains the " + "ground truth answer.\n" + "- Output 2, if the model's prediction and ground truth are not consistent or " + "inclusive, but may be different descriptions of the same visual scene, " + "requiring visual information for further judgment.\n" + "Only output the answer in the following format:\n\n" + '```json\n{"result": choice}\n```\n\n' + "The choice is either 0, 1, or 2 as specified above." +) + +sys_prompt_open_eval_step_2 = ( + "You will be provided with a question, a model's prediction, and the sampling " + "frames of the clue intervals related to this question.\n" + "Your task is to determine whether the model has answered the question " + "correctly based on the visual information provided.\n" + "Therefore, I hope you:\n" + "- Output 0, if the model's prediction does not correctly answer the question.\n" + "- Output 1, if the model's prediction correctly answers the question.\n" + "Only output the answer in the following format without output extra " + "explanation:\n\n" + '```json\n{"result": choice}\n```\n\n' + "The choice is either 0 or 1 as specified above." +) + +FAIL_MSG = "Failed to obtain answer via API." + +# '10-20', '20-30', '30-40', '40-50', '50-60' +DURATIONS = ["0 ~ 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "60+"] + +DOMAINS = [ + "Life Record", + "Music & TV show", + "Instruction & Knowledge", + "Driving", + "Embodied Expert", + "Humor/funny", + "Electonic/Social Gaming", + "Security & Health", + "Sports & Exercise", + "Special Scenes", + "Art & Culture", + "GUI", + "News", + "Animal & Pet", +] + +SUB_CATEGORIES = [ + "Time Cognition", + "Hallucination", + "Entity Perception", + "2D Spatial Perception", + "Time Perception", + "Scene Perception", + "Text Perception", + "Event Cognition", + "Entity Cognition", + "Text Cognition", + "Event Perception", + "Scene Cognition", +] + + +def get_dimention_rating_open_ended(data_path): + # 读取数据 + df = load(data_path) + + df = df[df["score"] != -1] + + # 将秒转换为分钟并分配到对应区间 + df["duration_minutes"] = df["duration"] / 60 + df["duration_range"] = pd.cut( + df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS + ) + + # 初始化结果字典 + result = { + "overall": 0, + "duration": {k: 0 for k in DURATIONS}, + "domain": {k: 0 for k in DOMAINS}, + "sub_category": {k: 0 for k in SUB_CATEGORIES}, + } + + # Overall + result["overall"] = round(df["score"].mean(), 4) + + # Duration + for dur in DURATIONS: + dur_scores = df[df["duration_range"] == dur]["score"] + result["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0 + + # Domain + for domain in DOMAINS: + domain_scores = df[df["domain"] == domain]["score"] + result["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0 + + # Sub-category + for sub_cat in SUB_CATEGORIES: + sub_cat_scores = df[df["sub_category"] == sub_cat]["score"] + result["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0 + + return result + + +def get_dimention_rating_mcq_grouding(data_path): + + # 读取数据 + df = load(data_path) + + # df.loc[(df['task_mode'] == 'miou') & (df['score'] == -1), 'score'] = 0 + + df = df[df["score"] != -1] + + # 将秒转换为分钟并分配到对应区间 + df["duration_minutes"] = df["duration"] / 60 + df["duration_range"] = pd.cut( + df["duration_minutes"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, np.inf], labels=DURATIONS + ) + + # 初始化结果字典 + result = { + metric: { + "overall": 0, + "duration": {k: 0 for k in DURATIONS}, + "domain": {k: 0 for k in DOMAINS}, + "sub_category": {k: 0 for k in SUB_CATEGORIES}, + } + for metric in ["long_acc", "clue_acc", "miou", "CRR", "acc@iou", "rec@iou"] + } + + # 计算基础指标 + for metric in ["long_acc", "clue_acc", "miou"]: + metric_df = df[df["task_mode"] == metric] + + # Overall + result[metric]["overall"] = round(metric_df["score"].mean(), 4) + + # Duration + for dur in DURATIONS: + dur_scores = metric_df[metric_df["duration_range"] == dur]["score"] + result[metric]["duration"][dur] = round(dur_scores.mean(), 4) if not dur_scores.empty else 0 + + # Domain + for domain in DOMAINS: + domain_scores = metric_df[metric_df["domain"] == domain]["score"] + result[metric]["domain"][domain] = round(domain_scores.mean(), 4) if not domain_scores.empty else 0 + + # Sub-category + for sub_cat in SUB_CATEGORIES: + sub_cat_scores = metric_df[metric_df["sub_category"] == sub_cat]["score"] + result[metric]["sub_category"][sub_cat] = round(sub_cat_scores.mean(), 4) if not sub_cat_scores.empty else 0 + + # 计算复合指标 CRR + def calculate_crr(scores): + long_acc = scores[scores["task_mode"] == "long_acc"]["score"].mean() + clue_acc = scores[scores["task_mode"] == "clue_acc"]["score"].mean() + return round(min(long_acc, clue_acc) / clue_acc, 4) if clue_acc != 0 else 0 + + # Overall CRR + result["CRR"]["overall"] = calculate_crr(df) + + # Duration CRR + for dur in DURATIONS: + dur_df = df[df["duration_range"] == dur] + result["CRR"]["duration"][dur] = calculate_crr(dur_df) + + # Domain CRR + for domain in DOMAINS: + domain_df = df[df["domain"] == domain] + result["CRR"]["domain"][domain] = calculate_crr(domain_df) + + # Sub-category CRR + for sub_cat in SUB_CATEGORIES: + sub_cat_df = df[df["sub_category"] == sub_cat] + result["CRR"]["sub_category"][sub_cat] = calculate_crr(sub_cat_df) + + # 计算 acc@iou + def calculate_acc_at_iou_threshold(scores, threshold): + + miou_qids = set(scores[scores["task_mode"] == "miou"]["qid"]) + + long_acc_qids = set(scores[scores["task_mode"] == "long_acc"]["qid"]) + + valid_qids = miou_qids & long_acc_qids + + miou_positive = set(scores[(scores["task_mode"] == "miou") & (scores["score"] > threshold)]["qid"]) + + long_acc_positive = scores[ + (scores["task_mode"] == "long_acc") & (scores["qid"].isin(miou_positive)) & (scores["score"] == 1) + ] + + acc_at_iou_threshold = len(long_acc_positive) / len(valid_qids) if len(valid_qids) > 0 else 0 + return round(acc_at_iou_threshold, 4) + + def calculate_acc_at_iou(scores): + thresholds = [0.1, 0.2, 0.3, 0.4, 0.5] + acc_at_iou_values = [calculate_acc_at_iou_threshold(scores, threshold) for threshold in thresholds] + + return round(sum(acc_at_iou_values) / len(acc_at_iou_values), 4) + + # Overall acc@iou + result["acc@iou"]["overall"] = calculate_acc_at_iou(df) + + # Duration acc@iou + for dur in DURATIONS: + dur_df = df[df["duration_range"] == dur] + result["acc@iou"]["duration"][dur] = calculate_acc_at_iou(dur_df) + + # Domain acc@iou + for domain in DOMAINS: + domain_df = df[df["domain"] == domain] + result["acc@iou"]["domain"][domain] = calculate_acc_at_iou(domain_df) + + # Sub-category acc@iou + for sub_cat in SUB_CATEGORIES: + sub_cat_df = df[df["sub_category"] == sub_cat] + result["acc@iou"]["sub_category"][sub_cat] = calculate_acc_at_iou(sub_cat_df) + + # 计算 rec@iou + def calculate_rec_at_iou_threshold(scores, threshold): + # 获取所有 miou 类型的数据 + miou_scores = scores[scores["task_mode"] == "miou"] + + # 计算 miou score 大于 threshold 的数量 + miou_positive = miou_scores[miou_scores["score"] > threshold] + + # 计算比例 + rec_at_iou = len(miou_positive) / len(miou_scores) if len(miou_scores) > 0 else 0 + + return round(rec_at_iou, 4) + + def calculate_rec_at_iou(scores): + thresholds = [0.1, 0.2, 0.3, 0.4, 0.5] + rec_at_iou_values = [calculate_rec_at_iou_threshold(scores, threshold) for threshold in thresholds] + + return round(sum(rec_at_iou_values) / len(rec_at_iou_values), 4) + + # Overall rec@iou + result["rec@iou"]["overall"] = calculate_rec_at_iou(df) + + # Duration rec@iou + for dur in DURATIONS: + dur_df = df[df["duration_range"] == dur] + result["rec@iou"]["duration"][dur] = calculate_rec_at_iou(dur_df) + + # Domain rec@iou + for domain in DOMAINS: + domain_df = df[df["domain"] == domain] + result["rec@iou"]["domain"][domain] = calculate_rec_at_iou(domain_df) + + # Sub-category rec@iou + for sub_cat in SUB_CATEGORIES: + sub_cat_df = df[df["sub_category"] == sub_cat] + result["rec@iou"]["sub_category"][sub_cat] = calculate_rec_at_iou(sub_cat_df) + + return result + + +def milliseconds_to_seconds(milliseconds): + return milliseconds / 1000 + + +def sample_frames_clue_average(clues_time_intervals, frame_num, fps): + # 计算每个线索区间的时长 + clues_frame_intervals = [(round(interval[0] * fps), round(interval[1] * fps)) for interval in clues_time_intervals] + clue_durations = [interval[1] - interval[0] for interval in clues_frame_intervals] + total_duration = sum(clue_durations) + # 如果 frame_num 的数量大于等于总帧数, 则直接返回全部帧 + if frame_num >= total_duration: + return [frame for interval in clues_frame_intervals for frame in range(interval[0], interval[1])] + frames_per_clue = [int(frame_num * (duration / total_duration)) for duration in clue_durations] + frame_indices = [] + for i, (interval, num_frames) in enumerate(zip(clues_frame_intervals, frames_per_clue)): + num_frames = max(1, num_frames) + seg_size = (interval[1] - interval[0]) / num_frames + clue_frame_indices = [int(interval[0] + seg_size / 2 + seg_size * idx) for idx in range(num_frames)] + frame_indices.extend(clue_frame_indices) + return frame_indices + + +def merge_intervals(intervals): + """ + Merge overlapping intervals in a list. + Assumes each interval is a list [start, end]. + """ + if not intervals: + return [] + + # Sort intervals by start time + intervals.sort(key=lambda x: x[0]) + + merged = [intervals[0]] + + for current in intervals[1:]: + last_merged = merged[-1] + + # Check if there is an overlap + if current[0] <= last_merged[1]: + # Merge the current interval with the last one + last_merged[1] = max(last_merged[1], current[1]) + else: + # No overlap, add current interval + merged.append(current) + + return merged + + +def calculate_intervals_iou(intervals1, intervals2): + """ + Calculate the IoU of two lists of intervals. + Each list contains intervals represented as [start, end]. + """ + # Merge overlapping intervals in both lists + merged1 = merge_intervals(intervals1) + merged2 = merge_intervals(intervals2) + + # Calculate total length of intervals for both lists + def total_length(merged_intervals): + return sum(end - start for start, end in merged_intervals) + + length1 = total_length(merged1) + length2 = total_length(merged2) + + # Calculate intersection length + intersection_length = 0 + for interval1 in merged1: + for interval2 in merged2: + intersection_start = max(interval1[0], interval2[0]) + intersection_end = min(interval1[1], interval2[1]) + intersection_length += max(0, intersection_end - intersection_start) + # Calculate union length + union_length = length1 + length2 - intersection_length + # IoU is intersection divided by union + iou = intersection_length / union_length if union_length > 0 else 0 + return iou + + +def post_process(response, right_answer, task_mode, duration): + result = -1 + + if response: + # 找到 ```json 和 ``` 的位置 + json_start = response.find("```json") + json_end = response.find("```", json_start + len("```json")) + + # 如果找到了 json 内容 + if json_start != -1 and json_end != -1: + json_content = response[json_start + len("```json"):json_end].strip() + else: + json_content = "" + + if json_content: + if task_mode in ["long_acc", "clue_acc"]: + json_content = re.sub(r"(?<=:\s)([A-Za-z_]\w*)", r'"\1"', json_content) + + try: + model_result = json.loads(json_content)["result"] + + if task_mode in ["long_acc", "clue_acc"]: + result = 1 if right_answer == model_result else 0 + elif task_mode == "miou": + if not isinstance(model_result, list): + return -1 + if not isinstance(model_result[0], list): + model_result = [model_result] + + need_duration = all(interval[0] <= 1 and interval[1] <= 1 for interval in model_result) + + if need_duration: + model_result = [[interval[0] * duration, interval[1] * duration] for interval in model_result] + + right_answer = eval(right_answer) + + result = calculate_intervals_iou(right_answer, model_result) + + except Exception as e: + print(f"Error in parsing JSON: {e}, {json_content}") + + if result == -1: + if task_mode in ["long_acc", "clue_acc"]: + # 检查是否存在大写字母 A-H,认为其为模型答案 + matches = re.findall(r"\b[A-H]\b", response) + if matches: + result = 1 if right_answer in matches else 0 + elif task_mode == "miou": + # 提取所有实数,进行配对 + numbers = re.findall(r"-?\d+\.?\d*", response) + if len(numbers) < 2: + result = -1 + else: + if len(numbers) % 2 != 0: + numbers = numbers[:-1] + model_result = [[float(numbers[i]), float(numbers[i + 1])] for i in range(0, len(numbers), 2)] + + if type(right_answer) is str: + right_answer = eval(right_answer) + + result = calculate_intervals_iou(right_answer, model_result) + + return result + + +def get_timestampes(frame_indices, fps): + seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices)) + timestamps = ", ".join(seconds) + return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format( + frame_num=len(frame_indices), timestamps=timestamps + ) + + +def post_process_open(response): + model_result = -1 + + if response and response != FAIL_MSG: + json_start = response.find("```json") + json_end = response.find("```", json_start + len("```json")) + + # 如果找到了 json 内容 + if json_start != -1 and json_end != -1: + json_content = response[json_start + len("```json"):json_end].strip() + else: + json_content = "" + + if json_content: + try: + model_result = json.loads(json_content)["result"] + except Exception as e: + print(f"Error in parsing JSON: {e}, {json_content}") + + if model_result == -1: + model_result = response + + return model_result + + +def post_process_eval_open(response, step): + + model_result = -1 + + if response and response != FAIL_MSG: + + json_start = response.find("```json") + json_end = response.find("```", json_start + len("```json")) + + if json_start != -1 and json_end != -1: + json_content = response[json_start + len("```json"):json_end].strip() + else: + json_content = "" + + if json_content: + try: + model_result = json.loads(json_content)["result"] + except Exception as e: + print(f"Error in parsing JSON: {e}, {json_content}") + return -1 + if model_result == -1: + if step == 1: + match = re.search(r"[012]", response) + if match: + model_result = int(match.group()) + else: + match = re.search(r"[01]", response) + if match: + model_result = int(match.group()) + + return model_result + + +def eval_open_first(model, line): + + user_prompt = "" + + user_prompt += f"Question: {line['question']}\n\n" + + user_prompt += f"The ground truth answer is '{line['answer']}'\n\n" + + user_prompt += f"The model's prediction is '{line['model_result']}'\n\n" + + result = model.generate(user_prompt) + + return result + + +def save_step_1_steps(data, step_1_results): + + # 处理所有结果 + data["step_1_result"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 1)) + + # 条件更新 + mask = data["step_1_result"].isin([-1, 0, 1]) + data.loc[mask, "step_2_result"] = data.loc[mask, "step_1_result"] + data.loc[mask, "score"] = data.loc[mask, "step_1_result"] + + return data + + +def eval_open_second(model, line, frame_paths): + + user_prompt = "" + + user_prompt += f"Question: {line['question']}\n\n" + + user_prompt += f"The model's prediction is '{line['model_result']}'\n\n" + + result = model.generate([user_prompt] + frame_paths) + + return result + + +def save_step_2_steps(data, step_1_results): + + # 处理所有结果 + data["score"] = data["qid"].map(lambda x: post_process_eval_open(step_1_results[x], 2)) + + return data + + +def clue_frame_paths(clue_frame_root, qid, num_frames=8): + frame_root = osp.join(clue_frame_root, str(qid)) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + + +def save_clue_video_frames(data_root, clue_frame_root, video, uid, clue_intervals=None, num_frames=8, fps=-1): + + if type(uid) is str: + uid = str(uid) + + vid_path = osp.join(data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + + if clue_intervals is not None: + # 1. 合并重叠区间 + merged_intervals = merge_intervals(clue_intervals) + + if num_frames > 0 and fps < 0: + # 2. 基于clue_intervals均匀抽帧 + indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps) + frame_paths = clue_frame_paths(clue_frame_root, uid, len(indices)) + + # 保存帧 + flag = np.all([osp.exists(p) for p in frame_paths]) + if not flag: + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths, indices, vid_fps + + +def get_chunk_number(filename): + try: + num = filename.split("chunk_")[1].split(".zip")[0] + return int(num) + except: + return float('inf') + + +def unzip_hf_zip(pth): + + import zipfile + + target_dir = pth + + if os.path.exists(f"{target_dir}/cg_videos_720p") and os.path.exists(f"{target_dir}/cg_subtitles")\ + and os.path.exists(f"{target_dir}/cg_clue_videos"): + print("all exists") + return + + video_zip_files = [ + os.path.join(target_dir, file) + for file in os.listdir(target_dir) + if file.endswith(".zip") and file.startswith("video") + ] + + video_zip_files = sorted(video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x))) + + videos_temp_zip = os.path.join(target_dir, "videos_merged.zip") + + print("Merging video files ...") + + with open(videos_temp_zip, "wb") as outfile: + for video_zip_file in tqdm(video_zip_files, desc="Merging videos"): + with open(video_zip_file, "rb") as infile: + outfile.write(infile.read()) + + print("Extracting video files...") + + try: + with zipfile.ZipFile(videos_temp_zip, "r") as zip_ref: + + total_files = len(zip_ref.namelist()) + + for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files): + zip_ref.extract(file, target_dir) + + print(f"Successfully extracted to {target_dir}") + except Exception as e: + print(f"Error during extraction: {e}") + finally: + + if os.path.exists(videos_temp_zip): + os.remove(videos_temp_zip) + print("Cleaned up temporary video file") + + clue_video_zip_files = [ + os.path.join(target_dir, file) + for file in os.listdir(target_dir) + if file.endswith(".zip") and file.startswith("clue_video") + ] + + clue_video_zip_files = sorted(clue_video_zip_files, key=lambda x: get_chunk_number(os.path.basename(x))) + + clue_videos_temp_zip = os.path.join(target_dir, "clue_videos_merged.zip") + + print("Merging clue video files ...") + + with open(clue_videos_temp_zip, "wb") as outfile: + for clue_video_zip_file in tqdm(clue_video_zip_files, desc="Merging clue_videos"): + with open(clue_video_zip_file, "rb") as infile: + outfile.write(infile.read()) + + print("Extracting clue video files...") + + try: + with zipfile.ZipFile(clue_videos_temp_zip, "r") as zip_ref: + + total_files = len(zip_ref.namelist()) + + for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files): + zip_ref.extract(file, target_dir) + + print(f"Successfully extracted to {target_dir}") + except Exception as e: + print(f"Error during extraction: {e}") + finally: + + if os.path.exists(clue_videos_temp_zip): + os.remove(clue_videos_temp_zip) + print("Cleaned up temporary clue video file") + + print("Extracting subtitle files ...") + + subtitles_zip = os.path.join(target_dir, "subtitles.zip") + + try: + with zipfile.ZipFile(subtitles_zip, "r") as zip_ref: + + total_files = len(zip_ref.namelist()) + + for file in tqdm(zip_ref.namelist(), desc="Extracting", total=total_files): + zip_ref.extract(file, target_dir) + + print(f"Successfully extracted to {target_dir}") + except Exception as e: + print(f"Error during extraction: {e}") diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py new file mode 100644 index 0000000..ec89792 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/crpe.py @@ -0,0 +1,13 @@ +import json +import argparse +from collections import defaultdict + + +def is_correct(predict, answer): + # predict是标准答案 answer是预测 + if len(answer) == 1: + return answer[0] == predict[0] + elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']: + return answer[0] == predict[0] + elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']: + return predict[4:].lower() in answer.lower() diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py new file mode 100644 index 0000000..8941280 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/hrbench.py @@ -0,0 +1,54 @@ +from ...smp import * +import os + + +def report_acc_hrbench(df): + cycle_group = df.groupby('cycle_category') + result_dic = defaultdict(list) + avg_dic = defaultdict(int) + + count = 0 + for key, data_value in cycle_group: + count += 1 + _, resp_dic = hrbench_score(data_value) + + for task_type, accuracy in resp_dic.items(): + result_dic['cycle'].append(key) + result_dic['type'].append(task_type) + result_dic['accuracy'].append(accuracy) + + avg_dic[task_type] += accuracy + for task_type, accuracy in avg_dic.items(): + result_dic['cycle'].append('Average') + result_dic['type'].append(task_type) + result_dic['accuracy'].append(accuracy / count) + result_pd = pd.DataFrame(result_dic) + + return result_pd + + +def hrbench_score(data): + ret = defaultdict(list) + resp_dic = {} + category_list = set(data['category']) + score_dict = defaultdict(list) + + for i in range(len(data)): + d = data.iloc[i] + category = d['category'] + gpt_score = d['hit'] + score_dict[category].append(gpt_score) + score_dict['all'].append(gpt_score) + + all_acc = np.mean(score_dict['all']) + ret['type'].append('all') + ret['acc'].append(all_acc) + resp_dic['all'] = all_acc + for cate in category_list: + acc = np.mean(score_dict[cate]) + ret['type'].append(cate) + ret['acc'].append(acc) + + resp_dic[cate] = acc + + return pd.DataFrame(ret), resp_dic diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py index bc3e9ac..ab24bda 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/judge_util.py @@ -1,11 +1,11 @@ import os -from ...api import OpenAIWrapper from ...smp import load_env INTERNAL = os.environ.get('INTERNAL', 0) def build_judge(**kwargs): + from ...api import OpenAIWrapper, SiliconFlowAPI model = kwargs.pop('model', None) kwargs.pop('nproc', None) load_env() @@ -19,12 +19,20 @@ def build_judge(**kwargs): 'chatgpt-1106': 'gpt-3.5-turbo-1106', 'chatgpt-0125': 'gpt-3.5-turbo-0125', 'gpt-4o': 'gpt-4o-2024-05-13', + 'gpt-4o-0806': 'gpt-4o-2024-08-06', 'gpt-4o-mini': 'gpt-4o-mini-2024-07-18', + 'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct', + 'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct', + 'deepseek': 'deepseek-ai/DeepSeek-V2.5', } model_version = model_map[model] else: model_version = LOCAL_LLM - model = OpenAIWrapper(model_version, **kwargs) + + if model in ['qwen-7b', 'qwen-72b', 'deepseek']: + model = SiliconFlowAPI(model_version, **kwargs) + else: + model = OpenAIWrapper(model_version, **kwargs) return model @@ -32,7 +40,7 @@ DEBUG_MESSAGE = """ To debug the OpenAI API, you can try the following scripts in python: ```python from vlmeval.api import OpenAIWrapper -model = OpenAIWrapper('gpt-4-1106-preview', verbose=True) +model = OpenAIWrapper('gpt-4o', verbose=True) msgs = [dict(type='text', value='Hello!')] code, answer, resp = model.generate_inner(msgs) print(code, answer, resp) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py new file mode 100644 index 0000000..4cb8c18 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/logicvista.py @@ -0,0 +1,150 @@ +import pandas as pd + +# from colorama import Fore, Back, Style +from ...smp import * + + +FAIL_MSG = 'Failed to obtain answer via API.' + + +def build_prompt_logicvista(line): + question = line['question'] + prediction = str(line['prediction']) + tmpl = ( + "You are a information extractor that extracts multiple choice letter answer choices " + "from a paragraph that contains the answer choice and sometimes explaination of why that " + "choice is correct to the given question.\n" + "What letter did the following answer choose? If the answer did not select a letter answer choice, " + "first try to infer the answer based off the given choices.\n" + "If it does not seem like the given answer corresponds to an answer choice OR if there is no selected answer, please just respond with Z.\n" + "Make sure you answer with ONLY the letters chosen.\n" + 'Example 1: \n' + 'Question: \nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n\n' + 'Answer: \na cute teddy bear\n\nYour output: A\n' + 'Example 2: \n' + 'Question: \nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n\n' + 'Answer: \nSpider\n\nYour output: Z\n' + 'Example 3: \n' + 'Question: \nWhich figure is a rotation of the object?\n\n' + 'Answer: \nThe figure on the right, labeled "D," is a rotation of the object shown in the top left corner.\n\nYour output: D\n' + 'Example 4: \n' + 'Question: \nWhich of the boxes comes next in the sequence? Select from A-E\n\n' + 'Answer: \nThe sequence of the boxes is A, B, C, D, E.\n\nYour output: ABCDE\n' + 'Example 5: \n' + 'Question: \n{}\n\nAnswer: \n{}\n\nYour output: ' + ) + + return tmpl.format(question, prediction) + + +def LogicVista_auxeval(model, line): + prompt = build_prompt_logicvista(line) + print(prompt) + log = '' + retry = 5 + + for i in range(retry): + prediction = line['prediction'] + res = model.generate(prompt, temperature=i * 0.5) + answer = line['answer'].split(", ") + for j in range(0, len(answer)): + answer[j] = answer[j].lower() + answer.sort() + answer = ''.join(answer) + + if FAIL_MSG in res: + log += f'Try {i}: output is {prediction}, failed to parse.\n' + elif not res.isupper() or not res.isalpha(): + log += f'Try {i}: output is {prediction}, failed to parse.\n' + else: + log += 'Succeed' + hit = 0 + extracted = [alpha.lower() for alpha in res] + extracted.sort() + extracted = ''.join(extracted) + if extracted == answer: + hit = 1 + return dict(log=log, res=res, hit=hit) + log += 'All 5 retries failed.\n' + return dict(log=log, res='', hit=0) + + +cat = ["diagram", "ocr", "patterns", "graphs", "tables", "3d shapes", "puzzles", "sequences", "physics"] + + +def evaluate_logicvista(file_path): + df = pd.read_excel(file_path) + + tot = defaultdict(lambda: 0) + hit = defaultdict(lambda: 0) + acc = defaultdict(lambda: 0) + + lt = len(df) + skill_list = [] + + df_tot = df + + df_inductive = df[df["skill"].str.contains("inductive")] + df_deductive = df[df["skill"].str.contains("deductive")] + df_numerical = df[df["skill"].str.contains("numerical")] + df_spatial = df[df["skill"].str.contains("spatial")] + df_mechanical = df[df["skill"].str.contains("mechanical")] + + tot_correct = df_tot["hit"].sum() + tot_acc = (tot_correct / df_tot.shape[0]) * 100 + tot['Overall'] = df_tot.shape[0] + hit['Overall'] = tot_correct + acc['Overall'] = tot_acc + + inductive_correct = df_inductive["hit"].sum() + inductive_acc = (inductive_correct / df_inductive.shape[0]) * 100 + + tot["inductive"] = df_inductive.shape[0] + hit["inductive"] = inductive_correct + acc["inductive"] = inductive_acc + + deductive_correct = df_deductive["hit"].sum() + deductive_acc = (deductive_correct / df_deductive.shape[0]) * 100 + + tot["deductive"] = df_deductive.shape[0] + hit["deductive"] = deductive_correct + acc["deductive"] = deductive_acc + + numerical_correct = df_numerical["hit"].sum() + numerical_acc = (numerical_correct / df_numerical.shape[0]) * 100 + + tot["numerical"] = df_numerical.shape[0] + hit["numerical"] = numerical_correct + acc["numerical"] = numerical_acc + + spatial_correct = df_spatial["hit"].sum() + spatial_acc = (spatial_correct / df_spatial.shape[0]) * 100 + + tot["spatial"] = df_spatial.shape[0] + hit["spatial"] = spatial_correct + acc["spatial"] = spatial_acc + + mechanical_correct = df_mechanical["hit"].sum() + mechanical_acc = (mechanical_correct / df_mechanical.shape[0]) * 100 + + tot["mechanical"] = df_mechanical.shape[0] + hit["mechanical"] = mechanical_correct + acc["mechanical"] = mechanical_acc + + # capability dimension, the official data json does not contain 'capability' column, so it is now ignored + # for i in cat: + # curr = df[df["capability"].str.contains(i.replace(" ", ""))] + # correct = curr["hit"].sum() + # accuracy = (correct / curr.shape[0]) * 100 + # tot[i] = curr.shape[0] + # hit[i] = correct + # acc[i] = accuracy + + res = defaultdict(list) + for k in tot.keys(): + res['Task&Skill'].append(k) + res['tot'].append(tot[k]) + res['hit'].append(hit[k]) + res['acc'].append(acc[k]) + res = pd.DataFrame(res) + return res diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py new file mode 100644 index 0000000..ca814bd --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/longvideobench.py @@ -0,0 +1,80 @@ +from ...smp import * +from .multiple_choice import extract_answer_from_item +import numpy as np +import re + +FAIL_MSG = 'Failed to obtain answer via API.' + +DURATIONS = [15, 60, 600, 3600] +TASK_CATEGORIES = [ + "S2E", "S2O", "S2A", + "E2O", "O2E", "T2E", + "T2O", "T2A", "E3E", + "O3O", "SSS", "SOS", + "SAA", "T3E", "T3O", + "TOS", "TAA" +] + + +def get_dimension_rating(data_path): + data = load(data_path) + print(data.iloc[0]) + + duration_rating = {k: {} for k in DURATIONS} + for duration in DURATIONS + ['overall']: + duration_rating[duration] = { + 'overall': '', + 'question_category': {k: [] for k in TASK_CATEGORIES} + } + + for i in range(len(data)): + + task_ctg = data.iloc[i]['question_category'] + + duration = data.iloc[i]['duration_group'] + duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score']) + + duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score']) + + for duration in DURATIONS + ['overall']: + overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}' # noqa: E501 + duration_rating[duration]['overall'] = overall_res_dur + for task_ctg in TASK_CATEGORIES: + task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}' # noqa: E501 + duration_rating[duration]['question_category'][task_ctg] = task_res_dur + + return duration_rating + + +def extract_option(model, input_item, dataset_name): + options = input_item['question'].split('\n')[1:] + for id, option in enumerate(options): + option_id = chr(ord('A') + id) + '.' + if option.find(option_id) >= 0: + input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n') + return extract_answer_from_item(model, input_item, dataset_name)['opt'] + + +def extract_characters_regex(s): + s = s.strip() + answer_prefixes = [ + 'The best answer is', + 'The correct answer is', + 'The answer is', + 'The answer', + 'The best option is' + 'The correct option is', + 'Best answer:' + 'Best option:', + 'Answer:', + 'Option:', + ] + for answer_prefix in answer_prefixes: + s = s.replace(answer_prefix, '') + + if len(s.split()) > 10 and not re.search('[ABCDE]', s): + return '' + matches = re.search(r'[ABCDE]', s) + if matches is None: + return '' + return matches[0] diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py index dd1a69b..868871c 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathv.py @@ -2,8 +2,9 @@ from ...smp import * from ...utils import can_infer try: from latex2sympy2 import latex2sympy -except ImportError: - print('Please install latex2sympy2 by running "pip install latex2sympy2"') +except Exception as e: + logging.critical(f'{type(e)}: {e}') + logging.critical('Please install latex2sympy2 by running "pip install latex2sympy2"') FAIL_MSG = 'Failed to obtain answer via API.' diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py new file mode 100644 index 0000000..f650a1d --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mathverse.py @@ -0,0 +1,193 @@ +from ...smp import * +from ...utils import can_infer + + +FAIL_MSG = 'Failed to obtain answer via API.' + + +def get_gpt4_extract_ICE(): + example_1 = """ +1. +Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)' +Extracted Answer: (-2, 1) +""" # noqa + + example_2 = """ +2. +Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",' +Extracted Answer: D +""" # noqa + + example_3 = """ +3. +Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)' +Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\) +""" # noqa + + example_4 = """ +4. +Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.' +Extracted Answer: null +""" # noqa + + example_5 = """ +5. +Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.' +Extracted answer: 22.3 +""" # noqa + + example_6 = """ +6. +Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"' +Extracted answer: f(x) = -x^2 - 2x + 1 +""" # noqa + + return [example_1, example_2, example_3, example_4, example_5, example_6] + + +def get_gpt4_score_ICE(): + example_1 = """ +[Question]: Write the set of numbers represented on the number line in interval notation. +[Standard Answer]: (-2,1] +[Model_answer] : Extracted Answer: \\((-2, 1)\\) +Judgement: 0 +""" # noqa + + example_2 = """ +[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}} +[Standard Answer]: C +[Model_answer] : B:2\u221a{{3}} +Judgement: 0 +""" # noqa + + example_3 = """ +[Question]: Find the domain and range of the function f using interval notation. +[Standard Answer]: domain: [-4, 0) and range: (-3, 1] +[Model_answer] : Range: \\((-4, 1]\\) +Judgement: 0 +""" # noqa + + example_4 = """ +[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}} +[Standard Answer]: C +[Model_answer] : null +Judgement: 0 +""" # noqa + + return [example_1, example_2, example_3, example_4] + + +def build_mathverse_gpt4_extract_prompt(line): + task_description = """ +I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n +""" # noqa + prediction = str(line['prediction']) + demo_prompt = task_description + examples = get_gpt4_extract_ICE() + for example in examples: + demo_prompt += example + '\n\n' + test_prompt = f"Model response: '{prediction}'\nExtracted Answer: " + full_prompt = f'{demo_prompt}7.\n{test_prompt}' + + return full_prompt + + +def build_mathverse_gpt4_score_prompt(line): + task_description = """ +Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent. +Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm. +If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n +""" # noqa + question_for_eval = line['question_for_eval'] + extract = line['extract'] + answer = line['answer'] + demo_prompt = task_description + examples = get_gpt4_score_ICE() + for example in examples: + demo_prompt += example + '\n\n' + test_prompt = f""" + [Question]: {question_for_eval} + [Standard Answer]: {answer} + [Model_answer] : {extract} + Judgement:""" + full_prompt = f'{demo_prompt}{test_prompt}' + + return full_prompt + + +def post_check_score(line, prefetch=False): + ans = str(line['answer']).strip() + response = str(line['extract']).strip() + + if response == ans: + return response if prefetch else True + else: + return False + + +def MathVerse_auxeval_extract(model, line): + prompt = build_mathverse_gpt4_extract_prompt(line) + log = '' + retry = 5 + for i in range(retry): + prediction = line['prediction'] + res = model.generate(prompt, temperature=i * 0.5) + + if FAIL_MSG in res: + log += f'Try {i}: output is {prediction}, failed to parse.\n' + else: + log += 'Succeed' + return dict(log_extract=log, extract=res) + log += 'All 5 retries failed.\n' + return dict(log_extract=log, extract='') + + +def MathVerse_auxeval_score(model, line): + prompt = build_mathverse_gpt4_score_prompt(line) + log = '' + retry = 5 + if post_check_score(line, prefetch=True): + res = post_check_score(line, prefetch=True) + return dict(log_score='Prefetch succeed', score=True) + for i in range(retry): + prediction = line['prediction'] + res = model.generate(prompt, temperature=i * 0.5) + + if FAIL_MSG in res or res.strip() not in ['0', '1']: + log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n' + else: + log += 'Succeed' + return dict(log_score=log, score=int(res) == 1) + log += 'All 5 retries failed.\n' + return dict(log_score=log, score=False) + + +def MathVerse_acc(result_file): + df = load(result_file) + + df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"')) + df['metadata'] = df['metadata'].apply(json.loads) + df_metadata = pd.json_normalize(df['metadata']) + df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1) + + subset = list(set(df['problem_version'])) + + res = defaultdict(list) + for p in subset: + if p != 'Overall': + sub = df[df['problem_version'] == p] + else: + sub = cp.deepcopy(df) + res['split'].append(p) + # Overall Acc + res['Overall'].append(np.mean(sub['score']) * 100) + # Subject + subjects = set(df['subject']) + for k in subjects: + res[k].append(np.mean(sub[sub['subject'] == k]['score']) * 100) + # Subfield + subfields = set(df['subfield']) + for k in subfields: + res[k].append(np.mean(sub[sub['subfield'] == k]['score']) * 100) + + return pd.DataFrame(res) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py new file mode 100644 index 0000000..c82fe3e --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mlvu.py @@ -0,0 +1,189 @@ +from ...smp import * +from .multiple_choice import extract_answer_from_item +from PIL import Image, ImageOps +import numpy as np + +FAIL_MSG = 'Failed to obtain answer via API.' + +system_prompt_sub_scene = """ +##TASK DESCRIPTION: +You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category. +##ACCURACY Scoring Criteria: +Evaluate the respondent's answer against specific scoring points as follows: +Score 1: The response completely misses the scoring point. +Score 3: The response mentions content related to the scoring point but is not entirely correct. +Score 5: The response accurately addresses the scoring point. +Calculate the average score across all scoring points to determine the final accuracy score. +##RELEVANCE Scoring Criteria: +Assess how the respondent's answer relates to the original question: +Score 1: The response is completely off-topic from the question. +Score 2: The response is partially related to the question but contains a significant amount of irrelevant content. +Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer. +Score 4: The response mostly addresses the question and the respondent appears confident in their answer. +Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty. +---- +##INSTRUCTION: +1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score. +2. Evaluate RELEVANCE: Assess the relevance of the respondent’s answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score. +3. Output Scores in JSON Format: Present the scores in JSON format as follows: +{'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele} +""" # noqa + +system_prompt_summary = """ +##TASK DESCRIPTION: +You are required to evaluate the performance of the respondent in the video summarization task based on the standard answer and the respondent's answer. You should provide two scores. The first is the COMPLETENESS score, which should range from 1 to 5. The second is the RELIABILITY score, which should also range from 1 to 5. Below are the criteria for each scoring category: +##COMPLETENESS Scoring Criteria: +The completeness score focuses on whether the summary covers all key points and main information from the video. +Score 1: The summary hardly covers any of the main content or key points of the video. +Score 2: The summary covers some of the main content and key points but misses many. +Score 3: The summary covers most of the main content and key points. +Score 4: The summary is very comprehensive, covering most to nearly all of the main content and key points. +Score 5: The summary completely covers all the main content and key points of the video. +##RELIABILITY Scoring Criteria: +The reliability score evaluates the correctness and clarity of the video summary. It checks for factual errors, misleading statements, and contradictions with the video content. If the respondent's answer includes details that are not present in the standard answer, as long as these details do not conflict with the correct answer and are reasonable, points should not be deducted. +Score 1: Contains multiple factual errors and contradictions; presentation is confusing. +Score 2: Includes several errors and some contradictions; needs clearer presentation. +Score 3: Generally accurate with minor errors; minimal contradictions; reasonably clear presentation. +Score 4: Very accurate with negligible inaccuracies; no contradictions; clear and fluent presentation. +Score 5: Completely accurate with no errors or contradictions; presentation is clear and easy to understand. +---- +##INSTRUCTION: +1. Evaluate COMPLETENESS: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence. +2. Evaluate RELIABILITY: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence. +3. Output Scores in JSON Format: Present the scores in JSON format as follows: +{'score_completeness': score_comp, 'score_reliability': score_reli, 'total_score': score_comp + score_reli} +""" # noqa + + +def check_ans_with_model(pred, gt, model, item, dataset_name='MLVU_MCQ'): + flag = False + + index = gt.index("(") # noqa + index2 = gt.index(")") # noqa + gt_option = gt[index + 1: index2] + + if ")" in pred: + index3 = pred.index(")") + pred = pred[index3 - 1: index3] + if pred == gt_option: + flag = True + elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']: + flag = True + + return flag + + +def extract_scores_summary(text): + # Define the keys to locate in the text + keys = ["score_completeness", "score_reliability"] + scores = [] + + for key in keys: + # Find the index where each key starts + start_index = text.find(key) + if start_index == -1: + continue # Skip if key is not found + + # Find the start of the number which is after the colon and space + start_number_index = text.find(":", start_index) + 2 + end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma + + # Extract and convert the number to float + score = float(text[start_number_index:end_number_index]) + scores.append(score) + + return scores + + +def check_ans_with_model_summary(pred, gt, model, item, dataset_name='MLVU_OpenEnded'): + user_prompt = f""" + Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores. + Standard Answer: {gt} + Respondent's Answer: {pred} + """ # noqa + result = model.generate(user_prompt) + result = extract_scores_summary(result) + result = np.sum(result) + return result + + +def extract_scores_sub_scene(text): + # Define the keys to locate in the text + keys = ["score_accuracy", "score_relevance"] + scores = [] + + for key in keys: + # Find the index where each key starts + start_index = text.find(key) + if start_index == -1: + continue # Skip if key is not found + + # Find the start of the number which is after the colon and space + start_number_index = text.find(":", start_index) + 2 + end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma + + # Extract and convert the number to float + score = float(text[start_number_index:end_number_index]) + scores.append(score) + + return scores + + +def check_ans_with_model_sub_scene(pred, gt, model, item, dataset_name='MLVU_OpenEnded'): + user_prompt = f""" + Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores. + Question: {item['question']} + Scoring Points: {item['scoring_points']} + Respondent's Answer: {pred} + """ # noqa + result = model.generate(user_prompt) + result = extract_scores_sub_scene(result) + result = np.sum(result) + return result + + +def MLVU_OpenEnded_generate(model, line): + task_type = line['task_type'] + if task_type == 'summary': + user_prompt = ( + f"Please score the respondent's answer according to the steps in the Instructions. " + f"You must end with a JSON dict to store the scores.\n" + f"Standard Answer: {line['answer']}\n" + f"Respondent's Answer: {line['prediction']}\n" + ) + elif task_type == 'sub_scene': + user_prompt = ( + f"Please score the respondent's answer according to the steps in the Instructions. " + f"You must end with a JSON dict to store the scores.\n" + f"Question: {line['question']}\n" + f"Scoring Points: {line['scoring_points']}\n" + f"Respondent's Answer: {line['prediction']}\n" + ) + else: + AssertionError(f'MLVU don\'t have {task_type} open ended task!') + result = model.generate(user_prompt) + return result + + +def MLVU_OpenEnded_extract(gpt_generate_data, org_data): + extract_func = { + 'sub_scene': extract_scores_sub_scene, + 'summary': extract_scores_summary + } + for idx, item in org_data.iterrows(): + func = extract_func[item['task_type']] + text = gpt_generate_data[idx] + org_data.loc[idx, 'score'] = np.sum(func(text)) + + return org_data + + +def get_dimension_rating(data_path): + data = load(data_path) + result_dict = {} + for idx, item in data.iterrows(): + if item['task_type'] not in result_dict: + result_dict[item['task_type']] = [0,0] + result_dict[item['task_type']][0] += int(item['score']) + result_dict[item['task_type']][1] += 1 + return result_dict diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py index 582b2e0..d432bdb 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmdu.py @@ -118,7 +118,7 @@ def mmdu_score(model, line): f'{",".join([x for x in DIMS if x not in result_dict])}' ) except Exception as e: - print({e}) + logging.warning(str(e)) all_result_dict.append({d: None for d in DIMS}) logs.append(str(e)) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py new file mode 100644 index 0000000..34604ad --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mmniah.py @@ -0,0 +1,298 @@ +import re +import json + + +def has_word(sentence, word): + pattern = r'\b' + re.escape(word) + r'\b' + match = re.search(pattern, sentence) + if match: + return True + else: + return False + + +class VQAEval: + def __init__(self): + self.contractions = { + 'aint': "ain't", + 'arent': "aren't", + 'cant': "can't", + 'couldve': "could've", + 'couldnt': "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + 'didnt': "didn't", + 'doesnt': "doesn't", + 'dont': "don't", + 'hadnt': "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + 'hasnt': "hasn't", + 'havent': "haven't", + 'hed': "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + 'hes': "he's", + 'howd': "how'd", + 'howll': "how'll", + 'hows': "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + 'Im': "I'm", + 'Ive': "I've", + 'isnt': "isn't", + 'itd': "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + 'itll': "it'll", + "let's": "let's", + 'maam': "ma'am", + 'mightnt': "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + 'mightve': "might've", + 'mustnt': "mustn't", + 'mustve': "must've", + 'neednt': "needn't", + 'notve': "not've", + 'oclock': "o'clock", + 'oughtnt': "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + 'shant': "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + 'shouldve': "should've", + 'shouldnt': "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": 'somebodyd', + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + 'somebodyll': "somebody'll", + 'somebodys': "somebody's", + 'someoned': "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + 'someonell': "someone'll", + 'someones': "someone's", + 'somethingd': "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + 'somethingll': "something'll", + 'thats': "that's", + 'thered': "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + 'therere': "there're", + 'theres': "there's", + 'theyd': "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + 'theyll': "they'll", + 'theyre': "they're", + 'theyve': "they've", + 'twas': "'twas", + 'wasnt': "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + 'weve': "we've", + 'werent': "weren't", + 'whatll': "what'll", + 'whatre': "what're", + 'whats': "what's", + 'whatve': "what've", + 'whens': "when's", + 'whered': "where'd", + 'wheres': "where's", + 'whereve': "where've", + 'whod': "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + 'wholl': "who'll", + 'whos': "who's", + 'whove': "who've", + 'whyll': "why'll", + 'whyre': "why're", + 'whys': "why's", + 'wont': "won't", + 'wouldve': "would've", + 'wouldnt': "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + 'yall': "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + 'youd': "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + 'youll': "you'll", + 'youre': "you're", + 'youve': "you've", + } + self.manualMap = { + 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, + 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, + 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, + 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, + 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, + 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50, + 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90} + self.articles = ['a', 'an', 'the'] + + self.periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)') + self.commaStrip = re.compile('(\\d)(\\,)(\\d)') + self.punct = [ + ';', + r'/', + '[', + ']', + '"', + '{', + '}', + '(', + ')', + '=', + '+', + '\\', + '_', + '-', + '>', + '<', + '@', + '`', + ',', + '?', + '!', + ] + + def evaluate(self, answer, gt_answers): + answer = answer.replace('\n', ' ') + answer = answer.replace('\t', ' ') + answer = answer.strip() + answer = self.processPunctuation(answer) + answer = self.processDigitArticle(answer) + if isinstance(gt_answers, list): + for i in range(len(gt_answers)): + gt_answers[i] = str(gt_answers[i]) + gt_answers[i] = gt_answers[i].replace('\n', ' ') + gt_answers[i] = gt_answers[i].replace('\t', ' ') + gt_answers[i] = gt_answers[i].strip() + gt_answers[i] = self.processPunctuation(gt_answers[i]) + gt_answers[i] = self.processDigitArticle(gt_answers[i]) + if has_word(answer, gt_answers[i]): + return 1 + return 0 + else: + gt_answers = gt_answers.replace('\n', ' ') + gt_answers = gt_answers.replace('\t', ' ') + gt_answers = gt_answers.strip() + gt_answers = self.processPunctuation(gt_answers) + gt_answers = self.processDigitArticle(gt_answers) + if has_word(answer, gt_answers): + return 1 + else: + return 0 + + def evaluate_MRR(self, answer, gt_answers): + answer = answer.replace('\n', ' ') + answer = answer.replace('\t', ' ') + answer = answer.strip() + answer = self.processPunctuation(answer) + answer = self.processDigitArticle(answer) + assert isinstance(gt_answers, list) + for i in range(len(gt_answers)): + gt_answers[i] = gt_answers[i].replace('\n', ' ') + gt_answers[i] = gt_answers[i].replace('\t', ' ') + gt_answers[i] = gt_answers[i].strip() + gt_answers[i] = self.processPunctuation(gt_answers[i]) + gt_answers[i] = self.processDigitArticle(gt_answers[i]) + if has_word(answer, gt_answers[i]): + return 1 / (i + 1) + return 0.0 + + def processPunctuation(self, inText): + outText = inText + for p in self.punct: + if (p + ' ' in inText or ' ' + p in inText) or ( + re.search(self.commaStrip, inText) is not None + ): + outText = outText.replace(p, '') + else: + outText = outText.replace(p, ' ') + outText = self.periodStrip.sub('', outText, re.UNICODE) + return outText + + def processDigitArticle(self, inText): + outText = [] + tempText = inText.lower().split() + for word in tempText: + word = self.manualMap.setdefault(word, word) + if word not in self.articles: + outText.append(word) + else: + pass + for wordId, word in enumerate(outText): + if word in self.contractions: + outText[wordId] = self.contractions[word] + + outText = [str(text) for text in outText] + outText = ' '.join(outText) + return outText + + +def is_correct(answer, response): + # response_orig = response + response = response.strip('.') + if isinstance(answer, int): + if response.isdigit(): + return int(int(response) == answer) + + response = response.lower() + response = response.replace('the answer is', '') + response = response.replace('*', '') # parse **A** + if response.find('.') != -1: + response = response.split('.')[0] + response = response.replace(',', '') + response = response.strip() + response = response.strip() + + if response == 'none': + return 0 + + if 'the camera is moving left' in response: + response = 'a' + elif 'the camera is moving right' in response: + response = 'b' + + if len(response) != 1: + # print(f"Fail to parse {response_orig}") + return 0 + + return (ord(response) - ord('a')) == answer + + if isinstance(answer, list): + try: + response = response.replace('json', '').replace('```', '').strip() + response = json.loads(response) + if isinstance(response, dict): + response = sum(list(response.values()), start=[]) + except: + # print(f"Fail to parse {response_orig} Exception: {e}") + return 0 + + if not isinstance(response, (list, tuple)): + # print(f"Fail to parse {response_orig} Exception: not a list!") + return 0 + + match = 0 + for res, ans in zip(response, answer): + match += res == ans + return match / len(answer) + + return VQAEval().evaluate(response, answer) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py index bd06601..c5aa9c8 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/multiple_choice.py @@ -2,6 +2,7 @@ import pandas as pd from ...utils import can_infer, track_progress_rich from ...smp import * import numpy as np +import re MMB_abbrs = { 'coarse_perception': 'CP', @@ -170,6 +171,31 @@ def build_prompt(question, options, prediction): return tmpl.format(question, options, prediction) +def build_prompt_wemath(question, prediction): + tmpl = ( + 'You are an AI assistant who will help me to match ' + 'an answer with several options of a single-choice question. ' + 'You are provided with a question, several options, and an answer, ' + 'and you need to find which option is most similar to the answer. ' + 'If the meaning of all options are significantly different from the answer, output Z. ' + 'Your should output a single uppercase character in A, B, C, D, E, F, G (if they are valid options), and Z. \n' + 'Example 1: \n' + 'Question: \nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n\n' + 'Answer: \na cute teddy bear\n\nYour output: A\n' + 'Example 2: \n' + 'Question: \nWhat is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n\n' + 'Answer: \nSpider\n\nYour output: Z\n' + 'Example 3: \n' + 'Question: \n{}\n\nAnswer: \n{}\n\nYour output: ' + ) + question = question.replace( + ("Regarding the format, please answer following the template below, and be sure to include two <> symbols:\n" + ": <> : <>"), + '', + ) + return tmpl.format(question, prediction) + + def build_prompt_blink(question, options, prediction): tmpl = ( 'You are an AI assistant who will help me to match an answer with several options of a single-choice question. ' @@ -241,6 +267,8 @@ def extract_answer_from_item(model, item, dataset_name=None): if dataset_name == 'BLINK': prompt = build_prompt_blink(item['question'], option_str, item['prediction']) + elif dataset_name == 'WeMath': + prompt = build_prompt_wemath(item['question'], item['prediction']) elif cn_string(item['question']): prompt = build_prompt_cn(item['question'], option_str, item['prediction']) else: @@ -359,9 +387,7 @@ def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None): res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys) result = load(result_file) for k, v in zip(keys, res): - if k in result: - assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log'] - else: + if k not in result: result[k] = v data['hit'] = [result[i]['hit'] for i in data['index']] data['log'] = [result[i]['log'] for i in data['index']] @@ -425,9 +451,7 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None): keys=keys) result = load(result_file) for k, v in zip(keys, res): - if k in result: - assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log'] - else: + if k not in result: result[k] = v tmp_pth = f'/tmp/{timestr()}.xlsx' @@ -440,3 +464,95 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None): data_main.pop('GT') return data_main + + +def extract_characters_regex(s, choices=['(A)', '(B)', '(C)', '(D)', '(E)']): + if type(s) is dict: + s = '' + s = s.strip() + answer_prefixes = [ + 'The best answer is', + 'The correct answer is', + 'The answer is', + 'The answer', + 'The best option is' + 'The correct option is', + 'Best answer:' + 'Best option:', + ] + for answer_prefix in answer_prefixes: + s = s.replace(answer_prefix, '') + + if len(s.split()) > 10 and not re.search('[ABCDE]', s): + return '' + matches = re.search(r'[ABCDE]', s) + if matches is None: + for choice in choices: + if s.lower() in choice.lower(): + return choice[1] + return '' + return matches[0] + + +def get_dimension_rating(data_path): + TASKS = [ + 'Reasoning', + 'Perception', + ] + + SUBTASKS = [ + 'Monitoring', + 'Autonomous_Driving', + 'OCR with Complex Context', + 'Diagram and Table', + 'Remote Sensing', + ] + data = load(data_path) + results = {} + results['Overall'] = {} + for task in TASKS: + results[f'{task}'] = {} + for subtask in SUBTASKS: + results[f'{task}'][f'{subtask}'] = {} + + for i in range(len(data)): + question = data.iloc[i] + Task = question['category'].split('/')[0] + Subtask = question['category'].split('/')[1] + Category = question['l2-category'].lower() + if 'attribute' in Category.lower(): + Category = Category.split('/')[0] + '/attribute' + if question['score'] >= 0: + cnt = question['score'] + if Category not in results[Task][Subtask].keys(): + results[Task][Subtask][f'{Category}'] = {'true': cnt, 'false': 1 - cnt} + else: + results[Task][Subtask][f'{Category}']['true'] += cnt + results[Task][Subtask][f'{Category}']['false'] += 1 - cnt + + sum_all, succ_all = 0, 0 + for task, tasks_values in results.items(): + cnt_task, sum_task = 0, 0 + for substask, subtask_value in tasks_values.items(): + cnt_subtask, sum_subtask = 0, 0 + for category, category_dict in subtask_value.items(): + cnt_subtask += category_dict['true'] + sum_subtask += category_dict['false'] + category_dict['true'] + acc = category_dict['true'] / (category_dict['false'] + category_dict['true']) + results[task][substask][category] = acc + if sum_subtask == 0: + acc_subtasks = 0 + else: + acc_subtasks = cnt_subtask / sum_subtask + cnt_task += cnt_subtask + sum_task += sum_subtask + results[task][substask]['Avg'] = acc_subtasks + if sum_task == 0: + acc_task = 0 + else: + acc_task = cnt_task / sum_task + succ_all += cnt_task + sum_all += sum_task + results[task]['Avg'] = acc_task + results['Overall'] = succ_all / sum_all + return results diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py index 2c9a7d4..b2750ea 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/mvbench.py @@ -1,4 +1,5 @@ from ...smp import * +from .multiple_choice import extract_answer_from_item from PIL import Image, ImageOps import torchvision import random @@ -32,9 +33,9 @@ def get_dimension_rating(data_path): def check_ans(pred, gt): flag = False - pred_list = pred.lower().split(' ') + pred_list = pred.lower().strip().split(' ') pred_option, _ = pred_list[0], ' '.join(pred_list[1:]) - gt_list = gt.lower().split(' ') + gt_list = gt.lower().strip().split(' ') gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:]) if gt_content[-1] == '.': gt_content = gt_content[:-1] @@ -47,6 +48,64 @@ def check_ans(pred, gt): return flag +def check_ans_with_model(pred, gt, model, item, dataset_name='MVBench'): + flag = False + + pred_list = pred.lower().strip().split(' ') + pred_option, _ = pred_list[0], ' '.join(pred_list[1:]) + gt_list = gt.lower().strip().split(' ') + gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:]) + if gt_content[-1] == '.': + gt_content = gt_content[:-1] + + if pred_option.replace('.', '') in gt_option: + flag = True + elif gt_option in pred_option: + flag = True + elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']: + flag = True + + return flag + + +def check_ans_advanced(pred, gt): + number_table = { + 0: 'zero', + 1: 'one', + 2: 'two', + 3: 'three', + 4: 'four', + 5: 'five', + 6: 'six', + 7: 'seven', + 8: 'eight', + 9: 'nine', + } + flag = False + + pred_list = pred.lower().strip().split(' ') + pred_option, _ = pred_list[0], ' '.join(pred_list[1:]) + gt_list = gt.lower().strip().split(' ') + gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:]) + if gt_content[-1] == '.': + gt_content = gt_content[:-1] + + try: + gt_content = number_table[int(gt_content.strip('. \n'))] + print(gt_content) + except: + pass + + if pred_option.replace('.', '') in gt_option: + flag = True + elif gt_option in pred_option: + flag = True + elif gt_content.lower().strip('. \n') in pred.lower().strip('. \n'): + flag = True + + return flag + + class GroupRandomCrop(object): def __init__(self, size): if isinstance(size, numbers.Number): diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py new file mode 100644 index 0000000..ed9a957 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/naturalbench.py @@ -0,0 +1,145 @@ +import re + + +def extract_answer(output_string, task_type="yes_no"): + """ + Extracts the answer from the output string based on the task type. + + Parameters: + output_string (str): The output string. + task_type (str): The type of task. Must be either "yes_no" or "multiple_choice". + + Returns: + int: + 1 if "yes" or "A" + 0 if "no" or "B" + -1 if no relevant answer is found. + Raises a ValueError if an unsupported task_type is provided. + """ + + def find_word_position(string, word): + pattern = r'\b' + re.escape(word) + r'\b' + match = re.search(pattern, string, re.IGNORECASE) + if match: + return match.start() + return -1 + + if task_type not in ["yes_no", "multiple_choice"]: + raise ValueError(f"Task type {task_type} not supported. Must be 'yes_no' or 'multiple_choice'.") + + if task_type == "yes_no": + position_yes_and_a = find_word_position(output_string, "yes") + position_no_and_b = find_word_position(output_string, "no") + elif task_type == "multiple_choice": + position_yes_and_a = find_word_position(output_string, "A") + position_no_and_b = find_word_position(output_string, "B") + + if position_yes_and_a == -1 and position_no_and_b == -1: + print(f"No answer found in the output string: {output_string}.") + return -1 + elif position_yes_and_a != -1 and position_no_and_b != -1: + return 1 if position_yes_and_a < position_no_and_b else 0 + else: + return 0 if position_yes_and_a == -1 else 1 + + +def get_scores(scores): + """ + Calculate various scores based on the given results. + + Args: + scores (dict or list): A dictionary or list containing results where each result can be: + - dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...} + - list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...] + + The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images: + - "q0_i0" means question_0 on image_0 + - "q0_i1" means question_0 on image_1 + - "q1_i0" means question_1 on image_0 + - "q1_i1" means question_1 on image_1 + + Returns: + dict: A dictionary containing the calculated scores: + - 'Q_Acc': Average question score + - 'I_Acc': Average image score + - 'Acc': Average binary VQA score + - 'G_Acc': Average group score + """ + Q_Acc = 0.0 + I_Acc = 0.0 + Acc = 0.0 + G_Acc = 0.0 + + num_samples = len(scores) + + def calculate_image_score(result): + image_correct = 0 + if isinstance(result, dict): + if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0: + image_correct += 1 + if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0: + image_correct += 1 + elif isinstance(result, list): + if result[0] == 1.0 and result[2] == 0.0: + image_correct += 1 + if result[3] == 1.0 and result[1] == 0.0: + image_correct += 1 + return image_correct + + def calculate_question_score(result): + text_correct = 0 + if isinstance(result, dict): + if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0: + text_correct += 1 + if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0: + text_correct += 1 + else: + if result[0] == 1.0 and result[1] == 0.0: + text_correct += 1 + if result[3] == 1.0 and result[2] == 0.0: + text_correct += 1 + return text_correct + + def calculate_binary_score(result): + binary_score_correct = 0 + if isinstance(result, dict): + binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0 + binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0 + binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0 + binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0 + else: + binary_score_correct += 1 if result[0] == 1.0 else 0 + binary_score_correct += 1 if result[1] == 0.0 else 0 + binary_score_correct += 1 if result[2] == 0.0 else 0 + binary_score_correct += 1 if result[3] == 1.0 else 0 + + return binary_score_correct + + def calculate_group(result): + group_correct = 0 + if calculate_question_score(result) == 2 and calculate_image_score(result) == 2: + group_correct += 1 + + return group_correct + + if isinstance(scores, dict): + for _, result in scores.items(): + Q_Acc += calculate_question_score(result) + I_Acc += calculate_image_score(result) + Acc += calculate_binary_score(result) + G_Acc += calculate_group(result) + else: + for result in scores: + Q_Acc += calculate_question_score(result) + I_Acc += calculate_image_score(result) + Acc += calculate_binary_score(result) + G_Acc += calculate_group(result) + + results = { + 'Q_Acc': Q_Acc / float(num_samples * 2), + 'I_Acc': I_Acc / float(num_samples * 2), + 'Acc': Acc / float(num_samples * 4), + 'G_Acc': G_Acc / num_samples + } + + return results diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py new file mode 100644 index 0000000..525c178 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/olympiadbench.py @@ -0,0 +1,532 @@ +import re +import json +from math import isclose +import sympy as sp +from sympy import simplify, Eq, sympify, evalf, Pow +from sympy.parsing.latex import parse_latex +import antlr4 +from decimal import Decimal, getcontext +from fractions import Fraction +import sys +import math + + +chinese_answer_type_dict = { + 'Numerical': '数值', + 'Expression': '表达式', + 'Equation': '方程', + 'Interval': '区间' +} +english_answer_type_dict = { + 'Numerical': 'a numerical value', + 'Expression': 'an expression', + 'Equation': 'an equation', + 'Interval': 'an interval' +} + + +def get_single_answer_type_text(answer_type, is_chinese): + if '-' in answer_type: # No need now + answer_type = answer_type[:answer_type.find('-')] + for t in ['Numerical', 'Expression', 'Equation', 'Interval']: + if t in answer_type: + if is_chinese: + return chinese_answer_type_dict[t] + else: + return english_answer_type_dict[t] + exit(f'Error parsing answer type {answer_type}!') + + +def get_answer_type_text(answer_type, is_chinese, multiple_answer): + # 'Tuple' has various meanings in different context, such as position or values of a series of variable, + # so it may lead to confusion to directly use 'tuple' in the prompt. + if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type): + full_answer_text = '' + else: + if not multiple_answer: + answer_text = get_single_answer_type_text(answer_type, is_chinese) + if is_chinese: + full_answer_text = f',答案类型为{answer_text}' + else: + full_answer_text = f"The answer of The problem should be {answer_text}. " + else: + if ',' not in answer_type: # Same answer type for all answers + answer_text = get_single_answer_type_text(answer_type, is_chinese) + if is_chinese: + full_answer_text = f',题目有多个答案,答案类型均为{answer_text}' + else: + full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. ' + else: + answer_types = answer_type.split(',') + answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types] + if len(set(answer_types)) == 1: + answer_text = answer_types[0] + if is_chinese: + full_answer_text = f',题目有多个答案,答案类型均为{answer_text}' + else: + full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. ' + else: + if is_chinese: + answer_text = '、'.join(answer_types) + full_answer_text = f',题目有多个答案,答案类型分别为{answer_text}' + else: + answer_text = ', '.join(answer_types) + full_answer_text = ( + f'The problem has multiple answers, with the answers in order being {answer_text}. ' + ) + return full_answer_text + + +def make_input(prompt, question_content): + # diversified based on the vllm, which is not implemented temporarily + input = prompt + '\n' + question_content + return input + + +sys.set_int_max_str_digits(1000000) +# 设置decimal的精度 +getcontext().prec = 50 + + +class MathJudger: + def __init__(self): + self.special_signal_map = { + "\\left": "", + "\\right": "", + "∶": ":", + ",": ",", + "$": "", + "\\approx": "=", + "\\simeq": "=", + "\\sim": "=", + "^\\prime": "'", + "^{\\prime}": "'", + "^\\circ": "", + "%": "", + } + self.pi = parse_latex("\\pi") + self.precision = 1e-8 + + def split_by_comma(self, expr: str): + in_bracket_num = 0 + splitted_expr = [] + start_idx = 0 + for i, char in enumerate(expr): + if char == "(" or char == "[": + in_bracket_num += 1 + elif char == ")" or char == "]": + in_bracket_num -= 1 + elif char == "," and in_bracket_num == 0: + splitted_expr.append(expr[start_idx:i].strip()) + start_idx = i + 1 + + if start_idx < len(expr): + splitted_expr.append(expr[start_idx:].strip()) + + return splitted_expr + + def trans_plus_minus_sign(self, expr_list: list): + new_expr_list = [] + for expr in expr_list: + if "\\pm" in expr: + new_expr_list.append(expr.replace("\\pm", "+")) + new_expr_list.append(expr.replace("\\pm", "-")) + else: + new_expr_list.append(expr) + + return new_expr_list + + def judge(self, expression1, expression2, precision=1e-8): + # (默认 expression1 为 Ground_Truth) + precision = precision if isinstance(precision, list) else [precision] + + try: + expression1, expression2 = self.preprocess(expression1, expression2) + except: + return False + if expression1 == expression2: + # print("原生相等") + return True + + # 去除字符串中的中文字符,因为上面已经判断过了类似回答为"能"或"不能"的含有中文字符的回答情况 + expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1) + expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2) + + expression1 = self.split_by_comma(expression1) + expression2 = self.split_by_comma(expression2) + + temp_list1 = self.trans_plus_minus_sign(expression1) + temp_list2 = self.trans_plus_minus_sign(expression2) + + # 设计误差值列表 + if len(precision) <= 1: + precision = precision * len(temp_list1) + + if len(temp_list1) != len(temp_list2): + return False + + # 判断两个列表中的元素是否可以两两配对,并且两两相等,由此支持多个回答的比较 + idx = -1 + while len(temp_list1) != 0: + idx = (idx + 1) % len(temp_list1) + + item1 = temp_list1[idx] + self.precision = precision[idx] + # print(self.precision) + + for item2 in temp_list2: + if self.is_equal(item1, item2): + temp_list1.remove(item1) + temp_list2.remove(item2) + precision.remove(self.precision) + break + else: + # If we didn't break from the inner loop, it means no match was found + return False + + # If all elements are matched and removed, the lists can be paired + return True + + def is_interval(self, epr): + return epr.startswith(("(", "[")) and epr.endswith((")", "]")) + + # 在进行数值计算前,需要将sympy中的pi符号替换为pi的近似数值 + # def sympy_sub_pi(self, expression_sympy): + # return expression_sympy.subs(self.pi, math.pi) + + # 默认第一个表达式是 ground_truth + def is_equal(self, expression1, expression2): + if expression1 == expression2 and expression1 != "" and expression2 != "": + # print("原生等价") + return True + + # 先判断是否是两个区间,是的话进行判断相等,不相等则返回 False + if self.is_interval(expression1) and self.is_interval(expression2): + try: + if self.interval_equal(expression1, expression2): + # print("区间等价") + return True + except: + return False + + # 再判断是否在数值上相等 + try: + if self.numerical_equal(expression1, expression2): + # print("数值等价") + return True + except: + pass + + # 再判断是否是表达式相等 + try: + if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2): + # print("表达式等价") + return True + except: + pass + + # 再判断是否是等式相等 + try: + if self.equation_equal(expression1, expression2): + # print("等式等价") + return True + except: + pass + + return False + + # 判断两个数值在误差允许范围内是否相等 + def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True): + """ + (默认 expression1 为 Ground_Truth) + 函数: 判读两个数值是否在误差允许范围内相等 + 步骤1: 将可能出现的百分号的情况包含进来 + 步骤2: 使用 math.isclose 函数判断是否相等 + """ + reference = float(expression1) + prediction = float(expression2) + + if include_percentage: + gt_result = [reference / 100, reference, reference * 100] + else: + gt_result = [reference] + + for item in gt_result: + # if isclose(item, prediction, abs_tol=self.precision, rel_tol=0): + if abs(item - prediction) <= self.precision * 1.01: + return True + return False + + def expression_equal(self, exp1, exp2): + """ + (默认 expression1 为 Ground_Truth) + 函数: 判断两个表达式是否在数学意义上等价 + 步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1" + 步骤2: 使用 sympy 库进行等价判断 + """ + + # 只提取等号右边的表达式,一般左边是所求的量 + def extract_expression(expression): + if "=" in expression: + expression = expression.split("=")[1] + return expression.strip() + + exp1 = extract_expression(exp1) + exp2 = extract_expression(exp2) + + exp_too_long = len(exp1) > 300 or len(exp2) > 300 + + # 将表达式转换为 sympy 中能够进行处理的格式 + expr1_sym = sympify(parse_latex(exp1)) + expr2_sym = sympify(parse_latex(exp2)) + + if expr1_sym == expr2_sym: + return True + else: + expr1_sym = self.sympy_sub_pi(expr1_sym) + expr2_sym = self.sympy_sub_pi(expr2_sym) + # 如果输入的表达式可以计算出具体数值的话,则将其进行数值计算的比较 + + if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or ( + not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)): + return False + elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol): + try: + if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)): + print( + "These two number can not be calculated by current computer for: " + f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"" + ) + return False + if exp_too_long: + print(f'Expression {exp1} or {exp2} is too long to compute. ') + return False + + if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01: + return True + else: + return False + except: + return False + elif exp_too_long: + print(f'Expression {exp1} or {exp2} is too long to compute. ') + return False + else: + try: + simplified_expr = simplify(expr1_sym - expr2_sym) + + num_value = simplified_expr.evalf() + + return abs(num_value) < 1e-3 + except: + return False + + def equation_equal(self, expression1, expression2): + """ + (默认 expression1 为 Ground_Truth) + 函数: 判断两个方程是否在数学意义上等价 + 步骤1: 将一个方程/等式化简为标准方程, 即等式的右边严格等于0, 接下来只需要判断两个等式的左边是否"等价" + 步骤2: 使用 sympy 库计算两个等式左边的商, 如果这个商或者这个商的倒数为整数, 那么数学意义上我们可以推导出这两个方程等价👌 + """ + + # 将等式的右边都移到左边,并返回一个 sympy 格式的表达式 + def simplify_equation(latex_eq): + # 分割等式的左边和右边 + lhs, rhs = latex_eq.split('=') + + # 使用 parse_latex 解析 LaTeX 表达式 + lhs_expr = parse_latex(lhs) + rhs_expr = parse_latex(rhs) + + # 创建等式对象 + equation = Eq(lhs_expr, rhs_expr) + + # 化简等式:将等式右边移到左边 + simplified_eq = simplify(equation.lhs - equation.rhs) + + return simplified_eq + + expr1_sym = simplify_equation(expression1) + expr2_sym = simplify_equation(expression2) + + division_result_1 = simplify(expr1_sym / expr2_sym) + division_result_2 = simplify(expr2_sym / expr1_sym) + + # 如果两个方程转换后的式子相除为整数 且非零,则根据推导可知这两个方程等价 + if (division_result_1.is_Integer and division_result_1 != 0) or ( + division_result_2.is_Integer and division_result_2 != 0): + return True + else: + return False + + def interval_equal(self, expression1, expression2): + # 函数: 判断两个区间是否在数学意义上等价 + # 步骤1: 简化区间的表达式, 去除无关的符号比如"\left", "\right", 同时将可能出现的"x \in"删去 + # 步骤2: 对比两个区间的左右符号、中间出现的数学表达式等是否一致 + + def compare_two_interval(inter1, inter2): + + # 首先比较两边的括号是否一致,一致的话再进行下一步比较 + if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]: + return False + + inter1 = inter1.strip('[]()') + inter2 = inter2.strip('[]()') + + # 分割区间的左右部分 + items_1 = inter1.split(',') + items_2 = inter2.split(',') + + for item_1, item_2 in zip(items_1, items_2): + if not self.expression_equal(item_1, item_2): + return False + return True + + interval1 = expression1 + interval2 = expression2 + + if interval1 == interval2: + return True + else: + inter_list1 = interval1.split("\\cup") + inter_list2 = interval2.split("\\cup") + + if len(inter_list1) != len(inter_list2): + return False + else: + for inter1, inter2 in zip(inter_list1, inter_list2): + if not compare_two_interval(inter1, inter2): + return False + return True + + def preprocess(self, expression1, expression2): + + # 尝试捕获box中的内容,如果有多个则以逗号相连返回,如果一个都没有,则报错 + def extract_boxed_content(latex_str): + # 查找所有的 \boxed{...} 结构 + boxed_matches = re.finditer(r'\\boxed{', latex_str) + results = "" + + for match in boxed_matches: + start_index = match.end() + end_index = start_index + stack = 1 + + # 从 \boxed{ 之后开始搜索,直到找到对应的闭合括号 + while stack > 0 and end_index < len(latex_str): + if latex_str[end_index] == '{': + stack += 1 + elif latex_str[end_index] == '}': + stack -= 1 + end_index += 1 + + if stack == 0: + # 提取 \boxed{} 内部的内容 + content = latex_str[start_index:end_index - 1] + results += content + "," + else: + # 如果括号没有正确闭合,则返回错误信息 + raise ValueError("Mismatched braces in LaTeX string.") + + # 如果没有匹配到'\boxed{}'字符,则默认提取有内容的文字最后一行中的所有公式部分 + if results == "": + last_line_ans = latex_str.strip().split("\n")[-1] + dollar_pattern = r"\$(.*?)\$" + answers = re.findall(dollar_pattern, last_line_ans) + + if answers: + for ans in answers: + results += ans + "," + else: + results = latex_str + + return results + + def sepcial_symbol_replace(expression): + if "\\in " in expression: + expression = expression.split("\\in ")[1] + + # 进行特殊字符的替换,这些字符都不影响latex的解析,属于美观/修饰性字符 + for signal in self.special_signal_map: + expression = expression.replace(signal, self.special_signal_map[signal]) + + expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。") + + pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}' + expression = re.sub(pattern, r'\1', expression) + + return expression + + exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2) + exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2) + + return exp1, exp2 + + def can_compute_power(self, expr): + """ + Check if the power expression can be computed. + + Parameters: + expr (sympy expression): The expression to check. + + Returns: + bool: True if the expression can be computed, False otherwise. + """ + # Check if the expression is a power expression + if isinstance(expr, Pow): + # Extract the base and the exponent + base, exp = expr.as_base_exp() + + # Check if the base and the exponent are numbers + if base.is_number and exp.is_number: + # Set a threshold for the maximum size of the exponent + MAX_EXP = 1000 # This threshold can be adjusted based on the computing environment + + # Check if the exponent is greater than the threshold + if abs(exp.evalf()) > MAX_EXP: + return False + else: + return True + else: + # If the base or the exponent is not a number, we cannot compute the power + return False + else: + # If the expression is not a power expression, return True as it is not the case we are checking for + return True + + +def extract_answer(is_chinese, model_output, is_deepseek=False): + # deepseekmath has special answering format + if str(model_output) == 'nan': + model_output = 'nan' + + if is_deepseek: + if is_chinese: + matches = re.findall('## 解题答案(.*)', model_output) + else: + matches = re.findall('The answer is: (.*)', model_output) + + # 检测是否至少找到一个匹配,如果没有就直接整个送进去找\boxed{} + if matches: + # 如果找到多个匹配,取最后一个 + model_answer = matches[-1].strip() + return model_answer + else: + return model_output + + if is_chinese: + matches = re.findall('所以最终答案是(.*)', model_output) + else: + matches = re.findall('So the final answer is (.*)', model_output) + + # 检测是否至少找到一个匹配,如果没有就直接整个送进去找\boxed{} + if matches: + # 如果找到多个匹配,取最后一个 + model_answer = matches[-1].strip() + return model_answer + else: + return model_output + + +def calculate_merged_accuracy(reference_dir, text_only): + pass diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py new file mode 100644 index 0000000..97d4125 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/qspatial.py @@ -0,0 +1,123 @@ +from ...smp import * +from ...utils import can_infer + + +FAIL_MSG = 'Failed to obtain answer via API.' + + +def get_gpt4_ICE_for_qspatial(): + example_1 = """ +Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit, +e.g., (1, m), (2.2, cm), (3.12, meter), at the end.\n +Model response: **Object Identification** + +* The object in question is a chair. +* The chair is not visible in the image. + +**Conclusion** + +The height of the chair cannot be determined from the provided image.\n +Extracted answer: (0, cm) +""" + + example_2 = """ +Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit, +e.g., (1, inch), (1.2, cm), (3.0, feet), at the end.\n +Model response: **Step 1: Identify the stapler and the recycle bin in the image.** + +The stapler is located on the wooden table, and the recycle bin is located on the floor. + +**Step 2: Determine the distance between the stapler and the recycle bin.** + +The stapler is 0.5 meters from the edge of the table, and the recycle bin is 1.5 meters from the edge of the table. +Therefore, the minimum distance between the stapler and the recycle bin is 1.5 - 0.5 = 1 meter. + +**Answer:** 1 m\n +Extracted answer: (1, m) +""" + example_3 = """ +Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit, +e.g., (1, foot), (2, cm), (4.3, meter), at the end.\n +Model response: The mirror in the image is approximately 5 feet 4 inches tall.\n +Extracted answer: (64, inch) +""" + example_4 = """ +Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit, +e.g., (0.1, cm), (2.9, cm), (0.3, meter), at the end.\n +Model response: The minimum distance between the wooden chair and the chair near the camera in the image is 1.7 feet.\n +Extracted answer: (1.7, feet) +""" + example_5 = """ +Hint: Please answer the question requiring in a tuple format. The tuple should contain a numeric value and a unit, +e.g., (5.1, cm), (0.9, cm), (55, mm), at the end.\n +Model response: The height of the painting's bottom edge from the floor is approximately 4.5 feet.\n +Extracted answer: (4.5, feet) +""" + return [example_1, example_2, example_3, example_4, example_5] + + +def list_to_dict(lst): + return {chr(65 + i): val for i, val in enumerate(lst)} + + +def post_check(line, prefetch=False): + res = None + ans = line['answer'] + response = line['prediction'] if prefetch else line['res'] + try: + if line['question_type'] == 'multi_choice': + ans = line['answer_option'] + choices = list_to_dict(eval(line['choices'])) + res = can_infer(response, choices) + if prefetch: + return res + else: + if line['answer_type'] == 'integer': + res = int(response) + ans = int(line['answer']) + elif line['answer_type'] == 'float': + res = float(response) + ans = float(line['answer']) + else: + res = str(res) + ans = str(ans) + except ValueError: + pass + + if res == ans: + return res if prefetch else True + else: + return False + + +def build_qspatial_gpt4_prompt(line): + task_description = """ +Please read the following example. +Then extract the answer from the model response and type it at the end of the prompt.\n +""" + prediction = str(line['prediction']) + prompt = task_description + examples = get_gpt4_ICE_for_qspatial() + for example in examples: + prompt += example + '\n' + prompt += 'Model respone: ' + prediction + prompt += '\nExtracted answer:' + return prompt + + +def QSpatial_auxeval(model, line): + prompt = build_qspatial_gpt4_prompt(line) + + log = '' + retry = 5 + for i in range(retry): + prediction = line['prediction'] + res = model.generate(prompt, temperature=i * 0.5) + + if FAIL_MSG in res: + log += f'Try {i}: output is {prediction}, failed to parse.\n' + else: + log += 'Succeed' + return dict(log=log, res=res) + log += 'All 5 retries failed.\n' + return dict(log=log, res='') diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py new file mode 100644 index 0000000..0782f55 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/tablevqabench.py @@ -0,0 +1,500 @@ +""" +Copied from https://github.com/allenai/allennlp-semparse +Modified from https://github.com/naver-ai/tablevqabench +""" + +import re +import unicodedata +import time + +from abc import ABCMeta, abstractmethod +from math import isinf, isnan + + +# Vision Prompts +VWTQ_PROMPT = ( + 'You are asked to answer questions asked on an image.\n' + 'You should answer the question with a single word.\n' + 'Example: \n' + 'Question: what was the only year mr. wu competed in the olympic games?\n' + 'Answer: 2004\n' + 'Question: which township in pope county, arkansas has the least amount of water area?\n' + 'Answer: Freeman\n' + 'If you have multiple answers, please separate them with || marks. Example: Apple||Banana||Tomato\n\n' + 'Question: {question}\n' + 'Answer:' +) + +VTABFACT_PROMPT = ( + 'You are asked to answer whether the statement is True or False based on given image\n' + 'You should only answer True or False.\n' + 'Example: \n' + 'Statement: the milwaukee buck win 6 game in the 2010 - 11 season\n' + 'Answer: True\n' + 'Statement: only the top team score above the average of 8.8\n' + 'Answer: False\n\n' + 'Statement: {question}\n' + 'Answer:' +) + +FINTABNETQA_PROMPT = ( + 'You are asked to answer questions asked on a image.\n' + 'You should answer the question within a single word or few words.\n' + 'If units can be known, the answer should include units such as $, %, million and etc.\n' + 'Example: \n' + 'Question: What were the total financing originations for the fiscal year ended October 31, 2004?\n' + 'Answer: $3,852 million\n' + 'Question: What is the time period represented in the table?\n' + 'Answer: October 31\n' + 'Question: What was the percentage of net sales for selling, general and administrative expenses in 2006?\n' + 'Answer: 34.2%\n' + 'Question: {question}\n' + 'Answer:' +) + + +def evaluate_tabfact(data, score_keys): + num_examples = 0 + num_correct = 0 + manual_check = 0 + start_time = time.time() + for instance in data: + if instance['prediction'] is None: + instance['prediction'] = 'none' + pred = instance['prediction'].lower() + gt = instance['answer'] + num_examples += 1 + if 'true' in pred and 'false' in pred: + manual_check += 1 + score = None + elif 'true' in pred and gt == '1': + num_correct += 1 + score = 1 + elif 'false' in pred and gt == '0': + num_correct += 1 + score = 1 + else: + score = 0 + instance['scores'] = {score_keys[0]: score} + if manual_check > 0: + print(f'the number of not properly parsed samples: {manual_check}') + end_time = time.time() + elapsed_time = end_time - start_time + Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100 + meta = { + 'evaluators': 'correctness', + 'score_info': [score_keys[0]], + 'evaluated_time': elapsed_time, + 'total_num_sample': len(data), + 'average_scores': [Accuracy], + } + return meta + + +def evaluate_wtq(data, score_keys): + num_examples = 0 + num_correct = 0 + start_time = time.time() + + for instance in data: + pred = instance['prediction'].replace('||', '|') + gt = instance['answer'] + original_strings = tsv_unescape_list(gt) + target_values = to_value_list(original_strings) + + predicted_strings = tsv_unescape_list(pred) + predicted_values = to_value_list(predicted_strings) + correct = check_denotation(target_values, predicted_values) + num_examples += 1 + score = 0 + if correct: + num_correct += 1 + score = 1 + instance['scores'] = {score_keys[0]: score} + + end_time = time.time() + elapsed_time = end_time - start_time + + Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100 + meta = { + 'evaluators': 'correctness', + 'score_info': [score_keys[0]], + 'evaluated_time': elapsed_time, + 'total_num_sample': len(data), + 'average_scores': [Accuracy], + } + return meta + + +def evaluate_fintabnet(data, score_keys): + num_examples = 0 + num_correct, _num_correct = 0, 0 + start_time = time.time() + for instance in data: + pred, preds = fintabnet_normalize(instance['prediction']) + gt, gts = fintabnet_normalize(instance['answer']) + correct = 1 if gt == pred else 0 + _correct = any(_pred == _gt for _pred in preds for _gt in gts) + num_examples += 1 + score, _score = 0, 0 + if correct: + num_correct += 1 + score = 1 + if _correct: + _num_correct += 1 + _score = 1 + instance['scores'] = {score_keys[0]: _score, 'exact_score': score} + + end_time = time.time() + elapsed_time = end_time - start_time + Accuracy = round((num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100 + _Accuracy = round((_num_correct + 1e-9) / (num_examples + 1e-9), 8) * 100 + meta = { + 'evaluators': 'correctness', + 'score_info': ['relieved_accuracy', score_keys[0]], + 'evaluated_time': elapsed_time, + 'total_num_sample': len(data), + 'average_scores': [_Accuracy, Accuracy], + } + return meta + + +def fintabnet_normalize(s): + s = normalize(s) + remove_words = [ + 'dollar', 'gallons', 'square feet', 'shares', 'mbtu', + 'mbpd', 'mbbls', 'mmbtu', 'unit', 'gwh', 'year', 'mmcf', 'mile', 'mboe' + ] + + # Data specific filtering using regular expressions + # Remove special characters like $, (, and ) + s = re.sub(r'[\$\(\),]', '', s) + + # Replace "dollar" with empty string if it's not part of another word + pattern = r'\b(' + '|'.join(remove_words) + r')s?\b' + s = re.sub(pattern, '', s, flags=re.IGNORECASE) + + # Unit conversion dictionary with regex patterns for flexibility + unit_conversion = { + r' \bthousand\b': 'e3', + r' \bmillion\b': 'e6', + r' \bbillion\b': 'e9', + r'\bthousand\b': 'e3', + r'\bmillion\b': 'e6', + r'\bbillion\b': 'e9', + r' ?%': 'e-2', + } + + # Convert percentages to their decimal representation. + # Applying this after unit_conversion prevents "percent" from being processed + # in cases like "million %", which would be incorrect. + # s = re.sub(r' ?%', 'e-2', s) + # s_percent = re.sub(r' ?%', '', s_percent) + + s_unit_free = s + + # Iterate over unit_conversion and apply transformations + for pattern, value in unit_conversion.items(): + s = re.sub(pattern, value, s) + s_unit_free = re.sub(pattern, '', s_unit_free) + + # Attempt to convert to float + try: + return float(s), [float(s), float(s_unit_free)] + except ValueError: + # Return the original string and the error for debugging purposes + return s, [s, s_unit_free] + + +def normalize(x): + if not isinstance(x, str): + x = x.decode('utf8', errors='ignore') + # Remove diacritics + x = ''.join( + c for c in unicodedata.normalize('NFKD', x) if unicodedata.category(c) != 'Mn' + ) + # Normalize quotes and dashes + x = re.sub(r'[‘’´`]', "'", x) + x = re.sub(r'[“”]', '"', x) + x = re.sub(r'[‐‑‒–—−]', '-', x) + while True: + old_x = x + # Remove citations + x = re.sub(r'((? backslash + n + vertical bar (0x7C) -> backslash + p + backslash (0x5C) -> backslash + backslash + + Args: + x (str or unicode) + Returns: + a unicode + """ + return x.replace(r'\n', '\n').replace(r'\p', '|').replace('\\\\', '\\') + + +def tsv_unescape_list(x): + """Unescape a list in the TSV file. + List items are joined with vertical bars (0x5C) + + Args: + x (str or unicode) + Returns: + a list of unicodes + """ + return [tsv_unescape(y) for y in x.split('|')] diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py new file mode 100644 index 0000000..c284e4b --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/tempcompass.py @@ -0,0 +1,254 @@ +from ...smp import * +from .multiple_choice import extract_answer_from_item +from PIL import Image, ImageOps +import numpy as np + +sys_prompt = "You are an AI assistant for question answering." + +system_prompt_multi_choice = ( + "You will receive a multi-choice question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa + "Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. " + "If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"." +) + +system_prompt_caption_matching = ( + "You will receive a caption matching question, the ground-truth answer and the prediction from a question answering (QA) model. " # noqa + "Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. " + "If the prediction is correct, respond \"Correct\". If the prediction is incorrect, respond \"Incorrect\"." +) + +system_prompt_captioning = """ +You will receive a video description and a multi-choice question. Your task is to choose the correct answer and briefly explain the reason why you choose the answer. \ +If none of the choice candidates are correct or the video description lacks enough information to answer the question, just answer "None of the choices are correct". \ +Please organize your response in this format: +``` +Reasoning: [Your reason to obtain the answer] +Answer: [Your answer] +``` + +Here are some examples of video description, multi-choice question and the expected answer: +``` +Video Description: A person is palying football. +Multi-Choice Question: +What is the person doing in the video? +A. cooking +B. palying football +C. playing basketball +D. reading book +Reasoning: The video description mentions that the person is playing football. +Answer: B. palying football + +Video Description: A bird is flying clockwise. +Multi-Choice Question: +In which direction is the bird flying? +A. backwark +B. counter-clockwise +C. clockwise +D. downward +Reasoning: The video description mentions that the bird is flying clockwise +Answer: C. clockwise + +Video Description: An air balloon is inflating. +Multi-Choice Question: +What is happening to the air balloon? +A. exploding +B. getting smaller +C. flying +Reasoning: The video description mentions that the air balloon is inflating, while none of the coices can be explained as inflating. +Answer: None of the choices are correct +``` +""" # noqa + +system_prompt_YorN = """ +You will receive a Yes/No question, the ground-truth answer and the prediction from a question answering (QA) model. \ +Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. \ +If the prediction is correct, respond "Correct". If the prediction is incorrect, respond "Incorrect". +""" # noqa + + +def eval_rule_caption_matching(line): + # Determine whether the video llm output is correct, based on word matching rules + video_llm_output = line['prediction'] + answer = line['answer'] + option_strs = eval(line['candidates']) # complete option strings + option_sents = [opt.split(': ')[1] for opt in option_strs] # option sentence + # option index, e.g., Sentence A, Caption A, Option 1 + option_inds = [opt.split(': ')[0] for opt in option_strs] + [opt.split(': ')[0].replace('Sentence ', '').replace('Option ', '').replace('Caption ', '') for opt in option_strs] # noqa + video_llm_pred = None + for option_str in option_strs: + if option_str == video_llm_output: + video_llm_pred = option_str + for option_sent in option_sents: + if option_sent == video_llm_output or (') ' in video_llm_output and option_sent == video_llm_output.split(') ')[1]): # noqa + video_llm_pred = option_sent + for option_ind in option_inds: + if option_ind == video_llm_output or option_ind == video_llm_output.replace('.', ''): # noqa + video_llm_pred = option_ind + + if video_llm_pred is None: + return "fail" + else: + return 1 if video_llm_pred == answer or video_llm_pred == answer.split(":")[0] or video_llm_pred == answer.split(": ")[1] or video_llm_pred == answer.split(": ")[0].split()[1] else 0 # noqa + + +def eval_rule_multi_choice(line): + if line['prediction'] == line['answer']: + return 1 + elif line['prediction'] in ['A', 'B', 'C', 'D']: + return 1 if line['prediction'] == line['answer'][0] else 0 + elif any(line['prediction'].startswith(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']): + return 1 if line['prediction'].split('.')[0] == line['answer'][0] else 0 + elif any(line['prediction'].startswith(prefix) for prefix in ['A)', 'B)', 'C)', 'D)']): + return 1 if line['prediction'].split(')')[0] == line['answer'][0] else 0 + else: + return "fail" + + +def eval_rule_YorN(video_llm_output): + # Extract the yes/no predction from the original video llm output + video_llm_output = video_llm_output.lower() + if video_llm_output.startswith("yes"): + return "yes" + elif video_llm_output.startswith("no"): + return "no" + else: + return False + + +def llm_output_to_rating(llm_output): + if not ('Correct' in llm_output or 'Incorrect' in llm_output): + print(f"Warning: LLM output is not in the correct format: {llm_output}") + rating = 0 + return rating + if llm_output.startswith('Correct'): + rating = 1 + elif llm_output.startswith('Incorrect'): + rating = 0 + elif ('Correct' in llm_output) and ('Incorrect' not in llm_output): + rating = 1 + elif 'Incorrect' in llm_output: + rating = 0 + return rating + + +def parse_llm_output(llm_output, gt_answer): + if llm_output == "invalid_request_error" or not llm_output: + eval_result = {"rating": -1, "chatgpt-answer": None, "chatgpt-reasoning": None} + return eval_result + + eval_result = {} + lines = llm_output.split("\n") + + for line in lines: + line = line.strip() + if "Reasoning" in line: + eval_result['chatgpt-reasoning'] = line.replace("Reasoning:", "").strip() + if "Answer" in line: + eval_result['chatgpt-answer'] = line.replace("Answer:", "").strip() + + if "chatgpt-answer" not in eval_result: + eval_result['chatgpt-answer'] = llm_output + if "chatgpt-reasoning" not in eval_result: + eval_result['chatgpt-reasoning'] = None + + # Check if the chatgpt answer is the ground-truth answer + # calculate the number of 'A.', 'B.', 'C.', 'D.' in chatgpt-answer + answer_counts = sum(eval_result['chatgpt-answer'].count(prefix) for prefix in ['A.', 'B.', 'C.', 'D.']) # noqa + if eval_result['chatgpt-answer'].split(". ")[0] == gt_answer.split(". ")[0] and answer_counts == 1: + eval_result['rating'] = 1 + else: + eval_result['rating'] = 0 + return eval_result + + +def evaluate_tempcompass_mcq(model, line): + eval_rules_dict = { + 'caption_matching': eval_rule_caption_matching, + 'multi-choice': eval_rule_multi_choice + } + gpt_eval_prompt = { + 'multi-choice': '{}\nMulti-Choice Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}', + 'caption_matching': '{}\nCaption Matching Question:\n{}\nGround-Truth Answer: {}\nModel Prediction: {}' + } + base_prompt = { + 'multi-choice': system_prompt_multi_choice, + 'caption_matching': system_prompt_caption_matching + } + eval_result = { + "question": line['question'], + "answer": line['answer'], + "prediction": line['prediction'], + "task_type": line['task_type'], + "candidates": line['candidates'], + "match_success": True + } + result = eval_rules_dict[line['task_type']](line) + if result == "fail": + eval_result['match_success'] = False + if model is None: + eval_result['rating'] = 0 + else: + prompt_template = gpt_eval_prompt[line['task_type']] + prompt = prompt_template.format(base_prompt[line['task_type']], line['question'], line['answer'], line['prediction']) # noqa + llm_output = model.generate(prompt) + result = llm_output_to_rating(llm_output) + eval_result['chatgpt-response'] = llm_output + eval_result['rating'] = result + else: + eval_result['rating'] = result + + return eval_result + + +def evaluate_tempcompass_captioning(model, line): + prompt = ( + f"{system_prompt_captioning}\n" + f"Video Description:{line['prediction']}\n" + f"Multi-Choice Question:\n{line['mc_question']}\n" + ) + if model is not None: + llm_output = model.generate(prompt) + eval_result = parse_llm_output(llm_output, gt_answer=line['mc_answer']) + return eval_result + else: + raise ValueError("Model is None, TempCompass Captioning task not supported exact matching") # noqa + + +def evaluate_tempcompass_YorN(model, line): + prompt = ( + f"{system_prompt_YorN}\n" + f"Yes/No Question:\n{line['question']}\n" + f"Ground-Truth Answer: {line['answer']}\n" + f"Model Prediction: {line['prediction']}" + ) + result = eval_rule_YorN(line['prediction']) + eval_result = { + "question": line['question'], + "answer": line['answer'], + "prediction": line['prediction'], + "match_success": True + } + if result: + eval_result['rating'] = 1 if result == line['answer'] else 0 + elif model is None: + eval_result['match_success'] = False + eval_result['rating'] = 0 + else: + eval_result['match_success'] = False + llm_output = model.generate(prompt) + result = llm_output_to_rating(llm_output) + eval_result['chatgpt-response'] = llm_output + eval_result['rating'] = result + return eval_result + + +def get_dimension_rating(score_file): + data = load(score_file) + result_dict = {} + for idx, item in data.iterrows(): + dict_key = item['dim'] + '. ' + item['task_type'] + if dict_key not in result_dict: + result_dict[dict_key] = [0,0] + result_dict[dict_key][0] += int(item['score']) + result_dict[dict_key][1] += 1 + return result_dict diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py index 9d35728..dea57ea 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/videomme.py @@ -1,4 +1,5 @@ from ...smp import * +from .multiple_choice import extract_answer_from_item import numpy as np import re @@ -97,24 +98,33 @@ def get_dimension_rating(data_path): for duration in DURATIONS + ['overall']: - overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.2f}' + overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["domain"].values(), []) if x >= 0]):.3f}' duration_rating[duration]['overall'] = overall_res_dur for domain in DOMAINS: - domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.2f}' + domain_res_dur = f'{np.mean([x for x in duration_rating[duration]["domain"][domain] if x >= 0]):.3f}' duration_rating[duration]['domain'][domain] = domain_res_dur for sub_ctg in SUB_CATEGORIES: - sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.2f}' + sub_res_dur = f'{np.mean([x for x in duration_rating[duration]["sub_category"][sub_ctg] if x >= 0]):.3f}' duration_rating[duration]['sub_category'][sub_ctg] = sub_res_dur for task_ctg in TASK_CATEGORIES: - task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.2f}' + task_res_dur = f'{np.mean([x for x in duration_rating[duration]["task_type"][task_ctg] if x >= 0]):.3f}' duration_rating[duration]['task_type'][task_ctg] = task_res_dur return duration_rating +def extract_option(model, input_item, dataset_name): + options = input_item['question'].split('\n')[1:] + for id, option in enumerate(options): + option_id = chr(ord('A') + id) + '.' + if option.find(option_id) >= 0: + input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n') + return extract_answer_from_item(model, input_item, dataset_name)['opt'] + + def extract_characters_regex(s): s = s.strip() answer_prefixes = [ diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py new file mode 100644 index 0000000..d6a4b2b --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/wemath.py @@ -0,0 +1,896 @@ +# pylint: skip-file + +import pandas as pd +import json +import numpy as np +import os +import argparse + +# four_dimensional_metrics.py + + +# Function to evaluate steps +def evaluate_evaluate_steps(json, steps): # noqa + jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)] + for i in range(steps): + jokers[i].rename( + columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'}, + inplace=True, + ) + concatenated_steps = pd.concat(jokers, axis=0) + return concatenated_steps + + +# Function to load and process JSON data +def load_and_process_data(filepath): + df = pd.read_excel(filepath) + if 'hit' not in df.columns: + df['processed_answer'] = ( + df['prediction'] + .str.split('Answer') + .str[-1] + .str.strip() + .str.replace(r'[>><<:.]', '', regex=True) + .str.strip() + ) + df['processed_answer'] = df['processed_answer'].apply(lambda x: x[0] if x and x[0] in 'ABCDEFGH' else None) + df['joker'] = df['processed_answer'] == df['answer'] + else: + df['joker'] = df['hit'].astype(bool) + return df + + +# Function to process steps data and merge results +def evaluate_process_steps_data(df, steps): + steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)} + steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi'] + for key, data in steps_data.items(): + data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns] + merged_data = steps_data[f'{steps}steps_1'] + for i in range(2, steps + 1): + merged_data = pd.merge( + merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left' # noqa + ) + merged_data = pd.merge( + merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left' # noqa + ) + return merged_data + + +# Function to calculate evaluation metrics +def evaluate_calculate_metrics(merged_2steps, merged_3steps): + metrics = {} + metrics['steps2_filtered_rows_1_loose'] = merged_2steps[ + ((merged_2steps['joker_1'] == False) & (merged_2steps['joker_2'] == False)) # noqa + & (merged_2steps['joker_multi'] == True) # noqa + ] + metrics['steps2_filtered_rows_1_strict'] = merged_2steps[ + ((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False)) # noqa + & (merged_2steps['joker_multi'] == True) # noqa + ] + metrics['steps2_filtered_rows_2'] = merged_2steps[ + ((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True)) # noqa + & (merged_2steps['joker_multi'] == False) # noqa + ] + metrics['steps2_filtered_rows_3'] = merged_2steps[ + ((merged_2steps['joker_1'] == False) | (merged_2steps['joker_2'] == False)) # noqa + & (merged_2steps['joker_multi'] == False) # noqa + ] + metrics['steps2_filtered_rows_4_loose'] = merged_2steps[ + ((merged_2steps['joker_1'] == True) | (merged_2steps['joker_2'] == True)) + & (merged_2steps['joker_multi'] == True) + ] + metrics['steps2_filtered_rows_4_strict'] = merged_2steps[ + ((merged_2steps['joker_1'] == True) & (merged_2steps['joker_2'] == True)) + & (merged_2steps['joker_multi'] == True) + ] + metrics['steps3_filtered_rows_1_loose'] = merged_3steps[ + ( + (merged_3steps['joker_1'] == False) + & (merged_3steps['joker_2'] == False) + & (merged_3steps['joker_3'] == False) + ) + & (merged_3steps['joker_multi'] == True) + ] + metrics['steps3_filtered_rows_1_strict'] = merged_3steps[ + ( + (merged_3steps['joker_1'] == False) + | (merged_3steps['joker_2'] == False) + | (merged_3steps['joker_3'] == False) + ) + & (merged_3steps['joker_multi'] == True) + ] + metrics['steps3_filtered_rows_2'] = merged_3steps[ + ((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True)) + & (merged_3steps['joker_multi'] == False) + ] + metrics['steps3_filtered_rows_3'] = merged_3steps[ + ( + (merged_3steps['joker_1'] == False) + | (merged_3steps['joker_2'] == False) + | (merged_3steps['joker_3'] == False) + ) + & (merged_3steps['joker_multi'] == False) + ] + metrics['steps3_filtered_rows_4_loose'] = merged_3steps[ + ((merged_3steps['joker_1'] == True) | (merged_3steps['joker_2'] == True) | (merged_3steps['joker_3'] == True)) + & (merged_3steps['joker_multi'] == True) + ] + metrics['steps3_filtered_rows_4_strict'] = merged_3steps[ + ((merged_3steps['joker_1'] == True) & (merged_3steps['joker_2'] == True) & (merged_3steps['joker_3'] == True)) + & (merged_3steps['joker_multi'] == True) + ] + # metrics.to_csv("/Users/mac/Desktop/测试结果/error_anal/csv/gpt4o-0626.csv", index = False) + return metrics + + +# Function to compute evaluation rates and final scores +def evaluate_compute_final_scores(metrics, total_count): + total_counts = { + 'InadequateGeneralization': len(metrics['steps2_filtered_rows_2']) + len(metrics['steps3_filtered_rows_2']), + 'InsufficientKnowledge': len(metrics['steps2_filtered_rows_3']) + len(metrics['steps3_filtered_rows_3']), + 'CompleteMastery_loose': len(metrics['steps2_filtered_rows_4_loose']) + + len(metrics['steps3_filtered_rows_4_loose']), + 'CompleteMastery_strict': len(metrics['steps2_filtered_rows_4_strict']) + + len(metrics['steps3_filtered_rows_4_strict']), + 'RoteMemorization_loose': len(metrics['steps2_filtered_rows_1_loose']) + + len(metrics['steps3_filtered_rows_1_loose']), + 'RoteMemorization_strict': len(metrics['steps2_filtered_rows_1_strict']) + + len(metrics['steps3_filtered_rows_1_strict']), + } + rates = { + 'InadequateGeneralization_rate': "{:.2%}".format(total_counts['InadequateGeneralization'] / total_count), + 'InsufficientKnowledge_rate': "{:.2%}".format(total_counts['InsufficientKnowledge'] / total_count), + 'CompleteMastery_loose_rate': "{:.2%}".format(total_counts['CompleteMastery_loose'] / total_count), + 'CompleteMastery_strict_rate': "{:.2%}".format(total_counts['CompleteMastery_strict'] / total_count), + 'RoteMemorization_loose_rate': "{:.2%}".format( + total_counts['RoteMemorization_loose'] + / (total_counts['CompleteMastery_loose'] + total_counts['RoteMemorization_loose']) + ), + 'RoteMemorization_strict_rate': "{:.2%}".format( + total_counts['RoteMemorization_strict'] + / (total_counts['CompleteMastery_strict'] + total_counts['RoteMemorization_strict']) + ), + } + return total_counts, rates + + +# Function to update main results DataFrame +def evaluate_update_main_results_df(main_results_df, total_counts, rates): + + final_score_loose = "{:.2%}".format( + ( + 525 + - 0.5 * total_counts['InadequateGeneralization'] + - total_counts['RoteMemorization_loose'] + - total_counts['InsufficientKnowledge'] + ) + / 525 + ) + final_score_strict = "{:.2%}".format( + ( + 525 + - 0.5 * total_counts['InadequateGeneralization'] + - total_counts['RoteMemorization_strict'] + - total_counts['InsufficientKnowledge'] + ) + / 525 + ) + + new_row = { + # 'Model': model, + 'Score (Strict)': final_score_strict, + 'InsufficientKnowledge (Strict)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})", + 'InadequateGeneralization (Strict)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})", + 'CompleteMastery (Strict)': f"{rates['CompleteMastery_strict_rate']} ({total_counts['CompleteMastery_strict']})", + 'RoteMemorization (Strict)': f"{rates['RoteMemorization_strict_rate']} ({total_counts['RoteMemorization_strict']})", + 'Score (Loose)': final_score_loose, + 'InsufficientKnowledge (Loose)': f"{rates['InsufficientKnowledge_rate']} ({total_counts['InsufficientKnowledge']})", + 'InadequateGeneralization (Loose)': f"{rates['InadequateGeneralization_rate']} ({total_counts['InadequateGeneralization']})", + 'CompleteMastery (Loose)': f"{rates['CompleteMastery_loose_rate']} ({total_counts['CompleteMastery_loose']})", + 'RoteMemorization (Loose)': f"{rates['RoteMemorization_loose_rate']} ({total_counts['RoteMemorization_loose']})", + } + main_results_df = main_results_df._append(new_row, ignore_index=True) + return main_results_df + + +# Main function to evaluate models +def wemath_evaluate_models(output_json, main_results_csv_path=None): + + main_results_df = pd.DataFrame( + columns=[ + 'Model', + 'Score (Strict)', + 'InsufficientKnowledge (Strict)', + 'InadequateGeneralization (Strict)', + 'CompleteMastery (Strict)', + 'RoteMemorization (Strict)', + 'Score (Loose)', + 'InsufficientKnowledge (Loose)', + 'InadequateGeneralization (Loose)', + 'CompleteMastery (Loose)', + 'RoteMemorization (Loose)', + ] + ) + + # print(f"Evaluating model: {model_name}, JSON path: {output_json}") + data = load_and_process_data(output_json) + data_2steps = data[data['key'].str.contains('2steps')] + data_3steps = data[data['key'].str.contains('3steps')] + merged_2steps = evaluate_process_steps_data(data_2steps, 2) + merged_3steps = evaluate_process_steps_data(data_3steps, 3) + + metrics = evaluate_calculate_metrics(merged_2steps, merged_3steps) + total_counts, rates = evaluate_compute_final_scores(metrics, total_count=525) + + main_results_df = evaluate_update_main_results_df(main_results_df, total_counts, rates) + + print(main_results_df.to_string(index=False)) + if main_results_csv_path is not None: + main_results_df.to_csv(main_results_csv_path, index=False) + print("Evaluation completed and results saved to CSV.") + return main_results_df.to_dict() + + +### Accuracy.py +# Function to load knowledge structure nodes +def load_knowledge_structure_nodes(filepath): + # with open(filepath, "r") as file: + # nodes = json.load(file) + nodes = knowledge_structure_nodes + nodes = pd.DataFrame(nodes) + nodes['final_key'] = nodes['full node'].str.split('_').str[-1] + nodes['root_2'] = nodes['full node'].str.split('_').str[1] + return nodes + + +# Function to evaluate steps +def accuracy_evaluate_steps(json, steps, nodes): + jokers = [json[[f'joker_{i}', f'knowledge concept_{i}']] for i in range(1, steps + 1)] + for i in range(steps): + jokers[i] = pd.merge( + jokers[i], + nodes[['final_key', 'full node', 'root_2']], + left_on=f'knowledge concept_{i + 1}', + right_on='final_key', + how='left', + ) + jokers[i].rename( + columns={f'joker_{i + 1}': 'joker', f'knowledge concept_{i + 1}': 'knowledge_concept'}, + inplace=True, + ) + concatenated_steps = pd.concat(jokers, axis=0) + return concatenated_steps + + +# Function to process steps data and merge results +def accuracy_process_steps_data(df, steps): + steps_data = {f'{steps}steps_{i}': df[df['key'] == f'{steps}steps_{i}'] for i in range(1, steps + 1)} + steps_data[f'{steps}steps_multi'] = df[df['key'] == f'{steps}steps_multi'] + for key, data in steps_data.items(): + data.columns = [col + f'_{key.split("_")[-1]}' for col in data.columns] + merged_data = steps_data[f'{steps}steps_1'] + for i in range(2, steps + 1): + merged_data = pd.merge( + merged_data, steps_data[f'{steps}steps_{i}'], left_on=f'ID_1', right_on=f'ID_{i}', how='left' + ) + merged_data = pd.merge( + merged_data, steps_data[f'{steps}steps_multi'], left_on=f'ID_1', right_on='ID_multi', how='left' + ) + return merged_data + + +# Function to update main results DataFrame +def accuracy_update_main_results_df(nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps): + One_step_acc = "{:.2%}".format(concatenated_data['joker'].mean()) + Two_step_acc = "{:.2%}".format(merged_2steps['joker_multi'].mean()) + Three_step_acc = "{:.2%}".format(merged_3steps['joker_multi'].mean()) + + new_row = { + # 'Model': model_name, + 'One-step(S1)': One_step_acc, + 'Two-step(S2)': Two_step_acc, + 'Three-step(S3)': Three_step_acc, + } + # Calculate rates according to Nodes + nodes['final_rode'] = nodes['full node'].str.split('_').str[-1] + csv_final_score = concatenated_data.groupby('final_key')['joker'].mean() + csv_final_score = pd.merge(nodes, csv_final_score, left_on='final_rode', right_on='final_key', how='left') + + new_row.update(csv_final_score.groupby('root2')['joker'].mean().apply(lambda x: "{:.2%}".format(x)).to_dict()) + main_results_df = main_results_df._append(new_row, ignore_index=True) + + return main_results_df + + +# Main function to evaluate models +def wemath_accuracy(output_json, main_results_csv_path=None): + + # nodes = load_knowledge_structure_nodes(knowledge_structure_nodes_path) + nodes = knowledge_structure_nodes + nodes = pd.DataFrame(nodes) + nodes['final_key'] = nodes['full node'].str.split('_').str[-1] + nodes['root_2'] = nodes['full node'].str.split('_').str[1] + + main_results_df = pd.DataFrame( + columns=[ + 'Model', + 'One-step(S1)', + 'Two-step(S2)', + 'Three-step(S3)', + 'Understanding and Conversion of Units', + 'Angles and Length', + 'Calculation of Plane Figures', + 'Understanding of Plane Figures', + 'Calculation of Solid Figures', + 'Understanding of Solid Figures', + 'Basic Transformations of Figures', + 'Cutting and Combining of Figures', + 'Direction', + 'Position', + 'Route Map', + 'Correspondence of Coordinates and Positions', + ] + ) + + # print(f"Evaluating model: {model_name}, JSON path: {output_json}") + data = load_and_process_data(output_json) + data_2steps = data[data['key'].str.contains('2steps')] + data_3steps = data[data['key'].str.contains('3steps')] + merged_2steps = accuracy_process_steps_data(data_2steps, 2) + merged_3steps = accuracy_process_steps_data(data_3steps, 3) + + concatenated_data = pd.concat( + [accuracy_evaluate_steps(merged_2steps, 2, nodes), accuracy_evaluate_steps(merged_3steps, 3, nodes)], + axis=0, + ) + main_results_df = accuracy_update_main_results_df( + nodes, main_results_df, concatenated_data, merged_2steps, merged_3steps + ) + + print(main_results_df.to_string(index=False)) + if main_results_csv_path is not None: + main_results_df.to_csv(main_results_csv_path, index=False) + print("Evaluation completed and results saved to CSV.") + + return main_results_df.to_dict() + + +knowledge_structure_nodes = [ + { + "root0": "Geometry and Figures", + "root1": "Measurement", + "root2": "Understanding and Conversion of Units", + "root3": "Conversion Rates and Calculations Between Area Units", + "root4": None, + "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Area Units", + }, + { + "root0": "Geometry and Figures", + "root1": "Measurement", + "root2": "Understanding and Conversion of Units", + "root3": "Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)", + "root4": None, + "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Volume Units (Including Liters and Milliliters)", + }, + { + "root0": "Geometry and Figures", + "root1": "Measurement", + "root2": "Understanding and Conversion of Units", + "root3": "Conversion Rates and Calculations Between Length Units", + "root4": None, + "full node": "Measurement_Understanding and Conversion of Units_Conversion Rates and Calculations Between Length Units", + }, + { + "root0": "Geometry and Figures", + "root1": "Measurement", + "root2": "Angles and Length", + "root3": "Understanding Angles (Using a Protractor)", + "root4": None, + "full node": "Measurement_Angles and Length_Understanding Angles (Using a Protractor)", + }, + { + "root0": "Geometry and Figures", + "root1": "Measurement", + "root2": "Angles and Length", + "root3": "Understanding Length (Using a Ruler)", + "root4": None, + "full node": "Measurement_Angles and Length_Understanding Length (Using a Ruler)", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Surface Area of Solid Figures", + "root4": "Surface Area of Cylinders", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cylinders", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Surface Area of Solid Figures", + "root4": "Surface Area of Rectangular Cuboids", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Rectangular Cuboids", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Surface Area of Solid Figures", + "root4": "Surface Area of Cubes", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Surface Area of Solid Figures_Surface Area of Cubes", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Volume of Solid Figures", + "root4": "Volume and Capacity of Cylinders", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cylinders", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Volume of Solid Figures", + "root4": "Volume and Capacity of Cones", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cones", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Volume of Solid Figures", + "root4": "Volume and Capacity of Rectangular Cuboids", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Rectangular Cuboids", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Calculation of Solid Figures", + "root3": "Calculation of Volume of Solid Figures", + "root4": "Volume and Capacity of Cubes", + "full node": "Solid Figures_Calculation of Solid Figures_Calculation of Volume of Solid Figures_Volume and Capacity of Cubes", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Expanded View of Solids", + "root4": "Expanded View of Cylinders", + "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cylinders", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Expanded View of Solids", + "root4": "Expanded View of Rectangular Cuboids", + "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Rectangular Cuboids", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Expanded View of Solids", + "root4": "Expanded View of Cubes", + "full node": "Solid Figures_Understanding of Solid Figures_Expanded View of Solids_Expanded View of Cubes", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Cylinders and Cones", + "root4": "Properties of Cylinders", + "full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cylinders", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Cylinders and Cones", + "root4": "Properties of Cones", + "full node": "Solid Figures_Understanding of Solid Figures_Cylinders and Cones_Properties of Cones", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Rectangular Cuboids and Cubes", + "root4": "Properties and Understanding of Rectangular Cuboids", + "full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Rectangular Cuboids", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Rectangular Cuboids and Cubes", + "root4": "Properties and Understanding of Cubes", + "full node": "Solid Figures_Understanding of Solid Figures_Rectangular Cuboids and Cubes_Properties and Understanding of Cubes", + }, + { + "root0": "Geometry and Figures", + "root1": "Solid Figures", + "root2": "Understanding of Solid Figures", + "root3": "Observing Objects", + "root4": None, + "full node": "Solid Figures_Understanding of Solid Figures_Observing Objects", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Sum of Interior Angles of Polygons", + "root4": "Sum of Interior Angles of Other Polygons", + "full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Other Polygons", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Sum of Interior Angles of Polygons", + "root4": "Sum of Interior Angles of Triangles", + "full node": "Plane Figures_Calculation of Plane Figures_Sum of Interior Angles of Polygons_Sum of Interior Angles of Triangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation and Comparison of Angles", + "root4": None, + "full node": "Plane Figures_Calculation of Plane Figures_Calculation and Comparison of Angles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Parallelograms", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Parallelograms", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Triangles", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Triangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Sectors", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Sectors", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Trapezoids", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Trapezoids", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Circles", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Circles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Rectangles", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Rectangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Areas", + "root4": "Area of Squares", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Areas_Area of Squares", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Perimeters", + "root4": "Perimeter of Parallelograms", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Parallelograms", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Perimeters", + "root4": "Perimeter of Triangles", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Triangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Perimeters", + "root4": "Perimeter of Trapezoids", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Trapezoids", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Perimeters", + "root4": "Circumference of Circles", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Circumference of Circles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Perimeters", + "root4": "Perimeter of Rectangles", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Rectangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Calculation of Plane Figures", + "root3": "Calculation of Perimeters", + "root4": "Perimeter of Squares", + "full node": "Plane Figures_Calculation of Plane Figures_Calculation of Perimeters_Perimeter of Squares", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Polygons", + "root4": "Properties and Understanding of Parallelograms", + "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Parallelograms", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Polygons", + "root4": "Properties and Understanding of Triangles", + "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Triangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Polygons", + "root4": "Properties and Understanding of Trapezoids", + "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Trapezoids", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Polygons", + "root4": "Properties and Understanding of Rectangles", + "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Rectangles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Polygons", + "root4": "Properties and Understanding of Squares", + "full node": "Plane Figures_Understanding of Plane Figures_Polygons_Properties and Understanding of Squares", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Classification and Understanding of Angles", + "root4": "Understanding Triangular Rulers", + "full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding Triangular Rulers", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Classification and Understanding of Angles", + "root4": "Understanding and Representing Angles", + "full node": "Plane Figures_Understanding of Plane Figures_Classification and Understanding of Angles_Understanding and Representing Angles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Properties and Understanding of Line Segments", + "root4": "Distance Between Two Points", + "full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Distance Between Two Points", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Properties and Understanding of Line Segments", + "root4": "Understanding Line Segments, Lines, and Rays", + "full node": "Plane Figures_Understanding of Plane Figures_Properties and Understanding of Line Segments_Understanding Line Segments, Lines, and Rays", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Positional Relationships Between Line Segments", + "root4": "perpendicularity", + "full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_perpendicularity", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Positional Relationships Between Line Segments", + "root4": "Parallel", + "full node": "Plane Figures_Understanding of Plane Figures_Positional Relationships Between Line Segments_Parallel", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Circles and Sectors", + "root4": "Understanding Sectors", + "full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Sectors", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Circles and Sectors", + "root4": "Understanding Circles", + "full node": "Plane Figures_Understanding of Plane Figures_Circles and Sectors_Understanding Circles", + }, + { + "root0": "Geometry and Figures", + "root1": "Plane Figures", + "root2": "Understanding of Plane Figures", + "root3": "Observing Figures", + "root4": None, + "full node": "Plane Figures_Understanding of Plane Figures_Observing Figures", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Basic Transformations of Figures", + "root3": "Axial Symmetry", + "root4": None, + "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Axial Symmetry", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Basic Transformations of Figures", + "root3": "Translation", + "root4": None, + "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Translation", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Basic Transformations of Figures", + "root3": "Rotation", + "root4": None, + "full node": "Transformation and Motion of Figures_Basic Transformations of Figures_Rotation", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Cutting and Combining of Figures", + "root3": "Combining and Dividing Solids", + "root4": None, + "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining and Dividing Solids", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Cutting and Combining of Figures", + "root3": "Combining Plane Figures", + "root4": "Division of Plane Figures", + "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Division of Plane Figures", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Cutting and Combining of Figures", + "root3": "Combining Plane Figures", + "root4": "Combining Plane Figures", + "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Combining Plane Figures", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Cutting and Combining of Figures", + "root3": "Combining Plane Figures", + "root4": "Tessellation of Figures", + "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Tessellation of Figures", + }, + { + "root0": "Geometry and Figures", + "root1": "Transformation and Motion of Figures", + "root2": "Cutting and Combining of Figures", + "root3": "Combining Plane Figures", + "root4": "Folding Problems of Figures", + "full node": "Transformation and Motion of Figures_Cutting and Combining of Figures_Combining Plane Figures_Folding Problems of Figures", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Direction", + "root3": "Southeast, Southwest, Northeast, Northwest Directions", + "root4": None, + "full node": "Position and Direction_Direction_Southeast, Southwest, Northeast, Northwest Directions", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Direction", + "root3": "Cardinal Directions (East, South, West, North)", + "root4": None, + "full node": "Position and Direction_Direction_Cardinal Directions (East, South, West, North)", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Route Map", + "root3": "Determining the Positions of Objects Based on Direction, Angle, and Distance", + "root4": None, + "full node": "Position and Direction_Route Map_Determining the Positions of Objects Based on Direction, Angle, and Distance", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Route Map", + "root3": "Describing Simple Routes Based on Direction and Distance", + "root4": None, + "full node": "Position and Direction_Route Map_Describing Simple Routes Based on Direction and Distance", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Correspondence of Coordinates and Positions", + "root3": "Representing Positions Using Ordered Pairs", + "root4": None, + "full node": "Position and Direction_Correspondence of Coordinates and Positions_Representing Positions Using Ordered Pairs", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Correspondence of Coordinates and Positions", + "root3": "Finding Positions Based on Ordered Pairs", + "root4": None, + "full node": "Position and Direction_Correspondence of Coordinates and Positions_Finding Positions Based on Ordered Pairs", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Position", + "root3": "Front-Back Position", + "root4": None, + "full node": "Position and Direction_Position_Front-Back Position", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Position", + "root3": "Up-Down Position", + "root4": None, + "full node": "Position and Direction_Position_Up-Down Position", + }, + { + "root0": "Geometry and Figures", + "root1": "Position and Direction", + "root2": "Position", + "root3": "Left-Right Position", + "root4": None, + "full node": "Position and Direction_Position_Left-Right Position", + }, +] diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py b/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py index 0fb0205..5dd266b 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/utils/yorn.py @@ -1,6 +1,47 @@ from ...smp import * +def AMBER_rating(data_file): + data = load(data_file) + stats = defaultdict(dict) + lt = len(data) + category_mapping = { + 'discriminative-attribute-state': 'Attribute', + 'discriminative-attribute-number': 'Attribute', + 'discriminative-attribute-action': 'Attribute', + 'discriminative-hallucination': 'Existence', + 'discriminative-relation': 'Relation', + 'relation': 'Relation' + } + + for i in range(lt): + item = data.iloc[i] + category = item['category'] + image_path = item['image_path'] + score = item['score'] + + new_category = category_mapping.get(category, category) + + if image_path not in stats[new_category]: + stats[new_category][image_path] = [] + stats[new_category][image_path].append(score) + + def acc(key): + res = stats[key] + values = [] + for val in res.values(): + values.extend(val) + return np.mean(values) * 100 + + scores = {} + for k in stats: + scores[k] = acc(k) + + scores['Avg ACC'] = np.mean(list(scores.values())) + ret = d2df(scores) + return ret + + def MME_rating(data_file): data = load(data_file) stats = defaultdict(dict) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py b/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py index ba5a432..c659c60 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/vcr.py @@ -17,20 +17,23 @@ def initialize(): try: rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4())) - except: - warnings.warn('Please first `pip install rouge_score`.') + except Exception as e: + logging.critical(f'{type(e)}: {e}') + logging.critical('Please first `pip install rouge_score`.') try: nlp_en = spacy.load('en_core_web_sm') - except: - warnings.warn('Will automatically download en_core_web_sm via spacy.') + except Exception as e: + logging.warning(f'{type(e)}: {e}') + logging.warning('Will automatically download en_core_web_sm via spacy.') spacy.cli.download('en_core_web_sm') nlp_en = spacy.load('en_core_web_sm') try: nlp_zh = spacy.load('zh_core_web_sm') - except: - warnings.warn('Will automatically download zh_core_web_sm via spacy.') + except Exception as e: + logging.warning(f'{type(e)}: {e}') + logging.warning('Will automatically download zh_core_web_sm via spacy.') spacy.cli.download('zh_core_web_sm') nlp_zh = spacy.load('zh_core_web_sm') diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py b/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py index ccda1d8..5b9ac03 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/video_base.py @@ -8,11 +8,14 @@ class VideoBaseDataset: def __init__(self, dataset='MMBench-Video', - pack=False): + pack=False, + nframe=0, + fps=-1): try: import decord - except: - warnings.warn('Please install decord via `pip install decord`.') + except Exception as e: + logging.critical(f'{type(e)}: {e}') + logging.critical('Please install decord via `pip install decord`.') self.dataset_name = dataset ret = self.prepare_dataset(dataset) @@ -21,6 +24,7 @@ class VideoBaseDataset: self.frame_root = osp.join(lmu_root, 'images', dataset) os.makedirs(self.frame_root, exist_ok=True) self.frame_tmpl = 'frame-{}-of-{}.jpg' + self.frame_tmpl_fps = 'frame-{}-of-{}-{}fps.jpg' self.data_root = ret['root'] self.data_file = ret['data_file'] @@ -31,6 +35,12 @@ class VideoBaseDataset: videos.sort() self.videos = videos self.pack = pack + self.nframe = nframe + self.fps = fps + if self.fps > 0 and self.nframe > 0: + raise ValueError('fps and nframe should not be set at the same time') + if self.fps <= 0 and self.nframe <= 0: + raise ValueError('fps and nframe should be set at least one valid value') def __len__(self): return len(self.videos) if self.pack else len(self.data) @@ -44,31 +54,69 @@ class VideoBaseDataset: assert idx < len(self.data) return dict(self.data.iloc[idx]) - def frame_paths(self, video, num_frames=8): + def frame_paths(self, video): frame_root = osp.join(self.frame_root, video) os.makedirs(frame_root, exist_ok=True) - return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)] + return [osp.join(frame_root, self.frame_tmpl.format(i, self.nframe)) for i in range(1, self.nframe + 1)] - def save_video_frames(self, video, num_frames=8): - frame_paths = self.frame_paths(video, num_frames) - flag = np.all([osp.exists(p) for p in frame_paths]) - if flag: + def frame_paths_fps(self, video, num_frames): + frame_root = osp.join(self.frame_root, video) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, + self.frame_tmpl_fps.format(i, num_frames, self.fps)) for i in range(1, num_frames + 1)] + + def save_video_frames(self, video): + if self.fps > 0: + vid_path = osp.join(self.data_root, video + '.mp4') + vid = decord.VideoReader(vid_path) + + # 计算视频的总帧数和总时长 + total_frames = len(vid) + video_fps = vid.get_avg_fps() + total_duration = total_frames / video_fps + + # 计算需要提取的总帧数 + required_frames = int(total_duration * self.fps) + + # 计算提取帧的间隔 + step_size = video_fps / self.fps + + # 计算提取帧的索引 + indices = [int(i * step_size) for i in range(required_frames)] + + # 提取帧并保存 + frame_paths = self.frame_paths_fps(video, len(indices)) + flag = np.all([osp.exists(p) for p in frame_paths]) + if flag: + return frame_paths + + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + return frame_paths + + else: + frame_paths = self.frame_paths(video) + flag = np.all([osp.exists(p) for p in frame_paths]) + if flag: + return frame_paths + vid_path = osp.join(self.data_root, video + '.mp4') + vid = decord.VideoReader(vid_path) + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + images = [vid[i].asnumpy() for i in indices] + images = [Image.fromarray(arr) for arr in images] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) return frame_paths - vid_path = osp.join(self.data_root, video + '.mp4') - vid = decord.VideoReader(vid_path) - step_size = len(vid) / (num_frames + 1) - indices = [int(i * step_size) for i in range(1, num_frames + 1)] - images = [vid[i].numpy() for i in indices] - images = [Image.fromarray(arr) for arr in images] - for im, pth in zip(images, frame_paths): - if not osp.exists(pth): - im.save(pth) - return frame_paths # Return a list of dataset names that are supported by this class, can override @classmethod def supported_datasets(cls): - return ['MMBench-Video', 'Video-MME', 'MVBench'] + return ['MMBench-Video', 'Video-MME', 'MVBench', 'MVBench_MP4', 'LongVideoBench'] # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe @abstractmethod @@ -76,7 +124,7 @@ class VideoBaseDataset: pass @abstractmethod - def build_prompt(self, idx, num_frames=8): + def build_prompt(self, idx): pass @abstractmethod diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py b/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py new file mode 100644 index 0000000..dab1ae1 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/video_concat_dataset.py @@ -0,0 +1,85 @@ +from ..smp import * +from .video_base import VideoBaseDataset + + +class ConcatVideoDataset(VideoBaseDataset): + # This dataset takes multiple dataset names as input and aggregate them into a single dataset. + # Each single dataset should not have a field named `SUB_DATASET` + + DATASET_SETS = {} + + def __init__(self, dataset, **kwargs): + from . import build_dataset + datasets = self.DATASET_SETS[dataset] + self.dataset_map = {} + # The name of the compliation + self.dataset_name = dataset + self.datasets = datasets + self.nframe = kwargs.get('nframe', 0) + self.fps = kwargs.get('fps', -1) + for dname in datasets: + dataset = build_dataset(dname, **kwargs) + assert dataset is not None, dataset + self.dataset_map[dname] = dataset + TYPES = [x.TYPE for x in self.dataset_map.values()] + MODALITIES = [x.MODALITY for x in self.dataset_map.values()] + # assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES) + assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES) + self.TYPE = TYPES + self.MODALITY = MODALITIES[0] + data_all = [] + for dname in datasets: + data = self.dataset_map[dname].data + data['SUB_DATASET'] = [dname] * len(data) + data_all.append(data) + + data = pd.concat(data_all) + data['original_index'] = data.pop('index') + data['index'] = np.arange(len(data)) + self.data = data + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + line = self.data.iloc[line] + idx = line['original_index'] + dname = line['SUB_DATASET'] + org_data = self.dataset_map[dname].data + org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0] + return self.dataset_map[dname].build_prompt(org_line, video_llm) + + def dump_image(self, line): + # Assert all images are pre-dumped + assert 'image' not in line + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + return tgt_path + + @classmethod + def supported_datasets(cls): + return [] # list(cls.DATASET_SETS) + + def evaluate(self, eval_file, **judge_kwargs): + suffix = eval_file.split('.')[-1] + # First, split the eval_file by dataset + data_all = load(eval_file) + for dname in self.datasets: + tgt = eval_file.replace(self.dataset_name, dname) + data_sub = data_all[data_all['SUB_DATASET'] == dname] + data_sub.pop('index') + data_sub['index'] = data_sub.pop('original_index') + data_sub.pop('SUB_DATASET') + dump(data_sub, tgt) + # Then, evaluate each dataset separately + results_all = {} + for dname in self.datasets: + tgt = eval_file.replace(self.dataset_name, dname) + res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs) + results_all.update(res) + + result = pd.DataFrame(results_all, index=['success', 'overall']) + result = result.T + for idx, item in result.iterrows(): + result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1) + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(result, score_file) + return result diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py b/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py new file mode 100644 index 0000000..ee7ddd6 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/video_dataset_config.py @@ -0,0 +1,103 @@ +from vlmeval.dataset import * +from functools import partial + +mmbench_video_dataset = { + 'MMBench_Video_8frame_nopack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=8, pack=False), + 'MMBench_Video_8frame_pack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=8, pack=True), + 'MMBench_Video_16frame_nopack': partial(MMBenchVideo, dataset='MMBench-Video', nframe=16, pack=False), + 'MMBench_Video_1fps_nopack': partial(MMBenchVideo, dataset='MMBench-Video', fps=1.0, pack=False), + 'MMBench_Video_1fps_pack': partial(MMBenchVideo, dataset='MMBench-Video', fps=1.0, pack=True) +} + +mvbench_dataset = { + 'MVBench_8frame': partial(MVBench, dataset='MVBench', nframe=8), + # MVBench not support fps, but MVBench_MP4 does + 'MVBench_MP4_8frame': partial(MVBench_MP4, dataset='MVBench_MP4', nframe=8), + 'MVBench_MP4_1fps': partial(MVBench_MP4, dataset='MVBench_MP4', fps=1.0), +} + +videomme_dataset = { + 'Video-MME_8frame': partial(VideoMME, dataset='Video-MME', nframe=8), + 'Video-MME_8frame_subs': partial(VideoMME, dataset='Video-MME', nframe=8, use_subtitle=True), + 'Video-MME_1fps': partial(VideoMME, dataset='Video-MME', fps=1.0), + 'Video-MME_0.5fps': partial(VideoMME, dataset='Video-MME', fps=0.5), + 'Video-MME_0.5fps_subs': partial(VideoMME, dataset='Video-MME', fps=0.5, use_subtitle=True), +} + +longvideobench_dataset = { + 'LongVideoBench_8frame': partial(LongVideoBench, dataset='LongVideoBench', nframe=8), + 'LongVideoBench_8frame_subs': partial(LongVideoBench, dataset='LongVideoBench', nframe=8, use_subtitle=True), + 'LongVideoBench_1fps': partial(LongVideoBench, dataset='LongVideoBench', fps=1.0), + 'LongVideoBench_0.5fps': partial(LongVideoBench, dataset='LongVideoBench', fps=0.5), + 'LongVideoBench_0.5fps_subs': partial(LongVideoBench, dataset='LongVideoBench', fps=0.5, use_subtitle=True) +} + +mlvu_dataset = { + 'MLVU_8frame': partial(MLVU, dataset='MLVU', nframe=8), + 'MLVU_1fps': partial(MLVU, dataset='MLVU', fps=1.0) +} + +tempcompass_dataset = { + 'TempCompass_8frame': partial(TempCompass, dataset='TempCompass', nframe=8), + 'TempCompass_1fps': partial(TempCompass, dataset='TempCompass', fps=1.0), + 'TempCompass_0.5fps': partial(TempCompass, dataset='TempCompass', fps=0.5) +} + +# In order to reproduce the experimental results in CGbench paper, +# use_subtitle, use_subtitle_time and use_frame_time need to be set to True. +# When measuring clue-related results, if the number of frames used is greater +# than 32, the frame capture limit will be set to 32. +cgbench_dataset = { + 'CGBench_MCQ_Grounding_Mini_8frame_subs_subt': partial( + CGBench_MCQ_Grounding_Mini, + dataset='CG-Bench_MCQ_Grounding_Mini', + nframe=8, + use_subtitle=True, + use_subtitle_time=True + ), + 'CGBench_OpenEnded_Mini_8frame_subs_subt_ft': partial( + CGBench_OpenEnded_Mini, + dataset='CG-Bench_OpenEnded_Mini', + nframe=8, + use_subtitle=True, + use_subtitle_time=True, + use_frame_time=True + ), + 'CGBench_MCQ_Grounding_32frame_subs': partial( + CGBench_MCQ_Grounding, + dataset='CG-Bench_MCQ_Grounding', + nframe=32, + use_subtitle=True + ), + 'CGBench_OpenEnded_8frame': partial( + CGBench_OpenEnded, + dataset='CG-Bench_OpenEnded', + nframe=8 + ), + 'CGBench_MCQ_Grounding_16frame_subs_subt_ft': partial( + CGBench_MCQ_Grounding, + dataset='CG-Bench_MCQ_Grounding', + nframe=16, + use_subtitle=True, + use_subtitle_time=True, + use_frame_time=True + ), + 'CGBench_OpenEnded_16frame_subs_subt_ft': partial( + CGBench_OpenEnded, + dataset='CG-Bench_OpenEnded', + nframe=16, + use_subtitle=True, + use_subtitle_time=True, + use_frame_time=True + ) +} + +supported_video_datasets = {} + +dataset_groups = [ + mmbench_video_dataset, mvbench_dataset, videomme_dataset, longvideobench_dataset, + mlvu_dataset, tempcompass_dataset, cgbench_dataset +] + +for grp in dataset_groups: + supported_video_datasets.update(grp) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py b/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py index 0b7f63b..afb2c1b 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/videomme.py @@ -1,6 +1,7 @@ from huggingface_hub import snapshot_download from ..smp import * from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE FAIL_MSG = 'Failed to obtain answer via API.' @@ -28,7 +29,7 @@ def unwrap_hf_pkl(pth, suffix='.mp4'): class VideoMME(VideoBaseDataset): - MD5 = '2f16cd40b1c125b67e661e59da2f6cd0' + MD5 = '85bdd91f9b29a99354c23b97ab7c113c' SYS = '' FRAMES_TMPL_NOSUB = """ @@ -45,11 +46,12 @@ Select the best answer to the following multiple-choice question based on the vi Respond with only the letter (A, B, C, or D) of the correct option. """ - TYPE = 'MCQ' + TYPE = 'Video-MCQ' - def __init__(self, dataset='Video-MME', use_subtitle=False): - super().__init__(dataset=dataset) + def __init__(self, dataset='Video-MME', use_subtitle=False, nframe=0, fps=-1): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) self.use_subtitle = use_subtitle + self.dataset_name = dataset @classmethod def supported_datasets(cls): @@ -131,14 +133,18 @@ Respond with only the letter (A, B, C, or D) of the correct option. data_file['video'] = data_file['videoID'] data_file['video_path'] = data_file['videoID'].apply(lambda x: f'./video/{x}.mp4') data_file['subtitle_path'] = data_file['videoID'].apply(lambda x: f'./subtitle/{x}.srt') - data_file['question'] += '\n' + data_file['options'].apply(lambda x: '\n'.join(x)) + data_file['candidates'] = data_file['options'].apply(lambda x: x.tolist()) - data_file = data_file[['index', 'video', 'video_path', 'duration', 'domain', + data_file = data_file[['index', 'video', 'video_path', 'duration', 'domain', 'candidates', 'sub_category', 'task_type', 'subtitle_path', 'question', 'answer']] data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False) - dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') unzip_hf_zip(dataset_path) generate_tsv(dataset_path) @@ -146,36 +152,43 @@ Respond with only the letter (A, B, C, or D) of the correct option. return dict(data_file=data_file, root=dataset_path) - def save_video_frames(self, video, num_frames=8): + def save_video_frames(self, video, video_llm=False): vid_path = osp.join(self.data_root, 'video', video + '.mp4') vid = decord.VideoReader(vid_path) - step_size = len(vid) / (num_frames + 1) - indices = [int(i * step_size) for i in range(1, num_frames + 1)] - video_info = { 'fps': vid.get_avg_fps(), 'n_frames': len(vid), } + if self.nframe > 0 and self.fps < 0: + step_size = len(vid) / (self.nframe + 1) + indices = [int(i * step_size) for i in range(1, self.nframe + 1)] + frame_paths = self.frame_paths(video) + elif self.fps > 0: + # not constrained by num_frames, get frames by fps + total_duration = video_info['n_frames'] / video_info['fps'] + required_frames = int(total_duration * self.fps) + step_size = video_info['fps'] / self.fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(video, len(indices)) - frame_paths = self.frame_paths(video, num_frames) flag = np.all([osp.exists(p) for p in frame_paths]) if not flag: - images = [vid[i].numpy() for i in indices] + images = [vid[i].asnumpy() for i in indices] images = [Image.fromarray(arr) for arr in images] for im, pth in zip(images, frame_paths): - if not osp.exists(pth): + if not osp.exists(pth) and not video_llm: im.save(pth) return frame_paths, indices, video_info - def build_prompt(self, line, num_frames, video_llm): + def build_prompt(self, line, video_llm): if isinstance(line, int): assert line < len(self) line = self.data.iloc[line] - frames, indices, video_info = self.save_video_frames(line['video'], num_frames) + frames, indices, video_info = self.save_video_frames(line['video'], video_llm) if self.use_subtitle and os.path.exists(osp.join(self.data_root, line['subtitle_path'])): import pysubs2 @@ -204,6 +217,7 @@ Respond with only the letter (A, B, C, or D) of the correct option. text_prompt = self.FRAMES_TMPL_NOSUB if not self.use_subtitle else self.FRAMES_TMPL_SUB.format(subtitles) message.append(dict(type='text', value=text_prompt)) + line['question'] += '\n' + '\n'.join(eval(line['candidates'])) prompt = 'Question: {}\nAnswer: '.format(line['question']) message.append(dict(type='text', value=prompt)) return message @@ -211,7 +225,7 @@ Respond with only the letter (A, B, C, or D) of the correct option. # It returns a dictionary @classmethod def evaluate(self, eval_file, **judge_kwargs): - from .utils.videomme import get_dimension_rating, extract_characters_regex + from .utils.videomme import get_dimension_rating, extract_characters_regex, extract_option assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' @@ -220,6 +234,20 @@ Respond with only the letter (A, B, C, or D) of the correct option. score_file = eval_file.replace('.xlsx', '_score.xlsx') if not osp.exists(score_file): + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None res = {} if not osp.exists(tmp_file) else load(tmp_file) res = {k: v for k, v in res.items() if FAIL_MSG not in v} @@ -228,10 +256,15 @@ Respond with only the letter (A, B, C, or D) of the correct option. for idx in data['index']: ans = data.loc[data['index'] == idx, 'answer'].values[0] - pred = data.loc[data['index'] == idx, 'prediction'].values[0] + pred = str(data.loc[data['index'] == idx, 'prediction'].values[0]) if extract_characters_regex(pred) == '': - data.loc[idx, 'score'] = -1 + extract_pred = extract_option( + model, + data.loc[data['index'] == idx].to_dict(orient='records')[0], + 'Video-MME' + ) + data.loc[idx, 'score'] = int(extract_pred == ans) else: data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans) diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py new file mode 100644 index 0000000..d8dad73 --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/vl_rewardbench.py @@ -0,0 +1,174 @@ +from ast import literal_eval + +from .image_base import ImageBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..smp import * +from ..utils import track_progress_rich + + +LLM_PARSE_ANSWER_PROMPT = ''' +You are given a pairwise judgement for two responses. Please return the better response according to the judgement. +Return the Answer X ONLY. e.g., Answer 1 or Answer 2. + +Judgement: {judgement} +''' + + +PROMPT_TEMPLATE = '''\ +You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. +Please analyze the following image and question, then determine which of the two provided answers is better. + +Question: {query} + +Answer 1: {answer_0} + +Answer 2: {answer_1} + +Please evaluate both answers based on the following criteria: +1. Accuracy: How well does the answer align with the visual information in the image? +2. Completeness: Does the answer fully address all aspects of the question? +3. Clarity: Is the answer easy to understand and well-articulated? +4. Relevance: Does the answer directly relate to the question and the image? + +After your evaluation, please: +1. Explain your reasoning for each criterion. +2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2).\ +For example: Overall Judgment: Answer X is better. + +Your response should be structured and detailed, \ +demonstrating your understanding of both the visual and textual elements of the task.''' + + +def get_score(line, parsed_response, random_number): + gt_ans = line['human_ranking'].index(0 if random_number == 0 else 1) + 1 + if 'Answer 1'.lower() in parsed_response.lower(): + pred = 1 + elif 'Answer 2'.lower() in parsed_response.lower(): + pred = 2 + else: # failed + pred = 'None' # random.choice([1, 2]) + + if pred == gt_ans: + return 1.0 + else: + return 0.0 + + +def VLRewardBench_eval_answer(model, line): + response = toliststr(line['response']) + random_number = sum(len(res) for res in response) % 2 + + prompt = LLM_PARSE_ANSWER_PROMPT.format(judgement=line['prediction']) + messages = [dict(type='text', value=prompt)] + + resp = model.generate(messages) + score = get_score(line, resp, random_number) + + if score is None: + return 'Unknown' + return score + + +class VLRewardBench(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv' + } + DATASET_MD5 = {'VL-RewardBench': '1d2676f4ab4a5f755019ec0af2b28189'} + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + tgt_path = self.dump_image(line) # save image to local + question = line['question'] + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + + response = toliststr(line['response']) + random_number = sum(len(res) for res in response) % 2 + if random_number == 1: + # randomly shuffle the order of the responses + response = response[::-1] + query_prompt = PROMPT_TEMPLATE.format( + query=question, answer_0=response[0], answer_1=response[1] + ) + msgs = msgs + [dict(type='text', value=query_prompt)] + return msgs + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + raw_data = VLRewardBench('VL-RewardBench').data + data = load(eval_file) + data['prediction'] = [str(x) for x in data['prediction']] + data['human_ranking'] = [literal_eval(x) for x in raw_data['answer']] + + judge_kwargs['temperature'] = 0 + judge_kwargs['timeout'] = 60 + model = build_judge(max_tokens=128, **judge_kwargs) + + assert model.working(), ( + 'VLRewardBench evaluation requires a working OPENAI API\n' + + DEBUG_MESSAGE + ) + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = load(tmp_file) if osp.exists(tmp_file) else {} + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + VLRewardBench_eval_answer, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + ans[k] = v + + data['score'] = [ans[idx] for idx in data['index']] + # data.pop('image') + dump(data, storage) + + data = load(storage) + lt = len(data) + + category_scores = defaultdict(lambda: 0) + category_cnt = defaultdict(lambda: 0) + scores = defaultdict(lambda: 0) + for i in range(lt): + item = data.iloc[i] + category_scores[item['category']] += item['score'] + category_cnt[item['category']] += 1 + # calculate the average score for each category + for k, v in category_scores.items(): + scores[k] = v / category_cnt[k] + # calculate category macro accuracy (average across categories) + scores['Macro Accuracy'] = sum(scores.values()) / len(scores) + # calculate the total average score + scores['Overall Consistency'] = sum(category_scores.values()) / lt + + scores = {k: [v] for k, v in scores.items()} + scores = pd.DataFrame(scores) + dump(scores, score_file) + return scores diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py b/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py new file mode 100644 index 0000000..b1ad1fd --- /dev/null +++ b/eval_mm/vlmevalkit/vlmeval/dataset/wildvision.py @@ -0,0 +1,222 @@ +import re +from functools import partial + +from .image_base import ImageBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..smp import * +from ..utils import track_progress_rich + + +SYSTEM_PROMPT = """\ +Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user \ +prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate \ +which assistant's answer is better. + +Begin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any \ +answers. + +When evaluating the assistants' answers, compare both assistants' answers with your answer. \ +You must identify and correct any mistakes or inaccurate information. + +Then consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly \ +responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one \ +interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than \ +providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate \ +to what is being asked. Concise means the response is clear and not verbose or excessive. + +Then consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing \ +important information in the assistants' answers that would be beneficial to include when responding to the user \ +prompt. + +After providing your explanation, you must output only one of the following choices as your final verdict with a label: + +1. Assistant A is significantly better: [[A>>B]] +2. Assistant A is slightly better: [[A>B]] +3. Tie, relatively the same: [[A=B]] +4. Assistant B is slightly better: [[B>A]] +5. Assistant B is significantly better: [[B>>A]] + +Example output: "My final verdict is tie: [[A=B]]".\ +""" + + +PROMPT_TEMPLATE = """\ +"<|User Prompt|>\n{question} + +<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|> + +<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|> +""" + + +REGEX_PATTERN = re.compile("\[\[([AB<>=]+)\]\]") # noqa: W605 + + +def get_score(judgement, pattern=REGEX_PATTERN): + matches = pattern.findall(judgement) + matches = [m for m in matches if m != ""] + if len(set(matches)) == 0: + return None, True + elif len(set(matches)) == 1: + return matches[0].strip("\n"), False + else: + return None, True + + +def WildVision_auxeval(model, line): + config = dict(question=line['question'], answer_1=line['A'], answer_2=line['B']) + prompt = PROMPT_TEMPLATE.format(**config) + + prefix = 'data:image/jpeg;base64,' + img = prefix + line['image'] + + messages = [ + dict(type='text', value=prompt), + dict(type='image', value=img) + ] + + retry = 2 + while retry: + resp = model.generate(messages) + score, try_again = get_score(resp) + if not try_again: + break + retry -= 1 + + if score is None: + return 'Unknown' + return score + + +class WildVision(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'WildVision': 'https://opencompass.openxlab.space/utils/VLMEval/WildVision.tsv' + } + DATASET_MD5 = {'WildVision': 'b38f80156d49411c594772866b0d0b52'} + + score_map = { + 'A>>B': -2, + 'A>B': -1, + 'A=B': 0, + 'B>A': 1, + 'B>>A': 2 + } + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + + if self.meta_only: + tgt_path = toliststr(line['image_path']) + else: + tgt_path = self.dump_image(line) + + question = line['question'] + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + # WildVision adopts text first + msgs = [dict(type='text', value=question)] + msgs + return msgs + + @classmethod + def gen_eval_base(self, eval_file, b64_map): + data = load(eval_file) + data['B'] = data.pop('prediction') + data['A'] = data.pop('claude3_sonnet') + data['image'] = [b64_map[x] for x in data['index']] + return data + # rev = cp.deepcopy(data) + # rev['A'] = data['B'] + # rev['B'] = data['A'] + # rev['index'] = [x + '_rev' for x in data['index']] + # return pd.concat([data, rev], ignore_index=True) + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + # We adopt pairwise evaluation (twice for a pair) for this dataset + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + raw_data = WildVision('WildVision').data + b64_map = {x: y for x, y in zip(raw_data['index'], raw_data['image'])} + data = self.gen_eval_base(eval_file, b64_map) + + judge_kwargs['system_prompt'] = SYSTEM_PROMPT + judge_kwargs['temperature'] = 0 + judge_kwargs['img_detail'] = 'high' + judge_kwargs['timeout'] = 300 + model = build_judge(max_tokens=4096, **judge_kwargs) + + assert model.working(), ('WildVision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = load(tmp_file) if osp.exists(tmp_file) else {} + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + WildVision_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + ans[k] = v + + data['score'] = [ans[idx] for idx in data['index']] + data.pop('image') + dump(data, storage) + + data = load(storage) + lt = len(data) + + scores = defaultdict(lambda: 0) + for i in range(lt): + item = data.iloc[i] + if item['score'] not in self.score_map: + score = 0 + else: + score = self.score_map[item['score']] + if '_rev' in item['index']: + score = -score + scores[score] += 1 + name_map = { + 2: 'Much Better', + 1: 'Better', + 0: 'Tie', + -1: 'Worse', + -2: 'Much Worse' + } + scores = {name_map[k]: v for k, v in scores.items()} + much_better = scores.get('Much Better', 0) + better = scores.get('Better', 0) + worse = scores.get('Worse', 0) + much_worse = scores.get('Much Worse', 0) + scores['Reward'] = ( + 100 * much_better + 50 * better - 50 * worse - 100 * much_worse + ) / lt + scores['Win Rate'] = (better + much_better) / lt + scores = {k: [v] for k, v in scores.items()} + scores = pd.DataFrame(scores) + dump(scores, score_file) + return scores diff --git a/eval_mm/vlmevalkit/vlmeval/inference.py b/eval_mm/vlmevalkit/vlmeval/inference.py index b45fba9..33839b4 100644 --- a/eval_mm/vlmevalkit/vlmeval/inference.py +++ b/eval_mm/vlmevalkit/vlmeval/inference.py @@ -18,7 +18,7 @@ def parse_args(): # Only API model is accepted -def infer_data_api(work_dir, model_name, dataset, index_set=None, api_nproc=4, ignore_failed=False): +def infer_data_api(model, work_dir, model_name, dataset, index_set=None, api_nproc=4, ignore_failed=False): rank, world_size = get_rank_and_world_size() assert rank == 0 and world_size == 1 dataset_name = dataset.dataset_name @@ -26,11 +26,24 @@ def infer_data_api(work_dir, model_name, dataset, index_set=None, api_nproc=4, i if index_set is not None: data = data[data['index'].isin(index_set)] - model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name + model = supported_VLM[model_name]() if isinstance(model, str) else model assert getattr(model, 'is_api', False) + if hasattr(model, 'set_dump_image'): + model.set_dump_image(dataset.dump_image) lt, indices = len(data), list(data['index']) - structs = [dataset.build_prompt(data.iloc[i]) for i in range(lt)] + + structs = [] + for i in range(lt): + item = data.iloc[i] + if hasattr(model, 'use_custom_prompt') and model.use_custom_prompt(dataset_name): + assert hasattr(model, 'build_prompt') + struct = model.build_prompt(item, dataset=dataset_name) + else: + struct = dataset.build_prompt(item) + structs.append(struct) + + # structs = [dataset.build_prompt(data.iloc[i]) for i in range(lt)] out_file = f'{work_dir}/{model_name}_{dataset_name}_supp.pkl' res = {} @@ -55,7 +68,7 @@ def infer_data_api(work_dir, model_name, dataset, index_set=None, api_nproc=4, i return res -def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc=4): +def infer_data(model, model_name, work_dir, dataset, out_file, verbose=False, api_nproc=4): dataset_name = dataset.dataset_name prev_file = f'{work_dir}/{model_name}_{dataset_name}_PREV.pkl' res = load(prev_file) if osp.exists(prev_file) else {} @@ -83,12 +96,13 @@ def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc data = data[~data['index'].isin(res)] lt = len(data) - model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name + model = supported_VLM[model_name]() if isinstance(model, str) else model is_api = getattr(model, 'is_api', False) if is_api: lt, indices = len(data), list(data['index']) supp = infer_data_api( + model=model, work_dir=work_dir, model_name=model_name, dataset=dataset, @@ -99,7 +113,7 @@ def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc res.update(supp) res = {k: res[k] for k in data_indices} dump(res, out_file) - return model_name + return model else: model.set_dump_image(dataset.dump_image) @@ -120,7 +134,7 @@ def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc print(response, flush=True) res[idx] = response - if (i + 1) % 20 == 0: + if (i + 1) % 10 == 0: dump(res, out_file) res = {k: res[k] for k in data_indices} @@ -149,7 +163,8 @@ def infer_data_job(model, work_dir, model_name, dataset, verbose=False, api_npro out_file = tmpl.format(rank) model = infer_data( - model, work_dir=work_dir, dataset=dataset, out_file=out_file, verbose=verbose, api_nproc=api_nproc) + model=model, work_dir=work_dir, model_name=model_name, dataset=dataset, + out_file=out_file, verbose=verbose, api_nproc=api_nproc) if world_size > 1: dist.barrier() @@ -168,4 +183,6 @@ def infer_data_job(model, work_dir, model_name, dataset, verbose=False, api_npro dump(data, result_file) for i in range(world_size): os.remove(tmpl.format(i)) + if world_size > 1: + dist.barrier() return model diff --git a/eval_mm/vlmevalkit/vlmeval/inference_mt.py b/eval_mm/vlmevalkit/vlmeval/inference_mt.py index 0de9ed6..976e79a 100644 --- a/eval_mm/vlmevalkit/vlmeval/inference_mt.py +++ b/eval_mm/vlmevalkit/vlmeval/inference_mt.py @@ -29,15 +29,15 @@ def chat_mt(model, messages, dataset_name): try: resp = model.chat(utter_stack, dataset=dataset_name) utter_stack.append(dict(role='assistant', content=resp)) - except: - resp = FAIL_MSG + except Exception as e: + resp = FAIL_MSG + str(e) utter_stack.append(dict(role='assistant', content=resp)) predictions.append(resp) return predictions # Only API model is accepted -def infer_data_api(work_dir, model_name, dataset, index_set=None, api_nproc=4, ignore_failed=False): +def infer_data_api(model, work_dir, model_name, dataset, index_set=None, api_nproc=4, ignore_failed=False): rank, world_size = get_rank_and_world_size() assert rank == 0 and world_size == 1 dataset_name = dataset.dataset_name @@ -45,7 +45,7 @@ def infer_data_api(work_dir, model_name, dataset, index_set=None, api_nproc=4, i if index_set is not None: data = data[data['index'].isin(index_set)] - model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name + model = supported_VLM[model_name]() if isinstance(model, str) else model assert getattr(model, 'is_api', False) assert hasattr(model, 'chat_inner') @@ -74,7 +74,7 @@ def infer_data_api(work_dir, model_name, dataset, index_set=None, api_nproc=4, i return res -def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc=4): +def infer_data(model, model_name, work_dir, dataset, out_file, verbose=False, api_nproc=4): dataset_name = dataset.dataset_name res = {} if osp.exists(out_file): @@ -101,13 +101,14 @@ def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc data = data[~data['index'].isin(res)] lt = len(data) - model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name + model = supported_VLM[model_name]() if isinstance(model, str) else model assert hasattr(model, 'chat_inner') is_api = getattr(model, 'is_api', False) if is_api: lt, indices = len(data), list(data['index']) supp = infer_data_api( + model=model, work_dir=work_dir, model_name=model_name, dataset=dataset, @@ -118,7 +119,7 @@ def infer_data(model_name, work_dir, dataset, out_file, verbose=False, api_nproc res.update(supp) res = {k: res[k] for k in data_indices} dump(res, out_file) - return model_name + return model else: model.set_dump_image(dataset.dump_image) @@ -157,7 +158,8 @@ def infer_data_job_mt(model, work_dir, model_name, dataset, verbose=False, api_n out_file = tmpl.format(rank) model = infer_data( - model, work_dir=work_dir, dataset=dataset, out_file=out_file, verbose=verbose, api_nproc=api_nproc) + model=model, model_name=model_name,work_dir=work_dir, dataset=dataset, + out_file=out_file, verbose=verbose, api_nproc=api_nproc) if world_size > 1: dist.barrier() diff --git a/eval_mm/vlmevalkit/vlmeval/inference_video.py b/eval_mm/vlmevalkit/vlmeval/inference_video.py index 05acb2a..a9f514c 100644 --- a/eval_mm/vlmevalkit/vlmeval/inference_video.py +++ b/eval_mm/vlmevalkit/vlmeval/inference_video.py @@ -18,23 +18,25 @@ def parse_args(): # Only API model is accepted -def infer_data_api(work_dir, model_name, dataset, nframe=8, pack=False, samples_dict={}, api_nproc=4): +def infer_data_api(model, work_dir, model_name, dataset, samples_dict={}, api_nproc=4): rank, world_size = get_rank_and_world_size() assert rank == 0 and world_size == 1 dataset_name = dataset.dataset_name - model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name + model = supported_VLM[model_name]() if isinstance(model, str) else model assert getattr(model, 'is_api', False) indices = list(samples_dict.keys()) - structs = [dataset.build_prompt(samples_dict[idx], num_frames=nframe, - video_llm=getattr(model, 'VIDEO_LLM', False)) for idx in indices] + structs = [dataset.build_prompt(samples_dict[idx], video_llm=getattr(model, 'VIDEO_LLM', False)) for idx in indices] - packstr = 'pack' if pack else 'nopack' - out_file = f'{work_dir}/{model_name}_{dataset_name}_{nframe}frame_{packstr}_supp.pkl' + packstr = 'pack' if getattr(dataset, 'pack', False) else 'nopack' + if dataset.nframe > 0: + out_file = f'{work_dir}/{model_name}_{dataset_name}_{dataset.nframe}frame_{packstr}_supp.pkl' + else: + out_file = f'{work_dir}/{model_name}_{dataset_name}_{dataset.fps}fps_{packstr}_supp.pkl' res = load(out_file) if osp.exists(out_file) else {} - structs = [s for i, s in zip(indices, structs) if i not in res] - indices = [i for i in indices if i not in res] + structs = [s for i, s in zip(indices, structs) if i not in res or res[i] == FAIL_MSG] + indices = [i for i in indices if i not in res or res[i] == FAIL_MSG] gen_func = model.generate structs = [dict(message=struct, dataset=dataset_name) for struct in structs] @@ -46,46 +48,72 @@ def infer_data_api(work_dir, model_name, dataset, nframe=8, pack=False, samples_ return res -def infer_data(model_name, work_dir, dataset, out_file, nframe=8, pack=False, verbose=False, api_nproc=4): +def infer_data(model, model_name, work_dir, dataset, out_file, verbose=False, api_nproc=4): res = load(out_file) if osp.exists(out_file) else {} rank, world_size = get_rank_and_world_size() dataset_name = dataset.dataset_name - sample_indices = list(dataset.videos) if pack else list(dataset.data['index']) - samples = list(dataset.videos) if pack else list(range(len(dataset.data))) + sample_indices = list(dataset.videos) if getattr(dataset, 'pack', False) else list(dataset.data['index']) + samples = list(dataset.videos) if getattr(dataset, 'pack', False) else list(range(len(dataset.data))) sample_map = {i: s for i, s in zip(sample_indices, samples)} sample_indices_sub = sample_indices[rank::world_size] if np.all([idx in res for idx in sample_indices_sub]): - return model_name + return model sample_indices_subrem = [x for x in sample_indices_sub if x not in res] - model = supported_VLM[model_name]() if isinstance(model_name, str) else model_name + model = supported_VLM[model_name]() if isinstance(model, str) else model is_api = getattr(model, 'is_api', False) if is_api: assert world_size == 1 supp = infer_data_api( + model=model, work_dir=work_dir, model_name=model_name, dataset=dataset, - nframe=nframe, - pack=pack, samples_dict={k: sample_map[k] for k in sample_indices_subrem}, api_nproc=api_nproc) for k in sample_indices_subrem: assert k in supp res.update(supp) dump(res, out_file) - return model_name + return model + assert not getattr(dataset, 'pack', False), 'Current model not supported pack mode!' for i, idx in tqdm(enumerate(sample_indices_subrem)): if idx in res: continue - # adapt to model frame sample number first - nframe = getattr(model, 'nframe', 0) if getattr(model, 'nframe', 0) > 0 else nframe - # when using video-llm, build prompt returns video+question; otherwise, several frames+question - struct = dataset.build_prompt(sample_map[idx], num_frames=nframe, video_llm=getattr(model, 'VIDEO_LLM', False)) + if getattr(model, 'nframe', None) is not None and getattr(model, 'nframe', 0) > 0: + if dataset.nframe > 0: + if getattr(model, 'nframe', 0) != dataset.nframe: + print(f'{model_name} is a video-llm model, nframe is set to {dataset.nframe}, not using default') + setattr(model, 'nframe', dataset.nframe) + elif getattr(model, 'fps', 0) == 0: + raise ValueError(f'fps is not suitable for {model_name}') + else: + setattr(model, 'nframe', None) + if getattr(model, 'fps', None) is not None and getattr(model, 'fps', 0) > 0: + if dataset.fps > 0: + if getattr(model, 'fps', 0) != dataset.fps: + print(f'{model_name} is a video-llm model, fps is set to {dataset.fps}, not using default') + setattr(model, 'fps', dataset.fps) + elif getattr(model, 'nframe', 0) == 0: + raise ValueError(f'nframe is not suitable for {model_name}') + else: + setattr(model, 'fps', None) + if 'SUB_DATASET' in dataset.data.iloc[sample_map[idx]]: + dataset_name = dataset.data.iloc[sample_map[idx]]['SUB_DATASET'] + if hasattr(model, 'use_custom_prompt') and model.use_custom_prompt(dataset_name): + if dataset.nframe == 0: + raise ValueError(f'nframe must be set for custom prompt, fps is not suitable for {model_name}') + struct = model.build_prompt( + dataset.data.iloc[sample_map[idx]], dataset=dataset, video_llm=getattr(model, 'VIDEO_LLM', False) + ) + else: + struct = dataset.build_prompt( + sample_map[idx], video_llm=getattr(model, 'VIDEO_LLM', False) + ) response = model.generate(message=struct, dataset=dataset_name) torch.cuda.empty_cache() @@ -107,36 +135,25 @@ def infer_data_job_video( work_dir, model_name, dataset, - nframe=8, - pack=False, + result_file_name, verbose=False, - subtitle=False, api_nproc=4): dataset_name = dataset.dataset_name - packstr = 'pack' if pack else 'nopack' rank, world_size = get_rank_and_world_size() - result_file = osp.join(work_dir, f'{model_name}_{dataset_name}_{nframe}frame_{packstr}.xlsx') - if dataset_name == 'Video-MME': - subtitle_str = 'subs' if subtitle else 'nosubs' - result_file = result_file.replace('.xlsx', f'_{subtitle_str}.xlsx') - + result_file = osp.join(work_dir, result_file_name) # Dump Predictions to Prev File if result file exists if osp.exists(result_file): - return model_name + return model - tmpl = osp.join(work_dir, '{}' + f'{world_size}_{dataset_name}_{nframe}frame_{packstr}.pkl') - if dataset_name == 'Video-MME': - subtitle_str = 'subs' if subtitle else 'nosubs' - tmpl = tmpl.replace('.pkl', f'_{subtitle_str}.pkl') + tmpl = osp.join(work_dir, '{}' + f'{world_size}_{osp.splitext(result_file_name)[0]}.pkl') out_file = tmpl.format(rank) model = infer_data( - model, + model=model, + model_name=model_name, work_dir=work_dir, dataset=dataset, - nframe=nframe, - pack=pack, out_file=out_file, verbose=verbose, api_nproc=api_nproc) @@ -150,7 +167,7 @@ def infer_data_job_video( data_all.update(load(tmpl.format(i))) meta = dataset.data - if dataset_name == 'MMBench-Video' and pack: + if dataset_name == 'MMBench-Video' and getattr(dataset, 'pack', False): meta, vstats = dataset.load_pack_answers(data_all) print(f'Statitics of Pack Video Inference: {vstats}') else: diff --git a/eval_mm/vlmevalkit/vlmeval/smp/file.py b/eval_mm/vlmevalkit/vlmeval/smp/file.py index cb39d56..aa0ce80 100644 --- a/eval_mm/vlmevalkit/vlmeval/smp/file.py +++ b/eval_mm/vlmevalkit/vlmeval/smp/file.py @@ -74,6 +74,20 @@ def LMUDataRoot(): return root +def HFCacheRoot(): + cache_list = ['HUGGINGFACE_HUB_CACHE', 'HF_HOME'] + for cache_name in cache_list: + if cache_name in os.environ and osp.exists(os.environ[cache_name]): + if os.environ[cache_name].split('/')[-1] == 'hub': + return os.environ[cache_name] + else: + return osp.join(os.environ[cache_name], 'hub') + home = osp.expanduser('~') + root = osp.join(home, '.cache', 'huggingface', 'hub') + os.makedirs(root, exist_ok=True) + return root + + def MMBenchOfficialServer(dataset_name): root = LMUDataRoot() @@ -190,20 +204,20 @@ def download_file(url, filename=None): if filename is None: filename = url.split('/')[-1] - # If HF_ENDPOINT is set, replace huggingface.co with it - if 'huggingface.co' in url and os.environ.get('HF_ENDPOINT', '') != '': - url = url.replace('huggingface.co', os.environ['HF_ENDPOINT'].split('://')[1]) - try: with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t: urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to) - except: + except Exception as e: + import logging + logging.warning(f'{type(e)}: {e}') # Handle Failed Downloads from huggingface.co if 'huggingface.co' in url: url_new = url.replace('huggingface.co', 'hf-mirror.com') try: - os.system(f'wget {url_new} -O {filename}') - except: + download_file(url_new, filename) + return filename + except Exception as e: + logging.warning(f'{type(e)}: {e}') raise Exception(f'Failed to download {url}') else: raise Exception(f'Failed to download {url}') @@ -286,6 +300,18 @@ def parse_file(s): suffix = osp.splitext(s)[1].lower() mime = mimetypes.types_map.get(suffix, 'unknown') return (mime, s) + elif s.startswith('data:image/'): + # To be compatible with OPENAI base64 format + content = s[11:] + mime = content.split(';')[0] + content = ';'.join(content.split(';')[1:]) + dname = osp.join(LMUDataRoot(), 'files') + assert content.startswith('base64,') + b64 = content[7:] + os.makedirs(dname, exist_ok=True) + tgt = osp.join(dname, md5(b64) + '.png') + decode_base64_to_image_file(b64, tgt) + return parse_file(tgt) elif validators.url(s): suffix = osp.splitext(s)[1].lower() if suffix in mimetypes.types_map: diff --git a/eval_mm/vlmevalkit/vlmeval/smp/log.py b/eval_mm/vlmevalkit/vlmeval/smp/log.py index 95804d5..26e00b1 100644 --- a/eval_mm/vlmevalkit/vlmeval/smp/log.py +++ b/eval_mm/vlmevalkit/vlmeval/smp/log.py @@ -1,4 +1,7 @@ import logging +logging.basicConfig( + format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') logger_initialized = {} @@ -29,7 +32,7 @@ def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): handlers.append(file_handler) formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + '[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s') for handler in handlers: handler.setFormatter(formatter) handler.setLevel(log_level) diff --git a/eval_mm/vlmevalkit/vlmeval/smp/misc.py b/eval_mm/vlmevalkit/vlmeval/smp/misc.py index aad223d..d0d9e90 100644 --- a/eval_mm/vlmevalkit/vlmeval/smp/misc.py +++ b/eval_mm/vlmevalkit/vlmeval/smp/misc.py @@ -5,13 +5,13 @@ import csv import multiprocessing as mp import os import os.path as osp +from pathlib import Path import copy as cp import random as rd import requests import shutil import subprocess import warnings -import logging import pandas as pd from collections import OrderedDict, defaultdict from multiprocessing import Pool, current_process @@ -21,8 +21,14 @@ import matplotlib.pyplot as plt from tabulate import tabulate from json import JSONDecoder from huggingface_hub import scan_cache_dir +from huggingface_hub.utils._cache_manager import _scan_cached_repo from sty import fg, bg, ef, rs + +def modelscope_flag_set(): + return os.environ.get('VLMEVALKIT_USE_MODELSCOPE', None) in ['1', 'True'] + + def process_punctuation(inText): import re outText = inText @@ -71,26 +77,30 @@ def bincount(lst): bins[item] += 1 return bins -def get_cache_path(repo_id, branch=None): - hf_cache_info = scan_cache_dir() - repos = list(hf_cache_info.repos) - repo = None - for r in repos: - if r.repo_id == repo_id: - repo = r - break - if repo is None: +def get_cache_path(repo_id, branch='main', repo_type='datasets'): + try: + if modelscope_flag_set(): + from modelscope.hub.file_download import create_temporary_directory_and_cache + if repo_type == 'datasets': + repo_type = 'dataset' + _, cache = create_temporary_directory_and_cache(model_id=repo_id, repo_type=repo_type) + cache_path = cache.get_root_location() + return cache_path + else: + from .file import HFCacheRoot + cache_path = HFCacheRoot() + org, repo_name = repo_id.split('/') + repo_path = Path(osp.join(cache_path, f'{repo_type}--{org}--{repo_name}/')) + hf_cache_info = _scan_cached_repo(repo_path=repo_path) + revs = {r.refs: r for r in hf_cache_info.revisions} + if branch is not None: + revs = {refs: r for refs, r in revs.items() if branch in refs} + rev2keep = max(revs.values(), key=lambda r: r.last_modified) + return str(rev2keep.snapshot_path) + except Exception as e: + import logging + logging.warning(f'{type(e)}: {e}') return None - revs = list(repo.revisions) - if branch is not None: - revs = [r for r in revs if r.refs == frozenset({branch})] - rev2keep, last_modified = None, 0 - for rev in revs: - if rev.last_modified > last_modified: - rev2keep, last_modified = rev, rev.last_modified - if rev2keep is None: - return None - return str(rev2keep.snapshot_path) def proxy_set(s): import os @@ -126,14 +136,47 @@ try: except ImportError: pass -def timestr(second=True, minute=False): - s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')[2:] - if second: +def timestr(granularity='second'): + s = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + assert granularity in ['second', 'minute', 'hour', 'day'] + if granularity == 'second': return s - elif minute: + elif granularity == 'minute': return s[:-2] - else: + elif granularity == 'hour': return s[:-4] + elif granularity == 'day': + return s[:-6] + +def _minimal_ext_cmd(cmd, cwd=None): + env = {} + for k in ['SYSTEMROOT', 'PATH', 'HOME']: + v = os.environ.get(k) + if v is not None: + env[k] = v + env['LANGUAGE'] = 'C' + env['LANG'] = 'C' + env['LC_ALL'] = 'C' + out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env, cwd=cwd).communicate()[0] + return out + +def githash(fallback='unknown', digits=8): + if digits is not None and not isinstance(digits, int): + raise TypeError('digits must be None or an integer') + try: + import vlmeval + except ImportError as e: + import logging + logging.error(f'ImportError: {str(e)}') + return fallback + try: + out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'], cwd=vlmeval.__path__[0]) + sha = out.strip().decode('ascii') + if digits is not None: + sha = sha[:digits] + except OSError: + sha = fallback + return sha def dict_merge(dct, merge_dct): for k, _ in merge_dct.items(): @@ -152,17 +195,21 @@ def run_command(cmd): return subprocess.check_output(cmd).decode() def load_env(): - logger = logging.getLogger('LOAD_ENV') + import logging + logging.basicConfig( + format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + try: import vlmeval except ImportError: - logger.error('VLMEval is not installed. Failed to import environment variables from .env file. ') + logging.error('VLMEval is not installed. Failed to import environment variables from .env file. ') return pth = osp.realpath(vlmeval.__path__[0]) pth = osp.join(pth, '../.env') pth = osp.realpath(pth) if not osp.exists(pth): - logger.error(f'Did not detect the .env file at {pth}, failed to load. ') + logging.error(f'Did not detect the .env file at {pth}, failed to load. ') return from dotenv import dotenv_values @@ -170,7 +217,7 @@ def load_env(): for k, v in values.items(): if v is not None and len(v): os.environ[k] = v - logger.info(f'API Keys successfully loaded from {pth}') + logging.info(f'API Keys successfully loaded from {pth}') def pip_install_robust(package): import sys @@ -214,3 +261,31 @@ def extract_json_objects(text, decoder=JSONDecoder()): pos = match + index except ValueError: pos = match + 1 + + +def get_gpu_memory(): + import subprocess + try: + command = "nvidia-smi --query-gpu=memory.free --format=csv" + memory_free_info = subprocess.check_output(command.split()).decode('ascii').split('\n')[:-1][1:] + memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)] + return memory_free_values + except Exception as e: + print(f'{type(e)}: {str(e)}') + return [] + + +def auto_split_flag(): + flag = os.environ.get('AUTO_SPLIT', '0') + if flag == '1': + return True + _, world_size = get_rank_and_world_size() + try: + import torch + device_count = torch.cuda.device_count() + if device_count > world_size and device_count % world_size == 0: + return True + else: + return False + except: + return False diff --git a/eval_mm/vlmevalkit/vlmeval/smp/vlm.py b/eval_mm/vlmevalkit/vlmeval/smp/vlm.py index efa3821..f3dcdd7 100644 --- a/eval_mm/vlmevalkit/vlmeval/smp/vlm.py +++ b/eval_mm/vlmevalkit/vlmeval/smp/vlm.py @@ -79,7 +79,7 @@ def mmqa_display(question, target_size=512): print(f'{k.upper()}. {question[k]}') -def encode_image_to_base64(img, target_size=-1): +def encode_image_to_base64(img, target_size=-1, fmt='JPEG'): # if target_size == -1, will not do resizing # else, will set the max_size ot (target_size, target_size) if img.mode in ('RGBA', 'P'): @@ -87,7 +87,7 @@ def encode_image_to_base64(img, target_size=-1): if target_size > 0: img.thumbnail((target_size, target_size)) img_buffer = io.BytesIO() - img.save(img_buffer, format='JPEG') + img.save(img_buffer, format=fmt) image_data = img_buffer.getvalue() ret = base64.b64encode(image_data).decode('utf-8') return ret diff --git a/eval_mm/vlmevalkit/vlmeval/tools.py b/eval_mm/vlmevalkit/vlmeval/tools.py index d665f4a..51f2c3c 100644 --- a/eval_mm/vlmevalkit/vlmeval/tools.py +++ b/eval_mm/vlmevalkit/vlmeval/tools.py @@ -1,9 +1,10 @@ import sys +from vlmeval.dataset import SUPPORTED_DATASETS from vlmeval.config import * from vlmeval.smp import * # Define valid modes -MODES = ('dlist', 'mlist', 'missing', 'circular', 'localize', 'check', 'run', 'eval') +MODES = ('dlist', 'mlist', 'missing', 'circular', 'localize', 'check', 'run', 'eval', 'merge_pkl') CLI_HELP_MSG = \ f""" @@ -32,6 +33,8 @@ CLI_HELP_MSG = \ vlmutil run l2 hf 8. Evaluate data file: vlmutil eval [dataset_name] [prediction_file] + 9. Merge pkl files: + vlmutil merge_pkl [pkl_dir] [world_size] GitHub: https://github.com/open-compass/VLMEvalKit """ # noqa: E501 @@ -50,7 +53,8 @@ dataset_levels = { ('SEEDBench_IMG', 'acc.csv'), ('COCO_VAL', 'score.json'), ('POPE', 'score.csv'), ('ScienceQA_VAL', 'acc.csv'), ('ScienceQA_TEST', 'acc.csv'), ('MMT-Bench_VAL', 'acc.csv'), ('SEEDBench2_Plus', 'acc.csv'), ('BLINK', 'acc.csv'), ('MTVQA_TEST', 'acc.json'), - ('Q-Bench1_VAL', 'acc.csv'), ('A-Bench_VAL', 'acc.csv') + ('Q-Bench1_VAL', 'acc.csv'), ('A-Bench_VAL', 'acc.csv'), ('R-Bench-Dis', 'acc.csv'), + ('MathVision', 'score.csv'), ('MathVerse_MINI_Vision_Only', 'score.csv'), ('DynaMath', 'score.csv'), ], 'l3': [ ('OCRVQA_TESTCORE', 'acc.csv'), ('TextVQA_VAL', 'acc.csv'), @@ -64,38 +68,13 @@ dataset_levels['l23'] = dataset_levels['l2'] + dataset_levels['l3'] dataset_levels['l123'] = dataset_levels['l12'] + dataset_levels['l3'] models = { - '4.33.0': list(qwen_series) + list(xcomposer_series) + [ - 'mPLUG-Owl2', 'flamingov2', 'VisualGLM_6b', 'MMAlaya', 'PandaGPT_13B', 'VXVERSE' - ] + list(idefics_series) + list(minigpt4_series) + list(instructblip_series), - '4.37.0': [x for x in llava_series if 'next' not in x] + list(internvl_series) + [ - 'TransCore_M', 'emu2_chat', 'MiniCPM-V', 'MiniCPM-V-2', 'OmniLMM_12B', - 'cogvlm-grounding-generalist', 'cogvlm-chat', 'cogvlm2-llama3-chat-19B', - ] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(cambrian_series), - '4.40.0': [ - 'idefics2_8b', 'Bunny-llama3-8B', 'MiniCPM-Llama3-V-2_5', '360VL-70B', 'Phi-3-Vision', - ] + list(wemm_series), - 'latest': ['paligemma-3b-mix-448', 'MiniCPM-V-2_6', 'glm-4v-9b'] + [x for x in llava_series if 'next' in x] - + list(chameleon_series) + list(ovis_series) + list(mantis_series), - 'api': list(api_models) + '4.37.0': ['MiniCPM-V', 'MiniCPM-V-2'], + '4.40.0': ['MiniCPM-Llama3-V-2_5'], + 'latest': ['MiniCPM-V-2_6'] } # SKIP_MODELS will be skipped in report_missing and run APIs -SKIP_MODELS = [ - 'MGM_7B', 'GPT4V_HIGH', 'GPT4V', 'flamingov2', 'PandaGPT_13B', - 'GeminiProVision', 'Step1V-0701', 'SenseChat-5-Vision', - 'llava_v1_7b', 'sharegpt4v_7b', 'sharegpt4v_13b', - 'llava-v1.5-7b-xtuner', 'llava-v1.5-13b-xtuner', - 'cogvlm-grounding-generalist', 'InternVL-Chat-V1-1', - 'InternVL-Chat-V1-2', 'InternVL-Chat-V1-2-Plus', 'RekaCore', - 'llava_next_72b', 'llava_next_110b', 'MiniCPM-V', 'sharecaptioner', 'XComposer', - 'VisualGLM_6b', 'idefics_9b_instruct', 'idefics_80b_instruct', - 'mPLUG-Owl2', 'MMAlaya', 'OmniLMM_12B', 'emu2_chat', 'VXVERSE' -] + list(minigpt4_series) + list(instructblip_series) + list(xtuner_series) + list(chameleon_series) + list(vila_series) - -LARGE_MODELS = [ - 'idefics_80b_instruct', '360VL-70B', 'emu2_chat', 'InternVL2-76B', -] - +SKIP_MODELS = ['MiniCPM-V'] def completed(m, d, suf): score_file = f'outputs/{m}/{m}_{d}_{suf}' @@ -111,11 +90,18 @@ def completed(m, d, suf): def DLIST(lvl): - lst = [x[0] for x in dataset_levels[lvl]] - return lst + if lvl in dataset_levels.keys(): + return [x[0] for x in dataset_levels[lvl]] + else: + from vlmeval.dataset import SUPPORTED_DATASETS + return SUPPORTED_DATASETS def MLIST(lvl, size='all'): + if lvl == 'all': + from vlmeval.config import supported_VLM + return [x for x in supported_VLM] + model_list = models[lvl] if size == 'small': model_list = [m for m in model_list if m not in LARGE_MODELS] @@ -338,18 +324,24 @@ def RUN(lvl, model): os.system(cmd) -def EVAL(dataset_name, data_file): +def EVAL(dataset_name, data_file, **kwargs): from vlmeval.dataset import build_dataset logger = get_logger('VLMEvalKit Tool-Eval') dataset = build_dataset(dataset_name) # Set the judge kwargs first before evaluation or dumping judge_kwargs = {'nproc': 4, 'verbose': True} - if dataset.TYPE in ['MCQ', 'Y/N']: - judge_kwargs['model'] = 'chatgpt-0125' - elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], dataset_name): - judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['MMLongBench', 'MMDU'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' + if 'model' not in kwargs: + if dataset.TYPE in ['MCQ', 'Y/N']: + judge_kwargs['model'] = 'chatgpt-0125' + elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): + judge_kwargs['model'] = 'gpt-4-turbo' + elif listinstr(['MMLongBench', 'MMDU'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['DynaMath', 'MathVerse', 'MathVista', 'MathVision'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-mini' + else: + judge_kwargs['model'] = kwargs['model'] + judge_kwargs['nproc'] = kwargs.get('nproc', 4) eval_results = dataset.evaluate(data_file, **judge_kwargs) if eval_results is not None: assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) @@ -357,9 +349,43 @@ def EVAL(dataset_name, data_file): if isinstance(eval_results, dict): logger.info('\n' + json.dumps(eval_results, indent=4)) elif isinstance(eval_results, pd.DataFrame): - if len(eval_results) < len(eval_results.columns): - eval_results = eval_results.T - logger.info('\n' + tabulate(eval_results)) + logger.info('\n') + logger.info(tabulate(eval_results.T) if len(eval_results) < len(eval_results.columns) else eval_results) + return eval_results + + +def parse_args_eval(): + parser = argparse.ArgumentParser() + # Essential Args, Setting the Names of Datasets and Models + parser.add_argument('cmd', type=str) + parser.add_argument('data_file', type=str) + parser.add_argument('--judge', type=str, default=None) + parser.add_argument('--nproc', type=int, default=4) + parser.add_argument('--retry', type=int, default=None) + args = parser.parse_args() + return args + + +def MERGE_PKL(pkl_dir, world_size=1): + prefs = [] + for ws in list(range(1, 9)): + prefs.extend([f'{i}{ws}_' for i in range(ws)]) + prefs = set(prefs) + files = os.listdir(pkl_dir) + files = [x for x in files if x[:3] in prefs] + # Merge the files + res_all = defaultdict(dict) + for f in files: + full_path = osp.join(pkl_dir, f) + key = f[3:] + res_all[key].update(load(full_path)) + os.remove(full_path) + + dump_prefs = [f'{i}{world_size}_' for i in range(world_size)] + for k in res_all: + for pf in dump_prefs: + dump(res_all[k], f'{pkl_dir}/{pf}{k}') + print(f'Merged {len(res_all[k])} records into {pkl_dir}/{dump_prefs[0]}{k}') def cli(): @@ -368,53 +394,74 @@ def cli(): if not args: # no arguments passed logger.info(CLI_HELP_MSG) return - if args[0].lower() in MODES: - if args[0].lower() == 'dlist': - assert len(args) >= 2 - lst = DLIST(args[1]) - print(' '.join(lst)) - elif args[0].lower() == 'mlist': - assert len(args) >= 2 - size = 'all' - if len(args) > 2: - size = args[2].lower() - lst = MLIST(args[1], size) - print(' '.join(lst)) - elif args[0].lower() == 'missing': - assert len(args) >= 2 - missing_list = MISSING(args[1]) - logger = get_logger('Find Missing') - logger.info(colored(f'Level {args[1]} Missing Results: ', 'red')) - lines = [] - for m, D in missing_list: - line = f'Model {m}, Dataset {D}' - logger.info(colored(line, 'red')) - lines.append(line) - mwlines(lines, f'{args[1]}_missing.txt') - elif args[0].lower() == 'circular': - assert len(args) >= 2 - CIRCULAR(args[1]) - elif args[0].lower() == 'localize': - assert len(args) >= 2 - LOCALIZE(args[1]) - elif args[0].lower() == 'check': - assert len(args) >= 2 - model_list = args[1:] - for m in model_list: - CHECK(m) - elif args[0].lower() == 'run': - assert len(args) >= 2 - lvl = args[1] - if len(args) == 2: - model = 'all' + + if args[0].lower() == 'dlist': + assert len(args) >= 2 + lst = DLIST(args[1]) + print(' '.join(lst)) + elif args[0].lower() == 'mlist': + assert len(args) >= 2 + size = 'all' + if len(args) > 2: + size = args[2].lower() + lst = MLIST(args[1], size) + print('\n'.join(lst)) + elif args[0].lower() == 'missing': + assert len(args) >= 2 + missing_list = MISSING(args[1]) + logger = get_logger('Find Missing') + logger.info(colored(f'Level {args[1]} Missing Results: ', 'red')) + lines = [] + for m, D in missing_list: + line = f'Model {m}, Dataset {D}' + logger.info(colored(line, 'red')) + lines.append(line) + mwlines(lines, f'{args[1]}_missing.txt') + elif args[0].lower() == 'circular': + assert len(args) >= 2 + CIRCULAR(args[1]) + elif args[0].lower() == 'localize': + assert len(args) >= 2 + LOCALIZE(args[1]) + elif args[0].lower() == 'check': + assert len(args) >= 2 + model_list = args[1:] + for m in model_list: + CHECK(m) + elif args[0].lower() == 'run': + assert len(args) >= 2 + lvl = args[1] + if len(args) == 2: + model = 'all' + RUN(lvl, model) + else: + for model in args[2:]: RUN(lvl, model) - else: - for model in args[2:]: - RUN(lvl, model) - elif args[0].lower() == 'eval': - assert len(args) == 3 - dataset, data_file = args[1], args[2] - EVAL(dataset, data_file) + elif args[0].lower() == 'eval': + args = parse_args_eval() + data_file = args.data_file + + def extract_dataset(file_name): + fname = osp.splitext(file_name)[0].split('/')[-1] + parts = fname.split('_') + for i in range(len(parts)): + if '_'.join(parts[i:]) in SUPPORTED_DATASETS: + return '_'.join(parts[i:]) + return None + + dataset = extract_dataset(data_file) + assert dataset is not None, f'Cannot infer dataset name from {data_file}' + kwargs = {'nproc': args.api_nproc} + if args.judge is not None: + kwargs['model'] = args.judge + if args.retry is not None: + kwargs['retry'] = args.retry + EVAL(dataset_name=dataset, data_file=data_file, **kwargs) + elif args[0].lower() == 'merge_pkl': + assert len(args) == 3 + args[2] = int(args[2]) + assert args[2] in [1, 2, 4, 8] + MERGE_PKL(args[1], args[2]) else: logger.error('WARNING: command error!') logger.info(CLI_HELP_MSG) diff --git a/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py b/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py index f8662a8..27e31eb 100644 --- a/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py +++ b/eval_mm/vlmevalkit/vlmeval/utils/mp_util.py @@ -6,186 +6,67 @@ from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task, TaskProgressColumn, TextColumn, TimeRemainingColumn) from rich.text import Text import os.path as osp +import time import portalocker from ..smp import load, dump -class _Worker: - """Function wrapper for ``track_progress_rich``""" +def track_progress_rich( + func: Callable, + tasks: Iterable = tuple(), + nproc: int = 1, + save=None, + keys=None, + **kwargs) -> list: - def __init__(self, func) -> None: - self.func = func - - def __call__(self, inputs): - inputs, idx = inputs - if not isinstance(inputs, (tuple, list, dict)): - inputs = (inputs, ) - - if isinstance(inputs, dict): - return self.func(**inputs), idx - else: - return self.func(*inputs), idx - - -class _SkipFirstTimeRemainingColumn(TimeRemainingColumn): - """Skip calculating remaining time for the first few times. - - Args: - skip_times (int): The number of times to skip. Defaults to 0. - """ - - def __init__(self, *args, skip_times=0, **kwargs): - super().__init__(*args, **kwargs) - self.skip_times = skip_times - - def render(self, task: Task) -> Text: - """Show time remaining.""" - if task.completed <= self.skip_times: - return Text('-:--:--', style='progress.remaining') - return super().render(task) - - -def _tasks_with_index(tasks): - """Add index to tasks.""" - for idx, task in enumerate(tasks): - yield task, idx - - -def track_progress_rich(func: Callable, - tasks: Iterable = tuple(), - task_num: int = None, - nproc: int = 1, - chunksize: int = 1, - description: str = 'Processing', - save=None, keys=None, - color: str = 'blue') -> list: - """Track the progress of parallel task execution with a progress bar. The - built-in :mod:`multiprocessing` module is used for process pools and tasks - are done with :func:`Pool.map` or :func:`Pool.imap_unordered`. - - Args: - func (callable): The function to be applied to each task. - tasks (Iterable or Sized): A tuple of tasks. There are several cases - for different format tasks: - - When ``func`` accepts no arguments: tasks should be an empty - tuple, and ``task_num`` must be specified. - - When ``func`` accepts only one argument: tasks should be a tuple - containing the argument. - - When ``func`` accepts multiple arguments: tasks should be a - tuple, with each element representing a set of arguments. - If an element is a ``dict``, it will be parsed as a set of - keyword-only arguments. - Defaults to an empty tuple. - task_num (int, optional): If ``tasks`` is an iterator which does not - have length, the number of tasks can be provided by ``task_num``. - Defaults to None. - nproc (int): Process (worker) number, if nuproc is 1, - use single process. Defaults to 1. - chunksize (int): Refer to :class:`multiprocessing.Pool` for details. - Defaults to 1. - description (str): The description of progress bar. - Defaults to "Process". - color (str): The color of progress bar. Defaults to "blue". - - Examples: - >>> import time - - >>> def func(x): - ... time.sleep(1) - ... return x**2 - >>> track_progress_rich(func, range(10), nproc=2) - - Returns: - list: The task results. - """ + from concurrent.futures import ThreadPoolExecutor + from tqdm import tqdm if save is not None: assert osp.exists(osp.dirname(save)) or osp.dirname(save) == '' if not osp.exists(save): dump({}, save) if keys is not None: assert len(keys) == len(tasks) - if not callable(func): raise TypeError('func must be a callable object') if not isinstance(tasks, Iterable): raise TypeError( f'tasks must be an iterable object, but got {type(tasks)}') - if isinstance(tasks, Sized): - if len(tasks) == 0: - if task_num is None: - raise ValueError('If tasks is an empty iterable, ' - 'task_num must be set') + assert nproc > 0, 'nproc must be a positive number' + res = load(save) if save is not None else {} + results = [None for _ in range(len(tasks))] + + with ThreadPoolExecutor(max_workers=nproc) as executor: + futures = [] + + for inputs in tasks: + if not isinstance(inputs, (tuple, list, dict)): + inputs = (inputs, ) + if isinstance(inputs, dict): + future = executor.submit(func, **inputs) else: - tasks = tuple(tuple() for _ in range(task_num)) - else: - if task_num is not None and task_num != len(tasks): - raise ValueError('task_num does not match the length of tasks') - task_num = len(tasks) + future = executor.submit(func, *inputs) + futures.append(future) - if nproc <= 0: - raise ValueError('nproc must be a positive number') - - skip_times = nproc * chunksize if nproc > 1 else 0 - prog_bar = Progress( - TextColumn('{task.description}'), - BarColumn(), - _SkipFirstTimeRemainingColumn(skip_times=skip_times), - MofNCompleteColumn(), - TaskProgressColumn(show_speed=True), - ) - - worker = _Worker(func) - task_id = prog_bar.add_task( - total=task_num, color=color, description=description) - tasks = _tasks_with_index(tasks) - - # Use single process when nproc is 1, else use multiprocess. - with prog_bar: - if nproc == 1: - results = [] - for task in tasks: - result, idx = worker(task) - results.append(result) + unfinished = set(range(len(tasks))) + pbar = tqdm(total=len(unfinished)) + while len(unfinished): + new_finished = set() + for idx in unfinished: + if futures[idx].done(): + results[idx] = futures[idx].result() + new_finished.add(idx) + if keys is not None: + res[keys[idx]] = results[idx] + if len(new_finished): if save is not None: - with portalocker.Lock(save, timeout=5) as fh: - ans = load(save) - ans[keys[idx]] = result + dump(res, save) + pbar.update(len(new_finished)) + for k in new_finished: + unfinished.remove(k) + time.sleep(0.1) + pbar.close() - if os.environ.get('VERBOSE', True): - print(keys[idx], result, flush=True) - - dump(ans, save) - fh.flush() - os.fsync(fh.fileno()) - - prog_bar.update(task_id, advance=1, refresh=True) - else: - with Pool(nproc) as pool: - results = [] - unordered_results = [] - gen = pool.imap_unordered(worker, tasks, chunksize) - try: - for result in gen: - result, idx = result - unordered_results.append((result, idx)) - - if save is not None: - with portalocker.Lock(save, timeout=5) as fh: - ans = load(save) - ans[keys[idx]] = result - - if os.environ.get('VERBOSE', False): - print(keys[idx], result, flush=True) - - dump(ans, save) - fh.flush() - os.fsync(fh.fileno()) - - results.append(None) - prog_bar.update(task_id, advance=1, refresh=True) - except Exception as e: - prog_bar.stop() - raise e - for result, idx in unordered_results: - results[idx] = result + if save is not None: + dump(res, save) return results diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py b/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py index f1e83f9..bb1f609 100644 --- a/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py +++ b/eval_mm/vlmevalkit/vlmeval/vlm/__init__.py @@ -3,4 +3,4 @@ import torch torch.set_grad_enabled(False) torch.manual_seed(1234) from .base import BaseModel -from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6 +from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6 diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/base.py b/eval_mm/vlmevalkit/vlmeval/vlm/base.py index 6212a40..655296a 100644 --- a/eval_mm/vlmevalkit/vlmeval/vlm/base.py +++ b/eval_mm/vlmevalkit/vlmeval/vlm/base.py @@ -1,5 +1,5 @@ from ..smp import * -from ..dataset import img_root_map +from ..dataset import img_root_map, DATASET_TYPE from abc import abstractmethod @@ -125,7 +125,8 @@ class BaseModel: while len(messages): try: return self.chat_inner(messages, dataset=dataset) - except: + except Exception as e: + logging.info(f'{type(e)}: {e}') messages = messages[1:] while len(messages) and messages[0]['role'] != 'user': messages = messages[1:] @@ -162,6 +163,36 @@ class BaseModel: video = [x['value'] for x in message if x['type'] == 'video'][0] return prompt, video else: - import sys - warnings.warn('Model does not support video input.') - sys.exit(-1) + logging.critical('Model does not support video input.') + raise NotImplementedError + + def message_to_promptvideo_withrole(self, message, dataset=None): + if self.VIDEO_LLM: + system, user, assistant, video_list = '', '', '', [] + for msg in message: + if msg['type'] == 'text': + if 'role' in msg and msg['role'] == 'system': + system += msg['value'] + elif 'role' in msg and msg['role'] == 'assistant': + assistant += msg['value'] + else: + user += msg['value'] + elif msg['type'] == 'video': + video_list.append(msg['value']) + question = { + 'system': system, + 'user': user, + 'assistant': assistant + } + if assistant == '': + if listinstr(['MCQ'], DATASET_TYPE(dataset)): + question['assistant'] = 'Best Option: (' + else: + del question['assistant'] + if len(video_list) > 1: + print('VLMEvalKit only support single video as input, take first video as input') + video = video_list[0] + return question, video + else: + logging.critical('Model does not support video input.') + raise NotImplementedError diff --git a/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py b/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py index 0729260..648485a 100644 --- a/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py +++ b/eval_mm/vlmevalkit/vlmeval/vlm/minicpm_v.py @@ -7,7 +7,9 @@ from transformers import AutoModel, AutoTokenizer from .base import BaseModel from ..smp import * -from ..dataset import DATASET_TYPE +from ..dataset import DATASET_TYPE, DATASET_MODALITY + +import re class MiniCPM_V(BaseModel): @@ -25,12 +27,13 @@ class MiniCPM_V(BaseModel): self.kwargs = kwargs self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) torch.cuda.empty_cache() - self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3 + self.num_beams = 3 def use_custom_prompt(self, dataset): assert dataset is not None - if listinstr(['MMMU'], dataset): - return True + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset): + # For Multi-Turn we don't have custom prompt + return False return False def build_prompt(self, line, dataset=None): @@ -103,7 +106,7 @@ class MiniCPM_Llama3_V(BaseModel): self.kwargs = kwargs self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) torch.cuda.empty_cache() - self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3 + self.num_beams = 3 self.options_system_prompt = ('Carefully read the following question and select the letter corresponding ' 'to the correct answer. Highlight the applicable choices without giving ' 'explanations.') @@ -258,7 +261,7 @@ class MiniCPM_V_2_6(BaseModel): INSTALL_REQ = False INTERLEAVE = True - def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs): + def __init__(self, model_path='openbmb/MiniCPM-V-2_6', **kwargs): random.seed(0) np.random.seed(0) torch.manual_seed(0) @@ -274,7 +277,7 @@ class MiniCPM_V_2_6(BaseModel): self.kwargs = kwargs self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) torch.cuda.empty_cache() - self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3 + self.num_beams = 3 self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.''' self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.' @@ -291,7 +294,7 @@ class MiniCPM_V_2_6(BaseModel): def use_custom_prompt(self, dataset=None): if dataset is None: return False - if listinstr(['MCQ', 'VQA', 'Y/N'], DATASET_TYPE(dataset)): + if DATASET_TYPE(dataset) in ['MCQ', 'VQA', 'Y/N']: return True return False @@ -414,6 +417,15 @@ class MiniCPM_V_2_6(BaseModel): return msgs def generate_inner(self, message, dataset=None): + if DATASET_MODALITY(dataset) == 'VIDEO': + max_slice_nums = 1 + use_image_id = False + max_inp_length = 2048 * 10 + else: + max_slice_nums = None + use_image_id = True + max_inp_length = 8192 + max_new_tokens = 2048 default_kwargs = dict( max_new_tokens=max_new_tokens, @@ -449,7 +461,9 @@ class MiniCPM_V_2_6(BaseModel): msgs=msgs, context=None, tokenizer=self.tokenizer, - max_inp_length=8192, + max_inp_length=max_inp_length, + use_image_id=use_image_id, + max_slice_nums=max_slice_nums, **default_kwargs ) @@ -457,3 +471,257 @@ class MiniCPM_V_2_6(BaseModel): res = res[0] return res + + +class MiniCPM_o_2_6(BaseModel): + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='openbmb/MiniCPM-o-2_6', **kwargs): + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + assert model_path is not None + self.model_path = model_path + print(f'load from path {self.model_path}') + self.model = AutoModel.from_pretrained( + self.model_path, + trust_remote_code=True, + attn_implementation='sdpa', + torch_dtype=torch.bfloat16, + init_vision=True, + init_audio=False, + init_tts=False + ) + + self.model.eval().cuda() + + self.kwargs = kwargs + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + torch.cuda.empty_cache() + + num_beams = int(os.getenv("NUM_BEAMS", "3")) + self.num_beams = 3 if self.model_path == 'openbmb/MiniCPM-o-2_6' else num_beams + + repetition_penalty = float(os.getenv("PENALTY", "1.2")) + self.repetition_penalty = repetition_penalty + + self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.''' + self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.' + self.detail_system_prompt = 'Answer this question in detail.' + self.vqa_prompt = 'Answer the question using a single word or phrase.' + + self.multi_choice_cot_prompt = ('''Carefully read the following multichoice question, solve it step ''' + '''by step and finally pick the option associated with the correct ''' + '''answer in the format of "Answer: selected option\n\n''') + self.short_ans_cot_prompt = ('''Read the following question carefully, solve it step by step, and ''' + '''then output the final answer in the format of "Answer: single number ''' + '''or single word or phrase".\n\n''') + + def use_custom_prompt(self, dataset=None): + if dataset is None: + return False + if listinstr(['MCQ', 'VQA', 'Y/N'], DATASET_TYPE(dataset)): + return True + return False + + def use_cot(self, dataset=None): + if dataset is None: + return False + if listinstr(['MMMU', 'MathVista', 'OCRBench', 'ChartQA', 'MathVision', 'MathVerse_MINI_Vision_Only'], dataset): + return True + elif listinstr(['MMVet', 'MMBench', 'MMStar', 'HallusionBench', 'AI2D', 'RealWorldQA', + 'POPE', 'ScienceQA', 'TextVQA', 'DocVQA'], dataset): + return False + else: + return False + + def use_upsize(self, dataset=None): + if dataset is None: + return False + if listinstr(['MathVista', 'MMBench_TEST_CN', 'MMStar', 'AI2D', 'OCRBench', 'DynaMath'], dataset): + return True + else: + return False + + def build_prompt(self, line, dataset=None): + if isinstance(line, int): + line = self.data.iloc[line] + + tgt_path = self.dump_image(line, dataset) + system_prompt, prompt = '', '' + + question = line['question'] + + if not self.use_cot(dataset): + if DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + prompt += self.options_suffix_prompt + else: + system_prompt = self.wo_options_system_prompt + + if 'MMMU' in dataset: + if len(system_prompt) > 0: + prompt = system_prompt + '\n' + prompt + system_prompt = '' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question += ' Yes or No?' + prompt = question + elif dataset is not None and listinstr(['OCRBench'], dataset): + system_prompt = self.vqa_prompt + prompt = question + elif DATASET_TYPE(dataset) == 'VQA': + if listinstr(['LLaVABench'], dataset): + system_prompt = '' + elif listinstr(['MMVet'], dataset): + system_prompt = self.detail_system_prompt + else: + system_prompt = self.vqa_prompt + prompt = question + else: + prompt = question + else: + has_options = True + if DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'{question}\n' + + if len(options): + prompt += options_prompt + else: + has_options = False + + if 'MMMU' in dataset: + if len(system_prompt) > 0: + prompt = system_prompt + '\n' + prompt + system_prompt = '' + else: + prompt = question + + if DATASET_TYPE(dataset) in ['MCQ', 'Y/N', 'VQA']: + if DATASET_TYPE(dataset) == 'MCQ': + if has_options: + prompt = self.multi_choice_cot_prompt + prompt + else: + prompt = self.short_ans_cot_prompt + prompt + elif DATASET_TYPE(dataset) == 'Y/N': + prompt = self.short_ans_cot_prompt + prompt + else: + prompt = self.short_ans_cot_prompt + prompt + + msgs = [] + if system_prompt: + msgs.append(dict(type='text', value=system_prompt)) + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + + return msgs + + def extract_answer(self, res, dataset=None): + if dataset is None: + return res + if self.use_cot(dataset): + if DATASET_TYPE(dataset) == 'MCQ': + pattern = r'Answer:\s*([A-Ia-i])(?![A-Za-z])' + matches = re.findall(pattern, res, re.DOTALL) + if matches: + extracted_res = matches[-1].strip() + else: + extracted_res = res + return extracted_res + elif DATASET_TYPE(dataset) == 'VQA' and not listinstr(['OCRBench'], dataset): + pattern = r'Answer:\s*(.*)\s*$' + match = re.search(pattern, res, re.DOTALL) + if match: + extracted_res = match.group(1) + else: + extracted_res = res + return extracted_res + return res + + def generate_inner(self, message, dataset=None): + if DATASET_MODALITY(dataset) == 'VIDEO': + max_slice_nums = 1 + use_image_id = False + max_inp_length = 2048 * 10 + else: + max_slice_nums = None + use_image_id = True + max_inp_length = 8192 + + max_new_tokens = 2048 + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + repetition_penalty=self.repetition_penalty, + num_beams=self.num_beams, + ) + default_kwargs.update(self.kwargs) + + content = [] + + for x in message: + if x['type'] == 'text': + content.append(x['value']) + elif x['type'] == 'image': + image = Image.open(x['value']).convert('RGB') + if not self.use_upsize(dataset): + content.append(image) + else: + img_width, img_height = image.width, image.height + if (img_width * img_height) >= (1344 * 1344): + content.append(image) + else: + ratio = math.sqrt((1344 * 1344) / (img_width * img_height)) + max_img_width = int(img_width * ratio) + new_img_width = random.randint(img_width, max_img_width) + new_img_height = int(new_img_width / img_width * img_height) + resized_image = image.resize((new_img_width, new_img_height)) + content.append(resized_image) + msgs = [{'role': 'user', 'content': content}] + + res = self.model.chat( + image=None, + msgs=msgs, + context=None, + tokenizer=self.tokenizer, + max_inp_length=max_inp_length, + use_image_id=use_image_id, + max_slice_nums=max_slice_nums, + **default_kwargs + ) + + if isinstance(res, tuple) and len(res) > 0: + res = res[0] + + res = self.extract_answer(res, dataset) + + return res diff --git a/eval_mm/vqaeval/eval.py b/eval_mm/vqaeval/eval.py index f5faf9f..12e8250 100644 --- a/eval_mm/vqaeval/eval.py +++ b/eval_mm/vqaeval/eval.py @@ -22,7 +22,7 @@ from eval_utils.vqa_evaluate import * def get_model(args): if args.model_name == '': raise Exception('Model name cannot be empty str!') - from models.MiniCPM.minicpmv import MiniCPM_V, MiniCPM_V_2_6 + from models.MiniCPM.minicpmv import MiniCPM_V, MiniCPM_V_2_6, MiniCPM_o_2_6 model_path = args.model_path ckpt = args.ckpt @@ -30,6 +30,8 @@ def get_model(args): model = MiniCPM_V(model_path=model_path, ckpt=ckpt, device=args.device) elif args.model_name == 'minicpmv26': model = MiniCPM_V_2_6(model_path=model_path, ckpt=ckpt, device=args.device) + elif args.model_name == 'minicpmo26': + model = MiniCPM_o_2_6(model_path=model_path, ckpt=ckpt, device=args.device) else: raise Exception(f"Unexpected Moedel Name {args.model_name}!") @@ -67,15 +69,16 @@ def main(args): dataset = docVQADataset(args.docVQA_image_dir, args.docVQA_ann_path) if max_sample_num is not None: dataset = torch.utils.data.Subset(dataset, range(max_sample_num)) - acc = evaluate_VQA(model, dataset, args.model_name, 'docVQA', time, batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path) + acc = evaluate_VQA(model, dataset, args.model_name, 'docVQA', time, \ + batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path) result['docVQA'] = acc if args.eval_docVQATest or args.eval_all: - target_dataset = "docVQATest" dataset = docVQATESTDataset(args.docVQATest_image_dir, args.docVQATest_ann_path) if max_sample_num is not None: dataset = torch.utils.data.Subset(dataset, range(max_sample_num)) - acc = evaluate_VQA(model, dataset, args.model_name, target_dataset, time, batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path) + acc = evaluate_VQA(model, dataset, args.model_name, 'docVQATest', time, \ + batch_size=args.batchsize, generate_method=args.generate_method, answer_path=args.answer_path) result['docVQATest'] = acc if torch.distributed.is_initialized(): diff --git a/eval_mm/vqaeval/eval_utils/vqa_evaluate.py b/eval_mm/vqaeval/eval_utils/vqa_evaluate.py index 7888894..a3994fe 100644 --- a/eval_mm/vqaeval/eval_utils/vqa_evaluate.py +++ b/eval_mm/vqaeval/eval_utils/vqa_evaluate.py @@ -370,8 +370,6 @@ def evaluate_VQA( generate_method="interleave", answer_path='./answers', ): - print(f"answer path:{answer_path}") - sampler = None if torch.distributed.is_initialized(): sampler=InferenceSampler(len(dataset)) @@ -383,8 +381,6 @@ def evaluate_VQA( collate_fn=collate_fn_vqa ) - now_rank = torch.distributed.get_rank() - answer_dir = os.path.join(answer_path, model_name, time) os.makedirs(answer_dir, exist_ok=True) @@ -395,21 +391,15 @@ def evaluate_VQA( predictions = [] for batch in tqdm(dataloader, desc="Running inference"): - image_paths, questions, gt_answers, ocr_tokens_list, question_ids, question_type = batch + image_paths, questions, gt_answers, ocr_tokens_list, question_ids, question_type = batch with torch.no_grad(): - if model_name != "minicpm": - if model_name != "codellama": - outputs = model.generate(images=image_paths, questions=questions, datasetname=dataset_name) - else: - outputs = model.generate() - elif model_name == "minicpm": - if generate_method == "old": - outputs = model.generate(images=image_paths, questions=questions, datasetname=dataset_name) - elif generate_method == "interleave": - outputs = model.generate_with_interleaved(images=image_paths, questions=questions, datasetname=dataset_name) - else: - raise Exception(f"Wrong generate paradigm {generate_method}!") + if generate_method == "old": + outputs = model.generate(images=image_paths, questions=questions, datasetname=dataset_name) + elif generate_method == "interleave": + outputs = model.generate_with_interleaved(images=image_paths, questions=questions, datasetname=dataset_name) + else: + raise Exception(f"Wrong generate paradigm {generate_method}!") for i in range(len(outputs)): answer_dict = { diff --git a/eval_mm/vqaeval/models/MiniCPM/minicpmv.py b/eval_mm/vqaeval/models/MiniCPM/minicpmv.py index ea366ed..3e2dcfd 100644 --- a/eval_mm/vqaeval/models/MiniCPM/minicpmv.py +++ b/eval_mm/vqaeval/models/MiniCPM/minicpmv.py @@ -33,14 +33,9 @@ class MiniCPM_V: def generate(self, images, questions, datasetname): image = Image.open(images[0]).convert('RGB') - try: - max_new_tokens = max_token[datasetname] - except: - max_new_tokens = 1024 - if (datasetname == 'docVQA') or (datasetname == "docVQATest") : - prompt = "Answer the question directly with single word." + "\n" + questions[0] - elif (datasetname == 'textVQA') : - prompt = "Answer the question directly with single word." + '\n'+ questions[0] + max_new_tokens = max_token[datasetname] + + prompt = "Answer the question directly with single word." + '\n' + questions[0] msgs = [{'role': 'user', 'content': prompt}] default_kwargs = dict( @@ -59,10 +54,7 @@ class MiniCPM_V: return [res] def generate_with_interleaved(self, images, questions, datasetname): - try: - max_new_tokens = max_token[datasetname] - except: - max_new_tokens = 1024 + max_new_tokens = max_token[datasetname] prompt = "Answer the question directly with single word." @@ -103,11 +95,10 @@ class MiniCPM_V: class MiniCPM_V_2_6: def __init__(self, model_path, ckpt, device=None)->None: - seed = 0 - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) self.model_path = model_path self.ckpt = ckpt @@ -125,14 +116,17 @@ class MiniCPM_V_2_6: def generate(self, images, questions, datasetname): image = Image.open(images[0]).convert('RGB') - try: - max_new_tokens = max_token[datasetname] - except: - max_new_tokens = 1024 - if (datasetname == 'docVQA') or (datasetname == "docVQATest") : - prompt = "Answer the question directly with single word." + "\n" + questions[0] - elif (datasetname == 'textVQA') : - prompt = "Answer the question directly with single word." + '\n'+ questions[0] + img_width, img_height = image.width, image.height + if (img_width * img_height) < (1344 * 1344): + ratio = math.sqrt((1344 * 1344) / (img_width * img_height)) + max_img_width = int(img_width * ratio) + new_img_width = random.randint(img_width, max_img_width) + new_img_height = int(new_img_width / img_width * img_height) + image = image.resize((new_img_width, new_img_height)) + + max_new_tokens = max_token[datasetname] + + prompt = "Answer the question directly with single word." + '\n' + questions[0] msgs = [{'role': 'user', 'content': prompt}] default_kwargs = dict( @@ -151,10 +145,7 @@ class MiniCPM_V_2_6: return [res] def generate_with_interleaved(self, images, questions, datasetname): - try: - max_new_tokens = max_token[datasetname] - except: - max_new_tokens = 1024 + max_new_tokens = max_token[datasetname] prompt = "Answer the question directly with single word." @@ -197,5 +188,117 @@ class MiniCPM_V_2_6: if isinstance(res, tuple) and len(res) > 0: res = res[0] - print(f"Q: {content}, \nA: {res}") + return [res] + + +class MiniCPM_o_2_6: + + def __init__(self, model_path, ckpt, device=None)->None: + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + self.model_path = model_path + self.ckpt = ckpt + self.model = AutoModel.from_pretrained( + self.model_path, + trust_remote_code=True, + attn_implementation='sdpa', + torch_dtype=torch.bfloat16, + init_vision=True, + init_audio=False, + init_tts=False + ) + if self.ckpt is not None: + self.ckpt = ckpt + self.state_dict = torch.load(self.ckpt, map_location=torch.device('cpu')) + self.model.load_state_dict(self.state_dict) + + self.model = self.model.eval().to(device) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + torch.cuda.empty_cache() + + def generate(self, images, questions, datasetname): + image = Image.open(images[0]).convert('RGB') + img_width, img_height = image.width, image.height + if (img_width * img_height) < (1344 * 1344): + ratio = math.sqrt((1344 * 1344) / (img_width * img_height)) + max_img_width = int(img_width * ratio) + new_img_width = random.randint(img_width, max_img_width) + new_img_height = int(new_img_width / img_width * img_height) + image = image.resize((new_img_width, new_img_height)) + + max_new_tokens = max_token[datasetname] + + prompt = "Answer the question directly with single word." + '\n' + questions[0] + + msgs = [{'role': 'user', 'content': prompt}] + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + num_beams=3, + max_inp_length=8192, + use_image_id=True, + max_slice_nums=None + ) + res = self.model.chat( + image=image, + msgs=msgs, + context=None, + tokenizer=self.tokenizer, + **default_kwargs + ) + + return [res] + + def generate_with_interleaved(self, images, questions, datasetname): + max_new_tokens = max_token[datasetname] + + prompt = "Answer the question directly with single word." + + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + num_beams=3, + max_inp_length=8192, + use_image_id=True, + max_slice_nums=None + ) + + content = [] + message = [ + {'type': 'text', 'value': prompt}, + {'type': 'image', 'value': images[0]}, + {'type': 'text', 'value': questions[0]} + ] + for x in message: + if x['type'] == 'text': + content.append(x['value']) + elif x['type'] == 'image': + image = Image.open(x['value']).convert('RGB') + img_width, img_height = image.width, image.height + if (img_width * img_height) >= (1344 * 1344): + content.append(image) + else: + ratio = math.sqrt((1344 * 1344) / (img_width * img_height)) + max_img_width = int(img_width * ratio) + new_img_width = random.randint(img_width, max_img_width) + new_img_height = int(new_img_width / img_width * img_height) + resized_image = image.resize((new_img_width, new_img_height)) + content.append(resized_image) + msgs = [{'role': 'user', 'content': content}] + + res = self.model.chat( + image=None, + msgs=msgs, + context=None, + tokenizer=self.tokenizer, + **default_kwargs + ) + + if isinstance(res, tuple) and len(res) > 0: + res = res[0] + print(f"Q: {content}, \nA: {res}") + return [res] \ No newline at end of file diff --git a/eval_mm/vqaeval/requirements.txt b/eval_mm/vqaeval/requirements.txt index 5eb9425..3c2227a 100644 --- a/eval_mm/vqaeval/requirements.txt +++ b/eval_mm/vqaeval/requirements.txt @@ -26,7 +26,7 @@ pyyaml==6.0 regex==2022.10.31 tokenizers==0.13.2 tqdm==4.64.1 -transformers +transformers==4.44.2 timm==0.6.13 spacy==3.5.1 webdataset==0.2.48 diff --git a/eval_mm/vqaeval/shell/run_inference.sh b/eval_mm/vqaeval/shell/run_inference.sh index da462bf..6582a5c 100644 --- a/eval_mm/vqaeval/shell/run_inference.sh +++ b/eval_mm/vqaeval/shell/run_inference.sh @@ -12,4 +12,4 @@ python -m torch.distributed.launch \ --eval_textVQA \ --eval_docVQA \ --answer_path ./answers \ - --batchsize 1 \ No newline at end of file + --batchsize 1 diff --git a/eval_mm/vqaeval/shell/run_transform.sh b/eval_mm/vqaeval/shell/run_transform.sh index a19a83f..3f51565 100644 --- a/eval_mm/vqaeval/shell/run_transform.sh +++ b/eval_mm/vqaeval/shell/run_transform.sh @@ -1,3 +1,3 @@ python ./transform_docvqatest_for_submission.py \ --input_file_path \ - --output_file_path \ No newline at end of file + --output_file_path