Files
MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/cgbench.py
2025-01-21 15:34:54 +08:00

1761 lines
70 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from .utils.cgbench import *
from ..utils import track_progress_rich
class CGBench_MCQ_Grounding_Mini(VideoBaseDataset):
dataset = "CG-Bench_MCQ_Grounding_Mini"
TYPE = "Video-MCQ-Grounding"
MD5 = "54ed3e90a51a6fb375c92b319a715f72"
SYS = {
"long_acc": (
"You will be provided with sampled frames from a video, along with a "
"multiple-choice question that includes a question and several answer options.\n"
"Your task is to analyze the provided frames, infer the most plausible "
"answer based on the visual information.\n"
"If the video does not provide enough information, infer the answer based "
"on the options available and still provide a result. "
"Therefore, In all cases, an answer must be given.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": "option"}\n```\n\n'
'The "option" is the uppercase letter corresponding to your answer.\n\n'
),
"clue_acc": (
"You will be provided with sampled frames from a video, along with a "
"multiple-choice question that includes a question and several answer options.\n"
"Your task is to analyze the provided frames, infer the most plausible "
"answer based on the visual information.\n"
"If the video does not provide enough information, infer the answer based "
"on the options available and still provide a result. "
"Therefore, In all cases, an answer must be given.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": "option"}\n```\n\n'
"The 'option' is the uppercase letter corresponding to your answer.\n\n"
),
"miou": (
"You will be provided with uniformly sampled frames from a video and their "
"timestamps, along with a multiple-choice question that includes a question "
"and several answer options.\n"
"Your task is to determine in which intervals the 'clue intervals' exist "
"that contain visual information needed to answer the question.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
"In this output format, each 'start' and 'end' represents the beginning and "
"end of an interval in seconds where relevant clues can be found.\n"
"You must provide at least one interval and at most five intervals. "
"Intervals exceeding five will NOT be considered valid.\n"
),
"miou_wo_frame_time": (
"You will be provided with uniformly sampled frames from a video, along "
"with a multiple-choice question that includes a question and several "
"answer options.\n"
"Your task is to determine in which intervals the 'clue intervals' exist "
"that contain visual information needed to answer the question.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
'In this output format, each "start" and "end" represents the start and '
"end of the video where the relevant clue can be found in the form of a "
"floating point number between 0 and 1, where 0 represents the start time "
"of the video and 1 represents the end time of the video.\n"
"You must provide at least one interval and at most five intervals. "
"Intervals exceeding five will NOT be considered valid.\n"
),
}
def __init__(
self,
dataset="CG-Bench_MCQ_Grounding_Mini",
use_subtitle=False,
use_subtitle_time=False,
use_frame_time=False,
nframe=0,
fps=-1,
):
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
self.use_subtitle = use_subtitle
self.use_subtitle_time = use_subtitle_time
self.use_frame_time = use_frame_time
self.dataset_name = dataset
lmu_root = LMUDataRoot()
self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
@classmethod
def supported_datasets(cls):
return ["CG-Bench_MCQ_Grounding_Mini"]
def clue_frame_paths(self, qid, num_frames=8):
frame_root = osp.join(self.clue_frame_root, qid)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1):
frame_root = osp.join(self.clue_frame_root, qid)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)]
def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
subtitles = []
srt_path = osp.join(self.data_root, subtitle_path)
assert osp.exists(srt_path)
import pysubs2
subs = pysubs2.load(srt_path, encoding="utf-8")
if not frame_indices:
for sub in subs:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
else:
for selected_frame_id in frame_indices:
cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
for sub in subs:
if sub.start < cur_time and sub.end > cur_time:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
if subtitles:
subtitles_str = '\n'.join(subtitles)
return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
else:
return ""
def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding_Mini", repo_id="CG-Bench/CG-Bench"):
def check_integrity(pth):
data_file = osp.join(pth, f"{dataset_name}.tsv")
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data["video"]:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
tsv_file = osp.join(pth, f"{dataset_name}.tsv")
task_modes = ["long_acc", "clue_acc", "miou"]
all_data = []
for task_mode in task_modes:
with open(osp.join(pth, "cgbench_mini.json"), "r") as f:
data_file = pd.DataFrame(json.load(f))
data_file = data_file.assign(index=range(len(data_file)))
data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
data_file["subtitle_path"] = data_file["video_uid"].apply(
lambda x: (
f"cg_subtitles/{x}.srt"
if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt"))
else ""
)
)
data_file["clue_video_path"] = ""
if task_mode in ["clue_acc"]:
data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply(
lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1
)
data_file["task_mode"] = task_mode
if task_mode in ["clue_acc", "long_acc"]:
data_file["answer"] = data_file["right_answer"]
if task_mode == "miou":
data_file["answer"] = data_file["clue_intervals"]
if task_mode in ["long_acc", "miou"]:
data_file["clue_intervals"] = ""
data_file = data_file[
[
"index",
"video_uid",
"video",
"duration",
"domain",
"choices",
"sub_category",
"subtitle_path",
"question",
"answer",
"task_mode",
"clue_intervals",
"qid",
"clue_video_path",
]
]
all_data.append(data_file)
final_data = pd.concat(all_data, ignore_index=True)
final_data["index"] = range(len(final_data))
final_data.to_csv(tsv_file, sep="\t", index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
unzip_hf_zip(dataset_path)
generate_tsv(dataset_path)
tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
return dict(data_file=tsv_file, root=dataset_path)
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
task_mode = line["task_mode"]
message = []
origin_use_subtitle_time = self.use_subtitle_time
try:
if task_mode in ["long_acc", "clue_acc"]:
system_prompt = self.SYS[task_mode]
elif task_mode == "miou":
if self.use_frame_time and not video_llm:
system_prompt = self.SYS[task_mode]
else:
system_prompt = self.SYS["miou_wo_frame_time"]
if self.use_subtitle_time is True:
self.use_subtitle_time = False
user_prompt = ""
if task_mode in ["long_acc", "miou"]:
video_path = line["video"]
if video_llm:
message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
if self.nframe:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
fps=vid_fps, sub_time=self.use_subtitle_time)
else:
user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
else:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
message.extend(dict(type="image", value=im) for im in image_paths)
if self.use_frame_time:
user_prompt += get_timestampes(frame_indices, vid_fps)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
user_prompt += self.get_subtitles(
line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
sub_time=self.use_subtitle_time
)
elif task_mode == "clue_acc":
clue_video_path = line["clue_video_path"]
video_path = line["video"]
if video_llm:
message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path)))
print(message)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
if self.nframe:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
fps=vid_fps, sub_time=self.use_subtitle_time)
else:
user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
else:
if self.nframe > 32:
self.nframe = 32
print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !")
clue_intervals = eval(line["clue_intervals"])
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps
)
message.extend(dict(type="image", value=im) for im in image_paths)
if self.use_frame_time:
user_prompt += get_timestampes(frame_indices, vid_fps)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
user_prompt += self.get_subtitles(
line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
sub_time=self.use_subtitle_time
)
question = line["question"]
user_prompt += f"Question: {question}\n\n"
choices = eval(line["choices"])
labels = [chr(ord("A") + i) for i in range(len(choices))]
user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n"
message.append(dict(type="text", value=system_prompt + user_prompt))
return message
finally:
# Ensure that `use_subtitle_time` is always restored to its original value
self.use_subtitle_time = origin_use_subtitle_time
def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
if type(uid) is not str:
uid = str(uid)
vid_path = osp.join(self.data_root, video)
vid = decord.VideoReader(vid_path)
vid_fps = vid.get_avg_fps()
n_frames = len(vid)
if clue_intervals is not None:
merged_intervals = merge_intervals(clue_intervals)
if num_frames > 0 and fps < 0:
indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
frame_paths = self.clue_frame_paths(uid, len(indices))
elif fps > 0:
frame_indices = []
for start, end in merged_intervals:
start_frame = int(start * vid_fps)
end_frame = int(end * vid_fps)
step = vid_fps / fps
interval_indices = [
int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
]
frame_indices.extend(interval_indices)
if len(frame_indices) < 32:
indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
else:
indices = frame_indices
frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
else:
if num_frames > 0 and fps < 0:
step_size = len(vid) / (num_frames + 1)
indices = [int(i * step_size) for i in range(1, num_frames + 1)]
frame_paths = self.frame_paths(uid)
elif fps > 0:
total_duration = n_frames / vid_fps
required_frames = int(total_duration * fps)
step_size = vid_fps / fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(uid, len(indices))
# Save and validate frames
valid_paths = []
valid_indices = []
if not np.all([osp.exists(p) for p in frame_paths]):
images = [vid[i].asnumpy() for i in indices]
for i, (img_array, path) in enumerate(zip(images, frame_paths)):
if osp.exists(path):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
try:
img = Image.fromarray(img_array)
img.save(path)
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
for i, path in enumerate(frame_paths):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
return valid_paths, valid_indices, vid_fps
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
data = load(eval_file)
data_un = data[~pd.isna(data["prediction"])]
data_pred_na = data[pd.isna(data["prediction"])]
data_pred_na["score"] = -1
data_un["score"] = data_un.apply(
lambda row: post_process(
response=row["prediction"],
right_answer=row["answer"],
task_mode=row["task_mode"],
duration=row["duration"],
),
axis=1,
)
data = pd.concat([data_pred_na, data_un])
rejected_count = (data["score"] == -1).sum()
print(
f"Among {len(data)} questions, "
f"failed to obtain prediction for {len(data_pred_na)} questions, "
f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. "
f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating."
)
dump(data, score_file)
rating = get_dimention_rating_mcq_grouding(score_file)
dump(rating, tgt_file)
return rating
# 评估时step_2 评估时,给出 [prompt] + image_paths 就行
class CGBench_OpenEnded_Mini(VideoBaseDataset):
TYPE = "Video-OpenEnded"
dataset = "CG-Bench_OpenEnded_Mini"
MD5 = "9175791b11afdfa305fdb3e525b7a4ee"
SYS = (
"You will be provided with sampled frames from a video, along with a "
"question.\n"
"Your task is to analyze the provided frames and infer the most plausible "
"answer based on the visual information.\n"
"If the visual information is ambiguous or insufficient, use the available "
"context to reason your answer.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": "answer"}\n```\n\n'
'The "answer" can be a word, phrase, or sentence that directly responds to '
"the question.\n\n"
)
def __init__(
self,
dataset="CG-Bench_OpenEnded_Mini",
use_subtitle=False,
use_subtitle_time=False,
use_frame_time=False,
nframe=0,
fps=-1,
):
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
self.use_subtitle = use_subtitle
self.use_subtitle_time = use_subtitle_time
self.use_frame_time = use_frame_time
self.dataset_name = dataset
lmu_root = LMUDataRoot()
self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
@classmethod
def supported_datasets(cls):
return ["CG-Bench_OpenEnded_Mini"]
def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
subtitles = []
srt_path = osp.join(self.data_root, subtitle_path)
assert osp.exists(srt_path)
import pysubs2
subs = pysubs2.load(srt_path, encoding="utf-8")
if not frame_indices:
for sub in subs:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
else:
for selected_frame_id in frame_indices:
cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
for sub in subs:
if sub.start < cur_time and sub.end > cur_time:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
if subtitles:
subtitles_str = '\n'.join(subtitles)
return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
else:
return ""
def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded_Mini", repo_id="CG-Bench/CG-Bench"):
def check_integrity(pth):
data_file = osp.join(pth, f"{dataset_name}.tsv")
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data["video"]:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
tsv_file = osp.join(pth, f"{dataset_name}.tsv")
with open(osp.join(pth, "cgbench_mini.json"), "r") as f:
data_file = pd.DataFrame(json.load(f))
data_file = data_file.assign(index=range(len(data_file)))
data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
data_file["subtitle_path"] = data_file["video_uid"].apply(
lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else ""
)
data_file = data_file[
[
"index",
"video_uid",
"video",
"duration",
"domain",
"sub_category",
"subtitle_path",
"question",
"answer",
"clue_intervals",
"qid",
]
]
data_file.to_csv(tsv_file, sep="\t", index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
unzip_hf_zip(dataset_path)
generate_tsv(dataset_path)
tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
return dict(data_file=tsv_file, root=dataset_path)
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
message = []
sys_prompt = self.SYS
user_prompt = ""
video_path = line["video"]
if video_llm:
message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
if self.nframe:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
fps=vid_fps, sub_time=self.use_subtitle_time)
else:
user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
else:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
message.extend(dict(type="image", value=im) for im in image_paths)
if self.use_frame_time:
user_prompt += get_timestampes(frame_indices, vid_fps)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
user_prompt += self.get_subtitles(
line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
sub_time=self.use_subtitle_time
)
question = line["question"]
user_prompt += f"Question: {question}\n\n"
message.append(dict(type="text", value=sys_prompt + user_prompt))
return message
def clue_frame_paths(self, qid, num_frames=8):
frame_root = osp.join(self.clue_frame_root, qid)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
if type(uid) is not str:
uid = str(uid)
vid_path = osp.join(self.data_root, video)
vid = decord.VideoReader(vid_path)
vid_fps = vid.get_avg_fps()
n_frames = len(vid)
if clue_intervals is not None:
merged_intervals = merge_intervals(clue_intervals)
if num_frames > 0 and fps < 0:
indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
frame_paths = self.clue_frame_paths(uid, len(indices))
elif fps > 0:
frame_indices = []
for start, end in merged_intervals:
start_frame = int(start * vid_fps)
end_frame = int(end * vid_fps)
step = vid_fps / fps
interval_indices = [
int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
]
frame_indices.extend(interval_indices)
if len(frame_indices) < 32:
indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
else:
indices = frame_indices
frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
else:
if num_frames > 0 and fps < 0:
step_size = len(vid) / (num_frames + 1)
indices = [int(i * step_size) for i in range(1, num_frames + 1)]
frame_paths = self.frame_paths(uid)
elif fps > 0:
total_duration = n_frames / vid_fps
required_frames = int(total_duration * fps)
step_size = vid_fps / fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(uid, len(indices))
valid_paths = []
valid_indices = []
if not np.all([osp.exists(p) for p in frame_paths]):
images = [vid[i].asnumpy() for i in indices]
for i, (img_array, path) in enumerate(zip(images, frame_paths)):
if osp.exists(path):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
try:
img = Image.fromarray(img_array)
img.save(path)
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
for i, path in enumerate(frame_paths):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
return valid_paths, valid_indices, vid_fps
def evaluate(self, eval_file, **judge_kwargs):
from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
data = load(eval_file)
data_pred_no_na = data[~pd.isna(data["prediction"])]
data_pred_na = data[pd.isna(data["prediction"])]
data_pred_na["model_result"] = -1
data_pred_na["step_1_result"] = -1
data_pred_na["step_2_result"] = -1
data_pred_na["score"] = -1
data_pred_no_na["model_result"] = data_pred_no_na.apply(
lambda row: post_process_open(
response=row["prediction"],
),
axis=1,
)
data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
if judge_kwargs.get("model", None) != "gpt-4o-0806":
judge_kwargs["model"] = "gpt-4o-0806"
print("The judge model in cg-bench is gpt-4o-0806!")
model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
nproc = judge_kwargs.pop("nproc", 32)
lines_step_1 = data_step_1.to_dict("records")
tups_step_1 = [(model_step_1, line) for line in lines_step_1]
keys_step_1 = {line["qid"] for line in lines_step_1}
ans = {}
if osp.exists(step_1_tmp_file):
ans = load(step_1_tmp_file)
tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans]
keys_step_1 = [i for i in keys_step_1 if i not in ans]
_ = track_progress_rich(
eval_open_first,
tups_step_1,
nproc=nproc,
keys=keys_step_1,
save=step_1_tmp_file,
)
step_1_results = load(step_1_tmp_file)
data_step_1 = save_step_1_steps(data_step_1, step_1_results) # -1, 0, 1, 2
data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1]
data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])]
data_step_2 = data_step_1[data_step_1["step_1_result"] == 2]
print(judge_kwargs)
model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs)
lines_step_2 = data_step_2.to_dict("records")
tups_step_2 = []
for line in tqdm(lines_step_2):
clue_intervals = eval(line["clue_intervals"])
lmu_root = LMUDataRoot()
clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset)
data_root = self.data_root
frame_paths, _, _ = save_clue_video_frames(
data_root,
clue_frame_root,
video=line["video"],
uid=line["qid"],
clue_intervals=clue_intervals,
num_frames=32,
)
tups_step_2.append((model_step_2, line, frame_paths))
keys_step_2 = {line["qid"] for line in lines_step_2}
ans = {}
if osp.exists(step_2_tmp_file):
ans = load(step_2_tmp_file)
tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans]
keys_step_2 = [i for i in keys_step_2 if i not in ans]
_ = track_progress_rich(
eval_open_second,
tups_step_2,
nproc=nproc,
keys=keys_step_2,
save=step_2_tmp_file,
)
step_2_results = load(step_2_tmp_file)
data_step_2 = save_step_2_steps(data_step_2, step_2_results)
data_no_step_2_results = data_step_2[data_step_2["score"] == -1]
data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])]
data = pd.concat(
[
data_pred_na,
data_no_model_result,
data_no_step_1_results,
data_step_1_over,
data_no_step_2_results,
data_step_2_over,
]
)
dump(data, score_file)
rating = get_dimention_rating_open_ended(score_file)
dump(rating, tgt_file)
return rating
class CGBench_MCQ_Grounding(VideoBaseDataset):
TYPE = "Video-MCQ-Grounding"
MD5 = "eaead3d978a689269fefce4ae29c86df"
SYS = {
"long_acc": (
"You will be provided with sampled frames from a video, along with a "
"multiple-choice question that includes a question and several answer options.\n"
"Your task is to analyze the provided frames, infer the most plausible "
"answer based on the visual information.\n"
"If the video does not provide enough information, infer the answer based "
"on the options available and still provide a result. "
"Therefore, In all cases, an answer must be given.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": "option"}\n```\n\n'
'The "option" is the uppercase letter corresponding to your answer.\n\n'
),
"clue_acc": (
"You will be provided with sampled frames from a video, along with a "
"multiple-choice question that includes a question and several answer options.\n"
"Your task is to analyze the provided frames, infer the most plausible "
"answer based on the visual information.\n"
"If the video does not provide enough information, infer the answer based "
"on the options available and still provide a result. "
"Therefore, In all cases, an answer must be given.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": "option"}\n```\n\n'
"The 'option' is the uppercase letter corresponding to your answer.\n\n"
),
"miou": (
"You will be provided with uniformly sampled frames from a video and their "
"timestamps, along with a multiple-choice question that includes a question "
"and several answer options.\n"
"Your task is to determine in which intervals the 'clue intervals' exist "
"that contain visual information needed to answer the question.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
"In this output format, each 'start' and 'end' represents the beginning and "
"end of an interval in seconds where relevant clues can be found.\n"
"You must provide at least one interval and at most five intervals. "
"Intervals exceeding five will NOT be considered valid.\n"
),
"miou_wo_frame_time": (
"You will be provided with uniformly sampled frames from a video, along "
"with a multiple-choice question that includes a question and several "
"answer options.\n"
"Your task is to determine in which intervals the 'clue intervals' exist "
"that contain visual information needed to answer the question.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
'In this output format, each "start" and "end" represents the start and '
"end of the video where the relevant clue can be found in the form of a "
"floating point number between 0 and 1, where 0 represents the start time "
"of the video and 1 represents the end time of the video.\n"
"You must provide at least one interval and at most five intervals. "
"Intervals exceeding five will NOT be considered valid.\n"
),
}
def __init__(
self,
dataset="CG-Bench_MCQ_Grounding",
use_subtitle=False,
use_subtitle_time=False,
use_frame_time=False,
nframe=0,
fps=-1,
):
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
self.use_subtitle = use_subtitle
self.use_subtitle_time = use_subtitle_time
self.use_frame_time = use_frame_time
self.dataset_name = dataset
lmu_root = LMUDataRoot()
self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
@classmethod
def supported_datasets(cls):
return ["CG-Bench_MCQ_Grounding"]
def clue_frame_paths(self, qid, num_frames=8):
frame_root = osp.join(self.clue_frame_root, qid)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1):
frame_root = osp.join(self.clue_frame_root, qid)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)]
def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
subtitles = []
srt_path = osp.join(self.data_root, subtitle_path)
assert osp.exists(srt_path)
import pysubs2
subs = pysubs2.load(srt_path, encoding="utf-8")
if not frame_indices:
for sub in subs:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
else:
for selected_frame_id in frame_indices:
cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
for sub in subs:
if sub.start < cur_time and sub.end > cur_time:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
if subtitles:
subtitles_str = '\n'.join(subtitles)
return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
else:
return ""
def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding", repo_id="CG-Bench/CG-Bench"):
def check_integrity(pth):
data_file = osp.join(pth, f"{dataset_name}.tsv")
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data["video"]:
if not osp.exists(osp.join(pth, video_pth)):
return False
for clue_video_pth in data["clue_video_path"]:
if clue_video_pth and not (isinstance(clue_video_pth, float) and np.isnan(clue_video_pth)):
if not osp.exists(osp.join(pth, clue_video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
tsv_file = osp.join(pth, f"{dataset_name}.tsv")
task_modes = ["long_acc", "clue_acc", "miou"]
all_data = []
for task_mode in task_modes:
with open(osp.join(pth, "cgbench.json"), "r") as f:
data_file = pd.DataFrame(json.load(f))
data_file = data_file.assign(index=range(len(data_file)))
data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
data_file["subtitle_path"] = data_file["video_uid"].apply(
lambda x: (
f"cg_subtitles/{x}.srt"
if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt"))
else ""
)
)
data_file["clue_video_path"] = ""
if task_mode in ["clue_acc"]:
data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply(
lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1
)
data_file["task_mode"] = task_mode
if task_mode in ["clue_acc", "long_acc"]:
data_file["answer"] = data_file["right_answer"]
if task_mode == "miou":
data_file["answer"] = data_file["clue_intervals"]
if task_mode in ["long_acc", "miou"]:
data_file["clue_intervals"] = ""
data_file = data_file[
[
"index",
"video_uid",
"video",
"duration",
"domain",
"choices",
"sub_category",
"subtitle_path",
"question",
"answer",
"task_mode",
"clue_intervals",
"qid",
"clue_video_path",
]
]
all_data.append(data_file)
final_data = pd.concat(all_data, ignore_index=True)
final_data["index"] = range(len(final_data))
final_data.to_csv(tsv_file, sep="\t", index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
unzip_hf_zip(dataset_path)
generate_tsv(dataset_path)
tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
return dict(data_file=tsv_file, root=dataset_path)
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
task_mode = line["task_mode"]
message = []
origin_use_subtitle_time = self.use_subtitle_time
try:
if task_mode in ["long_acc", "clue_acc"]:
system_prompt = self.SYS[task_mode]
elif task_mode == "miou":
if self.use_frame_time and not video_llm:
system_prompt = self.SYS[task_mode]
else:
system_prompt = self.SYS["miou_wo_frame_time"]
if self.use_subtitle_time is True:
self.use_subtitle_time = False
user_prompt = ""
if task_mode in ["long_acc", "miou"]:
video_path = line["video"]
if video_llm:
message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
if self.nframe:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
fps=vid_fps, sub_time=self.use_subtitle_time)
else:
user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
else:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
message.extend(dict(type="image", value=im) for im in image_paths)
if self.use_frame_time:
user_prompt += get_timestampes(frame_indices, vid_fps)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
user_prompt += self.get_subtitles(
line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
sub_time=self.use_subtitle_time
)
elif task_mode == "clue_acc":
clue_video_path = line["clue_video_path"]
video_path = line["video"]
if video_llm:
message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path)))
print(message)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
if self.nframe:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
fps=vid_fps, sub_time=self.use_subtitle_time)
else:
user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
else:
if self.nframe > 32:
self.nframe = 32
print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !")
clue_intervals = eval(line["clue_intervals"])
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps
)
message.extend(dict(type="image", value=im) for im in image_paths)
if self.use_frame_time:
user_prompt += get_timestampes(frame_indices, vid_fps)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
user_prompt += self.get_subtitles(
line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
sub_time=self.use_subtitle_time
)
question = line["question"]
user_prompt += f"Question: {question}\n\n"
choices = eval(line["choices"])
labels = [chr(ord("A") + i) for i in range(len(choices))]
user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n"
message.append(dict(type="text", value=system_prompt + user_prompt))
return message
finally:
# Ensure that `use_subtitle_time` is always restored to its original value
self.use_subtitle_time = origin_use_subtitle_time
def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
if type(uid) is not str:
uid = str(uid)
vid_path = osp.join(self.data_root, video)
vid = decord.VideoReader(vid_path)
vid_fps = vid.get_avg_fps()
n_frames = len(vid)
if clue_intervals is not None:
merged_intervals = merge_intervals(clue_intervals)
if num_frames > 0 and fps < 0:
indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
frame_paths = self.clue_frame_paths(uid, len(indices))
elif fps > 0:
frame_indices = []
for start, end in merged_intervals:
start_frame = int(start * vid_fps)
end_frame = int(end * vid_fps)
step = vid_fps / fps
interval_indices = [
int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
]
frame_indices.extend(interval_indices)
if len(frame_indices) < 32:
indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
else:
indices = frame_indices
frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
else:
if num_frames > 0 and fps < 0:
step_size = len(vid) / (num_frames + 1)
indices = [int(i * step_size) for i in range(1, num_frames + 1)]
frame_paths = self.frame_paths(uid)
elif fps > 0:
total_duration = n_frames / vid_fps
required_frames = int(total_duration * fps)
step_size = vid_fps / fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(uid, len(indices))
# Save and validate frames
valid_paths = []
valid_indices = []
if not np.all([osp.exists(p) for p in frame_paths]):
images = [vid[i].asnumpy() for i in indices]
for i, (img_array, path) in enumerate(zip(images, frame_paths)):
if osp.exists(path):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
try:
img = Image.fromarray(img_array)
img.save(path)
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
for i, path in enumerate(frame_paths):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
return valid_paths, valid_indices, vid_fps
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
data = load(eval_file)
data_un = data[~pd.isna(data["prediction"])]
data_pred_na = data[pd.isna(data["prediction"])]
data_pred_na["score"] = -1
data_un["score"] = data_un.apply(
lambda row: post_process(
response=row["prediction"],
right_answer=row["answer"],
task_mode=row["task_mode"],
duration=row["duration"],
),
axis=1,
)
data = pd.concat([data_pred_na, data_un])
rejected_count = (data["score"] == -1).sum()
print(
f"Among {len(data)} questions, "
f"failed to obtain prediction for {len(data_pred_na)} questions, "
f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. "
f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating."
)
dump(data, score_file)
rating = get_dimention_rating_mcq_grouding(score_file)
dump(rating, tgt_file)
return rating
# 评估时step_2 评估时,给出 [prompt] + image_paths 就行
class CGBench_OpenEnded(VideoBaseDataset):
TYPE = "Video-OpenEnded"
dataset = "CG-Bench_OpenEnded"
MD5 = "796035eda0b1e916c517cdc1bc145cfc"
SYS = (
"You will be provided with sampled frames from a video, along with a "
"question.\n"
"Your task is to analyze the provided frames and infer the most plausible "
"answer based on the visual information.\n"
"If the visual information is ambiguous or insufficient, use the available "
"context to reason your answer.\n"
"Only output the answer in the following format:\n\n"
'```json\n{"result": "answer"}\n```\n\n'
'The "answer" can be a word, phrase, or sentence that directly responds to '
"the question.\n\n"
)
def __init__(
self,
dataset="CG-Bench_OpenEnded",
use_subtitle=False,
use_subtitle_time=False,
use_frame_time=False,
nframe=0,
fps=-1,
):
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
self.use_subtitle = use_subtitle
self.use_subtitle_time = use_subtitle_time
self.use_frame_time = use_frame_time
self.dataset_name = dataset
lmu_root = LMUDataRoot()
self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
@classmethod
def supported_datasets(cls):
return ["CG-Bench_OpenEnded"]
def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
subtitles = []
srt_path = osp.join(self.data_root, subtitle_path)
assert osp.exists(srt_path)
import pysubs2
subs = pysubs2.load(srt_path, encoding="utf-8")
if not frame_indices:
for sub in subs:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
else:
for selected_frame_id in frame_indices:
cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
for sub in subs:
if sub.start < cur_time and sub.end > cur_time:
sub_text = sub.text.replace("\\N", " ")
if sub_time:
start_time = milliseconds_to_seconds(sub.start)
end_time = milliseconds_to_seconds(sub.end)
sub_text = f"[{start_time}, {end_time}] {sub_text}"
if sub_text.strip() and sub_text not in subtitles:
subtitles.append(sub_text)
if subtitles:
subtitles_str = '\n'.join(subtitles)
return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
else:
return ""
def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded", repo_id="CG-Bench/CG-Bench"):
def check_integrity(pth):
data_file = osp.join(pth, f"{dataset_name}.tsv")
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data["video"]:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
tsv_file = osp.join(pth, f"{dataset_name}.tsv")
with open(osp.join(pth, "cgbench.json"), "r") as f:
data_file = pd.DataFrame(json.load(f))
data_file = data_file.assign(index=range(len(data_file)))
data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
data_file["subtitle_path"] = data_file["video_uid"].apply(
lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else ""
)
data_file = data_file[
[
"index",
"video_uid",
"video",
"duration",
"domain",
"sub_category",
"subtitle_path",
"question",
"answer",
"clue_intervals",
"qid",
]
]
data_file.to_csv(tsv_file, sep="\t", index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
unzip_hf_zip(dataset_path)
generate_tsv(dataset_path)
tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
return dict(data_file=tsv_file, root=dataset_path)
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
message = []
sys_prompt = self.SYS
user_prompt = ""
video_path = line["video"]
if video_llm:
message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
if self.nframe:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
fps=vid_fps, sub_time=self.use_subtitle_time)
else:
user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
else:
image_paths, frame_indices, vid_fps = self.save_video_frames(
video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
)
message.extend(dict(type="image", value=im) for im in image_paths)
if self.use_frame_time:
user_prompt += get_timestampes(frame_indices, vid_fps)
if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
user_prompt += self.get_subtitles(
line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
sub_time=self.use_subtitle_time
)
question = line["question"]
user_prompt += f"Question: {question}\n\n"
message.append(dict(type="text", value=sys_prompt + user_prompt))
return message
def clue_frame_paths(self, qid, num_frames=8):
frame_root = osp.join(self.clue_frame_root, qid)
os.makedirs(frame_root, exist_ok=True)
return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
if type(uid) is not str:
uid = str(uid)
vid_path = osp.join(self.data_root, video)
vid = decord.VideoReader(vid_path)
vid_fps = vid.get_avg_fps()
n_frames = len(vid)
if clue_intervals is not None:
merged_intervals = merge_intervals(clue_intervals)
if num_frames > 0 and fps < 0:
indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
frame_paths = self.clue_frame_paths(uid, len(indices))
elif fps > 0:
frame_indices = []
for start, end in merged_intervals:
start_frame = int(start * vid_fps)
end_frame = int(end * vid_fps)
step = vid_fps / fps
interval_indices = [
int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
]
frame_indices.extend(interval_indices)
if len(frame_indices) < 32:
indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
else:
indices = frame_indices
frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
else:
if num_frames > 0 and fps < 0:
step_size = len(vid) / (num_frames + 1)
indices = [int(i * step_size) for i in range(1, num_frames + 1)]
frame_paths = self.frame_paths(uid)
elif fps > 0:
total_duration = n_frames / vid_fps
required_frames = int(total_duration * fps)
step_size = vid_fps / fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(uid, len(indices))
valid_paths = []
valid_indices = []
if not np.all([osp.exists(p) for p in frame_paths]):
images = [vid[i].asnumpy() for i in indices]
for i, (img_array, path) in enumerate(zip(images, frame_paths)):
if osp.exists(path):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
try:
img = Image.fromarray(img_array)
img.save(path)
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
else:
for i, path in enumerate(frame_paths):
try:
with Image.open(path) as img:
img.verify()
valid_paths.append(path)
valid_indices.append(indices[i])
except Exception:
continue
return valid_paths, valid_indices, vid_fps
def evaluate(self, eval_file, **judge_kwargs):
from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
tgt_file = eval_file.replace(".xlsx", "_rating.json")
score_file = eval_file.replace(".xlsx", "_score.xlsx")
step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
data = load(eval_file)
data_pred_no_na = data[~pd.isna(data["prediction"])]
data_pred_na = data[pd.isna(data["prediction"])]
data_pred_na["model_result"] = -1
data_pred_na["step_1_result"] = -1
data_pred_na["step_2_result"] = -1
data_pred_na["score"] = -1
data_pred_no_na["model_result"] = data_pred_no_na.apply(
lambda row: post_process_open(
response=row["prediction"],
),
axis=1,
)
if judge_kwargs.get("model", None) != "gpt-4o-0806":
judge_kwargs["model"] = "gpt-4o-0806"
print("The judge model in cg-bench is gpt-4o-0806!")
data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
nproc = judge_kwargs.pop('nproc', 32)
lines_step_1 = data_step_1.to_dict("records")
tups_step_1 = [(model_step_1, line) for line in lines_step_1]
keys_step_1 = {line["qid"] for line in lines_step_1}
ans = {}
if osp.exists(step_1_tmp_file):
ans = load(step_1_tmp_file)
tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans]
keys_step_1 = [i for i in keys_step_1 if i not in ans]
_ = track_progress_rich(
eval_open_first,
tups_step_1,
nproc=nproc,
keys=keys_step_1,
save=step_1_tmp_file,
)
step_1_results = load(step_1_tmp_file)
data_step_1 = save_step_1_steps(data_step_1, step_1_results) # -1, 0, 1, 2
data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1]
data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])]
data_step_2 = data_step_1[data_step_1["step_1_result"] == 2]
model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs)
lines_step_2 = data_step_2.to_dict("records")
tups_step_2 = []
for line in tqdm(lines_step_2):
clue_intervals = eval(line["clue_intervals"])
lmu_root = LMUDataRoot()
clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset)
data_root = self.data_root
frame_paths, _, _ = save_clue_video_frames(
data_root,
clue_frame_root,
video=line["video"],
uid=line["qid"],
clue_intervals=clue_intervals,
num_frames=32,
)
tups_step_2.append((model_step_2, line, frame_paths))
keys_step_2 = {line["qid"] for line in lines_step_2}
ans = {}
if osp.exists(step_2_tmp_file):
ans = load(step_2_tmp_file)
tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans]
keys_step_2 = [i for i in keys_step_2 if i not in ans]
_ = track_progress_rich(
eval_open_second,
tups_step_2,
nproc=nproc,
keys=keys_step_2,
save=step_2_tmp_file,
)
step_2_results = load(step_2_tmp_file)
data_step_2 = save_step_2_steps(data_step_2, step_2_results)
data_no_step_2_results = data_step_2[data_step_2["score"] == -1]
data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])]
data = pd.concat(
[
data_pred_na,
data_no_model_result,
data_no_step_1_results,
data_step_1_over,
data_no_step_2_results,
data_step_2_over,
]
)
dump(data, score_file)
rating = get_dimention_rating_open_ended(score_file)
dump(rating, tgt_file)
return rating