mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
Modify eval_mm for MiniCPM-o 2.6
This commit is contained in:
328
eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py
Normal file
328
eval_mm/vlmevalkit/vlmeval/dataset/longvideobench.py
Normal file
@@ -0,0 +1,328 @@
|
||||
from huggingface_hub import snapshot_download
|
||||
from ..smp import *
|
||||
from .video_base import VideoBaseDataset
|
||||
from .utils import build_judge, DEBUG_MESSAGE
|
||||
from glob import glob
|
||||
|
||||
FAIL_MSG = 'Failed to obtain answer via API.'
|
||||
|
||||
|
||||
def timestamp_to_seconds(timestamp):
|
||||
# Split the timestamp into hours, minutes, and seconds
|
||||
h, m, s = timestamp.split(":")
|
||||
# Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
|
||||
total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
|
||||
return total_seconds
|
||||
|
||||
|
||||
def uniformly_subsample(lst, K):
|
||||
n = len(lst)
|
||||
if K >= n:
|
||||
return lst
|
||||
step = n / K
|
||||
return [lst[int(i * step)] for i in range(K)]
|
||||
|
||||
|
||||
def insert_subtitles_into_frames(
|
||||
frames,
|
||||
frame_timestamps,
|
||||
subtitles,
|
||||
starting_timestamp_for_subtitles,
|
||||
duration,
|
||||
):
|
||||
interleaved_list = []
|
||||
cur_i = 0
|
||||
|
||||
for subtitle in subtitles:
|
||||
if "timestamp" in subtitle:
|
||||
start, end = subtitle["timestamp"]
|
||||
|
||||
if not isinstance(end, float):
|
||||
end = duration
|
||||
|
||||
start -= starting_timestamp_for_subtitles
|
||||
end -= starting_timestamp_for_subtitles
|
||||
|
||||
subtitle_timestamp = (start + end) / 2
|
||||
subtitle_text = subtitle["text"]
|
||||
else:
|
||||
start, end = subtitle["start"], subtitle["end"]
|
||||
start = timestamp_to_seconds(start)
|
||||
end = timestamp_to_seconds(end)
|
||||
start -= starting_timestamp_for_subtitles
|
||||
end -= starting_timestamp_for_subtitles
|
||||
|
||||
subtitle_timestamp = (start + end) / 2
|
||||
subtitle_text = subtitle["line"]
|
||||
|
||||
for i, (frame, frame_timestamp) in enumerate(
|
||||
zip(frames[cur_i:], frame_timestamps[cur_i:])
|
||||
):
|
||||
if frame_timestamp <= subtitle_timestamp:
|
||||
# print("frame:", frame_timestamp)
|
||||
interleaved_list.append({"type": "image", "value": frame})
|
||||
cur_i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if end - start < 1:
|
||||
end = subtitle_timestamp + 0.5
|
||||
start = subtitle_timestamp - 0.5
|
||||
|
||||
covering_frames = False
|
||||
for frame, frame_timestamp in zip(frames, frame_timestamps):
|
||||
if frame_timestamp < end and frame_timestamp > start:
|
||||
covering_frames = True
|
||||
break
|
||||
|
||||
if covering_frames:
|
||||
interleaved_list.append({"type": "text", "value": subtitle_text + "\n"})
|
||||
else:
|
||||
pass
|
||||
|
||||
for i, (frame, frame_timestamp) in enumerate(
|
||||
zip(frames[cur_i:], frame_timestamps[cur_i:])
|
||||
):
|
||||
interleaved_list.append({"type": "image", "value": frame})
|
||||
return interleaved_list
|
||||
|
||||
|
||||
class LongVideoBench(VideoBaseDataset):
|
||||
|
||||
MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9'
|
||||
SYS = ''
|
||||
|
||||
TYPE = 'Video-MCQ'
|
||||
|
||||
def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1):
|
||||
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
|
||||
self.use_subtitle = use_subtitle
|
||||
self.dataset_name = dataset
|
||||
|
||||
@classmethod
|
||||
def supported_datasets(cls):
|
||||
return ['LongVideoBench']
|
||||
|
||||
def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'):
|
||||
|
||||
def check_integrity(pth):
|
||||
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
||||
|
||||
if not osp.exists(data_file):
|
||||
return False
|
||||
|
||||
if md5(data_file) != self.MD5:
|
||||
print("md5 mismatch", md5(data_file), self.MD5)
|
||||
return False
|
||||
data = load(data_file)
|
||||
for video_pth in data['video_path']:
|
||||
if not osp.exists(osp.join(pth, video_pth)):
|
||||
print(video_pth, "is not found")
|
||||
return False
|
||||
return True
|
||||
|
||||
if modelscope_flag_set():
|
||||
repo_id = "AI-ModelScope/LongVideoBench"
|
||||
|
||||
cache_path = get_cache_path(repo_id)
|
||||
if cache_path is not None and check_integrity(cache_path):
|
||||
dataset_path = cache_path
|
||||
else:
|
||||
def generate_tsv(pth):
|
||||
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
||||
if osp.exists(data_file) and md5(data_file) == self.MD5:
|
||||
return
|
||||
|
||||
data_file = pd.read_json(osp.join(pth, 'lvb_val.json'))
|
||||
data_file = data_file.assign(index=range(len(data_file)))
|
||||
data_file['video'] = data_file['video_id']
|
||||
data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}')
|
||||
|
||||
data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
|
||||
|
||||
if modelscope_flag_set():
|
||||
from modelscope import dataset_snapshot_download
|
||||
dataset_snapshot_download(dataset_id=repo_id)
|
||||
else:
|
||||
snapshot_download(repo_id=repo_id, repo_type='dataset')
|
||||
print("All videos are downloaded for LongVideoBench")
|
||||
|
||||
if not glob(osp.join(cache_path, "videos")):
|
||||
tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True)
|
||||
|
||||
def untar_video_data(tar_file, cache_dir):
|
||||
import tarfile
|
||||
with tarfile.open(tar_file, "r") as tar_ref:
|
||||
tar_ref.extractall(cache_dir)
|
||||
print(f"Extracted all files from {tar_file} to {cache_dir}")
|
||||
|
||||
def concat_tar_parts(tar_parts, output_tar):
|
||||
with open(output_tar, "wb") as out_tar:
|
||||
from tqdm import tqdm
|
||||
for part in tqdm(sorted(tar_parts)):
|
||||
with open(part, "rb") as part_file:
|
||||
out_tar.write(part_file.read())
|
||||
print(f"Concatenated parts {tar_parts} into {output_tar}")
|
||||
|
||||
tar_parts_dict = {}
|
||||
|
||||
# Group tar parts together
|
||||
for tar_file in tar_files:
|
||||
base_name = tar_file.split(".tar")[0]
|
||||
if base_name not in tar_parts_dict:
|
||||
tar_parts_dict[base_name] = []
|
||||
tar_parts_dict[base_name].append(tar_file)
|
||||
|
||||
# Concatenate and untar split parts
|
||||
for base_name, parts in tar_parts_dict.items():
|
||||
print(f"Extracting following tar files: {parts}")
|
||||
output_tar = base_name + ".tar"
|
||||
if not osp.exists(output_tar):
|
||||
print('Start concatenating tar files')
|
||||
|
||||
concat_tar_parts(parts, output_tar)
|
||||
print('Finish concatenating tar files')
|
||||
|
||||
if not osp.exists(osp.join(cache_path, osp.basename(base_name))):
|
||||
untar_video_data(output_tar, cache_path)
|
||||
|
||||
print('All videos are extracted for LongVideoBench')
|
||||
|
||||
dataset_path = cache_path
|
||||
generate_tsv(dataset_path)
|
||||
|
||||
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
||||
|
||||
return dict(data_file=data_file, root=dataset_path)
|
||||
|
||||
def save_video_frames(self, video_path, video_llm=False):
|
||||
|
||||
vid_path = osp.join(self.data_root, video_path)
|
||||
vid = decord.VideoReader(vid_path)
|
||||
video_info = {
|
||||
'fps': vid.get_avg_fps(),
|
||||
'n_frames': len(vid),
|
||||
}
|
||||
if self.nframe > 0 and self.fps < 0:
|
||||
step_size = len(vid) / (self.nframe + 1)
|
||||
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
|
||||
frame_paths = self.frame_paths(video_path[:-4])
|
||||
elif self.fps > 0:
|
||||
# not constrained by num_frames, get frames by fps
|
||||
total_duration = video_info['n_frames'] / video_info['fps']
|
||||
required_frames = int(total_duration * self.fps)
|
||||
step_size = video_info['fps'] / self.fps
|
||||
indices = [int(i * step_size) for i in range(required_frames)]
|
||||
frame_paths = self.frame_paths_fps(video_path[:-4], len(indices))
|
||||
|
||||
flag = np.all([osp.exists(p) for p in frame_paths])
|
||||
|
||||
if not flag:
|
||||
images = [vid[i].asnumpy() for i in indices]
|
||||
images = [Image.fromarray(arr) for arr in images]
|
||||
for im, pth in zip(images, frame_paths):
|
||||
if not osp.exists(pth) and not video_llm:
|
||||
im.save(pth)
|
||||
|
||||
return frame_paths, indices, video_info
|
||||
|
||||
# def save_video_into_images(self, line, num_frames=8):
|
||||
# frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
|
||||
# return frame_paths
|
||||
|
||||
def build_prompt(self, line, video_llm):
|
||||
if isinstance(line, int):
|
||||
assert line < len(self)
|
||||
line = self.data.iloc[line]
|
||||
|
||||
frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm)
|
||||
fps = video_info["fps"]
|
||||
|
||||
message = [dict(type='text', value=self.SYS)]
|
||||
if video_llm:
|
||||
message.append(dict(type='video', value=osp.join(self.data_root, line['video_path'])))
|
||||
else:
|
||||
if not self.use_subtitle:
|
||||
with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f:
|
||||
subtitles = json.load(f)
|
||||
|
||||
frame_message = insert_subtitles_into_frames(
|
||||
frames,
|
||||
[ind_ / fps for ind_ in indices],
|
||||
subtitles,
|
||||
line["starting_timestamp_for_subtitles"],
|
||||
line["duration"]
|
||||
)
|
||||
|
||||
message += frame_message
|
||||
else:
|
||||
for im in frames:
|
||||
message.append(dict(type='image', value=im))
|
||||
|
||||
line['question'] += '\n' + '\n'.join(
|
||||
["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))]
|
||||
)
|
||||
prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly."
|
||||
message.append(dict(type='text', value=prompt))
|
||||
return message
|
||||
|
||||
# It returns a dictionary
|
||||
@classmethod
|
||||
def evaluate(self, eval_file, **judge_kwargs):
|
||||
from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
|
||||
|
||||
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
||||
|
||||
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
||||
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
||||
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
||||
|
||||
if not osp.exists(score_file):
|
||||
model = judge_kwargs.get('model', 'exact_matching')
|
||||
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
||||
|
||||
if model == 'exact_matching':
|
||||
model = None
|
||||
elif gpt_key_set():
|
||||
model = build_judge(**judge_kwargs)
|
||||
if not model.working():
|
||||
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
||||
warnings.warn(DEBUG_MESSAGE)
|
||||
model = None
|
||||
else:
|
||||
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
||||
model = None
|
||||
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
||||
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
||||
|
||||
data = load(eval_file)
|
||||
data_un = data[~pd.isna(data['prediction'])]
|
||||
|
||||
for idx in data['index']:
|
||||
ans = data.loc[data['index'] == idx, 'correct_choice'].values[0]
|
||||
ans = chr(ord("A") + ans)
|
||||
pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
|
||||
|
||||
if extract_characters_regex(pred) == '':
|
||||
extract_pred = extract_option(
|
||||
model,
|
||||
data.loc[data['index'] == idx].to_dict(orient='records')[0],
|
||||
'LongVideoBench'
|
||||
)
|
||||
data.loc[idx, 'score'] = int(extract_pred == ans)
|
||||
else:
|
||||
data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
|
||||
|
||||
rejected = [x for x in data['score'] if x == -1]
|
||||
|
||||
print(
|
||||
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
||||
f'failed to obtain the score for another {len(rejected)} questions. '
|
||||
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
||||
)
|
||||
|
||||
dump(data, score_file)
|
||||
|
||||
rating = get_dimension_rating(score_file)
|
||||
dump(rating, tgt_file)
|
||||
return rating
|
||||
Reference in New Issue
Block a user