mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-05 18:29:18 +08:00
451 lines
14 KiB
Python
451 lines
14 KiB
Python
from ...smp import *
|
|
from PIL import Image, ImageOps
|
|
import torchvision
|
|
import random
|
|
import numbers
|
|
import math
|
|
import torch
|
|
|
|
|
|
def get_dimension_rating(data_path):
|
|
data = load(data_path)
|
|
result_board = {}
|
|
for idx, item in data.iterrows():
|
|
if item['task_type'] not in result_board:
|
|
result_board[item['task_type']] = [0, 0]
|
|
result_board[item['task_type']][1] += 1
|
|
if item['score']:
|
|
result_board[item['task_type']][0] += 1
|
|
|
|
correct = 0
|
|
total = 0
|
|
for key, value in result_board.items():
|
|
correct += value[0]
|
|
total += value[1]
|
|
result_board[key].append(f'{value[0] / value[1] * 100 :.2f}%')
|
|
|
|
result_board['overall'] = [correct, total, f'{correct / total * 100 :.2f}%']
|
|
|
|
return result_board
|
|
|
|
|
|
def check_ans(pred, gt):
|
|
flag = False
|
|
|
|
pred_list = pred.lower().split(' ')
|
|
pred_option, _ = pred_list[0], ' '.join(pred_list[1:])
|
|
gt_list = gt.lower().split(' ')
|
|
gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
|
|
if gt_content[-1] == '.':
|
|
gt_content = gt_content[:-1]
|
|
|
|
if pred_option.replace('.', '') in gt_option:
|
|
flag = True
|
|
elif gt_option in pred_option:
|
|
flag = True
|
|
|
|
return flag
|
|
|
|
|
|
class GroupRandomCrop(object):
|
|
def __init__(self, size):
|
|
if isinstance(size, numbers.Number):
|
|
self.size = (int(size), int(size))
|
|
else:
|
|
self.size = size
|
|
|
|
def __call__(self, img_group):
|
|
|
|
w, h = img_group[0].size
|
|
th, tw = self.size
|
|
|
|
out_images = list()
|
|
|
|
x1 = random.randint(0, w - tw)
|
|
y1 = random.randint(0, h - th)
|
|
|
|
for img in img_group:
|
|
assert (img.size[0] == w and img.size[1] == h)
|
|
if w == tw and h == th:
|
|
out_images.append(img)
|
|
else:
|
|
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
|
|
|
|
return out_images
|
|
|
|
|
|
class MultiGroupRandomCrop(object):
|
|
def __init__(self, size, groups=1):
|
|
if isinstance(size, numbers.Number):
|
|
self.size = (int(size), int(size))
|
|
else:
|
|
self.size = size
|
|
self.groups = groups
|
|
|
|
def __call__(self, img_group):
|
|
|
|
w, h = img_group[0].size
|
|
th, tw = self.size
|
|
|
|
out_images = list()
|
|
|
|
for i in range(self.groups):
|
|
x1 = random.randint(0, w - tw)
|
|
y1 = random.randint(0, h - th)
|
|
|
|
for img in img_group:
|
|
assert (img.size[0] == w and img.size[1] == h)
|
|
if w == tw and h == th:
|
|
out_images.append(img)
|
|
else:
|
|
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
|
|
|
|
return out_images
|
|
|
|
|
|
class GroupCenterCrop(object):
|
|
def __init__(self, size):
|
|
self.worker = torchvision.transforms.CenterCrop(size)
|
|
|
|
def __call__(self, img_group):
|
|
return [self.worker(img) for img in img_group]
|
|
|
|
|
|
class GroupRandomHorizontalFlip(object):
|
|
"""Randomly horizontally flips the given PIL.Image with a probability of 0.5
|
|
"""
|
|
|
|
def __init__(self, is_flow=False):
|
|
self.is_flow = is_flow
|
|
|
|
def __call__(self, img_group, is_flow=False):
|
|
v = random.random()
|
|
if v < 0.5:
|
|
ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
|
|
if self.is_flow:
|
|
for i in range(0, len(ret), 2):
|
|
# invert flow pixel values when flipping
|
|
ret[i] = ImageOps.invert(ret[i])
|
|
return ret
|
|
else:
|
|
return img_group
|
|
|
|
|
|
class GroupNormalize(object):
|
|
def __init__(self, mean, std):
|
|
self.mean = mean
|
|
self.std = std
|
|
|
|
def __call__(self, tensor):
|
|
rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
|
|
rep_std = self.std * (tensor.size()[0] // len(self.std))
|
|
|
|
# TODO: make efficient
|
|
for t, m, s in zip(tensor, rep_mean, rep_std):
|
|
t.sub_(m).div_(s)
|
|
|
|
return tensor
|
|
|
|
|
|
class GroupScale(object):
|
|
""" Rescales the input PIL.Image to the given 'size'.
|
|
'size' will be the size of the smaller edge.
|
|
For example, if height > width, then image will be
|
|
rescaled to (size * height / width, size)
|
|
size: size of the smaller edge
|
|
interpolation: Default: PIL.Image.BILINEAR
|
|
"""
|
|
|
|
def __init__(self, size, interpolation=Image.BILINEAR):
|
|
self.worker = torchvision.transforms.Resize(size, interpolation)
|
|
|
|
def __call__(self, img_group):
|
|
return [self.worker(img) for img in img_group]
|
|
|
|
|
|
class GroupOverSample(object):
|
|
def __init__(self, crop_size, scale_size=None, flip=True):
|
|
self.crop_size = crop_size if not isinstance(
|
|
crop_size, int) else (crop_size, crop_size)
|
|
|
|
if scale_size is not None:
|
|
self.scale_worker = GroupScale(scale_size)
|
|
else:
|
|
self.scale_worker = None
|
|
self.flip = flip
|
|
|
|
def __call__(self, img_group):
|
|
|
|
if self.scale_worker is not None:
|
|
img_group = self.scale_worker(img_group)
|
|
|
|
image_w, image_h = img_group[0].size
|
|
crop_w, crop_h = self.crop_size
|
|
|
|
offsets = GroupMultiScaleCrop.fill_fix_offset(
|
|
False, image_w, image_h, crop_w, crop_h)
|
|
oversample_group = list()
|
|
for o_w, o_h in offsets:
|
|
normal_group = list()
|
|
flip_group = list()
|
|
for i, img in enumerate(img_group):
|
|
crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
|
|
normal_group.append(crop)
|
|
flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
|
|
|
|
if img.mode == 'L' and i % 2 == 0:
|
|
flip_group.append(ImageOps.invert(flip_crop))
|
|
else:
|
|
flip_group.append(flip_crop)
|
|
|
|
oversample_group.extend(normal_group)
|
|
if self.flip:
|
|
oversample_group.extend(flip_group)
|
|
return oversample_group
|
|
|
|
|
|
class GroupFullResSample(object):
|
|
def __init__(self, crop_size, scale_size=None, flip=True):
|
|
self.crop_size = crop_size if not isinstance(
|
|
crop_size, int) else (crop_size, crop_size)
|
|
|
|
if scale_size is not None:
|
|
self.scale_worker = GroupScale(scale_size)
|
|
else:
|
|
self.scale_worker = None
|
|
self.flip = flip
|
|
|
|
def __call__(self, img_group):
|
|
|
|
if self.scale_worker is not None:
|
|
img_group = self.scale_worker(img_group)
|
|
|
|
image_w, image_h = img_group[0].size
|
|
crop_w, crop_h = self.crop_size
|
|
|
|
w_step = (image_w - crop_w) // 4
|
|
h_step = (image_h - crop_h) // 4
|
|
|
|
offsets = list()
|
|
offsets.append((0 * w_step, 2 * h_step)) # left
|
|
offsets.append((4 * w_step, 2 * h_step)) # right
|
|
offsets.append((2 * w_step, 2 * h_step)) # center
|
|
|
|
oversample_group = list()
|
|
for o_w, o_h in offsets:
|
|
normal_group = list()
|
|
flip_group = list()
|
|
for i, img in enumerate(img_group):
|
|
crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
|
|
normal_group.append(crop)
|
|
if self.flip:
|
|
flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
|
|
|
|
if img.mode == 'L' and i % 2 == 0:
|
|
flip_group.append(ImageOps.invert(flip_crop))
|
|
else:
|
|
flip_group.append(flip_crop)
|
|
|
|
oversample_group.extend(normal_group)
|
|
oversample_group.extend(flip_group)
|
|
return oversample_group
|
|
|
|
|
|
class GroupMultiScaleCrop(object):
|
|
|
|
def __init__(self, input_size, scales=None, max_distort=1,
|
|
fix_crop=True, more_fix_crop=True):
|
|
self.scales = scales if scales is not None else [1, .875, .75, .66]
|
|
self.max_distort = max_distort
|
|
self.fix_crop = fix_crop
|
|
self.more_fix_crop = more_fix_crop
|
|
self.input_size = input_size if not isinstance(input_size, int) else [
|
|
input_size, input_size]
|
|
self.interpolation = Image.BILINEAR
|
|
|
|
def __call__(self, img_group):
|
|
|
|
im_size = img_group[0].size
|
|
|
|
crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
|
|
crop_img_group = [
|
|
img.crop(
|
|
(offset_w,
|
|
offset_h,
|
|
offset_w + crop_w,
|
|
offset_h + crop_h)) for img in img_group]
|
|
ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
|
|
for img in crop_img_group]
|
|
return ret_img_group
|
|
|
|
def _sample_crop_size(self, im_size):
|
|
image_w, image_h = im_size[0], im_size[1]
|
|
|
|
# find a crop size
|
|
base_size = min(image_w, image_h)
|
|
crop_sizes = [int(base_size * x) for x in self.scales]
|
|
crop_h = [
|
|
self.input_size[1] if abs(
|
|
x - self.input_size[1]) < 3 else x for x in crop_sizes]
|
|
crop_w = [
|
|
self.input_size[0] if abs(
|
|
x - self.input_size[0]) < 3 else x for x in crop_sizes]
|
|
|
|
pairs = []
|
|
for i, h in enumerate(crop_h):
|
|
for j, w in enumerate(crop_w):
|
|
if abs(i - j) <= self.max_distort:
|
|
pairs.append((w, h))
|
|
|
|
crop_pair = random.choice(pairs)
|
|
if not self.fix_crop:
|
|
w_offset = random.randint(0, image_w - crop_pair[0])
|
|
h_offset = random.randint(0, image_h - crop_pair[1])
|
|
else:
|
|
w_offset, h_offset = self._sample_fix_offset(
|
|
image_w, image_h, crop_pair[0], crop_pair[1])
|
|
|
|
return crop_pair[0], crop_pair[1], w_offset, h_offset
|
|
|
|
def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
|
|
offsets = self.fill_fix_offset(
|
|
self.more_fix_crop, image_w, image_h, crop_w, crop_h)
|
|
return random.choice(offsets)
|
|
|
|
@staticmethod
|
|
def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
|
|
w_step = (image_w - crop_w) // 4
|
|
h_step = (image_h - crop_h) // 4
|
|
|
|
ret = list()
|
|
ret.append((0, 0)) # upper left
|
|
ret.append((4 * w_step, 0)) # upper right
|
|
ret.append((0, 4 * h_step)) # lower left
|
|
ret.append((4 * w_step, 4 * h_step)) # lower right
|
|
ret.append((2 * w_step, 2 * h_step)) # center
|
|
|
|
if more_fix_crop:
|
|
ret.append((0, 2 * h_step)) # center left
|
|
ret.append((4 * w_step, 2 * h_step)) # center right
|
|
ret.append((2 * w_step, 4 * h_step)) # lower center
|
|
ret.append((2 * w_step, 0 * h_step)) # upper center
|
|
|
|
ret.append((1 * w_step, 1 * h_step)) # upper left quarter
|
|
ret.append((3 * w_step, 1 * h_step)) # upper right quarter
|
|
ret.append((1 * w_step, 3 * h_step)) # lower left quarter
|
|
ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
|
|
|
|
return ret
|
|
|
|
|
|
class GroupRandomSizedCrop(object):
|
|
"""Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
|
|
and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
|
|
This is popularly used to train the Inception networks
|
|
size: size of the smaller edge
|
|
interpolation: Default: PIL.Image.BILINEAR
|
|
"""
|
|
|
|
def __init__(self, size, interpolation=Image.BILINEAR):
|
|
self.size = size
|
|
self.interpolation = interpolation
|
|
|
|
def __call__(self, img_group):
|
|
for attempt in range(10):
|
|
area = img_group[0].size[0] * img_group[0].size[1]
|
|
target_area = random.uniform(0.08, 1.0) * area
|
|
aspect_ratio = random.uniform(3. / 4, 4. / 3)
|
|
|
|
w = int(round(math.sqrt(target_area * aspect_ratio)))
|
|
h = int(round(math.sqrt(target_area / aspect_ratio)))
|
|
|
|
if random.random() < 0.5:
|
|
w, h = h, w
|
|
|
|
if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
|
|
x1 = random.randint(0, img_group[0].size[0] - w)
|
|
y1 = random.randint(0, img_group[0].size[1] - h)
|
|
found = True
|
|
break
|
|
else:
|
|
found = False
|
|
x1 = 0
|
|
y1 = 0
|
|
|
|
if found:
|
|
out_group = list()
|
|
for img in img_group:
|
|
img = img.crop((x1, y1, x1 + w, y1 + h))
|
|
assert (img.size == (w, h))
|
|
out_group.append(
|
|
img.resize(
|
|
(self.size, self.size), self.interpolation))
|
|
return out_group
|
|
else:
|
|
# Fallback
|
|
scale = GroupScale(self.size, interpolation=self.interpolation)
|
|
crop = GroupRandomCrop(self.size)
|
|
return crop(scale(img_group))
|
|
|
|
|
|
class ConvertDataFormat(object):
|
|
def __init__(self, model_type):
|
|
self.model_type = model_type
|
|
|
|
def __call__(self, images):
|
|
if self.model_type == '2D':
|
|
return images
|
|
tc, h, w = images.size()
|
|
t = tc // 3
|
|
images = images.view(t, 3, h, w)
|
|
images = images.permute(1, 0, 2, 3)
|
|
return images
|
|
|
|
|
|
class Stack(object):
|
|
|
|
def __init__(self, roll=False):
|
|
self.roll = roll
|
|
|
|
def __call__(self, img_group):
|
|
if img_group[0].mode == 'L':
|
|
return np.concatenate([np.expand_dims(x, 2)
|
|
for x in img_group], axis=2)
|
|
elif img_group[0].mode == 'RGB':
|
|
if self.roll:
|
|
return np.concatenate([np.array(x)[:, :, ::-1]
|
|
for x in img_group], axis=2)
|
|
else:
|
|
# print(np.concatenate(img_group, axis=2).shape)
|
|
# print(img_group[0].shape)
|
|
return np.concatenate(img_group, axis=2)
|
|
|
|
|
|
class ToTorchFormatTensor(object):
|
|
""" Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
|
|
to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
|
|
|
|
def __init__(self, div=True):
|
|
self.div = div
|
|
|
|
def __call__(self, pic):
|
|
if isinstance(pic, np.ndarray):
|
|
# handle numpy array
|
|
img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
|
|
else:
|
|
# handle PIL Image
|
|
img = torch.ByteTensor(
|
|
torch.ByteStorage.from_buffer(
|
|
pic.tobytes()))
|
|
img = img.view(pic.size[1], pic.size[0], len(pic.mode))
|
|
# put it from HWC to CHW format
|
|
# yikes, this transpose takes 80% of the loading time/CPU
|
|
img = img.transpose(0, 1).transpose(0, 2).contiguous()
|
|
return img.float().div(255) if self.div else img.float()
|
|
|
|
|
|
class IdentityTransform(object):
|
|
|
|
def __call__(self, data):
|
|
return data
|