data_augment.py

"""Data augmentation functionality. Passed as callable transformations to
Dataset classes.

The data augmentation procedures were interpreted from @weiliu89's SSD paper
http://arxiv.org/abs/1512.02325
"""

import torch
from torchvision import transforms
import cv2
import numpy as np
import random
import math
from utils import matrix_iou, visual

# DEBUG = True
DEBUG = False


def _crop(image, boxes, labels, ratios=None):
    height, width, _ = image.shape

    if len(boxes) == 0:
        return image, boxes, labels, ratios

    while True:
        mode = random.choice(
            (
                None,
                (0.1, None),
                (0.3, None),
                (0.5, None),
                (0.7, None),
                (0.9, None),
                (None, None),
            )
        )

        if mode is None:
            return image, boxes, labels, ratios

        min_iou, max_iou = mode
        if min_iou is None:
            min_iou = float("-inf")
        if max_iou is None:
            max_iou = float("inf")

        for _ in range(50):
            scale = random.uniform(0.3, 1.0)
            min_ratio = max(0.5, scale * scale)
            max_ratio = min(2, 1.0 / scale / scale)
            ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
            w = int(scale * ratio * width)
            h = int((scale / ratio) * height)

            l = random.randrange(width - w)
            t = random.randrange(height - h)
            roi = np.array((l, t, l + w, t + h))

            iou = matrix_iou(boxes, roi[np.newaxis])

            if not (min_iou <= iou.min() and iou.max() <= max_iou):
                continue

            image_t = image[roi[1] : roi[3], roi[0] : roi[2]]

            centers = (boxes[:, :2] + boxes[:, 2:]) / 2
            mask = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1)
            boxes_t = boxes[mask].copy()
            labels_t = labels[mask].copy()
            if ratios is not None:
                ratios_t = ratios[mask].copy()
            else:
                ratios_t = None

            if len(boxes_t) == 0:
                continue

            boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
            boxes_t[:, :2] -= roi[:2]
            boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
            boxes_t[:, 2:] -= roi[:2]

            return image_t, boxes_t, labels_t, ratios_t


def _distort(image):
    def _convert(image, alpha=1, beta=0):
        tmp = image.astype(float) * alpha + beta
        tmp[tmp < 0] = 0
        tmp[tmp > 255] = 255
        image[:] = tmp

    image = image.copy()

    if random.randrange(2):
        _convert(image, beta=random.uniform(-32, 32))

    if random.randrange(2):
        _convert(image, alpha=random.uniform(0.5, 1.5))

    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    if random.randrange(2):
        tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
        tmp %= 180
        image[:, :, 0] = tmp

    if random.randrange(2):
        _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))

    image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)

    return image


def _expand(image, boxes, fill, p):
    if random.random() > p:
        return image, boxes

    height, width, depth = image.shape
    for _ in range(50):
        scale = random.uniform(1, 4)

        min_ratio = max(0.5, 1.0 / scale / scale)
        max_ratio = min(2, scale * scale)
        ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
        ws = scale * ratio
        hs = scale / ratio
        if ws < 1 or hs < 1:
            continue
        w = int(ws * width)
        h = int(hs * height)

        left = random.randint(0, w - width)
        top = random.randint(0, h - height)

        boxes_t = boxes.copy()
        boxes_t[:, :2] += (left, top)
        boxes_t[:, 2:] += (left, top)
        expand_image = np.empty((h, w, depth), dtype=image.dtype)
        expand_image[:, :] = fill
        expand_image[top : top + height, left : left + width] = image
        image = expand_image

        return image, boxes_t


def _mirror(image, boxes):
    _, width, _ = image.shape
    if random.randrange(2):
        image = image[:, ::-1]
        boxes = boxes.copy()
        boxes[:, 0::2] = width - boxes[:, 2::-2]
    return image, boxes


def _random_affine(
    img,
    targets=None,
    degrees=(-10, 10),
    translate=(0.1, 0.1),
    scale=(0.9, 1.1),
    shear=(-2, 2),
    borderValue=(127.5, 127.5, 127.5),
):
    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
    # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4

    border = 0  # width of added border (optional)
    # height = max(img.shape[0], img.shape[1]) + border * 2
    height, width, _ = img.shape

    # Rotation and Scale
    R = np.eye(3)
    a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
    # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
    s = random.random() * (scale[1] - scale[0]) + scale[0]
    R[:2] = cv2.getRotationMatrix2D(
        angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s
    )

    # Translation
    T = np.eye(3)
    T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[
        0
    ] + border  # x translation (pixels)
    T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[
        1
    ] + border  # y translation (pixels)

    # Shear
    S = np.eye(3)
    S[0, 1] = math.tan(
        (random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180
    )  # x shear (deg)
    S[1, 0] = math.tan(
        (random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180
    )  # y shear (deg)

    M = S @ T @ R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
    imw = cv2.warpPerspective(
        img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=borderValue
    )  # BGR order borderValue

    # Return warped points also
    if targets is not None:
        if len(targets) > 0:
            n = targets.shape[0]
            points = targets[:, 0:4].copy()
            area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])

            # warp points
            xy = np.ones((n * 4, 3))
            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
                n * 4, 2
            )  # x1y1, x2y2, x1y2, x2y1
            xy = (xy @ M.T)[:, :2].reshape(n, 8)

            # create new boxes
            x = xy[:, [0, 2, 4, 6]]
            y = xy[:, [1, 3, 5, 7]]
            xy = (
                np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
            )

            # apply angle-based reduction
            radians = a * math.pi / 180
            reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
            x = (xy[:, 2] + xy[:, 0]) / 2
            y = (xy[:, 3] + xy[:, 1]) / 2
            w = (xy[:, 2] - xy[:, 0]) * reduction
            h = (xy[:, 3] - xy[:, 1]) * reduction
            xy = (
                np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2))
                .reshape(4, n)
                .T
            )

            # reject warped points outside of image
            x1 = np.clip(xy[:, 0], 0, width)
            y1 = np.clip(xy[:, 1], 0, height)
            x2 = np.clip(xy[:, 2], 0, width)
            y2 = np.clip(xy[:, 3], 0, height)
            boxes = np.concatenate((x1, y1, x2, y2)).reshape(4, n).T

        return imw, boxes, M
    else:
        return imw


def preproc_for_test(image, input_size, mean, std):
    interp_methods = [
        cv2.INTER_LINEAR,
        cv2.INTER_CUBIC,
        cv2.INTER_AREA,
        cv2.INTER_NEAREST,
        cv2.INTER_LANCZOS4,
    ]
    interp_method = interp_methods[random.randrange(5)]
    image = cv2.resize(image, input_size, interpolation=interp_method)
    image = image.astype(np.float32)
    image = image[:, :, ::-1]
    image /= 255.0
    if mean is not None:
        image -= mean
    if std is not None:
        image /= std
    return image.transpose(2, 0, 1)


class TrainTransform(object):
    def __init__(self, p=0.5, rgb_means=None, std=None, max_labels=50):
        self.means = rgb_means
        self.std = std
        self.p = p
        self.max_labels = max_labels

    def __call__(self, image, targets, input_dim):
        boxes = targets[:, :4].copy()
        labels = targets[:, 4].copy()
        if targets.shape[1] > 5:
            mixup = True
            ratios = targets[:, -1].copy()
            ratios_o = targets[:, -1].copy()
        else:
            mixup = False
            ratios = None
            ratios_o = None
        lshape = 6 if mixup else 5
        if len(boxes) == 0:
            targets = np.zeros((self.max_labels, lshape), dtype=np.float32)
            image = preproc_for_test(image, input_dim, self.means, self.std)
            image = np.ascontiguousarray(image, dtype=np.float32)
            return torch.from_numpy(image), torch.from_numpy(targets)

        image_o = image.copy()
        targets_o = targets.copy()
        height_o, width_o, _ = image_o.shape
        boxes_o = targets_o[:, :4]
        labels_o = targets_o[:, 4]
        b_x_o = (boxes_o[:, 2] + boxes_o[:, 0]) * 0.5
        b_y_o = (boxes_o[:, 3] + boxes_o[:, 1]) * 0.5
        b_w_o = (boxes_o[:, 2] - boxes_o[:, 0]) * 1.0
        b_h_o = (boxes_o[:, 3] - boxes_o[:, 1]) * 1.0
        boxes_o[:, 0] = b_x_o
        boxes_o[:, 1] = b_y_o
        boxes_o[:, 2] = b_w_o
        boxes_o[:, 3] = b_h_o
        boxes_o[:, 0::2] /= width_o
        boxes_o[:, 1::2] /= height_o
        boxes_o[:, 0::2] *= input_dim[0]
        boxes_o[:, 1::2] *= input_dim[1]
        # labels_o = np.expand_dims(labels_o,1)
        # targets_o = np.hstack((boxes_o,labels_o))
        # targets_o = np.hstack((labels_o,boxes_o))

        image_t = _distort(image)
        if self.means is not None:
            fill = [m * 255 for m in self.means]
            fill = fill[::-1]
        else:
            fill = (127.5, 127.5, 127.5)
        image_t, boxes = _expand(image_t, boxes, fill, self.p)
        image_t, boxes, labels, ratios = _crop(image_t, boxes, labels, ratios)
        image_t, boxes = _mirror(image_t, boxes)

        if random.randrange(2):
            image_t, boxes, _ = _random_affine(image_t, boxes, borderValue=fill)

        height, width, _ = image_t.shape

        if DEBUG:
            image_t = np.ascontiguousarray(image_t, dtype=np.uint8)
            img = visual(image_t, boxes, labels)
            cv2.imshow("DEBUG", img)
            cv2.waitKey(0)

        image_t = preproc_for_test(image_t, input_dim, self.means, self.std)
        boxes = boxes.copy()
        b_x = (boxes[:, 2] + boxes[:, 0]) * 0.5
        b_y = (boxes[:, 3] + boxes[:, 1]) * 0.5
        b_w = (boxes[:, 2] - boxes[:, 0]) * 1.0
        b_h = (boxes[:, 3] - boxes[:, 1]) * 1.0
        boxes[:, 0] = b_x
        boxes[:, 1] = b_y
        boxes[:, 2] = b_w
        boxes[:, 3] = b_h
        boxes[:, 0::2] /= width
        boxes[:, 1::2] /= height
        boxes[:, 0::2] *= input_dim[0]
        boxes[:, 1::2] *= input_dim[1]
        mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 6
        # mask_b= (boxes[:,2]*boxes[:,3]) > 32**2
        # mask_b= (boxes[:,2]*boxes[:,3]) > 48**2
        boxes_t = boxes[mask_b]
        labels_t = labels[mask_b].copy()
        if mixup:
            ratios_t = ratios[mask_b].copy()

        """
        if len(boxes_t)==0:
            targets = np.zeros((self.max_labels,lshape),dtype=np.float32)
            image = preproc_for_test(image_o, input_dim, self.means, self.std)
            image = np.ascontiguousarray(image, dtype=np.float32)
            return torch.from_numpy(image), torch.from_numpy(targets)
        """
        # if len(boxes_t)==0 or random.random() > 0.97:
        if len(boxes_t) == 0:
            image_t = preproc_for_test(image_o, input_dim, self.means, self.std)
            boxes_t = boxes_o
            labels_t = labels_o
            ratios_t = ratios_o

        labels_t = np.expand_dims(labels_t, 1)
        if mixup:
            ratios_t = np.expand_dims(ratios_t, 1)
            targets_t = np.hstack((labels_t, boxes_t, ratios_t))
        else:
            targets_t = np.hstack((labels_t, boxes_t))
        padded_labels = np.zeros((self.max_labels, lshape))
        padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[
            : self.max_labels
        ]
        padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
        image_t = np.ascontiguousarray(image_t, dtype=np.float32)

        return torch.from_numpy(image_t), torch.from_numpy(padded_labels)


class ValTransform(object):
    """Defines the transformations that should be applied to test PIL image
        for input into the network

    dimension -> tensorize -> color adj

    Arguments:
        resize (int): input dimension to SSD
        rgb_means ((int,int,int)): average RGB of the dataset
            (104,117,123)
        swap ((int,int,int)): final order of channels
    Returns:
        transform (transform) : callable transform to be applied to test/val
        data
    """

    def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)):
        self.means = rgb_means
        self.swap = swap
        self.std = std

    # assume input is cv2 img for now
    def __call__(self, img, res, input_size):

        interp_methods = [
            cv2.INTER_LINEAR,
            cv2.INTER_CUBIC,
            cv2.INTER_AREA,
            cv2.INTER_NEAREST,
            cv2.INTER_LANCZOS4,
        ]
        interp_method = interp_methods[0]
        img = cv2.resize(np.array(img), input_size, interpolation=interp_method).astype(
            np.float32
        )
        img = img[:, :, ::-1]
        img /= 255.0
        if self.means is not None:
            img -= self.means
        if self.std is not None:
            img /= self.std
        img = img.transpose(self.swap)
        img = np.ascontiguousarray(img, dtype=np.float32)
        return torch.from_numpy(img), torch.zeros(1, 5)