804 lines
33 KiB
Python
804 lines
33 KiB
Python
|
# coding=utf-8
|
||
|
# Copyright 2022 The HuggingFace Inc. team.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
|
||
|
import warnings
|
||
|
from typing import Iterable, List, Optional, Tuple, Union
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from .image_utils import (
|
||
|
ChannelDimension,
|
||
|
ImageInput,
|
||
|
get_channel_dimension_axis,
|
||
|
get_image_size,
|
||
|
infer_channel_dimension_format,
|
||
|
)
|
||
|
from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
|
||
|
from .utils.import_utils import (
|
||
|
is_flax_available,
|
||
|
is_tf_available,
|
||
|
is_torch_available,
|
||
|
is_vision_available,
|
||
|
requires_backends,
|
||
|
)
|
||
|
|
||
|
|
||
|
if is_vision_available():
|
||
|
import PIL
|
||
|
|
||
|
from .image_utils import PILImageResampling
|
||
|
|
||
|
if is_torch_available():
|
||
|
import torch
|
||
|
|
||
|
if is_tf_available():
|
||
|
import tensorflow as tf
|
||
|
|
||
|
if is_flax_available():
|
||
|
import jax.numpy as jnp
|
||
|
|
||
|
|
||
|
def to_channel_dimension_format(
|
||
|
image: np.ndarray,
|
||
|
channel_dim: Union[ChannelDimension, str],
|
||
|
input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Converts `image` to the channel dimension format specified by `channel_dim`.
|
||
|
|
||
|
Args:
|
||
|
image (`numpy.ndarray`):
|
||
|
The image to have its channel dimension set.
|
||
|
channel_dim (`ChannelDimension`):
|
||
|
The channel dimension format to use.
|
||
|
input_channel_dim (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||
|
|
||
|
Returns:
|
||
|
`np.ndarray`: The image with the channel dimension set to `channel_dim`.
|
||
|
"""
|
||
|
if not isinstance(image, np.ndarray):
|
||
|
raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
|
||
|
|
||
|
if input_channel_dim is None:
|
||
|
input_channel_dim = infer_channel_dimension_format(image)
|
||
|
|
||
|
target_channel_dim = ChannelDimension(channel_dim)
|
||
|
if input_channel_dim == target_channel_dim:
|
||
|
return image
|
||
|
|
||
|
if target_channel_dim == ChannelDimension.FIRST:
|
||
|
image = image.transpose((2, 0, 1))
|
||
|
elif target_channel_dim == ChannelDimension.LAST:
|
||
|
image = image.transpose((1, 2, 0))
|
||
|
else:
|
||
|
raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
|
||
|
|
||
|
return image
|
||
|
|
||
|
|
||
|
def rescale(
|
||
|
image: np.ndarray,
|
||
|
scale: float,
|
||
|
data_format: Optional[ChannelDimension] = None,
|
||
|
dtype: np.dtype = np.float32,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Rescales `image` by `scale`.
|
||
|
|
||
|
Args:
|
||
|
image (`np.ndarray`):
|
||
|
The image to rescale.
|
||
|
scale (`float`):
|
||
|
The scale to use for rescaling the image.
|
||
|
data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
||
|
dtype (`np.dtype`, *optional*, defaults to `np.float32`):
|
||
|
The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
|
||
|
extractors.
|
||
|
input_data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||
|
|
||
|
Returns:
|
||
|
`np.ndarray`: The rescaled image.
|
||
|
"""
|
||
|
if not isinstance(image, np.ndarray):
|
||
|
raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
|
||
|
|
||
|
rescaled_image = image * scale
|
||
|
if data_format is not None:
|
||
|
rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format)
|
||
|
|
||
|
rescaled_image = rescaled_image.astype(dtype)
|
||
|
|
||
|
return rescaled_image
|
||
|
|
||
|
|
||
|
def _rescale_for_pil_conversion(image):
|
||
|
"""
|
||
|
Detects whether or not the image needs to be rescaled before being converted to a PIL image.
|
||
|
|
||
|
The assumption is that if the image is of type `np.float` and all values are between 0 and 1, it needs to be
|
||
|
rescaled.
|
||
|
"""
|
||
|
if image.dtype == np.uint8:
|
||
|
do_rescale = False
|
||
|
elif np.allclose(image, image.astype(int)):
|
||
|
if np.all(0 <= image) and np.all(image <= 255):
|
||
|
do_rescale = False
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"The image to be converted to a PIL image contains values outside the range [0, 255], "
|
||
|
f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
|
||
|
)
|
||
|
elif np.all(0 <= image) and np.all(image <= 1):
|
||
|
do_rescale = True
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"The image to be converted to a PIL image contains values outside the range [0, 1], "
|
||
|
f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
|
||
|
)
|
||
|
return do_rescale
|
||
|
|
||
|
|
||
|
def to_pil_image(
|
||
|
image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
|
||
|
do_rescale: Optional[bool] = None,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> "PIL.Image.Image":
|
||
|
"""
|
||
|
Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
|
||
|
needed.
|
||
|
|
||
|
Args:
|
||
|
image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
|
||
|
The image to convert to the `PIL.Image` format.
|
||
|
do_rescale (`bool`, *optional*):
|
||
|
Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
|
||
|
to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
|
||
|
and `False` otherwise.
|
||
|
input_data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the input image. If unset, will use the inferred format from the input.
|
||
|
|
||
|
Returns:
|
||
|
`PIL.Image.Image`: The converted image.
|
||
|
"""
|
||
|
requires_backends(to_pil_image, ["vision"])
|
||
|
|
||
|
if isinstance(image, PIL.Image.Image):
|
||
|
return image
|
||
|
|
||
|
# Convert all tensors to numpy arrays before converting to PIL image
|
||
|
if is_torch_tensor(image) or is_tf_tensor(image):
|
||
|
image = image.numpy()
|
||
|
elif is_jax_tensor(image):
|
||
|
image = np.array(image)
|
||
|
elif not isinstance(image, np.ndarray):
|
||
|
raise ValueError("Input image type not supported: {}".format(type(image)))
|
||
|
|
||
|
# If the channel has been moved to first dim, we put it back at the end.
|
||
|
image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
|
||
|
|
||
|
# If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
|
||
|
image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
|
||
|
|
||
|
# PIL.Image can only store uint8 values so we rescale the image to be between 0 and 255 if needed.
|
||
|
do_rescale = _rescale_for_pil_conversion(image) if do_rescale is None else do_rescale
|
||
|
|
||
|
if do_rescale:
|
||
|
image = rescale(image, 255)
|
||
|
|
||
|
image = image.astype(np.uint8)
|
||
|
return PIL.Image.fromarray(image)
|
||
|
|
||
|
|
||
|
# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
|
||
|
def get_resize_output_image_size(
|
||
|
input_image: np.ndarray,
|
||
|
size: Union[int, Tuple[int, int], List[int], Tuple[int]],
|
||
|
default_to_square: bool = True,
|
||
|
max_size: Optional[int] = None,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> tuple:
|
||
|
"""
|
||
|
Find the target (height, width) dimension of the output image after resizing given the input image and the desired
|
||
|
size.
|
||
|
|
||
|
Args:
|
||
|
input_image (`np.ndarray`):
|
||
|
The image to resize.
|
||
|
size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
|
||
|
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
|
||
|
this.
|
||
|
|
||
|
If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
|
||
|
`size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
|
||
|
number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
|
||
|
default_to_square (`bool`, *optional*, defaults to `True`):
|
||
|
How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
|
||
|
(`size`,`size`). If set to `False`, will replicate
|
||
|
[`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
|
||
|
with support for resizing only the smallest edge and providing an optional `max_size`.
|
||
|
max_size (`int`, *optional*):
|
||
|
The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
|
||
|
than `max_size` after being resized according to `size`, then the image is resized again so that the longer
|
||
|
edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
|
||
|
than `size`. Only used if `default_to_square` is `False`.
|
||
|
input_data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the input image. If unset, will use the inferred format from the input.
|
||
|
|
||
|
Returns:
|
||
|
`tuple`: The target (height, width) dimension of the output image after resizing.
|
||
|
"""
|
||
|
if isinstance(size, (tuple, list)):
|
||
|
if len(size) == 2:
|
||
|
return tuple(size)
|
||
|
elif len(size) == 1:
|
||
|
# Perform same logic as if size was an int
|
||
|
size = size[0]
|
||
|
else:
|
||
|
raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
|
||
|
|
||
|
if default_to_square:
|
||
|
return (size, size)
|
||
|
|
||
|
height, width = get_image_size(input_image, input_data_format)
|
||
|
short, long = (width, height) if width <= height else (height, width)
|
||
|
requested_new_short = size
|
||
|
|
||
|
new_short, new_long = requested_new_short, int(requested_new_short * long / short)
|
||
|
|
||
|
if max_size is not None:
|
||
|
if max_size <= requested_new_short:
|
||
|
raise ValueError(
|
||
|
f"max_size = {max_size} must be strictly greater than the requested "
|
||
|
f"size for the smaller edge size = {size}"
|
||
|
)
|
||
|
if new_long > max_size:
|
||
|
new_short, new_long = int(max_size * new_short / new_long), max_size
|
||
|
|
||
|
return (new_long, new_short) if width <= height else (new_short, new_long)
|
||
|
|
||
|
|
||
|
def resize(
|
||
|
image: np.ndarray,
|
||
|
size: Tuple[int, int],
|
||
|
resample: "PILImageResampling" = None,
|
||
|
reducing_gap: Optional[int] = None,
|
||
|
data_format: Optional[ChannelDimension] = None,
|
||
|
return_numpy: bool = True,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Resizes `image` to `(height, width)` specified by `size` using the PIL library.
|
||
|
|
||
|
Args:
|
||
|
image (`np.ndarray`):
|
||
|
The image to resize.
|
||
|
size (`Tuple[int, int]`):
|
||
|
The size to use for resizing the image.
|
||
|
resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||
|
The filter to user for resampling.
|
||
|
reducing_gap (`int`, *optional*):
|
||
|
Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
|
||
|
the fair resampling. See corresponding Pillow documentation for more details.
|
||
|
data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the output image. If unset, will use the inferred format from the input.
|
||
|
return_numpy (`bool`, *optional*, defaults to `True`):
|
||
|
Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
|
||
|
returned.
|
||
|
input_data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the input image. If unset, will use the inferred format from the input.
|
||
|
|
||
|
Returns:
|
||
|
`np.ndarray`: The resized image.
|
||
|
"""
|
||
|
requires_backends(resize, ["vision"])
|
||
|
|
||
|
resample = resample if resample is not None else PILImageResampling.BILINEAR
|
||
|
|
||
|
if not len(size) == 2:
|
||
|
raise ValueError("size must have 2 elements")
|
||
|
|
||
|
# For all transformations, we want to keep the same data format as the input image unless otherwise specified.
|
||
|
# The resized image from PIL will always have channels last, so find the input format first.
|
||
|
if input_data_format is None:
|
||
|
input_data_format = infer_channel_dimension_format(image)
|
||
|
data_format = input_data_format if data_format is None else data_format
|
||
|
|
||
|
# To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
|
||
|
# the pillow library to resize the image and then convert back to numpy
|
||
|
do_rescale = False
|
||
|
if not isinstance(image, PIL.Image.Image):
|
||
|
do_rescale = _rescale_for_pil_conversion(image)
|
||
|
image = to_pil_image(image, do_rescale=do_rescale, input_data_format=input_data_format)
|
||
|
height, width = size
|
||
|
# PIL images are in the format (width, height)
|
||
|
resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
|
||
|
|
||
|
if return_numpy:
|
||
|
resized_image = np.array(resized_image)
|
||
|
# If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
|
||
|
# so we need to add it back if necessary.
|
||
|
resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
|
||
|
# The image is always in channels last format after converting from a PIL image
|
||
|
resized_image = to_channel_dimension_format(
|
||
|
resized_image, data_format, input_channel_dim=ChannelDimension.LAST
|
||
|
)
|
||
|
# If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
|
||
|
# rescale it back to the original range.
|
||
|
resized_image = rescale(resized_image, 1 / 255) if do_rescale else resized_image
|
||
|
return resized_image
|
||
|
|
||
|
|
||
|
def normalize(
|
||
|
image: np.ndarray,
|
||
|
mean: Union[float, Iterable[float]],
|
||
|
std: Union[float, Iterable[float]],
|
||
|
data_format: Optional[ChannelDimension] = None,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
|
||
|
|
||
|
image = (image - mean) / std
|
||
|
|
||
|
Args:
|
||
|
image (`np.ndarray`):
|
||
|
The image to normalize.
|
||
|
mean (`float` or `Iterable[float]`):
|
||
|
The mean to use for normalization.
|
||
|
std (`float` or `Iterable[float]`):
|
||
|
The standard deviation to use for normalization.
|
||
|
data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the output image. If unset, will use the inferred format from the input.
|
||
|
input_data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format of the input image. If unset, will use the inferred format from the input.
|
||
|
"""
|
||
|
if not isinstance(image, np.ndarray):
|
||
|
raise ValueError("image must be a numpy array")
|
||
|
|
||
|
if input_data_format is None:
|
||
|
input_data_format = infer_channel_dimension_format(image)
|
||
|
channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
|
||
|
num_channels = image.shape[channel_axis]
|
||
|
|
||
|
# We cast to float32 to avoid errors that can occur when subtracting uint8 values.
|
||
|
# We preserve the original dtype if it is a float type to prevent upcasting float16.
|
||
|
if not np.issubdtype(image.dtype, np.floating):
|
||
|
image = image.astype(np.float32)
|
||
|
|
||
|
if isinstance(mean, Iterable):
|
||
|
if len(mean) != num_channels:
|
||
|
raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
|
||
|
else:
|
||
|
mean = [mean] * num_channels
|
||
|
mean = np.array(mean, dtype=image.dtype)
|
||
|
|
||
|
if isinstance(std, Iterable):
|
||
|
if len(std) != num_channels:
|
||
|
raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
|
||
|
else:
|
||
|
std = [std] * num_channels
|
||
|
std = np.array(std, dtype=image.dtype)
|
||
|
|
||
|
if input_data_format == ChannelDimension.LAST:
|
||
|
image = (image - mean) / std
|
||
|
else:
|
||
|
image = ((image.T - mean) / std).T
|
||
|
|
||
|
image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
|
||
|
return image
|
||
|
|
||
|
|
||
|
def center_crop(
|
||
|
image: np.ndarray,
|
||
|
size: Tuple[int, int],
|
||
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
return_numpy: Optional[bool] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
|
||
|
the size given, it will be padded (so the returned result will always be of size `size`).
|
||
|
|
||
|
Args:
|
||
|
image (`np.ndarray`):
|
||
|
The image to crop.
|
||
|
size (`Tuple[int, int]`):
|
||
|
The target size for the cropped image.
|
||
|
data_format (`str` or `ChannelDimension`, *optional*):
|
||
|
The channel dimension format for the output image. Can be one of:
|
||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||
|
If unset, will use the inferred format of the input image.
|
||
|
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||
|
The channel dimension format for the input image. Can be one of:
|
||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||
|
If unset, will use the inferred format of the input image.
|
||
|
return_numpy (`bool`, *optional*):
|
||
|
Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
|
||
|
previous ImageFeatureExtractionMixin method.
|
||
|
- Unset: will return the same type as the input image.
|
||
|
- `True`: will return a numpy array.
|
||
|
- `False`: will return a `PIL.Image.Image` object.
|
||
|
Returns:
|
||
|
`np.ndarray`: The cropped image.
|
||
|
"""
|
||
|
requires_backends(center_crop, ["vision"])
|
||
|
|
||
|
if return_numpy is not None:
|
||
|
warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning)
|
||
|
|
||
|
return_numpy = True if return_numpy is None else return_numpy
|
||
|
|
||
|
if not isinstance(image, np.ndarray):
|
||
|
raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
|
||
|
|
||
|
if not isinstance(size, Iterable) or len(size) != 2:
|
||
|
raise ValueError("size must have 2 elements representing the height and width of the output image")
|
||
|
|
||
|
if input_data_format is None:
|
||
|
input_data_format = infer_channel_dimension_format(image)
|
||
|
output_data_format = data_format if data_format is not None else input_data_format
|
||
|
|
||
|
# We perform the crop in (C, H, W) format and then convert to the output format
|
||
|
image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
|
||
|
|
||
|
orig_height, orig_width = get_image_size(image, ChannelDimension.FIRST)
|
||
|
crop_height, crop_width = size
|
||
|
crop_height, crop_width = int(crop_height), int(crop_width)
|
||
|
|
||
|
# In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
|
||
|
top = (orig_height - crop_height) // 2
|
||
|
bottom = top + crop_height
|
||
|
# In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
|
||
|
left = (orig_width - crop_width) // 2
|
||
|
right = left + crop_width
|
||
|
|
||
|
# Check if cropped area is within image boundaries
|
||
|
if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
|
||
|
image = image[..., top:bottom, left:right]
|
||
|
image = to_channel_dimension_format(image, output_data_format, ChannelDimension.FIRST)
|
||
|
return image
|
||
|
|
||
|
# Otherwise, we may need to pad if the image is too small. Oh joy...
|
||
|
new_height = max(crop_height, orig_height)
|
||
|
new_width = max(crop_width, orig_width)
|
||
|
new_shape = image.shape[:-2] + (new_height, new_width)
|
||
|
new_image = np.zeros_like(image, shape=new_shape)
|
||
|
|
||
|
# If the image is too small, pad it with zeros
|
||
|
top_pad = (new_height - orig_height) // 2
|
||
|
bottom_pad = top_pad + orig_height
|
||
|
left_pad = (new_width - orig_width) // 2
|
||
|
right_pad = left_pad + orig_width
|
||
|
new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
|
||
|
|
||
|
top += top_pad
|
||
|
bottom += top_pad
|
||
|
left += left_pad
|
||
|
right += left_pad
|
||
|
|
||
|
new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
|
||
|
new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST)
|
||
|
|
||
|
if not return_numpy:
|
||
|
new_image = to_pil_image(new_image)
|
||
|
|
||
|
return new_image
|
||
|
|
||
|
|
||
|
def _center_to_corners_format_torch(bboxes_center: "torch.Tensor") -> "torch.Tensor":
|
||
|
center_x, center_y, width, height = bboxes_center.unbind(-1)
|
||
|
bbox_corners = torch.stack(
|
||
|
# top left x, top left y, bottom right x, bottom right y
|
||
|
[(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)],
|
||
|
dim=-1,
|
||
|
)
|
||
|
return bbox_corners
|
||
|
|
||
|
|
||
|
def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
|
||
|
center_x, center_y, width, height = bboxes_center.T
|
||
|
bboxes_corners = np.stack(
|
||
|
# top left x, top left y, bottom right x, bottom right y
|
||
|
[center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
|
||
|
axis=-1,
|
||
|
)
|
||
|
return bboxes_corners
|
||
|
|
||
|
|
||
|
def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor":
|
||
|
center_x, center_y, width, height = tf.unstack(bboxes_center, axis=-1)
|
||
|
bboxes_corners = tf.stack(
|
||
|
# top left x, top left y, bottom right x, bottom right y
|
||
|
[center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
|
||
|
axis=-1,
|
||
|
)
|
||
|
return bboxes_corners
|
||
|
|
||
|
|
||
|
# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
|
||
|
def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
|
||
|
"""
|
||
|
Converts bounding boxes from center format to corners format.
|
||
|
|
||
|
center format: contains the coordinate for the center of the box and its width, height dimensions
|
||
|
(center_x, center_y, width, height)
|
||
|
corners format: contains the coodinates for the top-left and bottom-right corners of the box
|
||
|
(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
||
|
"""
|
||
|
# Function is used during model forward pass, so we use the input framework if possible, without
|
||
|
# converting to numpy
|
||
|
if is_torch_tensor(bboxes_center):
|
||
|
return _center_to_corners_format_torch(bboxes_center)
|
||
|
elif isinstance(bboxes_center, np.ndarray):
|
||
|
return _center_to_corners_format_numpy(bboxes_center)
|
||
|
elif is_tf_tensor(bboxes_center):
|
||
|
return _center_to_corners_format_tf(bboxes_center)
|
||
|
|
||
|
raise ValueError(f"Unsupported input type {type(bboxes_center)}")
|
||
|
|
||
|
|
||
|
def _corners_to_center_format_torch(bboxes_corners: "torch.Tensor") -> "torch.Tensor":
|
||
|
top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
|
||
|
b = [
|
||
|
(top_left_x + bottom_right_x) / 2, # center x
|
||
|
(top_left_y + bottom_right_y) / 2, # center y
|
||
|
(bottom_right_x - top_left_x), # width
|
||
|
(bottom_right_y - top_left_y), # height
|
||
|
]
|
||
|
return torch.stack(b, dim=-1)
|
||
|
|
||
|
|
||
|
def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
|
||
|
top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
|
||
|
bboxes_center = np.stack(
|
||
|
[
|
||
|
(top_left_x + bottom_right_x) / 2, # center x
|
||
|
(top_left_y + bottom_right_y) / 2, # center y
|
||
|
(bottom_right_x - top_left_x), # width
|
||
|
(bottom_right_y - top_left_y), # height
|
||
|
],
|
||
|
axis=-1,
|
||
|
)
|
||
|
return bboxes_center
|
||
|
|
||
|
|
||
|
def _corners_to_center_format_tf(bboxes_corners: "tf.Tensor") -> "tf.Tensor":
|
||
|
top_left_x, top_left_y, bottom_right_x, bottom_right_y = tf.unstack(bboxes_corners, axis=-1)
|
||
|
bboxes_center = tf.stack(
|
||
|
[
|
||
|
(top_left_x + bottom_right_x) / 2, # center x
|
||
|
(top_left_y + bottom_right_y) / 2, # center y
|
||
|
(bottom_right_x - top_left_x), # width
|
||
|
(bottom_right_y - top_left_y), # height
|
||
|
],
|
||
|
axis=-1,
|
||
|
)
|
||
|
return bboxes_center
|
||
|
|
||
|
|
||
|
def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
|
||
|
"""
|
||
|
Converts bounding boxes from corners format to center format.
|
||
|
|
||
|
corners format: contains the coordinates for the top-left and bottom-right corners of the box
|
||
|
(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
||
|
center format: contains the coordinate for the center of the box and its the width, height dimensions
|
||
|
(center_x, center_y, width, height)
|
||
|
"""
|
||
|
# Inverse function accepts different input types so implemented here too
|
||
|
if is_torch_tensor(bboxes_corners):
|
||
|
return _corners_to_center_format_torch(bboxes_corners)
|
||
|
elif isinstance(bboxes_corners, np.ndarray):
|
||
|
return _corners_to_center_format_numpy(bboxes_corners)
|
||
|
elif is_tf_tensor(bboxes_corners):
|
||
|
return _corners_to_center_format_tf(bboxes_corners)
|
||
|
|
||
|
raise ValueError(f"Unsupported input type {type(bboxes_corners)}")
|
||
|
|
||
|
|
||
|
# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
|
||
|
# Copyright (c) 2018, Alexander Kirillov
|
||
|
# All rights reserved.
|
||
|
def rgb_to_id(color):
|
||
|
"""
|
||
|
Converts RGB color to unique ID.
|
||
|
"""
|
||
|
if isinstance(color, np.ndarray) and len(color.shape) == 3:
|
||
|
if color.dtype == np.uint8:
|
||
|
color = color.astype(np.int32)
|
||
|
return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
|
||
|
return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
|
||
|
|
||
|
|
||
|
def id_to_rgb(id_map):
|
||
|
"""
|
||
|
Converts unique ID to RGB color.
|
||
|
"""
|
||
|
if isinstance(id_map, np.ndarray):
|
||
|
id_map_copy = id_map.copy()
|
||
|
rgb_shape = tuple(list(id_map.shape) + [3])
|
||
|
rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
|
||
|
for i in range(3):
|
||
|
rgb_map[..., i] = id_map_copy % 256
|
||
|
id_map_copy //= 256
|
||
|
return rgb_map
|
||
|
color = []
|
||
|
for _ in range(3):
|
||
|
color.append(id_map % 256)
|
||
|
id_map //= 256
|
||
|
return color
|
||
|
|
||
|
|
||
|
class PaddingMode(ExplicitEnum):
|
||
|
"""
|
||
|
Enum class for the different padding modes to use when padding images.
|
||
|
"""
|
||
|
|
||
|
CONSTANT = "constant"
|
||
|
REFLECT = "reflect"
|
||
|
REPLICATE = "replicate"
|
||
|
SYMMETRIC = "symmetric"
|
||
|
|
||
|
|
||
|
def pad(
|
||
|
image: np.ndarray,
|
||
|
padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
|
||
|
mode: PaddingMode = PaddingMode.CONSTANT,
|
||
|
constant_values: Union[float, Iterable[float]] = 0.0,
|
||
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Pads the `image` with the specified (height, width) `padding` and `mode`.
|
||
|
|
||
|
Args:
|
||
|
image (`np.ndarray`):
|
||
|
The image to pad.
|
||
|
padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
|
||
|
Padding to apply to the edges of the height, width axes. Can be one of three formats:
|
||
|
- `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
|
||
|
- `((before, after),)` yields same before and after pad for height and width.
|
||
|
- `(pad,)` or int is a shortcut for before = after = pad width for all axes.
|
||
|
mode (`PaddingMode`):
|
||
|
The padding mode to use. Can be one of:
|
||
|
- `"constant"`: pads with a constant value.
|
||
|
- `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
|
||
|
vector along each axis.
|
||
|
- `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
|
||
|
- `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
|
||
|
constant_values (`float` or `Iterable[float]`, *optional*):
|
||
|
The value to use for the padding if `mode` is `"constant"`.
|
||
|
data_format (`str` or `ChannelDimension`, *optional*):
|
||
|
The channel dimension format for the output image. Can be one of:
|
||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||
|
If unset, will use same as the input image.
|
||
|
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||
|
The channel dimension format for the input image. Can be one of:
|
||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||
|
If unset, will use the inferred format of the input image.
|
||
|
|
||
|
Returns:
|
||
|
`np.ndarray`: The padded image.
|
||
|
|
||
|
"""
|
||
|
if input_data_format is None:
|
||
|
input_data_format = infer_channel_dimension_format(image)
|
||
|
|
||
|
def _expand_for_data_format(values):
|
||
|
"""
|
||
|
Convert values to be in the format expected by np.pad based on the data format.
|
||
|
"""
|
||
|
if isinstance(values, (int, float)):
|
||
|
values = ((values, values), (values, values))
|
||
|
elif isinstance(values, tuple) and len(values) == 1:
|
||
|
values = ((values[0], values[0]), (values[0], values[0]))
|
||
|
elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
|
||
|
values = (values, values)
|
||
|
elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
|
||
|
values = values
|
||
|
else:
|
||
|
raise ValueError(f"Unsupported format: {values}")
|
||
|
|
||
|
# add 0 for channel dimension
|
||
|
values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
|
||
|
|
||
|
# Add additional padding if there's a batch dimension
|
||
|
values = (0, *values) if image.ndim == 4 else values
|
||
|
return values
|
||
|
|
||
|
padding = _expand_for_data_format(padding)
|
||
|
|
||
|
if mode == PaddingMode.CONSTANT:
|
||
|
constant_values = _expand_for_data_format(constant_values)
|
||
|
image = np.pad(image, padding, mode="constant", constant_values=constant_values)
|
||
|
elif mode == PaddingMode.REFLECT:
|
||
|
image = np.pad(image, padding, mode="reflect")
|
||
|
elif mode == PaddingMode.REPLICATE:
|
||
|
image = np.pad(image, padding, mode="edge")
|
||
|
elif mode == PaddingMode.SYMMETRIC:
|
||
|
image = np.pad(image, padding, mode="symmetric")
|
||
|
else:
|
||
|
raise ValueError(f"Invalid padding mode: {mode}")
|
||
|
|
||
|
image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
|
||
|
return image
|
||
|
|
||
|
|
||
|
# TODO (Amy): Accept 1/3/4 channel numpy array as input and return np.array as default
|
||
|
def convert_to_rgb(image: ImageInput) -> ImageInput:
|
||
|
"""
|
||
|
Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
|
||
|
as is.
|
||
|
Args:
|
||
|
image (Image):
|
||
|
The image to convert.
|
||
|
"""
|
||
|
requires_backends(convert_to_rgb, ["vision"])
|
||
|
|
||
|
if not isinstance(image, PIL.Image.Image):
|
||
|
return image
|
||
|
|
||
|
if image.mode == "RGB":
|
||
|
return image
|
||
|
|
||
|
image = image.convert("RGB")
|
||
|
return image
|
||
|
|
||
|
|
||
|
def flip_channel_order(
|
||
|
image: np.ndarray,
|
||
|
data_format: Optional[ChannelDimension] = None,
|
||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||
|
) -> np.ndarray:
|
||
|
"""
|
||
|
Flips the channel order of the image.
|
||
|
|
||
|
If the image is in RGB format, it will be converted to BGR and vice versa.
|
||
|
|
||
|
Args:
|
||
|
image (`np.ndarray`):
|
||
|
The image to flip.
|
||
|
data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format for the output image. Can be one of:
|
||
|
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||
|
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||
|
If unset, will use same as the input image.
|
||
|
input_data_format (`ChannelDimension`, *optional*):
|
||
|
The channel dimension format for the input image. Can be one of:
|
||
|
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||
|
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||
|
If unset, will use the inferred format of the input image.
|
||
|
"""
|
||
|
input_data_format = infer_channel_dimension_format(image) if input_data_format is None else input_data_format
|
||
|
|
||
|
if input_data_format == ChannelDimension.LAST:
|
||
|
image = image[..., ::-1]
|
||
|
elif input_data_format == ChannelDimension.FIRST:
|
||
|
image = image[::-1, ...]
|
||
|
else:
|
||
|
raise ValueError(f"Unsupported channel dimension: {input_data_format}")
|
||
|
|
||
|
if data_format is not None:
|
||
|
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
||
|
return image
|