diff --git a/dali/python/nvidia/dali/experimental/torchvision/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/__init__.py index de4d4c47977..550dfd57bc5 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/__init__.py @@ -20,6 +20,7 @@ from .v2.normalize import Normalize from .v2.pad import Pad from .v2.rand_apply import RandomApply +from .v2.randomcrop import RandomCrop from .v2.resize import Resize from .v2.totensor import ToPureTensor, PILToTensor, ToPILImage @@ -33,6 +34,7 @@ "Pad", "PILToTensor", "RandomApply", + "RandomCrop", "RandomGrayscale", "RandomHorizontalFlip", "RandomVerticalFlip", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py index ec19014a2d7..6064709cd61 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py @@ -14,8 +14,10 @@ from .centercrop import center_crop from .color import to_grayscale, rgb_to_grayscale +from .crop import crop from .flips import horizontal_flip, vertical_flip from .gaussian_blur import gaussian_blur +from .image_metadata import get_dimensions, get_image_size from .normalize import normalize from .pad import pad from .resize import resize @@ -23,7 +25,10 @@ __all__ = [ "center_crop", + "crop", "gaussian_blur", + "get_dimensions", + "get_image_size", "horizontal_flip", "normalize", "pad", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py new file mode 100644 index 00000000000..d084b6aee0c --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py @@ -0,0 +1,69 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvidia.dali.experimental.dynamic as ndd +from nvidia.dali._typing import TensorLike +from nvidia.dali.experimental.dynamic._device import DeviceLike + +from ..operator import adjust_input +from ..randomcrop import RandomCrop + + +def _get_crop_axes(inpt: TensorLike | ndd.Batch) -> list[int]: + layout = inpt.layout[-3:] + if layout == "HWC": + return [-3, -2] + if layout == "CHW": + return [-2, -1] + if inpt.layout[-2:] == "HW": + return [-2, -1] + raise ValueError(f"Unsupported layout: {inpt.layout!r}. Expected one of HWC, CHW, HW.") + + +def _verify_crop_coordinate(value, name: str) -> None: + if not isinstance(value, int): + raise TypeError(f"{name} must be int, got {type(value)}") + + +@adjust_input +def crop( + inpt: TensorLike | ndd.Batch, + top: int, + left: int, + height: int, + width: int, + device: DeviceLike = "cpu", +) -> ndd.Tensor | ndd.Batch: + """ + Please refer to the ``RandomCrop`` operator for more details. + """ + _verify_crop_coordinate(top, "top") + _verify_crop_coordinate(left, "left") + RandomCrop.verify_args( + size=(height, width), + padding=None, + pad_if_needed=False, + padding_mode="constant", + fill=0, + ) + + return ndd.slice( + inpt, + (top, left), + (height, width), + axes=_get_crop_axes(inpt), + out_of_bounds_policy="pad", + fill_values=0, + device=device, + ) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py new file mode 100644 index 00000000000..4b62db09f20 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py @@ -0,0 +1,82 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from PIL import Image +import torch + + +def get_image_size(inpt: Image.Image | torch.Tensor) -> List[int]: + """ + Return the spatial size of an image as ``[width, height]``. + + Mirrors ``torchvision.transforms.v2.functional.get_image_size``. + + .. note:: + This function is provided for compatibility. The torchvision successor + ``get_size`` returns ``[height, width]`` instead. + + Parameters + ---------- + inpt : PIL Image or torch.Tensor + Input image. Tensors are expected in ``[…, H, W]`` layout (leading + channel / batch dimensions are ignored). + + Returns + ------- + List[int] + ``[width, height]`` + """ + if isinstance(inpt, Image.Image): + return list(inpt.size) # PIL .size is (W, H) + elif isinstance(inpt, torch.Tensor): + if inpt.ndim < 2: + raise TypeError( + f"get_image_size requires a tensor with at least 2 dimensions, got {inpt.ndim}" + ) + return [inpt.shape[-1], inpt.shape[-2]] # [W, H] + raise TypeError(f"Unsupported input type: {type(inpt)}") + + +def get_dimensions(inpt: Image.Image | torch.Tensor) -> List[int]: + """ + Return the number of channels, height, and width of an image as + ``[channels, height, width]``. + + Mirrors ``torchvision.transforms.v2.functional.get_dimensions``. + + Parameters + ---------- + inpt : PIL Image or torch.Tensor + Input image. Tensors are expected in ``[H, W]`` or ``[…, C, H, W]`` layout + (leading batch dimensions are ignored). + + Returns + ------- + List[int] + ``[channels, height, width]`` + """ + if isinstance(inpt, Image.Image): + w, h = inpt.size + return [len(inpt.getbands()), h, w] + elif isinstance(inpt, torch.Tensor): + if inpt.ndim < 2: + raise TypeError( + f"get_dimensions requires a tensor with at least 2 dimensions, got {inpt.ndim}" + ) + if inpt.ndim == 2: + return [1, inpt.shape[-2], inpt.shape[-1]] + return [inpt.shape[-3], inpt.shape[-2], inpt.shape[-1]] # [C, H, W] + raise TypeError(f"Unsupported input type: {type(inpt)}") diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py new file mode 100644 index 00000000000..01bf27b4037 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py @@ -0,0 +1,257 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numbers +from types import NoneType +from typing import Literal, Sequence, Union + +import nvidia.dali as dali +import nvidia.dali.fn as fn + +from .centercrop import CenterCrop +from .operator import ( + Operator, + _ArgumentValidateRule, + _ValidateIfNonNegative, + _ValidateSizeDescriptor, + get_HWC_from_layout_pipeline, +) +from .pad import PADDING_CLASS, _ValidatePaddingMode + + +class _ValidateCropSize(_ArgumentValidateRule): + """ + Verify RandomCrop size values. + """ + + @classmethod + def verify(cls, *, size, **_) -> None: + if isinstance(size, (list, tuple)) and any(not isinstance(value, int) for value in size): + raise ValueError(f"Size values must be integers, got {size}") + + +class _ValidatePadding(_ArgumentValidateRule): + """ + Verify RandomCrop padding arguments. + """ + + @classmethod + def verify(cls, *, padding, pad_if_needed, padding_mode, **_) -> None: + if not isinstance(pad_if_needed, bool): + raise TypeError(f"pad_if_needed must be bool, got {type(pad_if_needed)}") + + if padding is not None: + if not isinstance(padding, (int, list, tuple)): + raise TypeError( + f"Padding must be an int or a sequence of length 1, 2 or 4, " + f"got {type(padding)}" + ) + if isinstance(padding, (list, tuple)) and len(padding) not in (1, 2, 4): + raise ValueError(f"Padding sequence must have length 1, 2 or 4, got {len(padding)}") + if isinstance(padding, (list, tuple)) and any( + not isinstance(value, int) for value in padding + ): + raise ValueError(f"Padding values must be integers, got {padding}") + _ValidateIfNonNegative.verify(values=padding, name="padding") + + if pad_if_needed or padding is not None: + _ValidatePaddingMode.verify(padding_mode=padding_mode) + + +class _ValidateFill(_ArgumentValidateRule): + """ + Verify RandomCrop fill argument. + """ + + @classmethod + def _verify_fill_value(cls, fill) -> None: + if fill is None or isinstance(fill, numbers.Number): + return + if isinstance(fill, (list, tuple)) and all( + isinstance(value, numbers.Number) for value in fill + ): + return + raise TypeError(f"fill must be a number, sequence of numbers, None or a dict, got {fill!r}") + + @classmethod + def verify(cls, *, fill, **_) -> None: + if isinstance(fill, dict): + for key, value in fill.items(): + if not isinstance(key, (type, str)): + raise TypeError(f"fill dictionary keys must be types or strings, got {key!r}") + cls._verify_fill_value(value) + else: + cls._verify_fill_value(fill) + + +class RandomCrop(Operator): + """ + Crop the input at a random location. + + If the input is a ``torch.Tensor`` it can have an arbitrary number of leading batch dimensions. + For example, the image tensor can have [..., C, H, W] shape. + + Parameters + ---------- + size : sequence or int + Desired output size of the crop. If size is an int instead of sequence like (h, w), + a square crop (size, size) is made. If provided a sequence of length 1, it will be + interpreted as (size[0], size[0]). + padding : int or sequence, optional, default = None + Optional padding on each border of the image, applied before cropping. If a single int + or a sequence of length 1 is provided this is used to pad all borders. If sequence of + length 2 is provided this is the padding on left/right and top/bottom respectively. If + a sequence of length 4 is provided this is the padding for the left, top, right and + bottom borders respectively. + pad_if_needed : bool, optional, default = False + Pad the image if it is smaller than the desired size. + fill : number or tuple or dict, optional, default = 0 + Pixel fill value used when the padding_mode is constant. + padding_mode : Literal["constant", "edge", "reflect", "symmetric"], optional, + Type of padding. Should be: constant, edge, reflect or symmetric. + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the crop. Can be ``"cpu"`` or ``"gpu"``. + """ + + arg_rules = [_ValidateSizeDescriptor, _ValidateCropSize, _ValidatePadding, _ValidateFill] + preprocess_data = get_HWC_from_layout_pipeline + + @classmethod + def adjust_size(cls, size: int | Sequence[int]) -> Sequence[int]: + return CenterCrop.adjust_size(size) + + @classmethod + def adjust_padding(cls, padding: None | int | Sequence[int]) -> tuple[int, int, int, int]: + if padding is None: + return 0, 0, 0, 0 + if isinstance(padding, int): + return padding, padding, padding, padding + if isinstance(padding, (list, tuple)): + if len(padding) == 1: + return padding[0], padding[0], padding[0], padding[0] + if len(padding) == 2: + return padding[0], padding[1], padding[0], padding[1] + if len(padding) == 4: + return tuple(padding) + + raise TypeError( + f"Padding must be an int or a sequence of length 1, 2 or 4, got {type(padding)}" + ) + + @staticmethod + def adjust_fill(fill): + if isinstance(fill, dict): + return {key: RandomCrop.adjust_fill(value) for key, value in fill.items()} + if fill is None: + return 0 + if isinstance(fill, numbers.Number): + return fill + return tuple(fill) + + @staticmethod + def _randint(max_value): + range_start = fn.cast(0, dtype=dali.types.FLOAT) + range_end = fn.cast(max_value + 1, dtype=dali.types.FLOAT) + value = dali.math.floor(fn.random.uniform(range=fn.stack(range_start, range_end))) + return fn.cast(value, dtype=dali.types.INT32) + + def __init__( + self, + size: int | Sequence[int], + padding: None | int | Sequence[int] = None, + pad_if_needed: bool = False, + fill: Union[ + int, + float, + Sequence[int], + Sequence[float], + None, + dict[ + type | str, + int + | float + | collections.abc.Sequence[int] + | collections.abc.Sequence[float] + | NoneType, + ], + ] = 0, + padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant", + device: Literal["cpu", "gpu"] = "cpu", + ): + super().__init__( + device=device, + size=size, + padding=padding, + pad_if_needed=pad_if_needed, + padding_mode=padding_mode, + fill=fill, + ) + + self.size = RandomCrop.adjust_size(size) + self.padding = RandomCrop.adjust_padding(padding) + self.pad_if_needed = pad_if_needed + self.fill = RandomCrop.adjust_fill(fill) + self.padding_mode = padding_mode + self.needs_padding = pad_if_needed or any(self.padding) + + def _kernel(self, data_input): + """ + Applies the random crop to the input data. + """ + in_h, in_w, _, tensor = data_input + crop_h, crop_w = self.size + pad_left, pad_top, pad_right, pad_bottom = self.padding + + if self.needs_padding: + padded_h = in_h + pad_top + pad_bottom + padded_w = in_w + pad_left + pad_right + + if self.pad_if_needed: + pad_h = dali.math.max(crop_h - padded_h, 0) + pad_w = dali.math.max(crop_w - padded_w, 0) + pad_top = pad_top + pad_h + pad_bottom = pad_bottom + pad_h + pad_left = pad_left + pad_w + pad_right = pad_right + pad_w + + tensor = fn.slice( + tensor, + fn.stack( + fn.cast(-pad_left, dtype=dali.types.INT64), + fn.cast(-pad_top, dtype=dali.types.INT64), + ), + fn.stack(in_w + pad_left + pad_right, in_h + pad_top + pad_bottom), + out_of_bounds_policy=PADDING_CLASS[self.padding_mode].border_type, + fill_values=self.fill, + device=self.device, + axis_names="WH", + ) + + in_h = in_h + pad_top + pad_bottom + in_w = in_w + pad_left + pad_right + + max_top = fn.cast(in_h, dtype=dali.types.INT32) - crop_h + max_left = fn.cast(in_w, dtype=dali.types.INT32) - crop_w + + top = RandomCrop._randint(max_top) + left = RandomCrop._randint(max_left) + + return fn.slice( + tensor, + fn.stack(left, top), + fn.stack(crop_w, crop_h), + device=self.device, + axis_names="WH", + ) diff --git a/dali/test/python/torchvision/test_tv_crop.py b/dali/test/python/torchvision/test_tv_crop.py new file mode 100644 index 00000000000..3648c6c07c7 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_crop.py @@ -0,0 +1,148 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import unittest + +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +import numpy as np +from PIL import Image +import torch +import torchvision.transforms.v2.functional as tv_fn + +from nvidia.dali.experimental.torchvision.v2.functional import crop + + +def make_test_tensor(shape=(3, 8, 10), dtype=torch.uint8): + return torch.arange(math.prod(shape), dtype=dtype).reshape(shape) + + +def _make_pil_image(mode, h=8, w=10, seed=42): + rng = np.random.default_rng(seed) + if mode == "L": + data = rng.integers(0, 256, (h, w), dtype=np.uint8) + elif mode == "RGB": + data = rng.integers(0, 256, (h, w, 3), dtype=np.uint8) + elif mode == "RGBA": + data = rng.integers(0, 256, (h, w, 4), dtype=np.uint8) + else: + raise ValueError(f"Unsupported mode: {mode}") + return Image.fromarray(data, mode=mode) + + +def _assert_crop_matches_torchvision(inpt, top, left, height, width, device="cpu"): + dali_out = crop(inpt, top, left, height, width, device=device) + tv_out = tv_fn.crop(inpt, top, left, height, width) + + if device == "gpu" and not isinstance(dali_out, Image.Image): + dali_out = dali_out.cpu() + if isinstance(tv_out, torch.Tensor): + tv_out = tv_out.cpu() + + if isinstance(inpt, Image.Image): + assert isinstance(dali_out, Image.Image), f"Expected PIL Image, got {type(dali_out)}" + assert dali_out.mode == tv_out.mode, f"Expected mode {tv_out.mode}, got {dali_out.mode}" + dali_out = tv_fn.pil_to_tensor(dali_out) + tv_out = tv_fn.pil_to_tensor(tv_out) + + assert dali_out.shape == tv_out.shape, f"Shape mismatch: {dali_out.shape} != {tv_out.shape}" + assert torch.equal(dali_out, tv_out), "DALI crop output differs from torchvision" + + +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + +def _move_tensor_to_device(tensor, device): + if device == "gpu": + return tensor.cuda() + return tensor + + +@cartesian_params( + ("cpu", "gpu"), + ( + (1, 2, 4, 5), + (0, 0, 8, 10), + (3, 4, 2, 3), + (-1, -2, 6, 8), + (6, 8, 5, 6), + (0, 0, 12, 14), + ), +) +def test_crop_tensor(device, crop_case): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_test_tensor(), device) + _assert_crop_matches_torchvision(tensor, *crop_case, device=device) + + +@cartesian_params( + ("cpu", "gpu"), + ("L", "RGB", "RGBA"), + ( + (1, 2, 4, 5), + (-2, -3, 12, 14), + ), +) +def test_crop_pil(device, mode, crop_case): + _skip_if_gpu_unavailable(device) + _assert_crop_matches_torchvision(_make_pil_image(mode), *crop_case, device=device) + + +@cartesian_params(("cpu", "gpu"), ((2, 3, 4, 5),)) +def test_crop_batched_tensor(device, crop_case): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_test_tensor(shape=(4, 3, 8, 10)), device) + _assert_crop_matches_torchvision(tensor, *crop_case, device=device) + + +@params(torch.float32, torch.int16, torch.int32) +def test_crop_preserves_tensor_dtype(dtype): + tensor = make_test_tensor(dtype=dtype) + dali_out = crop(tensor, top=1, left=1, height=4, width=5) + tv_out = tv_fn.crop(tensor, top=1, left=1, height=4, width=5) + + assert dali_out.dtype == tv_out.dtype, f"Expected dtype {tv_out.dtype}, got {dali_out.dtype}" + assert torch.equal(dali_out, tv_out), "DALI crop output differs from torchvision" + + +def test_crop_invalid_input_type(): + with assert_raises(TypeError): + _ = crop([1, 2, 3], top=0, left=0, height=1, width=1) + + +@params( + (0, 1), + (1, 0), + (-1, 1), + (1, -1), + (1.0, 1), + (1, 1.0), +) +def test_crop_invalid_output_size(height, width): + with assert_raises((TypeError, ValueError)): + _ = crop(make_test_tensor(), top=0, left=0, height=height, width=width) + + +@params( + (0.5, 0), + ("0", 0), + (0, 0.5), + (0, "0"), +) +def test_crop_invalid_coordinates(top, left): + with assert_raises(TypeError): + _ = crop(make_test_tensor(), top=top, left=left, height=1, width=1) diff --git a/dali/test/python/torchvision/test_tv_image_metadata.py b/dali/test/python/torchvision/test_tv_image_metadata.py new file mode 100644 index 00000000000..3f24ebcd359 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_image_metadata.py @@ -0,0 +1,186 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import unittest + +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +from PIL import Image +import torch +from torchvision import tv_tensors +import torchvision.transforms.v2.functional as fn_tv + +from nvidia.dali.experimental.torchvision.v2.functional import get_image_size, get_dimensions + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _tv_get_image_size(inpt): + """Call torchvision get_image_size while suppressing its deprecation warning.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + return fn_tv.get_image_size(inpt) + + +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + +def _move_tensor_to_device(tensor, device): + if device == "gpu": + return tensor.cuda() + return tensor + + +def _make_compatibility_input(input_kind, shape): + tensor = torch.zeros(*shape) + if input_kind == "tensor": + return tensor + if input_kind == "tv_image": + return tv_tensors.Image(tensor) + raise ValueError(f"Unsupported input kind: {input_kind}") + + +# PIL images with known exact dimensions (W x H) +PIL_CASES = [ + Image.new("RGB", (320, 240)), # 3 channels + Image.new("L", (100, 50)), # 1 channel, non-square + Image.new("RGBA", (64, 32)), # 4 channels + Image.new("RGB", (1, 1)), # minimal + Image.new("L", (512, 1)), # extreme aspect ratio +] + +# Tensors in CHW / NCHW layout — deliberately use H≠W to catch W/H swap bugs +TENSOR_CASES = [ + torch.zeros(3, 240, 320), # CHW + torch.zeros(1, 3, 240, 320), # NCHW, N=1 + torch.zeros(8, 3, 240, 320), # NCHW, N=8 + torch.zeros(1, 50, 100), # CHW, 1 channel + torch.zeros(4, 32, 64), # CHW, 4 channels + torch.zeros(10, 11, 12, 8, 3, 240, 320), # ...NCHW, N=8 +] + +TORCHVISION_COMPATIBILITY_CASES = [ + ("tensor", (240, 320)), # HW, implicit single channel + ("tensor", (3, 240, 320)), # CHW + ("tensor", (8, 3, 240, 320)), # NCHW + ("tv_image", (240, 320)), # torchvision Image converts HW to 1HW + ("tv_image", (3, 240, 320)), # torchvision Image, CHW +] + + +# --------------------------------------------------------------------------- +# get_image_size — PIL +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_get_image_size_pil(img): + expected = _tv_get_image_size(img) + assert ( + get_image_size(img) == expected + ), f"mode={img.mode} size={img.size}: got {get_image_size(img)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_image_size — tensors +# --------------------------------------------------------------------------- + + +@cartesian_params(("cpu", "gpu"), TENSOR_CASES) +def test_get_image_size_tensor(device, t): + _skip_if_gpu_unavailable(device) + t = _move_tensor_to_device(t, device) + expected = _tv_get_image_size(t) + assert ( + get_image_size(t) == expected + ), f"device={device} shape={t.shape}: got {get_image_size(t)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_dimensions — PIL +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_get_dimensions_pil(img): + expected = fn_tv.get_dimensions(img) + assert ( + get_dimensions(img) == expected + ), f"mode={img.mode} size={img.size}: got {get_dimensions(img)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_dimensions — tensors +# --------------------------------------------------------------------------- + + +@cartesian_params(("cpu", "gpu"), TENSOR_CASES) +def test_get_dimensions_tensor(device, t): + _skip_if_gpu_unavailable(device) + t = _move_tensor_to_device(t, device) + expected = fn_tv.get_dimensions(t) + assert ( + get_dimensions(t) == expected + ), f"device={device} shape={t.shape}: got {get_dimensions(t)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# Torchvision compatibility +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_image_metadata_pil_matches_torchvision(img): + assert get_image_size(img) == _tv_get_image_size(img) + assert get_dimensions(img) == fn_tv.get_dimensions(img) + + +@cartesian_params(("cpu", "gpu"), TORCHVISION_COMPATIBILITY_CASES) +def test_image_metadata_tensor_inputs_match_torchvision(device, input_case): + _skip_if_gpu_unavailable(device) + input_kind, shape = input_case + inpt = _move_tensor_to_device(_make_compatibility_input(input_kind, shape), device) + + assert get_image_size(inpt) == _tv_get_image_size(inpt) + assert get_dimensions(inpt) == fn_tv.get_dimensions(inpt) + + +# --------------------------------------------------------------------------- +# Error cases +# --------------------------------------------------------------------------- + + +def test_get_image_size_1d_tensor_raises(): + with assert_raises(TypeError): + get_image_size(torch.zeros(10)) + + +def test_get_dimensions_1d_tensor_raises(): + with assert_raises(TypeError): + get_dimensions(torch.zeros(10)) + + +def test_get_image_size_unsupported_type_raises(): + with assert_raises(TypeError): + get_image_size("not_an_image") + + +def test_get_dimensions_unsupported_type_raises(): + with assert_raises(TypeError): + get_dimensions("not_an_image") diff --git a/dali/test/python/torchvision/test_tv_randomcrop.py b/dali/test/python/torchvision/test_tv_randomcrop.py new file mode 100644 index 00000000000..618125215e1 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_randomcrop.py @@ -0,0 +1,373 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import unittest + +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +import numpy as np +from PIL import Image +import torch +import torchvision.transforms.v2 as transforms +import torchvision.transforms.v2.functional as tv_fn + +from nvidia.dali.experimental.torchvision import Compose, RandomCrop +from nvidia.dali.experimental.torchvision.v2.operator import Operator + + +def make_tensor(shape=(3, 8, 10), dtype=torch.uint8): + return torch.arange(math.prod(shape), dtype=dtype).reshape(shape) + + +def make_pil_image(mode="RGB", h=8, w=10, seed=42): + rng = np.random.default_rng(seed) + if mode == "L": + data = rng.integers(0, 256, (h, w), dtype=np.uint8) + elif mode == "RGB": + data = rng.integers(0, 256, (h, w, 3), dtype=np.uint8) + elif mode == "RGBA": + data = rng.integers(0, 256, (h, w, 4), dtype=np.uint8) + else: + raise ValueError(f"Unsupported mode: {mode}") + return Image.fromarray(data, mode=mode) + + +def _to_tensor(inpt): + if isinstance(inpt, Image.Image): + return tv_fn.pil_to_tensor(inpt) + return inpt + + +def _assert_equal_to_torchvision(inpt, dali_transform, tv_transform, device="cpu"): + out = dali_transform(inpt) + tv_out = tv_transform(inpt) + + out = _to_tensor(out) + tv_out = _to_tensor(tv_out) + if device == "gpu": + out = out.cpu() + if isinstance(tv_out, torch.Tensor): + tv_out = tv_out.cpu() + + assert out.shape == tv_out.shape, f"Shape mismatch: {out.shape} != {tv_out.shape}" + assert torch.equal(out, tv_out), "DALI RandomCrop output differs from torchvision" + + +def _build_dali_random_crop(**kwargs): + batch_size = kwargs.pop("batch_size", 1) + return Compose([RandomCrop(**kwargs)], batch_size=batch_size) + + +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + +def _move_tensor_to_device(inpt, device): + if device == "gpu" and isinstance(inpt, torch.Tensor): + return inpt.cuda() + return inpt + + +def _possible_torchvision_random_crop_outputs(inpt, size, padding, fill=0, padding_mode="constant"): + crop_h, crop_w = RandomCrop.adjust_size(size) + pad_left, pad_top, pad_right, pad_bottom = RandomCrop.adjust_padding(padding) + + padded_h = inpt.shape[-2] + pad_top + pad_bottom + padded_w = inpt.shape[-1] + pad_left + pad_right + + if padded_h < crop_h: + diff = crop_h - padded_h + pad_top += diff + pad_bottom += diff + padded_h += 2 * diff + + if padded_w < crop_w: + diff = crop_w - padded_w + pad_left += diff + pad_right += diff + padded_w += 2 * diff + + padded = tv_fn.pad( + inpt, + padding=[pad_left, pad_top, pad_right, pad_bottom], + fill=fill, + padding_mode=padding_mode, + ) + + top_values = range(padded_h - crop_h + 1) if padded_h > crop_h else range(1) + left_values = range(padded_w - crop_w + 1) if padded_w > crop_w else range(1) + + return [ + tv_fn.crop(padded, top=top, left=left, height=crop_h, width=crop_w) + for top in top_values + for left in left_values + ] + + +def test_random_crop_is_operator(): + assert issubclass(RandomCrop, Operator) + + +@cartesian_params( + ("cpu", "gpu"), + ( + ("tensor", (3, 8, 10), (8, 10)), + ("tensor", (4, 3, 8, 10), (8, 10)), + ("pil", "L", (8, 10)), + ("pil", "RGB", (8, 10)), + ("pil", "RGBA", (8, 10)), + ), +) +def test_random_crop_identity_matches_torchvision(device, input_case): + _skip_if_gpu_unavailable(device) + input_type, input_arg, size = input_case + inpt = make_pil_image(input_arg) if input_type == "pil" else make_tensor(shape=input_arg) + inpt = _move_tensor_to_device(inpt, device) + batch_size = inpt.shape[0] if isinstance(inpt, torch.Tensor) and inpt.ndim > 3 else 1 + _assert_equal_to_torchvision( + inpt, + _build_dali_random_crop(size=size, device=device, batch_size=batch_size), + transforms.RandomCrop(size=size), + device=device, + ) + + +@cartesian_params( + ("cpu", "gpu"), + ( + (None, 0, "constant"), + (1, 0, "constant"), + ([1], 0, "constant"), + ([1, 1], 0, "constant"), + ([1, 1, 1, 1], 0, "constant"), + (1, 7, "constant"), + (1, (1, 2, 3), "constant"), + (1, None, "constant"), + (1, 0, "edge"), + (1, 0, "reflect"), + (1, 0, "symmetric"), + ), +) +def test_random_crop_padding_matches_torchvision_tensor(device, padding_case): + _skip_if_gpu_unavailable(device) + padding, fill, padding_mode = padding_case + tensor = _move_tensor_to_device(make_tensor(shape=(3, 4, 5)), device) + size = (4, 5) if padding is None else (6, 7) + + _assert_equal_to_torchvision( + tensor, + _build_dali_random_crop( + size=size, + padding=padding, + fill=fill, + padding_mode=padding_mode, + device=device, + ), + transforms.RandomCrop( + size=size, + padding=padding, + fill=fill, + padding_mode=padding_mode, + ), + device=device, + ) + + +@cartesian_params(("cpu", "gpu"), ("L", "RGB", "RGBA")) +def test_random_crop_padding_matches_torchvision_pil(device, mode): + _skip_if_gpu_unavailable(device) + img = make_pil_image(mode=mode, h=4, w=5) + _assert_equal_to_torchvision( + img, + _build_dali_random_crop(size=(6, 7), padding=1, fill=3, device=device), + transforms.RandomCrop(size=(6, 7), padding=1, fill=3), + device=device, + ) + + +@cartesian_params( + ("cpu", "gpu"), + ( + ([0, 1, 2, 0], (7, 8)), + ([2, 0, 0, 1], (6, 9)), + ([1, 2, 0, 3], (10, 7)), + ), +) +def test_random_crop_asymmetric_padding_with_pad_if_needed(device, padding_case): + _skip_if_gpu_unavailable(device) + padding, size = padding_case + tensor = make_tensor(shape=(3, 4, 5)) + expected_outputs = _possible_torchvision_random_crop_outputs( + tensor, + size=size, + padding=padding, + ) + + dali_out = _build_dali_random_crop( + size=size, + padding=padding, + pad_if_needed=True, + device=device, + )(_move_tensor_to_device(tensor, device)).cpu() + + assert any( + torch.equal(dali_out, expected) for expected in expected_outputs + ), "DALI RandomCrop output is not a valid torchvision crop" + + +@cartesian_params(("cpu", "gpu")) +def test_random_crop_pad_if_needed_matches_torchvision_random_offsets(device): + _skip_if_gpu_unavailable(device) + tensor = make_tensor(shape=(3, 4, 5)) + size = (6, 7) + + expected_outputs = { + out.numpy().tobytes() + for out in _possible_torchvision_random_crop_outputs( + tensor, + size=size, + padding=None, + ) + } + tv_transform = transforms.RandomCrop(size=size, pad_if_needed=True) + tv_outputs = {tv_transform(tensor).numpy().tobytes() for _ in range(100)} + + assert len(tv_outputs) > 1, "Torchvision RandomCrop did not sample multiple offsets" + assert tv_outputs <= expected_outputs, "Torchvision produced an unexpected pad_if_needed crop" + + dali_tensor = _move_tensor_to_device(tensor, device) + dali_transform = _build_dali_random_crop(size=size, pad_if_needed=True, device=device) + dali_outputs = {dali_transform(dali_tensor).cpu().numpy().tobytes() for _ in range(20)} + + assert ( + dali_outputs <= expected_outputs + ), "DALI RandomCrop produced an invalid pad_if_needed crop" + + +""" +# TODO: Fill using dictionary pattern is currently not supported +def test_random_crop_fill_dict_matches_torchvision_tensor(): + tensor = make_tensor(shape=(3, 4, 5)) + fill = {torch.Tensor: 9} + _assert_equal_to_torchvision( + tensor, + _build_dali_random_crop(size=(6, 7), padding=1, fill=fill), + transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), + ) + +def test_random_crop_fill_dict_matches_torchvision_pil(): + img = make_pil_image(mode="RGB", h=4, w=5) + fill = {Image.Image: (1, 2, 3)} + _assert_equal_to_torchvision( + img, + _build_dali_random_crop(size=(6, 7), padding=1, fill=fill), + transforms.RandomCrop(size=(6, 7), padding=1, fill=fill), + ) +""" + + +@cartesian_params( + ("cpu", "gpu"), + ( + (4, (4, 4)), + ([4, 5], (4, 5)), + ), +) +def test_random_crop_tensor_shape(device, shape_case): + _skip_if_gpu_unavailable(device) + size, expected_hw = shape_case + tensor = _move_tensor_to_device(make_tensor(), device) + out = _build_dali_random_crop(size=size, device=device)(tensor) + + assert out.shape == (3, *expected_hw) + + +@cartesian_params(("cpu", "gpu")) +def test_random_crop_samples_different_offsets(device): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_tensor(), device) + transform = _build_dali_random_crop(size=(4, 5), device=device) + + outputs = {bytes(transform(tensor).cpu().numpy().tobytes()) for _ in range(20)} + + assert len(outputs) > 1, "RandomCrop produced the same crop for every run" + + +@params("cpu", "gpu") +def test_random_crop_pad_if_needed_shape(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + tensor = make_tensor(shape=(3, 4, 5)) + if device == "gpu": + tensor = tensor.cuda() + out = _build_dali_random_crop(size=(6, 7), pad_if_needed=True, device=device)(tensor) + + assert out.shape == (3, 6, 7) + + +@params( + [], + [0, 5], + [5, 0], + [1.0, 2], + [1, 2, 3], + -1, + 1.0, + {"bad": "value"}, +) +def test_random_crop_invalid_size(size): + with assert_raises((TypeError, ValueError)): + _ = RandomCrop(size=size) + + +@params( + -1, + [1, -1], + [1, 2, 3], + [1.0], + "bad", +) +def test_random_crop_invalid_padding(padding): + with assert_raises((TypeError, ValueError)): + _ = RandomCrop(size=3, padding=padding) + + +def test_random_crop_invalid_pad_if_needed(): + with assert_raises(TypeError): + _ = RandomCrop(size=3, pad_if_needed="yes") + + +@params( + object(), + "bad", + [1, object()], + {object(): 1}, + {torch.Tensor: object()}, +) +def test_random_crop_invalid_fill(fill): + with assert_raises(TypeError): + _ = RandomCrop(size=3, padding=1, fill=fill) + + +def test_random_crop_invalid_padding_mode_when_padding_is_used(): + with assert_raises(ValueError): + _ = RandomCrop(size=3, padding=1, padding_mode="bad") + + +def test_random_crop_invalid_padding_mode_when_pad_if_needed_is_used(): + with assert_raises(ValueError): + _ = RandomCrop(size=3, pad_if_needed=True, padding_mode="bad")