gemma4-research/tooling/huggingface/transformers/image_processing_pil_gemma4.py

# Copyright 2026 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math

import numpy as np

from ...image_processing_backends import PilBackend
from ...image_processing_utils import BatchFeature
from ...image_transforms import resize
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring, is_vision_available, logging


if is_vision_available():
    from ...image_utils import PILImageResampling


logger = logging.get_logger(__name__)

_SUPPORTED_SOFT_TOKENS = (70, 140, 280, 560, 1120)


def get_aspect_ratio_preserving_size(
    height: int,
    width: int,
    patch_size: int,
    max_patches: int,
    pooling_kernel_size: int,
) -> tuple[int, int]:
    """
    Image is resized to preserve aspect ratio so it fits within the patch budget.
    Target dimensions are the largest that:
    1) Produce at most `max_patches` patches when patchified with `patch_size`
    2) Have height and width divisible by `pooling_kernel_size * patch_size`
    """
    total_px = height * width
    target_px = max_patches * (patch_size**2)
    factor = math.sqrt(target_px / total_px)
    ideal_height = factor * height
    ideal_width = factor * width
    side_mult = pooling_kernel_size * patch_size

    # Round down to nearest multiple of side_mult
    target_height = int(math.floor(ideal_height / side_mult)) * side_mult
    target_width = int(math.floor(ideal_width / side_mult)) * side_mult

    # Handle edge cases where one or both dimensions round to 0
    if target_height == 0 and target_width == 0:
        raise ValueError(
            "Attempting to resize to a 0 x 0 image. Resized height should be divisble by "
            f"`pooling_kernel_size * patch_size`={pooling_kernel_size * patch_size}."
        )

    max_side_length = (max_patches // pooling_kernel_size**2) * side_mult
    if target_height == 0:
        target_height = side_mult
        target_width = min(
            int(math.floor(width / height)) * side_mult,
            max_side_length,
        )
    elif target_width == 0:
        target_width = side_mult
        target_height = min(
            int(math.floor(height / width)) * side_mult,
            max_side_length,
        )

    if target_height * target_width > target_px:
        raise ValueError(
            f"Resizing [{height}x{width}] to [{target_height}x{target_width}] "
            f"but this exceeds {max_patches} patches with patch_size {patch_size}"
        )

    return target_height, target_width


# Copied from transformers.models.siglip2.image_processing_pil_siglip2.convert_image_to_patches
def convert_image_to_patches(image: np.ndarray, patch_size: int) -> np.ndarray:
    """
    Convert 3D array image of shape (num_channels, image_height, image_width) into 2D array of patches of shape
    (num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
    """
    num_channels, image_height, image_width = image.shape
    num_patches_height = image_height // patch_size
    num_patches_width = image_width // patch_size
    patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
    patched_image = patched_image.transpose(1, 3, 2, 4, 0)
    patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
    return patched_image


# Adopted from Siglip2 (mask -> position ids)
def pad_along_first_dim(image: np.ndarray, positions: np.ndarray, target_length: int) -> tuple[np.ndarray, np.ndarray]:
    """
    Pad the image along the first dimension.
    """
    current_length = image.shape[0]
    padding_length = target_length - current_length
    if padding_length > 0:
        paddings = [(0, padding_length)] + [(0, 0)] * (image.ndim - 1)
        pos_paddings = [(0, padding_length), (0, 0)]
        image = np.pad(image, paddings, mode="constant", constant_values=0)
        positions = np.pad(positions, pos_paddings, mode="constant", constant_values=-1)
    return image, positions


class Gemma4ImageProcessorKwargs(ImagesKwargs, total=False):
    """
    patch_size (`int`, *optional*):
        Size of each image patch in pixels.
    max_soft_tokens (`int`, *optional*):
        Maximum number of soft (vision) tokens per image.
        Must be one of {70, 140, 280, 560, 1120}.
    pooling_kernel_size (`int`, *optional*):
        Spatial pooling kernel size applied after patchification.
    """

    patch_size: int
    max_soft_tokens: int
    pooling_kernel_size: int


@auto_docstring(custom_intro="Constructs a Gemma4 image processor.")
class Gemma4ImageProcessorPil(PilBackend):
    valid_kwargs = Gemma4ImageProcessorKwargs
    model_input_names = ["pixel_values", "image_position_ids", "num_soft_tokens_per_image"]

    do_resize = True
    resample = PILImageResampling.BICUBIC
    do_rescale = True
    rescale_factor = 1 / 255
    do_normalize = False
    image_mean = [0.0, 0.0, 0.0]
    image_std = [1.0, 1.0, 1.0]
    do_convert_rgb = True
    patch_size = 16
    max_soft_tokens = 280
    pooling_kernel_size = 3

    def __init__(self, **kwargs: Unpack[Gemma4ImageProcessorKwargs]) -> None:
        super().__init__(**kwargs)

        if self.max_soft_tokens not in _SUPPORTED_SOFT_TOKENS:
            raise ValueError(f"`max_soft_tokens` must be one of {_SUPPORTED_SOFT_TOKENS}, got {self.max_soft_tokens}.")

    def _validate_preprocess_kwargs(self, **kwargs):
        # Gemma4 uses aspect_ratio_preserving_resize driven by patch_size,
        # max_soft_tokens, and pooling_kernel_size — not the standard `size`
        # parameter. Temporarily disable do_resize so the base validation
        # doesn't require `size` to be set.
        kwargs["do_resize"] = False
        super()._validate_preprocess_kwargs(**kwargs)

    @auto_docstring
    def preprocess(
        self,
        images: ImageInput,
        **kwargs: Unpack[Gemma4ImageProcessorKwargs],
    ) -> BatchFeature:
        return super().preprocess(images, **kwargs)

    def aspect_ratio_preserving_resize(
        self,
        image: np.ndarray,
        patch_size: int,
        max_patches: int,
        pooling_kernel_size: int,
        resample: PILImageResampling,
    ) -> np.ndarray:
        height, width = image.shape[-2], image.shape[-1]
        target_height, target_width = get_aspect_ratio_preserving_size(
            height=height,
            width=width,
            patch_size=patch_size,
            max_patches=max_patches,
            pooling_kernel_size=pooling_kernel_size,
        )

        if target_height == height and target_width == width:
            return image

        return resize(
            image,
            size=(target_height, target_width),
            resample=resample,
        )

    def _preprocess(
        self,
        images: list[np.ndarray],
        do_resize: bool,
        resample: "PILImageResampling | int | None",
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        image_mean: float | list[float] | None,
        image_std: float | list[float] | None,
        return_tensors: str | TensorType | None,
        max_soft_tokens: int | None = None,
        patch_size: int | None = None,
        pooling_kernel_size: int | None = None,
        **kwargs,
    ) -> BatchFeature:
        if max_soft_tokens not in _SUPPORTED_SOFT_TOKENS:
            raise ValueError(f"`max_soft_tokens` must be one of {_SUPPORTED_SOFT_TOKENS}, got {max_soft_tokens}.")

        # Compute max_patches from max_soft_tokens and pooling_kernel_size
        max_patches = max_soft_tokens * pooling_kernel_size**2

        # Process each image individually: resize, rescale/normalize, patchify, pad.
        # Images have different aspect ratios and thus different resized dimensions,
        # so patchification and padding must happen per-image before stacking.
        pixel_values = []
        position_ids = []
        num_soft_tokens_per_image = []

        for image in images:
            # Step 1: Aspect-ratio-preserving resize
            if do_resize:
                image = self.aspect_ratio_preserving_resize(
                    image=image,
                    patch_size=patch_size,
                    max_patches=max_patches,
                    pooling_kernel_size=pooling_kernel_size,
                    resample=resample,
                )

            # Step 2: Rescale pixel values from [0, 255] to [0, 1]
            if do_rescale:
                image = self.rescale(image=image, scale=rescale_factor)

            # Step 3: Identity normalization because Gemma4 was trained with pixels in [0, 1]
            if do_normalize:
                image = self.normalize(image=image, mean=image_mean, std=image_std)

            # Step 4: Patchify the image
            # image is (C, H, W) numpy array; add batch dimension for reshape
            # (num_channels, height, width) -> (num_patches, patch_size * patch_size * num_channels)
            patches = convert_image_to_patches(image, patch_size)
            num_soft_tokens_per_image.append(patches.shape[0] // pooling_kernel_size**2)

            # Step 5: Compute position IDs
            patch_height = image.shape[-2] // patch_size
            patch_width = image.shape[-1] // patch_size
            grid_x, grid_y = np.meshgrid(np.arange(patch_width), np.arange(patch_height), indexing="xy")
            real_positions = np.stack([grid_x, grid_y], axis=-1).reshape(patches.shape[0], 2)

            patches, positions = pad_along_first_dim(patches, real_positions, max_patches)

            pixel_values.append(patches)
            position_ids.append(positions)

        # Stack into batch arrays and convert to tensors
        pixel_values = np.stack(pixel_values, axis=0)  # (batch, max_patches, patch_pixels)
        position_ids = np.stack(position_ids, axis=0)  # (batch, max_patches, 2)

        data = {
            "pixel_values": pixel_values,
            "image_position_ids": position_ids,
            "num_soft_tokens_per_image": num_soft_tokens_per_image,
        }
        return BatchFeature(data=data, tensor_type=return_tensors)


__all__ = ["Gemma4ImageProcessorPil"]