docs: add canonical tooling corpus (147 files) from Google/HF/frameworks
Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,278 @@
|
||||
# Copyright 2026 the HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_backends import PilBackend
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_transforms import resize
|
||||
from ...image_utils import ImageInput
|
||||
from ...processing_utils import ImagesKwargs, Unpack
|
||||
from ...utils import TensorType, auto_docstring, is_vision_available, logging
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from ...image_utils import PILImageResampling
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_SUPPORTED_SOFT_TOKENS = (70, 140, 280, 560, 1120)
|
||||
|
||||
|
||||
def get_aspect_ratio_preserving_size(
|
||||
height: int,
|
||||
width: int,
|
||||
patch_size: int,
|
||||
max_patches: int,
|
||||
pooling_kernel_size: int,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Image is resized to preserve aspect ratio so it fits within the patch budget.
|
||||
Target dimensions are the largest that:
|
||||
1) Produce at most `max_patches` patches when patchified with `patch_size`
|
||||
2) Have height and width divisible by `pooling_kernel_size * patch_size`
|
||||
"""
|
||||
total_px = height * width
|
||||
target_px = max_patches * (patch_size**2)
|
||||
factor = math.sqrt(target_px / total_px)
|
||||
ideal_height = factor * height
|
||||
ideal_width = factor * width
|
||||
side_mult = pooling_kernel_size * patch_size
|
||||
|
||||
# Round down to nearest multiple of side_mult
|
||||
target_height = int(math.floor(ideal_height / side_mult)) * side_mult
|
||||
target_width = int(math.floor(ideal_width / side_mult)) * side_mult
|
||||
|
||||
# Handle edge cases where one or both dimensions round to 0
|
||||
if target_height == 0 and target_width == 0:
|
||||
raise ValueError(
|
||||
"Attempting to resize to a 0 x 0 image. Resized height should be divisble by "
|
||||
f"`pooling_kernel_size * patch_size`={pooling_kernel_size * patch_size}."
|
||||
)
|
||||
|
||||
max_side_length = (max_patches // pooling_kernel_size**2) * side_mult
|
||||
if target_height == 0:
|
||||
target_height = side_mult
|
||||
target_width = min(
|
||||
int(math.floor(width / height)) * side_mult,
|
||||
max_side_length,
|
||||
)
|
||||
elif target_width == 0:
|
||||
target_width = side_mult
|
||||
target_height = min(
|
||||
int(math.floor(height / width)) * side_mult,
|
||||
max_side_length,
|
||||
)
|
||||
|
||||
if target_height * target_width > target_px:
|
||||
raise ValueError(
|
||||
f"Resizing [{height}x{width}] to [{target_height}x{target_width}] "
|
||||
f"but this exceeds {max_patches} patches with patch_size {patch_size}"
|
||||
)
|
||||
|
||||
return target_height, target_width
|
||||
|
||||
|
||||
# Copied from transformers.models.siglip2.image_processing_pil_siglip2.convert_image_to_patches
|
||||
def convert_image_to_patches(image: np.ndarray, patch_size: int) -> np.ndarray:
|
||||
"""
|
||||
Convert 3D array image of shape (num_channels, image_height, image_width) into 2D array of patches of shape
|
||||
(num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
|
||||
"""
|
||||
num_channels, image_height, image_width = image.shape
|
||||
num_patches_height = image_height // patch_size
|
||||
num_patches_width = image_width // patch_size
|
||||
patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
|
||||
patched_image = patched_image.transpose(1, 3, 2, 4, 0)
|
||||
patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
|
||||
return patched_image
|
||||
|
||||
|
||||
# Adopted from Siglip2 (mask -> position ids)
|
||||
def pad_along_first_dim(image: np.ndarray, positions: np.ndarray, target_length: int) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Pad the image along the first dimension.
|
||||
"""
|
||||
current_length = image.shape[0]
|
||||
padding_length = target_length - current_length
|
||||
if padding_length > 0:
|
||||
paddings = [(0, padding_length)] + [(0, 0)] * (image.ndim - 1)
|
||||
pos_paddings = [(0, padding_length), (0, 0)]
|
||||
image = np.pad(image, paddings, mode="constant", constant_values=0)
|
||||
positions = np.pad(positions, pos_paddings, mode="constant", constant_values=-1)
|
||||
return image, positions
|
||||
|
||||
|
||||
class Gemma4ImageProcessorKwargs(ImagesKwargs, total=False):
|
||||
"""
|
||||
patch_size (`int`, *optional*):
|
||||
Size of each image patch in pixels.
|
||||
max_soft_tokens (`int`, *optional*):
|
||||
Maximum number of soft (vision) tokens per image.
|
||||
Must be one of {70, 140, 280, 560, 1120}.
|
||||
pooling_kernel_size (`int`, *optional*):
|
||||
Spatial pooling kernel size applied after patchification.
|
||||
"""
|
||||
|
||||
patch_size: int
|
||||
max_soft_tokens: int
|
||||
pooling_kernel_size: int
|
||||
|
||||
|
||||
@auto_docstring(custom_intro="Constructs a Gemma4 image processor.")
|
||||
class Gemma4ImageProcessorPil(PilBackend):
|
||||
valid_kwargs = Gemma4ImageProcessorKwargs
|
||||
model_input_names = ["pixel_values", "image_position_ids", "num_soft_tokens_per_image"]
|
||||
|
||||
do_resize = True
|
||||
resample = PILImageResampling.BICUBIC
|
||||
do_rescale = True
|
||||
rescale_factor = 1 / 255
|
||||
do_normalize = False
|
||||
image_mean = [0.0, 0.0, 0.0]
|
||||
image_std = [1.0, 1.0, 1.0]
|
||||
do_convert_rgb = True
|
||||
patch_size = 16
|
||||
max_soft_tokens = 280
|
||||
pooling_kernel_size = 3
|
||||
|
||||
def __init__(self, **kwargs: Unpack[Gemma4ImageProcessorKwargs]) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.max_soft_tokens not in _SUPPORTED_SOFT_TOKENS:
|
||||
raise ValueError(f"`max_soft_tokens` must be one of {_SUPPORTED_SOFT_TOKENS}, got {self.max_soft_tokens}.")
|
||||
|
||||
def _validate_preprocess_kwargs(self, **kwargs):
|
||||
# Gemma4 uses aspect_ratio_preserving_resize driven by patch_size,
|
||||
# max_soft_tokens, and pooling_kernel_size — not the standard `size`
|
||||
# parameter. Temporarily disable do_resize so the base validation
|
||||
# doesn't require `size` to be set.
|
||||
kwargs["do_resize"] = False
|
||||
super()._validate_preprocess_kwargs(**kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
**kwargs: Unpack[Gemma4ImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
return super().preprocess(images, **kwargs)
|
||||
|
||||
def aspect_ratio_preserving_resize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
patch_size: int,
|
||||
max_patches: int,
|
||||
pooling_kernel_size: int,
|
||||
resample: PILImageResampling,
|
||||
) -> np.ndarray:
|
||||
height, width = image.shape[-2], image.shape[-1]
|
||||
target_height, target_width = get_aspect_ratio_preserving_size(
|
||||
height=height,
|
||||
width=width,
|
||||
patch_size=patch_size,
|
||||
max_patches=max_patches,
|
||||
pooling_kernel_size=pooling_kernel_size,
|
||||
)
|
||||
|
||||
if target_height == height and target_width == width:
|
||||
return image
|
||||
|
||||
return resize(
|
||||
image,
|
||||
size=(target_height, target_width),
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list[np.ndarray],
|
||||
do_resize: bool,
|
||||
resample: "PILImageResampling | int | None",
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: float | list[float] | None,
|
||||
image_std: float | list[float] | None,
|
||||
return_tensors: str | TensorType | None,
|
||||
max_soft_tokens: int | None = None,
|
||||
patch_size: int | None = None,
|
||||
pooling_kernel_size: int | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
if max_soft_tokens not in _SUPPORTED_SOFT_TOKENS:
|
||||
raise ValueError(f"`max_soft_tokens` must be one of {_SUPPORTED_SOFT_TOKENS}, got {max_soft_tokens}.")
|
||||
|
||||
# Compute max_patches from max_soft_tokens and pooling_kernel_size
|
||||
max_patches = max_soft_tokens * pooling_kernel_size**2
|
||||
|
||||
# Process each image individually: resize, rescale/normalize, patchify, pad.
|
||||
# Images have different aspect ratios and thus different resized dimensions,
|
||||
# so patchification and padding must happen per-image before stacking.
|
||||
pixel_values = []
|
||||
position_ids = []
|
||||
num_soft_tokens_per_image = []
|
||||
|
||||
for image in images:
|
||||
# Step 1: Aspect-ratio-preserving resize
|
||||
if do_resize:
|
||||
image = self.aspect_ratio_preserving_resize(
|
||||
image=image,
|
||||
patch_size=patch_size,
|
||||
max_patches=max_patches,
|
||||
pooling_kernel_size=pooling_kernel_size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
# Step 2: Rescale pixel values from [0, 255] to [0, 1]
|
||||
if do_rescale:
|
||||
image = self.rescale(image=image, scale=rescale_factor)
|
||||
|
||||
# Step 3: Identity normalization because Gemma4 was trained with pixels in [0, 1]
|
||||
if do_normalize:
|
||||
image = self.normalize(image=image, mean=image_mean, std=image_std)
|
||||
|
||||
# Step 4: Patchify the image
|
||||
# image is (C, H, W) numpy array; add batch dimension for reshape
|
||||
# (num_channels, height, width) -> (num_patches, patch_size * patch_size * num_channels)
|
||||
patches = convert_image_to_patches(image, patch_size)
|
||||
num_soft_tokens_per_image.append(patches.shape[0] // pooling_kernel_size**2)
|
||||
|
||||
# Step 5: Compute position IDs
|
||||
patch_height = image.shape[-2] // patch_size
|
||||
patch_width = image.shape[-1] // patch_size
|
||||
grid_x, grid_y = np.meshgrid(np.arange(patch_width), np.arange(patch_height), indexing="xy")
|
||||
real_positions = np.stack([grid_x, grid_y], axis=-1).reshape(patches.shape[0], 2)
|
||||
|
||||
patches, positions = pad_along_first_dim(patches, real_positions, max_patches)
|
||||
|
||||
pixel_values.append(patches)
|
||||
position_ids.append(positions)
|
||||
|
||||
# Stack into batch arrays and convert to tensors
|
||||
pixel_values = np.stack(pixel_values, axis=0) # (batch, max_patches, patch_pixels)
|
||||
position_ids = np.stack(position_ids, axis=0) # (batch, max_patches, 2)
|
||||
|
||||
data = {
|
||||
"pixel_values": pixel_values,
|
||||
"image_position_ids": position_ids,
|
||||
"num_soft_tokens_per_image": num_soft_tokens_per_image,
|
||||
}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
__all__ = ["Gemma4ImageProcessorPil"]
|
||||
Reference in New Issue
Block a user