Files
Mortdecai eecebe7ef5 docs: add canonical tooling corpus (147 files) from Google/HF/frameworks
Five-lane parallel research pass. Each subdir under tooling/ has its own
README indexing downloaded files with verified upstream sources.

- google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts,
  gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev
  HTML snapshots, Gemma 3 tech report
- huggingface/: 8 gemma-4-* model cards, chat-template .jinja files,
  tokenizer_config.json, transformers gemma4/ source, launch blog posts,
  official HF Spaces app.py
- inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI
  comparison, run_commands.sh with 8 working launches, 9 code snippets
- gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2,
  Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma)
- fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE),
  TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md

Findings that update earlier CORPUS_* docs are flagged in tooling/README.md
(not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch
abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM,
FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech
report PDF yet, no Gemma-4-generation specialized siblings yet.

Pre-commit secrets hook bypassed per user authorization — flagged "secrets"
are base64 notebook cell outputs and example Ed25519 keys in the HDP
agentic-security demo, not real credentials.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 12:24:48 -04:00

367 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Copyright 2026 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import numpy as np
from ...audio_utils import AudioInput
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput, make_nested_list_of_images
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import auto_docstring, is_vision_available, logging
from ...utils.import_utils import requires
from ...video_utils import VideoInput
if is_vision_available():
from .image_processing_pil_gemma4 import Gemma4ImageProcessorKwargs, get_aspect_ratio_preserving_size
logger = logging.get_logger(__name__)
class Gemma4ProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Gemma4ImageProcessorKwargs
_defaults = {
"text_kwargs": {
"padding": True,
"return_mm_token_type_ids": True,
},
"images_kwargs": {
"do_convert_rgb": True,
},
"audio_kwargs": {},
"videos_kwargs": {"return_metadata": True},
}
@auto_docstring
@requires(backends=("vision",))
class Gemma4Processor(ProcessorMixin):
def __init__(
self,
feature_extractor,
image_processor,
tokenizer,
video_processor,
chat_template=None,
image_seq_length: int = 280,
audio_seq_length: int = 750,
audio_ms_per_token: int = 40,
**kwargs,
):
r"""
image_seq_length (`int`, *optional*, defaults to 280):
The number of soft tokens per image used for placeholder expansion.
audio_seq_length (`int`, *optional*, defaults to 750):
The maximum number of audio soft tokens per audio segment. Serves as an
upper-bound cap when dynamic audio token counts are computed.
audio_ms_per_token (`int`, *optional*, defaults to 40):
Milliseconds of audio per output soft token. Used to dynamically compute
the number of audio placeholder tokens as ``ceil(duration_ms / audio_ms_per_token)``.
The default of 40 comes from the SSCP convolution's 4× time reduction on 10ms frames.
"""
self.image_seq_length = image_seq_length
self.image_token_id = tokenizer.image_token_id
self.boi_token = tokenizer.boi_token
self.eoi_token = tokenizer.eoi_token
self.image_token = tokenizer.image_token
# FIXME: add the token to config and ask Ryan to re-upload
tokenizer.add_special_tokens({"additional_special_tokens": ["<|video|>"]})
self.video_token = "<|video|>"
self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token)
# Audio token handling, mirroring the vision pattern.
# audio_seq_length serves as the maximum cap on the number of audio soft tokens
# any single audio segment can produce. With dynamic audio tokens, the actual
# number of placeholders inserted per audio is computed from the audio duration.
self.audio_seq_length = audio_seq_length
# Milliseconds of audio per output soft token. The default of 40 comes from the
# SSCP convolution's 4× time reduction applied to 10ms mel spectrogram frames.
self.audio_ms_per_token = audio_ms_per_token
self.audio_token_id = getattr(tokenizer, "audio_token_id", None)
self.audio_token = getattr(tokenizer, "audio_token", None)
self.boa_token = getattr(tokenizer, "boa_token", None)
self.eoa_token = getattr(tokenizer, "eoa_token", None)
super().__init__(
feature_extractor=feature_extractor,
image_processor=image_processor,
tokenizer=tokenizer,
video_processor=video_processor,
chat_template=chat_template,
**kwargs,
)
@auto_docstring
def __call__(
self,
images: ImageInput | None = None,
text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
audio: AudioInput | None = None,
videos: VideoInput | None = None,
**kwargs: Unpack[Gemma4ProcessorKwargs],
) -> BatchFeature:
if text is None and images is None and audio is None and videos is None:
raise ValueError("Provide at least one of `text`, `images`, `audio`, or `videos`.")
output_kwargs = self._merge_kwargs(
Gemma4ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise TypeError("Invalid input text. Please provide a string, or a list of strings")
image_inputs = {}
if images is not None:
images = self.image_processor.fetch_images(images)
batched_images = make_nested_list_of_images(images)
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
num_soft_tokens = image_inputs.pop("num_soft_tokens_per_image")
# Create empty text to be replaced with placeholders
if not text:
text = [" ".join([self.image_token] * len(images)) for images in batched_images]
if len(batched_images) != len(text):
raise ValueError(
f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
)
replacements = [f"{self.boi_token}{self.image_token * n}{self.eoi_token}" for n in num_soft_tokens]
replacements_iter = iter(replacements)
# Expand image_token placeholders to per-image soft token sequences.
# re.sub never re-scans replaced text, so it is safe
pattern = re.escape(self.image_token)
text = [re.sub(pattern, lambda _: next(replacements_iter), prompt) for prompt in text]
# Process video inputs in same way
video_inputs = {}
if videos is not None:
video_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
num_video_tokens = video_inputs.pop("num_soft_tokens_per_video")
# If user has not requested video metadata, pop it so it isn't returned
if not kwargs.get("return_metadata"):
video_metadata = video_inputs.pop("video_metadata")
else:
video_metadata = video_inputs["video_metadata"]
video_replacements = []
for metadata, n_tokens in zip(video_metadata, num_video_tokens):
if metadata.fps is None:
logger.warning_once(
"Gemma 4 requires frame timestamps to construct prompts, but the `fps` of the input video "
"could not be inferred. Probably `video_metadata` was missing from inputs and you passed "
"pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more "
"accurate results."
)
metadata.fps = 24 if metadata.fps is None else metadata.fps
# mm:ss format for timestamps
timestamp_str = [
f"{int(seconds // 60):02d}:{int(seconds % 60):02d}" for seconds in metadata.timestamps
]
video_replacements.append(
" ".join(
[f"{t} {self.boi_token}{self.video_token * n_tokens}{self.eoi_token}" for t in timestamp_str]
)
)
video_replacements = iter(video_replacements)
pattern = re.escape(self.video_token)
text = [re.sub(pattern, lambda _: next(video_replacements), prompt) for prompt in text]
# Process audio inputs
audio_inputs = {}
if audio is not None:
if self.audio_token is None or self.boa_token is None or self.eoa_token is None:
raise ValueError(
"Audio inputs were provided, but the tokenizer does not have an `audio_token` defined."
)
# Normalize audio input to list of waveforms
if isinstance(audio, np.ndarray) and audio.ndim == 1:
audio = [audio]
# TODO: Add tests for audio-only processor inputs.
if not text:
text = [self.audio_token] * len(audio)
# Dynamic audio token expansion wihtout padding:
# * Extract audio features with feature extractor;
# * Compute precise per-audio token counts from the waveform duration;
# * Generate full audio token sequence for each computed audio length;
# * Expand text prompts with full audio token sequences.
audio_kwargs = output_kwargs.get("audio_kwargs", {})
audio_inputs = self.feature_extractor(audio, **audio_kwargs)
sampling_rate = self.feature_extractor.sampling_rate
num_audio_tokens = [self._compute_audio_num_tokens(a, sampling_rate) for a in audio]
replacements = [f"{self.boa_token}{self.audio_token * n}{self.eoa_token}" for n in num_audio_tokens]
replacements_iter = iter(replacements)
audio_pattern = re.escape(self.audio_token)
text = [re.sub(audio_pattern, lambda _: next(replacements_iter), prompt) for prompt in text]
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
# Check special tokens for all active modalities
active_modalities = []
if images is not None:
active_modalities.append("image")
if videos is not None:
active_modalities.append("video")
if audio is not None:
active_modalities.append("audio")
if active_modalities:
self._check_special_mm_tokens(text, text_inputs, modalities=active_modalities)
if return_mm_token_type_ids:
text_inputs["mm_token_type_ids"] = self.create_mm_token_type_ids(text_inputs["input_ids"])
return BatchFeature(
data={**text_inputs, **image_inputs, **audio_inputs, **video_inputs},
tensor_type=return_tensors,
)
def _compute_audio_num_tokens(self, audio_waveform, sampling_rate: int) -> int:
"""Compute the number of audio soft tokens for a single waveform.
Replicates the exact sequence-length arithmetic of the audio encoder
so that the processor inserts the correct number of placeholder tokens.
The computation mirrors:
1. Mel framing via ``_unfold`` in ``Gemma4AudioFeatureExtractor``
2. Two ``Conv2d`` subsampling layers in ``Gemma4AudioSubSampleConvProjection``
(each: kernel=3, stride=2, semicausal padding top=1, bottom=1)
The result is capped at ``self.audio_seq_length`` (the configured maximum).
Args:
audio_waveform: A 1-D numpy array or list containing the raw audio samples.
sampling_rate: The sampling rate of the audio waveform in Hz.
Returns:
The number of audio soft tokens to insert as placeholders.
"""
num_samples = len(audio_waveform)
# Step 1: Mel frames (matches feature_extraction_gemma4.py _unfold)
frame_length = int(round(sampling_rate * 20.0 / 1000.0)) # 320 @ 16kHz
hop_length = int(round(sampling_rate * 10.0 / 1000.0)) # 160 @ 16kHz
frame_size_for_unfold = frame_length + 1 # 321
# The feature extractor prepends (frame_length // 2) zero samples as
# semicausal time-padding before the unfold. We must include this to
# match the actual number of mel frames it produces.
pad_left = frame_length // 2 # 160 @ 16kHz
padded_samples = num_samples + pad_left
num_mel_frames = (padded_samples - frame_size_for_unfold) // hop_length + 1
if num_mel_frames <= 0:
return 0
# Step 2: Two SSCP conv layers (kernel=3, stride=2, semicausal pad top=1, bottom=1)
# Each layer: T_out = (T_in + pad_top + pad_bottom - kernel) // stride + 1
t = num_mel_frames
for _ in range(2):
t_padded = t + 2 # pad_top=1, pad_bottom=1
t = (t_padded - 3) // 2 + 1
# Cap at the configured maximum
return min(t, self.audio_seq_length)
def _get_num_multimodal_tokens(self, image_sizes=None, audio_lengths=None, **kwargs):
"""
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
image_sizes (`list[list[int]]`, *optional*):
The input sizes formatted as (height, width) per each image.
audio_lengths (`list[int]`, *optional*):
The lengths of audio inputs in number of samples. Used to dynamically
compute per-audio token counts.
Returns:
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
input modalities, along with other useful data.
"""
images_kwargs = Gemma4ProcessorKwargs._defaults.get("images_kwargs", {})
images_kwargs.update(kwargs)
patch_size = images_kwargs.get("patch_size", None) or self.image_processor.patch_size
pooling_kernel_size = (
images_kwargs.get("pooling_kernel_size", None) or self.image_processor.pooling_kernel_size
)
max_soft_tokens = images_kwargs.get("max_soft_tokens", None) or self.image_processor.max_soft_tokens
max_patches = max_soft_tokens * pooling_kernel_size**2
vision_data = {}
if image_sizes is not None:
num_image_tokens = []
for image_size in image_sizes:
target_h, target_w = get_aspect_ratio_preserving_size(
height=image_size[0],
width=image_size[1],
patch_size=patch_size,
max_patches=max_patches,
pooling_kernel_size=pooling_kernel_size,
)
patch_height = target_h // patch_size
patch_width = target_w // patch_size
num_image_tokens.append(patch_height * patch_width // pooling_kernel_size**2)
num_image_patches = [1] * len(image_sizes)
vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
if audio_lengths is not None:
# Dynamically compute per-audio token counts from sample lengths.
# audio_lengths are in number of samples; assume default sampling rate.
sampling_rate = getattr(self.feature_extractor, "sampling_rate", 16_000)
num_audio_tokens = [
self._compute_audio_num_tokens(np.zeros(length), sampling_rate) for length in audio_lengths
]
vision_data.update({"num_audio_tokens": num_audio_tokens})
return MultiModalData(**vision_data)
@property
def model_input_names(self):
model_input_names = super().model_input_names
model_input_names = [
name
for name in model_input_names
if name not in ["num_soft_tokens_per_image", "num_soft_tokens_per_video"]
]
# Include audio feature extractor input names if available
if self.feature_extractor is not None:
feature_extractor_input_names = self.feature_extractor.model_input_names
model_input_names.extend([name for name in feature_extractor_input_names if name not in model_input_names])
return model_input_names + ["mm_token_type_ids"]
__all__ = ["Gemma4Processor"]