docs: add canonical tooling corpus (147 files) from Google/HF/frameworks

Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 12:24:48 -04:00
parent 5011059f5d
commit eecebe7ef5
149 changed files with 181297 additions and 0 deletions
@@ -0,0 +1,26 @@
+"""Canonical Gemma 4 call via the google-genai Python SDK (Gemini API).
+
+Source: https://ai.google.dev/gemma/docs/core/gemma_on_gemini_api
+
+Install:  pip install google-genai
+Env:      GEMINI_API_KEY=...  (from https://aistudio.google.com/apikey)
+
+Hosted model IDs (2026-04):
+  - gemma-4-31b-it
+  - gemma-4-26b-a4b-it
+
+The E-series (E2B, E4B) is NOT exposed via the Gemini API — those are
+on-device-only checkpoints. For them you must self-host (Ollama,
+llama.cpp, vLLM, MLX).
+"""
+
+from google import genai
+
+client = genai.Client()  # picks up GEMINI_API_KEY from env
+
+response = client.models.generate_content(
+    model="gemma-4-26b-a4b-it",
+    contents="Write a haiku about inference framework fragmentation.",
+)
+
+print(response.text)
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Canonical Gemma 4 call via the Gemini API (Google AI Studio).
+# Source: https://ai.google.dev/gemma/docs/core/gemma_on_gemini_api
+# Hosted model IDs (2026-04): gemma-4-31b-it, gemma-4-26b-a4b-it
+# Note: hosted variants are the big ones only; on-device E2B/E4B are NOT served on the Gemini API.
+
+export GEMINI_API_KEY="..."  # from https://aistudio.google.com/apikey
+
+curl "https://generativelanguage.googleapis.com/v1beta/models/gemma-4-26b-a4b-it:generateContent" \
+  -H 'Content-Type: application/json' \
+  -H "x-goog-api-key: ${GEMINI_API_KEY}" \
+  -X POST \
+  -d '{
+    "contents": [{
+      "parts": [{"text": "Write a haiku about inference framework fragmentation."}]
+    }]
+  }'
@@ -0,0 +1,30 @@
+"""Canonical Keras / keras-hub example for Gemma 4.
+
+Source: keras-team/keras-hub — keras_hub/src/models/gemma4/
+Requires: pip install keras-hub keras[jax]  (or keras[torch] / keras[tensorflow])
+
+Presets (verified 2026-04-18 from gemma4_presets.py):
+  gemma4_2b              gemma4_instruct_2b
+  gemma4_4b              gemma4_instruct_4b
+  gemma4_26b_a4b         gemma4_instruct_26b_a4b
+  gemma4_31b             gemma4_instruct_31b
+
+Keras-hub is the reference implementation maintained by the Keras team
+(Google). It ships all components modularly — see the directory listing:
+gemma4_attention, gemma4_audio_encoder, gemma4_vision_encoder,
+gemma4_moe, gemma4_decoder_block, gemma4_causal_lm, etc.  This makes it
+the most legible path to *read* the architecture, but it is a
+training/fine-tuning tool — not a production inference server.
+"""
+
+import keras_hub
+
+# Text causal LM
+model = keras_hub.models.Gemma4CausalLM.from_preset("gemma4_instruct_4b")
+print(model.generate("Write a haiku about JAX.", max_length=128))
+
+# For multimodal (vision/audio) use the backbone + preprocessors directly:
+# backbone = keras_hub.models.Gemma4Backbone.from_preset("gemma4_instruct_4b")
+# preproc  = keras_hub.models.Gemma4CausalLMPreprocessor.from_preset("gemma4_instruct_4b")
+# Vision and audio encoders are in separate modules (gemma4_vision_encoder,
+# gemma4_audio_encoder) and are wired by the backbone when preset includes them.
@@ -0,0 +1,175 @@
+@ModelBase.register("Gemma4ForConditionalGeneration")
+class Gemma4Model(Gemma3Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA4
+
+    def norm_shift(self, name: str) -> float:
+        del name # unused
+        return 0.0
+
+    def set_vocab(self):
+        vocab = gguf.LlamaHfVocab(self.dir_model)
+        tokens = []
+        scores = []
+        toktypes = []
+        visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"}
+
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            text_str = text.decode()
+            if text_str in visible_tokens:
+                # always render these tokens, so that the chat parser can read them
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+                logger.info(f"Token '{text_str}' is set to USER_DEFINED")
+            else:
+                toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size
+
+        self.gguf_writer.add_tokenizer_model("gemma4")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+        self.gguf_writer.add_add_space_prefix(False)
+        self.gguf_writer.add_add_bos_token(True)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
+        self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
+
+        # per-layer embedding is optional
+        n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0
+        self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd)
+
+        swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]]
+        self.gguf_writer.add_sliding_window_pattern(swa_layers)
+
+        head_dim_full = self.hparams["global_head_dim"]
+        head_dim_swa = self.hparams["head_dim"]
+        # correct the head dim for global/swa layers
+        self.gguf_writer.add_key_length(head_dim_full)
+        self.gguf_writer.add_value_length(head_dim_full)
+        self.gguf_writer.add_key_length_swa(head_dim_swa)
+        self.gguf_writer.add_value_length_swa(head_dim_swa)
+
+        expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"])
+        if expert_intermediate_size is not None:
+            self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
+
+        # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers
+        use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False)
+        first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers
+        if use_double_wide_mlp:
+            n_ff = self.hparams["intermediate_size"]
+            n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)]
+            self.gguf_writer.add_feed_forward_length(n_ff_arr)
+
+        # handle num_global_key_value_heads
+        num_key_value_heads_full = self.hparams.get("num_global_key_value_heads")
+        num_key_value_heads_swa = self.hparams.get("num_key_value_heads")
+        if num_key_value_heads_full is not None and num_key_value_heads_swa is not None:
+            value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers]
+            self.gguf_writer.add_head_count_kv(value_arr)
+
+        # handle n_rot differently for global vs swa layers
+        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
+        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
+        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
+        self.gguf_writer.add_rope_dimension_count(n_rot_full)
+        self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa)
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        # full layer uses "proportional" rope with partial_rotary_factor=0.25
+        # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated),
+        # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention
+        # solution is to set specific freq_factors for the unrotated dims
+
+        # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers
+        rope_params_full = self.hparams["rope_parameters"]["full_attention"]
+        assert rope_params_full["rope_type"] == "proportional"
+        head_dim_full = (self.hparams["global_head_dim"])
+        partial_rotary_factor_full = rope_params_full["partial_rotary_factor"]
+        n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2)
+        n_unrot_full = int(head_dim_full / 2) - n_rot_full
+        values = [1.0] * n_rot_full + [1e30] * n_unrot_full
+        rope_freqs_full = torch.tensor(values, dtype=torch.float32)
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
+            name = name + ".weight"
+
+        if "language_model." not in name and "rope_freqs" not in name:
+            return # skip non-language model tensors
+
+        name = name.replace("language_model.", "")
+        if name.endswith("router.scale"):
+            name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale")
+            yield (name, data_torch)
+            return
+        if ".per_expert_scale" in name:
+            # convert per-expert scale to FFN down scale
+            name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale")
+            yield (name, data_torch)
+            return
+        if ".experts." in name and not name.endswith(".weight"):
+            name += ".weight"
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Gemma4ForConditionalGeneration")
+class Gemma4VisionAudioModel(MmprojModel):
+    has_audio_encoder = True
+    has_vision_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        self.hparams_vision["image_size"] = 224 # unused, but set to avoid error
+
+        # remap audio hparams
+        if self.hparams_audio:
+            self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
+            self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
+        else:
+            self.has_audio_encoder = False
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # vision params
+        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+
+        # audio params
+        if self.hparams_audio:
+            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
+            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+            self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+
+    def is_audio_tensor(self, name: str) -> bool:
+        return "audio_tower" in name or "embed_audio" in name
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if self.is_audio_tensor(name):
+            if ".conv" in name or "_conv" in name and ".weight" in name:
+                return gguf.GGMLQuantizationType.F32
+        if "position_embedding_table" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid # unused
+
+        if name.startswith("model.language_model."):
+            return # skip
+
+        if len(data_torch.shape) == 0:
+            # convert scalar tensors (input/output_mix/max) to 1D tensors
+            data_torch = data_torch.unsqueeze(0)
@@ -0,0 +1,92 @@
+# Copyright © 2025 Apple Inc.
+
+from dataclasses import dataclass
+from typing import Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten, tree_unflatten
+
+from . import gemma4_text
+from .base import BaseModelArgs
+
+
+@dataclass
+class ModelArgs(BaseModelArgs):
+    model_type: str = "gemma4"
+    text_config: dict = None
+    vocab_size: int = 262144
+
+    def __post_init__(self):
+        if self.text_config is None:
+            self.text_config = {}
+        self.text_config["vocab_size"] = self.vocab_size
+        self.text_config["num_attention_heads"] = self.text_config.get(
+            "num_attention_heads", 8
+        )
+        self.text_config["num_key_value_heads"] = self.text_config.get(
+            "num_key_value_heads", 1
+        )
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = args.model_type
+        self.language_model = gemma4_text.Model(
+            gemma4_text.ModelArgs.from_dict(args.text_config)
+        )
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+        input_embeddings: Optional[mx.array] = None,
+        per_layer_inputs: Optional[mx.array] = None,
+    ):
+        return self.language_model(
+            inputs,
+            cache=cache,
+            input_embeddings=input_embeddings,
+            per_layer_inputs=per_layer_inputs,
+        )
+
+    def sanitize(self, weights):
+        new_weights = {}
+        for k, v in weights.items():
+            starts_w_model = k.startswith("model.")
+
+            k = k.removeprefix("model.")
+            if k.startswith(
+                (
+                    "vision_tower",
+                    "multi_modal_projector",
+                    "audio_tower",
+                    "embed_audio",
+                    "embed_vision",
+                )
+            ):
+                continue
+
+            if not starts_w_model:
+                new_weights[k] = v
+                continue
+
+            if k.startswith("language_model"):
+                k = k.replace("language_model.", "language_model.model.")
+
+            new_weights[k] = v
+
+        return self.language_model.sanitize(new_weights)
+
+    @property
+    def layers(self):
+        return self.language_model.layers
+
+    @property
+    def quant_predicate(self):
+        return self.language_model.quant_predicate
+
+    def make_cache(self):
+        return self.language_model.make_cache()
@@ -0,0 +1,60 @@
+from typing import Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from ..base import InputEmbeddingsFeatures
+from .audio import AudioEncoder
+from .config import ModelConfig
+from .language import LanguageModel, RMSNormNoScale
+from .vision import VisionModel
+
+
+def masked_scatter(input_tensor, mask, source):
+    mask_flat = mask.flatten().astype(mx.int32)
+    indices = mx.cumsum(mask_flat) - 1
+    aligned = source.flatten()[indices % source.size]
+    return mx.where(mask_flat, aligned, input_tensor.flatten()).reshape(
+        input_tensor.shape
+    )
+
+
+class MultimodalEmbedder(nn.Module):
+    """Projects soft tokens from vision/audio into language model space."""
+
+    def __init__(self, embedding_dim: int, text_hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.embedding_projection = nn.Linear(
+            embedding_dim, text_hidden_size, bias=False
+        )
+        self.embedding_pre_projection_norm = RMSNormNoScale(embedding_dim, eps=eps)
+
+    def __call__(self, inputs_embeds: mx.array) -> mx.array:
+        normed = self.embedding_pre_projection_norm(inputs_embeds)
+        return self.embedding_projection(normed)
+
+
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.model_type = config.model_type
+        self.config = config
+
+        # Text
+        self.language_model = LanguageModel(config.text_config)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Vision
+        self.vision_tower = VisionModel(config.vision_config)
+        self.embed_vision = MultimodalEmbedder(
+            embedding_dim=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            eps=config.vision_config.rms_norm_eps,
+        )
+
+        # Audio
+        if config.audio_config is not None:
+            self.audio_tower = AudioEncoder(config.audio_config)
+            audio_output_dim = (
+                config.audio_config.output_proj_dims or config.audio_config.hidden_size
+            )
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The vLLM team.
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gemma 4 model implementation for vLLM."""
+
+from collections.abc import Iterable
+from dataclasses import replace
+from itertools import islice
+
+import regex as re
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
+
+from .interfaces import (
+    EagleModelMixin,
+    MixtureOfExperts,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+def _get_text_config(config):
+    """Dereference text_config if config is a nested Gemma4Config.
+
+    Gemma4 checkpoints use architectures=["Gemma4ForConditionalGeneration"]
+    which yields a Gemma4Config with nested text_config. This function
+    transparently returns the text config regardless of nesting.
+    """
+    if hasattr(config, "text_config"):
+        return config.text_config
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Gemma 4 multimodal model (image + audio + video support).
+
+Adds vision tower, audio tower, and multimodal embedders on top of the
+text-only Gemma4ForCausalLM.  The vision/audio encoders are loaded via
+AutoModel.from_config and run in eager mode while the language model uses
+the vLLM-optimized path.
+
+Video support:  Gemma4 does **not** have a native video tower.  Videos are
+decomposed into timestamped image frames (up to 32 frames at 70 soft tokens
+each) and fed through the same vision tower as regular images.  The
+processor inserts ``mm:ss`` timestamps between frames so the model can
+reason about temporal order.
+"""
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal
+
+import numpy as np
+import torch
+from PIL import Image as PILImage
+from torch import nn
+from transformers import AutoModel, BatchFeature
+from transformers.models.gemma4 import (
+    Gemma4Config,
+    Gemma4Processor,
+    Gemma4VisionConfig,
+)
+from transformers.models.gemma4.configuration_gemma4 import (
+    Gemma4AudioConfig,
+    Gemma4TextConfig,
+)
+
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
+from vllm.inputs import MultiModalDataDict
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+    VideoItem,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    ImageProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import BaseDummyInputsBuilder
+from vllm.multimodal.processing.processor import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
@@ -0,0 +1,16 @@
+# Source: vllm-project/vllm main branch — vllm/model_executor/models/registry.py
+# Verified 2026-04-18 via GitHub API.
+
+# Line 99 (text-only Gemma 4 CausalLM):
+"Gemma4ForCausalLM": ("gemma4", "Gemma4ForCausalLM"),
+
+# Line 230 (multimodal Gemma 4: vision + audio + video):
+"Gemma4ForCausalLM": ("gemma4_mm", "Gemma4ForConditionalGeneration"),
+
+# The second (_mm) registration maps Gemma4ForCausalLM -> gemma4_mm.Gemma4ForConditionalGeneration,
+# which wires in:
+#   - vision_tower (pixel_values, pixel_position_ids)
+#   - audio_tower  (input_features_padded, input_features_mask)  [E2B/E4B only]
+#   - video path   (pixel_values_videos — decomposed to frames, up to 32 frames @ 70 soft tokens)
+#
+# vLLM dispatches based on whether the HF config has audio_config populated.