docs: add canonical tooling corpus (147 files) from Google/HF/frameworks
Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
"""Canonical Gemma 4 call via the google-genai Python SDK (Gemini API).
|
||||
|
||||
Source: https://ai.google.dev/gemma/docs/core/gemma_on_gemini_api
|
||||
|
||||
Install: pip install google-genai
|
||||
Env: GEMINI_API_KEY=... (from https://aistudio.google.com/apikey)
|
||||
|
||||
Hosted model IDs (2026-04):
|
||||
- gemma-4-31b-it
|
||||
- gemma-4-26b-a4b-it
|
||||
|
||||
The E-series (E2B, E4B) is NOT exposed via the Gemini API — those are
|
||||
on-device-only checkpoints. For them you must self-host (Ollama,
|
||||
llama.cpp, vLLM, MLX).
|
||||
"""
|
||||
|
||||
from google import genai
|
||||
|
||||
client = genai.Client() # picks up GEMINI_API_KEY from env
|
||||
|
||||
response = client.models.generate_content(
|
||||
model="gemma-4-26b-a4b-it",
|
||||
contents="Write a haiku about inference framework fragmentation.",
|
||||
)
|
||||
|
||||
print(response.text)
|
||||
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
# Canonical Gemma 4 call via the Gemini API (Google AI Studio).
|
||||
# Source: https://ai.google.dev/gemma/docs/core/gemma_on_gemini_api
|
||||
# Hosted model IDs (2026-04): gemma-4-31b-it, gemma-4-26b-a4b-it
|
||||
# Note: hosted variants are the big ones only; on-device E2B/E4B are NOT served on the Gemini API.
|
||||
|
||||
export GEMINI_API_KEY="..." # from https://aistudio.google.com/apikey
|
||||
|
||||
curl "https://generativelanguage.googleapis.com/v1beta/models/gemma-4-26b-a4b-it:generateContent" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "x-goog-api-key: ${GEMINI_API_KEY}" \
|
||||
-X POST \
|
||||
-d '{
|
||||
"contents": [{
|
||||
"parts": [{"text": "Write a haiku about inference framework fragmentation."}]
|
||||
}]
|
||||
}'
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Canonical Keras / keras-hub example for Gemma 4.
|
||||
|
||||
Source: keras-team/keras-hub — keras_hub/src/models/gemma4/
|
||||
Requires: pip install keras-hub keras[jax] (or keras[torch] / keras[tensorflow])
|
||||
|
||||
Presets (verified 2026-04-18 from gemma4_presets.py):
|
||||
gemma4_2b gemma4_instruct_2b
|
||||
gemma4_4b gemma4_instruct_4b
|
||||
gemma4_26b_a4b gemma4_instruct_26b_a4b
|
||||
gemma4_31b gemma4_instruct_31b
|
||||
|
||||
Keras-hub is the reference implementation maintained by the Keras team
|
||||
(Google). It ships all components modularly — see the directory listing:
|
||||
gemma4_attention, gemma4_audio_encoder, gemma4_vision_encoder,
|
||||
gemma4_moe, gemma4_decoder_block, gemma4_causal_lm, etc. This makes it
|
||||
the most legible path to *read* the architecture, but it is a
|
||||
training/fine-tuning tool — not a production inference server.
|
||||
"""
|
||||
|
||||
import keras_hub
|
||||
|
||||
# Text causal LM
|
||||
model = keras_hub.models.Gemma4CausalLM.from_preset("gemma4_instruct_4b")
|
||||
print(model.generate("Write a haiku about JAX.", max_length=128))
|
||||
|
||||
# For multimodal (vision/audio) use the backbone + preprocessors directly:
|
||||
# backbone = keras_hub.models.Gemma4Backbone.from_preset("gemma4_instruct_4b")
|
||||
# preproc = keras_hub.models.Gemma4CausalLMPreprocessor.from_preset("gemma4_instruct_4b")
|
||||
# Vision and audio encoders are in separate modules (gemma4_vision_encoder,
|
||||
# gemma4_audio_encoder) and are wired by the backbone when preset includes them.
|
||||
@@ -0,0 +1,175 @@
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
class Gemma4Model(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
def norm_shift(self, name: str) -> float:
|
||||
del name # unused
|
||||
return 0.0
|
||||
|
||||
def set_vocab(self):
|
||||
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"}
|
||||
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
text_str = text.decode()
|
||||
if text_str in visible_tokens:
|
||||
# always render these tokens, so that the chat parser can read them
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
logger.info(f"Token '{text_str}' is set to USER_DEFINED")
|
||||
else:
|
||||
toktypes.append(toktype)
|
||||
|
||||
assert len(tokens) == vocab.vocab_size
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gemma4")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
|
||||
self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
|
||||
|
||||
# per-layer embedding is optional
|
||||
n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0
|
||||
self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd)
|
||||
|
||||
swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]]
|
||||
self.gguf_writer.add_sliding_window_pattern(swa_layers)
|
||||
|
||||
head_dim_full = self.hparams["global_head_dim"]
|
||||
head_dim_swa = self.hparams["head_dim"]
|
||||
# correct the head dim for global/swa layers
|
||||
self.gguf_writer.add_key_length(head_dim_full)
|
||||
self.gguf_writer.add_value_length(head_dim_full)
|
||||
self.gguf_writer.add_key_length_swa(head_dim_swa)
|
||||
self.gguf_writer.add_value_length_swa(head_dim_swa)
|
||||
|
||||
expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"])
|
||||
if expert_intermediate_size is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
||||
|
||||
# if use_double_wide_mlp is set, we need to adjust the value for kv shared layers
|
||||
use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False)
|
||||
first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers
|
||||
if use_double_wide_mlp:
|
||||
n_ff = self.hparams["intermediate_size"]
|
||||
n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)]
|
||||
self.gguf_writer.add_feed_forward_length(n_ff_arr)
|
||||
|
||||
# handle num_global_key_value_heads
|
||||
num_key_value_heads_full = self.hparams.get("num_global_key_value_heads")
|
||||
num_key_value_heads_swa = self.hparams.get("num_key_value_heads")
|
||||
if num_key_value_heads_full is not None and num_key_value_heads_swa is not None:
|
||||
value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers]
|
||||
self.gguf_writer.add_head_count_kv(value_arr)
|
||||
|
||||
# handle n_rot differently for global vs swa layers
|
||||
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
|
||||
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
|
||||
self.gguf_writer.add_rope_dimension_count(n_rot_full)
|
||||
self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa)
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# full layer uses "proportional" rope with partial_rotary_factor=0.25
|
||||
# the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated),
|
||||
# but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention
|
||||
# solution is to set specific freq_factors for the unrotated dims
|
||||
|
||||
# IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers
|
||||
rope_params_full = self.hparams["rope_parameters"]["full_attention"]
|
||||
assert rope_params_full["rope_type"] == "proportional"
|
||||
head_dim_full = (self.hparams["global_head_dim"])
|
||||
partial_rotary_factor_full = rope_params_full["partial_rotary_factor"]
|
||||
n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2)
|
||||
n_unrot_full = int(head_dim_full / 2) - n_rot_full
|
||||
values = [1.0] * n_rot_full + [1e30] * n_unrot_full
|
||||
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
|
||||
name = name + ".weight"
|
||||
|
||||
if "language_model." not in name and "rope_freqs" not in name:
|
||||
return # skip non-language model tensors
|
||||
|
||||
name = name.replace("language_model.", "")
|
||||
if name.endswith("router.scale"):
|
||||
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale")
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if ".per_expert_scale" in name:
|
||||
# convert per-expert scale to FFN down scale
|
||||
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale")
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if ".experts." in name and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
class Gemma4VisionAudioModel(MmprojModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = 224 # unused, but set to avoid error
|
||||
|
||||
# remap audio hparams
|
||||
if self.hparams_audio:
|
||||
self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
|
||||
else:
|
||||
self.has_audio_encoder = False
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# vision params
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
|
||||
# audio params
|
||||
if self.hparams_audio:
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
|
||||
def is_audio_tensor(self, name: str) -> bool:
|
||||
return "audio_tower" in name or "embed_audio" in name
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if self.is_audio_tensor(name):
|
||||
if ".conv" in name or "_conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if "position_embedding_table" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("model.language_model."):
|
||||
return # skip
|
||||
|
||||
if len(data_torch.shape) == 0:
|
||||
# convert scalar tensors (input/output_mix/max) to 1D tensors
|
||||
data_torch = data_torch.unsqueeze(0)
|
||||
@@ -0,0 +1,92 @@
|
||||
# Copyright © 2025 Apple Inc.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import mlx.core as mx
|
||||
import mlx.nn as nn
|
||||
from mlx.utils import tree_flatten, tree_unflatten
|
||||
|
||||
from . import gemma4_text
|
||||
from .base import BaseModelArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArgs(BaseModelArgs):
|
||||
model_type: str = "gemma4"
|
||||
text_config: dict = None
|
||||
vocab_size: int = 262144
|
||||
|
||||
def __post_init__(self):
|
||||
if self.text_config is None:
|
||||
self.text_config = {}
|
||||
self.text_config["vocab_size"] = self.vocab_size
|
||||
self.text_config["num_attention_heads"] = self.text_config.get(
|
||||
"num_attention_heads", 8
|
||||
)
|
||||
self.text_config["num_key_value_heads"] = self.text_config.get(
|
||||
"num_key_value_heads", 1
|
||||
)
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self, args: ModelArgs):
|
||||
super().__init__()
|
||||
self.args = args
|
||||
self.model_type = args.model_type
|
||||
self.language_model = gemma4_text.Model(
|
||||
gemma4_text.ModelArgs.from_dict(args.text_config)
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
inputs: mx.array,
|
||||
cache=None,
|
||||
input_embeddings: Optional[mx.array] = None,
|
||||
per_layer_inputs: Optional[mx.array] = None,
|
||||
):
|
||||
return self.language_model(
|
||||
inputs,
|
||||
cache=cache,
|
||||
input_embeddings=input_embeddings,
|
||||
per_layer_inputs=per_layer_inputs,
|
||||
)
|
||||
|
||||
def sanitize(self, weights):
|
||||
new_weights = {}
|
||||
for k, v in weights.items():
|
||||
starts_w_model = k.startswith("model.")
|
||||
|
||||
k = k.removeprefix("model.")
|
||||
if k.startswith(
|
||||
(
|
||||
"vision_tower",
|
||||
"multi_modal_projector",
|
||||
"audio_tower",
|
||||
"embed_audio",
|
||||
"embed_vision",
|
||||
)
|
||||
):
|
||||
continue
|
||||
|
||||
if not starts_w_model:
|
||||
new_weights[k] = v
|
||||
continue
|
||||
|
||||
if k.startswith("language_model"):
|
||||
k = k.replace("language_model.", "language_model.model.")
|
||||
|
||||
new_weights[k] = v
|
||||
|
||||
return self.language_model.sanitize(new_weights)
|
||||
|
||||
@property
|
||||
def layers(self):
|
||||
return self.language_model.layers
|
||||
|
||||
@property
|
||||
def quant_predicate(self):
|
||||
return self.language_model.quant_predicate
|
||||
|
||||
def make_cache(self):
|
||||
return self.language_model.make_cache()
|
||||
@@ -0,0 +1,60 @@
|
||||
from typing import Optional
|
||||
|
||||
import mlx.core as mx
|
||||
import mlx.nn as nn
|
||||
|
||||
from ..base import InputEmbeddingsFeatures
|
||||
from .audio import AudioEncoder
|
||||
from .config import ModelConfig
|
||||
from .language import LanguageModel, RMSNormNoScale
|
||||
from .vision import VisionModel
|
||||
|
||||
|
||||
def masked_scatter(input_tensor, mask, source):
|
||||
mask_flat = mask.flatten().astype(mx.int32)
|
||||
indices = mx.cumsum(mask_flat) - 1
|
||||
aligned = source.flatten()[indices % source.size]
|
||||
return mx.where(mask_flat, aligned, input_tensor.flatten()).reshape(
|
||||
input_tensor.shape
|
||||
)
|
||||
|
||||
|
||||
class MultimodalEmbedder(nn.Module):
|
||||
"""Projects soft tokens from vision/audio into language model space."""
|
||||
|
||||
def __init__(self, embedding_dim: int, text_hidden_size: int, eps: float = 1e-6):
|
||||
super().__init__()
|
||||
self.embedding_projection = nn.Linear(
|
||||
embedding_dim, text_hidden_size, bias=False
|
||||
)
|
||||
self.embedding_pre_projection_norm = RMSNormNoScale(embedding_dim, eps=eps)
|
||||
|
||||
def __call__(self, inputs_embeds: mx.array) -> mx.array:
|
||||
normed = self.embedding_pre_projection_norm(inputs_embeds)
|
||||
return self.embedding_projection(normed)
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self, config: ModelConfig):
|
||||
super().__init__()
|
||||
self.model_type = config.model_type
|
||||
self.config = config
|
||||
|
||||
# Text
|
||||
self.language_model = LanguageModel(config.text_config)
|
||||
self.vocab_size = config.text_config.vocab_size
|
||||
|
||||
# Vision
|
||||
self.vision_tower = VisionModel(config.vision_config)
|
||||
self.embed_vision = MultimodalEmbedder(
|
||||
embedding_dim=config.vision_config.hidden_size,
|
||||
text_hidden_size=config.text_config.hidden_size,
|
||||
eps=config.vision_config.rms_norm_eps,
|
||||
)
|
||||
|
||||
# Audio
|
||||
if config.audio_config is not None:
|
||||
self.audio_tower = AudioEncoder(config.audio_config)
|
||||
audio_output_dim = (
|
||||
config.audio_config.output_proj_dims or config.audio_config.hidden_size
|
||||
)
|
||||
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 The vLLM team.
|
||||
# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Gemma 4 model implementation for vLLM."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import replace
|
||||
from itertools import islice
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import (
|
||||
get_pp_group,
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
)
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import GeluAndMul
|
||||
from vllm.model_executor.layers.attention import Attention
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
)
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader,
|
||||
maybe_remap_kv_scale_name,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
|
||||
|
||||
from .interfaces import (
|
||||
EagleModelMixin,
|
||||
MixtureOfExperts,
|
||||
SupportsEagle3,
|
||||
SupportsLoRA,
|
||||
SupportsPP,
|
||||
)
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
WeightsMapper,
|
||||
extract_layer_index,
|
||||
is_pp_missing_parameter,
|
||||
make_layers,
|
||||
maybe_prefix,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def _get_text_config(config):
|
||||
"""Dereference text_config if config is a nested Gemma4Config.
|
||||
|
||||
Gemma4 checkpoints use architectures=["Gemma4ForConditionalGeneration"]
|
||||
which yields a Gemma4Config with nested text_config. This function
|
||||
transparently returns the text config regardless of nesting.
|
||||
"""
|
||||
if hasattr(config, "text_config"):
|
||||
return config.text_config
|
||||
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Gemma 4 multimodal model (image + audio + video support).
|
||||
|
||||
Adds vision tower, audio tower, and multimodal embedders on top of the
|
||||
text-only Gemma4ForCausalLM. The vision/audio encoders are loaded via
|
||||
AutoModel.from_config and run in eager mode while the language model uses
|
||||
the vLLM-optimized path.
|
||||
|
||||
Video support: Gemma4 does **not** have a native video tower. Videos are
|
||||
decomposed into timestamped image frames (up to 32 frames at 70 soft tokens
|
||||
each) and fed through the same vision tower as regular images. The
|
||||
processor inserts ``mm:ss`` timestamps between frames so the model can
|
||||
reason about temporal order.
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image as PILImage
|
||||
from torch import nn
|
||||
from transformers import AutoModel, BatchFeature
|
||||
from transformers.models.gemma4 import (
|
||||
Gemma4Config,
|
||||
Gemma4Processor,
|
||||
Gemma4VisionConfig,
|
||||
)
|
||||
from transformers.models.gemma4.configuration_gemma4 import (
|
||||
Gemma4AudioConfig,
|
||||
Gemma4TextConfig,
|
||||
)
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
||||
from vllm.inputs import MultiModalDataDict
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargsItems,
|
||||
VideoItem,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
AudioProcessorItems,
|
||||
ImageProcessorItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalDataParser,
|
||||
)
|
||||
from vllm.multimodal.processing import BaseDummyInputsBuilder
|
||||
from vllm.multimodal.processing.processor import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (
|
||||
MultiModalEmbeddings,
|
||||
SupportsEagle3,
|
||||
SupportsLoRA,
|
||||
SupportsMultiModal,
|
||||
SupportsPP,
|
||||
)
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
WeightsMapper,
|
||||
init_vllm_registered_model,
|
||||
maybe_prefix,
|
||||
)
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
# Source: vllm-project/vllm main branch — vllm/model_executor/models/registry.py
|
||||
# Verified 2026-04-18 via GitHub API.
|
||||
|
||||
# Line 99 (text-only Gemma 4 CausalLM):
|
||||
"Gemma4ForCausalLM": ("gemma4", "Gemma4ForCausalLM"),
|
||||
|
||||
# Line 230 (multimodal Gemma 4: vision + audio + video):
|
||||
"Gemma4ForCausalLM": ("gemma4_mm", "Gemma4ForConditionalGeneration"),
|
||||
|
||||
# The second (_mm) registration maps Gemma4ForCausalLM -> gemma4_mm.Gemma4ForConditionalGeneration,
|
||||
# which wires in:
|
||||
# - vision_tower (pixel_values, pixel_position_ids)
|
||||
# - audio_tower (input_features_padded, input_features_mask) [E2B/E4B only]
|
||||
# - video path (pixel_values_videos — decomposed to frames, up to 32 frames @ 70 soft tokens)
|
||||
#
|
||||
# vLLM dispatches based on whether the HF config has audio_config populated.
|
||||
Reference in New Issue
Block a user