eecebe7ef5
Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Gemma 4 multimodal model (image + audio + video support).
|
|
|
|
Adds vision tower, audio tower, and multimodal embedders on top of the
|
|
text-only Gemma4ForCausalLM. The vision/audio encoders are loaded via
|
|
AutoModel.from_config and run in eager mode while the language model uses
|
|
the vLLM-optimized path.
|
|
|
|
Video support: Gemma4 does **not** have a native video tower. Videos are
|
|
decomposed into timestamped image frames (up to 32 frames at 70 soft tokens
|
|
each) and fed through the same vision tower as regular images. The
|
|
processor inserts ``mm:ss`` timestamps between frames so the model can
|
|
reason about temporal order.
|
|
"""
|
|
|
|
import math
|
|
from collections.abc import Iterable, Mapping, Sequence
|
|
from typing import Annotated, Any, Literal
|
|
|
|
import numpy as np
|
|
import torch
|
|
from PIL import Image as PILImage
|
|
from torch import nn
|
|
from transformers import AutoModel, BatchFeature
|
|
from transformers.models.gemma4 import (
|
|
Gemma4Config,
|
|
Gemma4Processor,
|
|
Gemma4VisionConfig,
|
|
)
|
|
from transformers.models.gemma4.configuration_gemma4 import (
|
|
Gemma4AudioConfig,
|
|
Gemma4TextConfig,
|
|
)
|
|
|
|
from vllm.config import VllmConfig
|
|
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
|
from vllm.inputs import MultiModalDataDict
|
|
from vllm.logger import init_logger
|
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
|
from vllm.model_executor.layers.linear import ReplicatedLinear
|
|
from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
|
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
|
from vllm.multimodal.inputs import (
|
|
MultiModalFieldConfig,
|
|
MultiModalKwargsItems,
|
|
VideoItem,
|
|
)
|
|
from vllm.multimodal.parse import (
|
|
AudioProcessorItems,
|
|
ImageProcessorItems,
|
|
MultiModalDataItems,
|
|
MultiModalDataParser,
|
|
)
|
|
from vllm.multimodal.processing import BaseDummyInputsBuilder
|
|
from vllm.multimodal.processing.processor import (
|
|
BaseMultiModalProcessor,
|
|
BaseProcessingInfo,
|
|
PromptReplacement,
|
|
PromptUpdate,
|
|
PromptUpdateDetails,
|
|
)
|
|
from vllm.sequence import IntermediateTensors
|
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
|
|
|
from .interfaces import (
|
|
MultiModalEmbeddings,
|
|
SupportsEagle3,
|
|
SupportsLoRA,
|
|
SupportsMultiModal,
|
|
SupportsPP,
|
|
)
|
|
from .utils import (
|
|
AutoWeightsLoader,
|
|
WeightsMapper,
|
|
init_vllm_registered_model,
|
|
maybe_prefix,
|
|
)
|
|
|