gemma4-research/tooling/inference-frameworks/run_commands.sh

#!/usr/bin/env bash
# Canonical one-liners to serve Gemma 4 across inference frameworks.
# Verified against upstream repos / model cards on 2026-04-18.
# Not meant to be executed as a script — each block is a standalone example.

### 1. vLLM — full multimodal (text + vision + audio + video) ###
# Text-only 31B dense:
vllm serve google/gemma-4-31b-it --tensor-parallel-size 2
# Multimodal E4B (vision + audio):
vllm serve google/gemma-4-E4B-it --limit-mm-per-prompt image=4,audio=1
# NVFP4-quantized 31B on Blackwell/H100 (NVIDIA's official quant):
vllm serve nvidia/Gemma-4-31B-IT-NVFP4 --quantization modelopt --tensor-parallel-size 8

### 2. llama.cpp — official ggml-org GGUFs ###
# Text-only via -hf shortcut (auto-download, default = Q4_K_M if multiple present):
llama-server -hf ggml-org/gemma-4-E4B-it-GGUF
# Choose a specific quant:
llama-server -hf ggml-org/gemma-4-26b-a4b-it-GGUF:Q4_K_M
# Vision (+ audio for E-series) — add --mmproj pointing to the projector:
llama-server -hf ggml-org/gemma-4-E4B-it-GGUF \
  --mmproj ggml-org/gemma-4-E4B-it-GGUF/mmproj-gemma-4-E4B-it-Q8_0.gguf
# Convert a new HF checkpoint to GGUF yourself:
python convert_hf_to_gguf.py /path/to/google/gemma-4-31b-it --outfile gemma-4-31b.gguf

### 3. Apple MLX — text via mlx-lm, multimodal via mlx-vlm (community) ###
# Text generation (mlx-lm, first-party Apple):
mlx_lm.generate --model mlx-community/gemma-4-E4B-it-4bit --prompt "Hello"
# Vision/audio (mlx-vlm, Prince Canuma / community):
mlx_vlm.generate --model mlx-community/gemma-4-E4B-it-8bit \
  --image https://example.com/cat.jpg --prompt "Describe this image."

### 4. Keras / keras-hub — reference implementation, training-focused ###
# python:
# import keras_hub
# model = keras_hub.models.Gemma4CausalLM.from_preset("gemma4_instruct_4b")
# model.generate("Hello", max_length=128)
# Presets: gemma4_{2b,4b,26b_a4b,31b} and gemma4_instruct_{...}

### 5. Text Generation Inference (TGI) — NO native Gemma 4 support as of 2026-04-18 ###
# Upstream supported_models list stops at Gemma 3 / Gemma 3 Text.
# Fallback: TGI will try AutoModelForCausalLM without optimized kernels —
# expect degraded throughput and no guarantee of vision/audio paths.
text-generation-launcher --model-id google/gemma-4-31b-it   # unoptimized fallback

### 6. TensorRT-LLM — NOT supported ###
# Support matrix (2026-04) lists Gemma2 and Gemma3{ForCausalLM,ForConditionalGeneration}
# but NOT Gemma4. NVIDIA's own nvidia/Gemma-4-31B-IT-NVFP4 card points users to vLLM.
# Issue #12764 tracks DGX Spark runtime skew. Avoid for production Gemma 4.

### 7. Gemini API (Google AI Studio) — hosted Gemma 4 ###
curl "https://generativelanguage.googleapis.com/v1beta/models/gemma-4-26b-a4b-it:generateContent" \
  -H 'Content-Type: application/json' \
  -H "x-goog-api-key: $GEMINI_API_KEY" \
  -X POST \
  -d '{"contents":[{"parts":[{"text":"Your prompt here"}]}]}'
# Python SDK (google-genai):
# from google import genai
# client = genai.Client()
# resp = client.models.generate_content(model="gemma-4-26b-a4b-it", contents="Hi")
# print(resp.text)
# Hosted model IDs: gemma-4-31b-it, gemma-4-26b-a4b-it

### 8. Vertex AI Model Garden — one-click deploy ###
# Console: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemma4
# CLI (new model-garden command):
gcloud ai model-garden models list | grep gemma-4
# Python SDK (vertex-ai-model-garden):
# from google.cloud.aiplatform import model_garden
# model = model_garden.OpenModel("google/gemma4@gemma-4-31b-it")
# endpoint = model.deploy()   # spins up Vertex endpoint with backing GPUs