#!/usr/bin/env bash # Canonical one-liners to serve Gemma 4 across inference frameworks. # Verified against upstream repos / model cards on 2026-04-18. # Not meant to be executed as a script — each block is a standalone example. ### 1. vLLM — full multimodal (text + vision + audio + video) ### # Text-only 31B dense: vllm serve google/gemma-4-31b-it --tensor-parallel-size 2 # Multimodal E4B (vision + audio): vllm serve google/gemma-4-E4B-it --limit-mm-per-prompt image=4,audio=1 # NVFP4-quantized 31B on Blackwell/H100 (NVIDIA's official quant): vllm serve nvidia/Gemma-4-31B-IT-NVFP4 --quantization modelopt --tensor-parallel-size 8 ### 2. llama.cpp — official ggml-org GGUFs ### # Text-only via -hf shortcut (auto-download, default = Q4_K_M if multiple present): llama-server -hf ggml-org/gemma-4-E4B-it-GGUF # Choose a specific quant: llama-server -hf ggml-org/gemma-4-26b-a4b-it-GGUF:Q4_K_M # Vision (+ audio for E-series) — add --mmproj pointing to the projector: llama-server -hf ggml-org/gemma-4-E4B-it-GGUF \ --mmproj ggml-org/gemma-4-E4B-it-GGUF/mmproj-gemma-4-E4B-it-Q8_0.gguf # Convert a new HF checkpoint to GGUF yourself: python convert_hf_to_gguf.py /path/to/google/gemma-4-31b-it --outfile gemma-4-31b.gguf ### 3. Apple MLX — text via mlx-lm, multimodal via mlx-vlm (community) ### # Text generation (mlx-lm, first-party Apple): mlx_lm.generate --model mlx-community/gemma-4-E4B-it-4bit --prompt "Hello" # Vision/audio (mlx-vlm, Prince Canuma / community): mlx_vlm.generate --model mlx-community/gemma-4-E4B-it-8bit \ --image https://example.com/cat.jpg --prompt "Describe this image." ### 4. Keras / keras-hub — reference implementation, training-focused ### # python: # import keras_hub # model = keras_hub.models.Gemma4CausalLM.from_preset("gemma4_instruct_4b") # model.generate("Hello", max_length=128) # Presets: gemma4_{2b,4b,26b_a4b,31b} and gemma4_instruct_{...} ### 5. Text Generation Inference (TGI) — NO native Gemma 4 support as of 2026-04-18 ### # Upstream supported_models list stops at Gemma 3 / Gemma 3 Text. # Fallback: TGI will try AutoModelForCausalLM without optimized kernels — # expect degraded throughput and no guarantee of vision/audio paths. text-generation-launcher --model-id google/gemma-4-31b-it # unoptimized fallback ### 6. TensorRT-LLM — NOT supported ### # Support matrix (2026-04) lists Gemma2 and Gemma3{ForCausalLM,ForConditionalGeneration} # but NOT Gemma4. NVIDIA's own nvidia/Gemma-4-31B-IT-NVFP4 card points users to vLLM. # Issue #12764 tracks DGX Spark runtime skew. Avoid for production Gemma 4. ### 7. Gemini API (Google AI Studio) — hosted Gemma 4 ### curl "https://generativelanguage.googleapis.com/v1beta/models/gemma-4-26b-a4b-it:generateContent" \ -H 'Content-Type: application/json' \ -H "x-goog-api-key: $GEMINI_API_KEY" \ -X POST \ -d '{"contents":[{"parts":[{"text":"Your prompt here"}]}]}' # Python SDK (google-genai): # from google import genai # client = genai.Client() # resp = client.models.generate_content(model="gemma-4-26b-a4b-it", contents="Hi") # print(resp.text) # Hosted model IDs: gemma-4-31b-it, gemma-4-26b-a4b-it ### 8. Vertex AI Model Garden — one-click deploy ### # Console: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemma4 # CLI (new model-garden command): gcloud ai model-garden models list | grep gemma-4 # Python SDK (vertex-ai-model-garden): # from google.cloud.aiplatform import model_garden # model = model_garden.OpenModel("google/gemma4@gemma-4-31b-it") # endpoint = model.deploy() # spins up Vertex endpoint with backing GPUs