From 22af59756f62ae1c839f3bc9b23dc8817707012b Mon Sep 17 00:00:00 2001 From: Mortdecai Date: Mon, 20 Apr 2026 05:47:41 -0400 Subject: [PATCH] docs: remove V100 from GPU bakeoff V100 data was degraded by SDXL co-residence on CT 167 (31/32 GB VRAM occupied, Gemma 4 models forced 95% onto CPU). Rather than ship a prominent caveat, drop the V100 column entirely so the doc reports only apples-to-apples measurements. V100 can be added back once an isolated run is possible. Removed: V100 column from TL;DR and per-model tables, hardware row, caveat section, and associated raw JSONs under runs/pve197/. Harness config keeps pve197 in HOSTS for future re-runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- docs/reference/gpu-bakeoff-2026-04-20.md | 125 +++++------------- .../runs/pve197/gemma4-26b/long.json | 81 ------------ .../runs/pve197/gemma4-26b/short.json | 81 ------------ .../runs/pve197/gemma4-31b/long.json | 81 ------------ .../runs/pve197/gemma4-31b/short.json | 81 ------------ 6 files changed, 31 insertions(+), 420 deletions(-) delete mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json delete mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json delete mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json delete mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json diff --git a/README.md b/README.md index 99ce858..306126a 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Research corpus and implementation guidance for Google Gemma 4, based on product | `docs/openwebui-setup.md` | How to configure Gemma 4 inside OpenWebUI — per-setting reference, two ready-to-bake Workspace Model profiles (chat + extract), and a symptom→cause troubleshooting table mapped back to GOTCHAS.md. Assumes Ollama + OpenWebUI are already running. | When setting up or debugging a Gemma 4 model in OpenWebUI, or handing the front-end config to someone else | | `docs/reference/bakeoff-2026-04-18.md` | CLI-coding-agent bakeoff on 3090 Ti. **Rounds 1/2 misidentified the cause; Round 3 (the correct one): `think: false` silent-stops gemma4:26b at certain multi-turn states on 32K context.** 31B and Qwen3-Coder robust to the flag. Harness at `scripts/bakeoff/` | When deciding which model to back a CLI agent with, writing a custom agent payload, or debugging a silent tool-call halt | | `docs/reference/mort-bakeoff-2026-04-18.md` | mort-bot-specific `think=true` vs `think=false` bakeoff on mort's actual loop shape (gemma4:26b, num_ctx=8192). **Thinking does NOT accumulate in context on Ollama 0.20.4** — strips it from serialized history. Both settings behave identically on step counts, tool counts, wall clock. Harness at `scripts/mort-bakeoff/` | When deciding mort-bot's THINK env var, or when someone claims "think=true eats context" without pinning an Ollama version | -| `docs/reference/gpu-bakeoff-2026-04-20.md` | Cross-GPU throughput bakeoff: steel141 RTX 3090 Ti vs pve197 V100 vs matt-strix (AMD Strix Halo). **3090 Ti wins decode decisively (128 tok/s on 26B MoE). Strix gets ~42% of that on ~25% of the bandwidth. V100 numbers are degraded because SDXL on CT 167 occupies 31/32 GB of its VRAM.** Also quantifies the MoE vs dense gap: 26B decodes ~4.7× faster than 31B on every card. Harness at `scripts/gpu-bakeoff/` | When choosing which host to run a Gemma 4 workload on, or deciding whether the V100 needs isolated for a given job | +| `docs/reference/gpu-bakeoff-2026-04-20.md` | Cross-GPU throughput bakeoff: steel141 RTX 3090 Ti vs matt-strix (AMD Strix Halo). **3090 Ti wins decode decisively (128 tok/s on 26B MoE). Strix gets ~42% of that on ~25% of the bandwidth.** Also quantifies the MoE vs dense gap: 26B decodes ~4.7× faster than 31B on both cards. Harness at `scripts/gpu-bakeoff/` | When choosing which host to run a Gemma 4 workload on | | `tooling/` | **Canonical upstream tooling** — real scripts, notebooks, model cards, and configs pulled from Google / HF / framework maintainers (147 files). Subdirs: `google-official/`, `huggingface/`, `inference-frameworks/`, `gemma-family/`, `fine-tuning/`. See `tooling/README.md` for index and findings that update the older `CORPUS_*` docs | When you need authoritative source material — model cards, chat templates, fine-tuning recipes, serving commands for vLLM / llama.cpp / MLX, or to scope a specialized sibling (ShieldGemma, EmbeddingGemma, etc.) | ## Source Projects diff --git a/docs/reference/gpu-bakeoff-2026-04-20.md b/docs/reference/gpu-bakeoff-2026-04-20.md index 5f1e6f7..2706d9d 100644 --- a/docs/reference/gpu-bakeoff-2026-04-20.md +++ b/docs/reference/gpu-bakeoff-2026-04-20.md @@ -1,7 +1,7 @@ -# GPU Bakeoff — Gemma 4 Throughput Across Three Architectures +# GPU Bakeoff — Gemma 4 Throughput: 3090 Ti vs Strix Halo **Date:** 2026-04-20 -**Host matrix:** steel141 (RTX 3090 Ti) · pve197 CT 105 (Tesla V100) · matt-strix (AMD Strix Halo iGPU) +**Host matrix:** steel141 (RTX 3090 Ti) · matt-strix (AMD Strix Halo iGPU) **Models:** `gemma4:26b` (MoE Q4_K_M) · `gemma4:31b-it-q4_K_M` (dense Q4_K_M) **Harness:** `scripts/gpu-bakeoff/harness.py` **Raw data:** `scripts/gpu-bakeoff/runs/` @@ -14,12 +14,6 @@ |-----|------------------|--------------------|-----------------------| | **RTX 3090 Ti** (steel141) | **128 tok/s** | **27 tok/s** | **23,849 tok/s** | | **AMD Strix Halo iGPU** (matt-strix) | 54 tok/s (42%) | 11 tok/s (39%) | 14,326 tok/s (60%) | -| **Tesla V100** (pve197) ⚠ | 8 tok/s (6%) | 1.6 tok/s (6%) | 2,696 tok/s (11%) | - -> ⚠ **V100 numbers reflect degraded conditions — SDXL on CT 167 occupies -> 31.7 / 32.7 GB VRAM, forcing Ollama's Gemma 4 models 95% onto CPU.** -> Under isolation, V100 should land between 3090 Ti and Strix based on -> raw specs (HBM2 ~900 GB/s). See § "V100 caveat" for the evidence. ### Headline findings @@ -33,10 +27,6 @@ 3. **Strix Halo punches above its bandwidth.** Gets 42 % of 3090 Ti decode speed on only ~25 % of the memory bandwidth (~256 GB/s vs ~1008 GB/s) — good SIMD utilization, especially on the MoE model. -4. **V100 is held back by shared VRAM.** Its spec should put it closer - to 3090 Ti than to Strix, but coresident SDXL crowds out Ollama's - layer offload. The V100 column in this doc is an *as-is* reading, - not a *peak-capability* reading. --- @@ -45,7 +35,6 @@ | Host | GPU | VRAM | Bandwidth | Compute cap | Notes | |------|-----|------|-----------|-------------|-------| | steel141 | RTX 3090 Ti | 24 GB GDDR6X | ~1008 GB/s | 8.6 (Ampere) | Seth's workstation. Also has a GTX 1660 SUPER as aux display card — not used for inference. Ollama on 127.0.0.1:11434. | -| pve197 CT 105 | Tesla V100-PCIE-32GB | 32 GB HBM2 | ~900 GB/s | 7.0 (Volta) | LXC with GPU passthrough. Ollama on 192.168.0.179:11434. **Coresident with CT 167 ai-visualizer (SDXL) which consumes most of the VRAM.** | | matt-strix | AMD Strix Halo (Radeon 890M iGPU + XDNA 2 NPU) | Shared LPDDR5X | ~256 GB/s | — | Unified memory lets it fit models a 24 GB card can't. Ollama on 100.117.155.64:11434 via Tailscale. | --- @@ -65,10 +54,6 @@ jitter are excluded from the rates. - Median of the 3 measurement runs is reported in tables; min/max are in the raw JSON. -- **No network-introduced variance** — all three hosts exposed HTTP - Ollama endpoints (matt-strix via Tailscale). The timings reported are - computed server-side from `prompt_eval_count / prompt_eval_duration` - and `eval_count / eval_duration`. --- @@ -79,10 +64,10 @@ Decode is the metric that matters most for interactive LLM use — it's the speed of token generation after the prompt has been processed. -| Model | 3090 Ti | V100 ⚠ | Strix Halo | -|-------|---------|-------|------------| -| gemma4:26b (MoE, ~4 B active) | **128.20** | 8.34 | 53.86 | -| gemma4:31b (dense, 31.3 B active) | **27.15** | 1.55 | 10.64 | +| Model | 3090 Ti | Strix Halo | +|-------|---------|------------| +| gemma4:26b (MoE, ~4 B active) | **128.20** | 53.86 | +| gemma4:31b (dense, 31.3 B active) | **27.15** | 10.64 | ### Prefill rate (tok/s, long ~500-token prompt, median) @@ -91,52 +76,20 @@ before decode begins. Batched per-token, so short-prompt prefill numbers are noisy (dominated by fixed overhead — see raw JSON for those); the long-prompt numbers below are the ones to reason from. -| Model | 3090 Ti | V100 ⚠ | Strix Halo | -|-------|---------|-------|------------| -| gemma4:26b (long) | **23,849** | 2,696 | 14,326 | -| gemma4:31b (long) | **7,716** | 436 | 3,278 | +| Model | 3090 Ti | Strix Halo | +|-------|---------|------------| +| gemma4:26b (long) | **23,849** | 14,326 | +| gemma4:31b (long) | **7,716** | 3,278 | ### Short-prompt prefill (for reference) On a 15-token prompt, prefill tokens/sec is meaningless — prompt is too small to amortize overhead. Included only to confirm no regression. -| Model | 3090 Ti | V100 ⚠ | Strix Halo | -|-------|---------|-------|------------| -| gemma4:26b (short) | 2,063 | 240 | 1,276 | -| gemma4:31b (short) | 661 | 41 | 292 | - ---- - -## V100 caveat — why the numbers are degraded - -Mid-bakeoff I probed `GET /api/ps` on pve197 while the V100's Q8 MoE was -loaded: - -``` -gemma4:26b-a4b-it-q8_0 size: 30.5 GB size_vram: 1.57 GB -``` - -**Only 1.57 GB of the 30.5 GB model is actually resident on the V100;** -the other 28.9 GB is running on CPU via Ollama's CPU-offload fallback. -`nvidia-smi` corroborated: 31,754 / 32,768 MiB used, 0 % utilization -at probe time. That remaining ~29 GB of VRAM isn't free — it's held by -the SDXL pipeline on CT 167 (claude-avatar + ai-visualizer). - -Impact on every V100 number in this doc: -- `gemma4:26b` Q4_K_M is 18 GB — doesn't fit in the ~1 GB of headroom - SDXL leaves, so it runs largely on CPU. Observed 8.3 tok/s is - consistent with CPU inference of a MoE 26B Q4 model. -- `gemma4:31b` Q4_K_M is 19.9 GB — same fate. Observed 1.55 tok/s is - consistent with dense 31B on CPU (dense kills you on CPU; only - ~4 B params activate on the MoE, so the MoE suffers less). -- The Q8 variant (28 GB) never had a chance on the V100 while SDXL is - loaded. Bakeoff did not attempt it. - -**To get isolated V100 numbers**, stop SDXL on CT 167 (or stop CT 167 -entirely) and re-run `scripts/gpu-bakeoff/harness.py --host pve197`. -Left as a follow-up — whether that's worth the ai-visualizer -interruption is a judgment call. See "Open questions" below. +| Model | 3090 Ti | Strix Halo | +|-------|---------|------------| +| gemma4:26b (short) | 2,063 | 1,276 | +| gemma4:31b (short) | 661 | 292 | --- @@ -162,10 +115,10 @@ smaller. For interactive chat this is decisive: Seth's `mort-bot` running `gemma4:26b` gets ~4.7× the responsiveness it would on `gemma4:31b`, even though the models are near-equal in total params. -Why the ratio holds on every GPU: **memory bandwidth is the bottleneck** -across all three cards. Strix gets 42 % of 3090 Ti on 26B and 39 % of -3090 Ti on 31B — identical ratios — because it has ~25 % of the -bandwidth and matches or exceeds proportionally. +Why the ratio holds on both GPUs: **memory bandwidth is the bottleneck** +on both cards. Strix gets 42 % of 3090 Ti on 26B and 39 % of 3090 Ti on +31B — nearly identical ratios — because it has ~25 % of the bandwidth +and matches or slightly exceeds proportionally. --- @@ -176,44 +129,31 @@ bandwidth and matches or exceeds proportionally. comfortable for real-time responses. - Fallback: **Strix Halo** — 54 tok/s is usable. Benefit is unified memory can host larger models the 24 GB 3090 Ti can't. - - Avoid: V100 *while SDXL is coresident.* Without SDXL it should be - competitive. **Long-context / prompt-heavy workloads (prefill-heavy).** - Primary: **3090 Ti** again — 23,849 tok/s prefill means a 500-token prompt ingests in ~21 ms. - Strix at 14,326 tok/s is ~35 ms — still interactive. -**Running models that don't fit elsewhere.** - - Strix Halo. Unified LPDDR5X can hold 80 GB+ models that 24 GB and - 32 GB discrete cards can't — at the cost of lower bandwidth. +**Running models that don't fit on discrete cards.** + - Strix Halo. Unified LPDDR5X can hold 80 GB+ models that a 24 GB + 3090 Ti can't — at the cost of lower bandwidth. - The largest model tested here (`gemma4:31b` Q4 at 19.9 GB) fits - all three. Q8 variants (28 GB+) only fit the V100 and Strix. + both. Q8 variants (28 GB+) only fit Strix in this matrix. **Fine-tuning / training.** - Not measured here. 3090 Ti's 24 GB limits batch size on 20 B+ - models; V100's 32 GB HBM2 is much more forgiving *if* isolated. + models. --- ## Open questions / follow-ups -1. **Isolated V100 re-run.** Stop SDXL, re-run the harness. Expected - outcome: V100 decode lands between 3090 Ti and Strix (probably - ~70-90 tok/s on 26B given HBM2 bandwidth ~900 GB/s vs 3090 Ti's - ~1008 GB/s). That would settle the V100's actual rank. -2. **V100 Q8 baseline.** `gemma4:26b-a4b-it-q8_0` (28 GB) is the Q8 - MoE variant Seth pulled on pve197 — worth measuring once isolated. - Q8 vs Q4 quality/speed tradeoff for the same model would be useful. -3. **Strix max-model fit.** Strix can probably host models that - wouldn't fit the discrete cards. A follow-up would pull a larger - model (70 B+ quantized) on matt-strix and see the Strix-only - performance ceiling. -4. **Contention behavior.** The V100 finding generalizes — whenever - the homelab is running coresident AI workloads, Gemma 4 inference - falls off a cliff. A "contention-aware routing" decision (don't - send latency-sensitive Ollama traffic to a card with SDXL running) - may be worth building into the mort-bot / openwebui gateway. +1. **Strix max-model fit.** Strix can host models that wouldn't fit the + 3090 Ti. A follow-up would pull a larger model (70 B+ quantized) on + matt-strix and measure the Strix-only performance ceiling. +2. **Q8 vs Q4 on Strix.** Same model, two quantizations — quality/speed + tradeoff characterization. --- @@ -225,15 +165,10 @@ All per-run JSON traces are under `scripts/gpu-bakeoff/runs/`: runs/ ├── steel141/ │ ├── gemma4-26b/{short,long}.json -│ ├── gemma4-31b/{short,long}.json -│ └── gemma4-26b-q8/{short,long}.json # skipped — model not on host -├── pve197/ -│ ├── gemma4-26b/{short,long}.json # ⚠ degraded, see caveat -│ └── gemma4-31b/{short,long}.json # ⚠ degraded, see caveat +│ └── gemma4-31b/{short,long}.json └── matt-strix/ ├── gemma4-26b/{short,long}.json - ├── gemma4-31b/{short,long}.json - └── gemma4-26b-q8/{short,long}.json # skipped — model not on host + └── gemma4-31b/{short,long}.json ``` Each JSON contains the warmup call and all 3 measurement calls with diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json deleted file mode 100644 index ae93a45..0000000 --- a/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "host": "pve197", - "gpu": "Tesla V100-PCIE-32GB", - "vram_gb": 32, - "model_alias": "gemma4:26b", - "model_tag": "gemma4:26b", - "prompt_key": "long", - "prompt_chars": 1614, - "num_predict": 256, - "num_ctx": 4096, - "runs": [ - { - "prompt_tokens": 318, - "prompt_eval_ms": 118.0, - "prefill_tok_per_s": 2695.59, - "output_tokens": 256, - "eval_ms": 32720.5, - "decode_tok_per_s": 7.82, - "load_ms": 475.8, - "total_ms": 33548.1, - "harness_wall_s": 33.555, - "done_reason": "length" - }, - { - "prompt_tokens": 318, - "prompt_eval_ms": 118.3, - "prefill_tok_per_s": 2689.01, - "output_tokens": 256, - "eval_ms": 31273.0, - "decode_tok_per_s": 8.19, - "load_ms": 492.5, - "total_ms": 32116.6, - "harness_wall_s": 32.123, - "done_reason": "length" - }, - { - "prompt_tokens": 318, - "prompt_eval_ms": 117.3, - "prefill_tok_per_s": 2711.41, - "output_tokens": 256, - "eval_ms": 33434.9, - "decode_tok_per_s": 7.66, - "load_ms": 496.0, - "total_ms": 34298.7, - "harness_wall_s": 34.305, - "done_reason": "length" - } - ], - "warmup": { - "prompt_tokens": 318, - "prompt_eval_ms": 3562.7, - "prefill_tok_per_s": 89.26, - "output_tokens": 256, - "eval_ms": 32215.7, - "decode_tok_per_s": 7.95, - "load_ms": 491.7, - "total_ms": 36521.3, - "harness_wall_s": 36.529, - "done_reason": "length" - }, - "summary": { - "prefill_tok_per_s": { - "min": 2689.01, - "median": 2695.59, - "max": 2711.41, - "n": 3 - }, - "decode_tok_per_s": { - "min": 7.66, - "median": 7.82, - "max": 8.19, - "n": 3 - }, - "total_ms": { - "min": 32116.6, - "median": 33548.1, - "max": 34298.7, - "n": 3 - } - } -} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json deleted file mode 100644 index 985a74d..0000000 --- a/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "host": "pve197", - "gpu": "Tesla V100-PCIE-32GB", - "vram_gb": 32, - "model_alias": "gemma4:26b", - "model_tag": "gemma4:26b", - "prompt_key": "short", - "prompt_chars": 78, - "num_predict": 256, - "num_ctx": 4096, - "runs": [ - { - "prompt_tokens": 27, - "prompt_eval_ms": 112.5, - "prefill_tok_per_s": 240.05, - "output_tokens": 256, - "eval_ms": 30919.5, - "decode_tok_per_s": 8.28, - "load_ms": 531.1, - "total_ms": 31828.4, - "harness_wall_s": 31.832, - "done_reason": "length" - }, - { - "prompt_tokens": 27, - "prompt_eval_ms": 113.6, - "prefill_tok_per_s": 237.6, - "output_tokens": 256, - "eval_ms": 30399.9, - "decode_tok_per_s": 8.42, - "load_ms": 479.4, - "total_ms": 31242.1, - "harness_wall_s": 31.246, - "done_reason": "length" - }, - { - "prompt_tokens": 27, - "prompt_eval_ms": 111.0, - "prefill_tok_per_s": 243.16, - "output_tokens": 256, - "eval_ms": 30712.9, - "decode_tok_per_s": 8.34, - "load_ms": 483.2, - "total_ms": 31552.8, - "harness_wall_s": 31.557, - "done_reason": "length" - } - ], - "warmup": { - "prompt_tokens": 27, - "prompt_eval_ms": 843.7, - "prefill_tok_per_s": 32.0, - "output_tokens": 256, - "eval_ms": 30499.4, - "decode_tok_per_s": 8.39, - "load_ms": 5877.7, - "total_ms": 37664.4, - "harness_wall_s": 37.668, - "done_reason": "length" - }, - "summary": { - "prefill_tok_per_s": { - "min": 237.6, - "median": 240.05, - "max": 243.16, - "n": 3 - }, - "decode_tok_per_s": { - "min": 8.28, - "median": 8.34, - "max": 8.42, - "n": 3 - }, - "total_ms": { - "min": 31242.1, - "median": 31552.8, - "max": 31828.4, - "n": 3 - } - } -} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json deleted file mode 100644 index ecab413..0000000 --- a/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "host": "pve197", - "gpu": "Tesla V100-PCIE-32GB", - "vram_gb": 32, - "model_alias": "gemma4:31b", - "model_tag": "gemma4:31b-it-q4_K_M", - "prompt_key": "long", - "prompt_chars": 1614, - "num_predict": 256, - "num_ctx": 4096, - "runs": [ - { - "prompt_tokens": 318, - "prompt_eval_ms": 728.7, - "prefill_tok_per_s": 436.37, - "output_tokens": 256, - "eval_ms": 163511.0, - "decode_tok_per_s": 1.57, - "load_ms": 495.0, - "total_ms": 164970.4, - "harness_wall_s": 164.977, - "done_reason": "length" - }, - { - "prompt_tokens": 318, - "prompt_eval_ms": 682.8, - "prefill_tok_per_s": 465.71, - "output_tokens": 256, - "eval_ms": 168727.1, - "decode_tok_per_s": 1.52, - "load_ms": 545.3, - "total_ms": 170207.4, - "harness_wall_s": 170.214, - "done_reason": "length" - }, - { - "prompt_tokens": 318, - "prompt_eval_ms": 950.0, - "prefill_tok_per_s": 334.75, - "output_tokens": 256, - "eval_ms": 163102.9, - "decode_tok_per_s": 1.57, - "load_ms": 507.9, - "total_ms": 164801.8, - "harness_wall_s": 164.809, - "done_reason": "length" - } - ], - "warmup": { - "prompt_tokens": 318, - "prompt_eval_ms": 3883.3, - "prefill_tok_per_s": 81.89, - "output_tokens": 256, - "eval_ms": 172199.4, - "decode_tok_per_s": 1.49, - "load_ms": 528.0, - "total_ms": 176864.8, - "harness_wall_s": 176.871, - "done_reason": "length" - }, - "summary": { - "prefill_tok_per_s": { - "min": 334.75, - "median": 436.37, - "max": 465.71, - "n": 3 - }, - "decode_tok_per_s": { - "min": 1.52, - "median": 1.57, - "max": 1.57, - "n": 3 - }, - "total_ms": { - "min": 164801.8, - "median": 164970.4, - "max": 170207.4, - "n": 3 - } - } -} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json deleted file mode 100644 index 7abf1e3..0000000 --- a/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "host": "pve197", - "gpu": "Tesla V100-PCIE-32GB", - "vram_gb": 32, - "model_alias": "gemma4:31b", - "model_tag": "gemma4:31b-it-q4_K_M", - "prompt_key": "short", - "prompt_chars": 78, - "num_predict": 256, - "num_ctx": 4096, - "runs": [ - { - "prompt_tokens": 27, - "prompt_eval_ms": 665.6, - "prefill_tok_per_s": 40.56, - "output_tokens": 256, - "eval_ms": 164631.1, - "decode_tok_per_s": 1.55, - "load_ms": 512.6, - "total_ms": 166062.7, - "harness_wall_s": 166.067, - "done_reason": "length" - }, - { - "prompt_tokens": 27, - "prompt_eval_ms": 660.3, - "prefill_tok_per_s": 40.89, - "output_tokens": 256, - "eval_ms": 159594.3, - "decode_tok_per_s": 1.6, - "load_ms": 523.6, - "total_ms": 161012.3, - "harness_wall_s": 161.016, - "done_reason": "length" - }, - { - "prompt_tokens": 27, - "prompt_eval_ms": 887.8, - "prefill_tok_per_s": 30.41, - "output_tokens": 256, - "eval_ms": 167584.3, - "decode_tok_per_s": 1.53, - "load_ms": 486.8, - "total_ms": 169188.9, - "harness_wall_s": 169.194, - "done_reason": "length" - } - ], - "warmup": { - "prompt_tokens": 27, - "prompt_eval_ms": 6642.4, - "prefill_tok_per_s": 4.06, - "output_tokens": 256, - "eval_ms": 173530.1, - "decode_tok_per_s": 1.48, - "load_ms": 20142.1, - "total_ms": 200836.5, - "harness_wall_s": 200.841, - "done_reason": "length" - }, - "summary": { - "prefill_tok_per_s": { - "min": 30.41, - "median": 40.56, - "max": 40.89, - "n": 3 - }, - "decode_tok_per_s": { - "min": 1.53, - "median": 1.55, - "max": 1.6, - "n": 3 - }, - "total_ms": { - "min": 161012.3, - "median": 166062.7, - "max": 169188.9, - "n": 3 - } - } -} \ No newline at end of file