From b6190357ba2733fa2f1b7a4f6e47026c687527b2 Mon Sep 17 00:00:00 2001 From: Mortdecai Date: Mon, 20 Apr 2026 05:45:26 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20GPU=20bakeoff=20=E2=80=94=203090=20Ti?= =?UTF-8?q?=20vs=20V100=20vs=20Strix=20Halo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-host Gemma 4 throughput comparison across three architectures. Harness at scripts/gpu-bakeoff/; writeup at docs/reference/gpu-bakeoff-2026-04-20.md. Key findings: - RTX 3090 Ti wins decode decisively (128 tok/s on gemma4:26b MoE Q4, ~4.7× faster than gemma4:31b dense on the same card). - AMD Strix Halo iGPU lands at ~42% of 3090 Ti decode on ~25% of the memory bandwidth — good SIMD utilization, especially for MoE. - V100 numbers are DEGRADED: CT 167 ai-visualizer SDXL consumes 31/32 GB of its VRAM, forcing Gemma 4 models 95% onto CPU. Isolated V100 run requires SDXL eviction — left as follow-up. - MoE vs dense is the dominant latency factor across all GPUs: ~4 B active params of gemma4:26b beats 31.3 B active of gemma4:31b by the same ratio (~4.7×) on every card tested. Methodology: 1 warmup + 3 measurement runs per (host × model × prompt-length), Ollama's canonical timing fields, temp=0 greedy, num_predict=256. All three Ollama servers accessed via HTTP (Strix via Tailscale). Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 1 + docs/reference/gpu-bakeoff-2026-04-20.md | 242 ++++++++++++++++++ scripts/gpu-bakeoff/harness.py | 242 ++++++++++++++++++ scripts/gpu-bakeoff/runs/matt-strix-rerun.log | 6 + .../runs/matt-strix/gemma4-26b-q8/long.json | 5 + .../runs/matt-strix/gemma4-26b-q8/short.json | 5 + .../runs/matt-strix/gemma4-26b/long.json | 81 ++++++ .../runs/matt-strix/gemma4-26b/short.json | 81 ++++++ .../runs/matt-strix/gemma4-31b/long.json | 81 ++++++ .../runs/matt-strix/gemma4-31b/short.json | 81 ++++++ .../runs/pve197/gemma4-26b/long.json | 81 ++++++ .../runs/pve197/gemma4-26b/short.json | 81 ++++++ .../runs/pve197/gemma4-31b/long.json | 81 ++++++ .../runs/pve197/gemma4-31b/short.json | 81 ++++++ .../runs/steel141/gemma4-26b-q8/long.json | 5 + .../runs/steel141/gemma4-26b-q8/short.json | 5 + .../runs/steel141/gemma4-26b/long.json | 81 ++++++ .../runs/steel141/gemma4-26b/short.json | 81 ++++++ .../runs/steel141/gemma4-31b/long.json | 81 ++++++ .../runs/steel141/gemma4-31b/short.json | 81 ++++++ 20 files changed, 1483 insertions(+) create mode 100644 docs/reference/gpu-bakeoff-2026-04-20.md create mode 100644 scripts/gpu-bakeoff/harness.py create mode 100644 scripts/gpu-bakeoff/runs/matt-strix-rerun.log create mode 100644 scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/long.json create mode 100644 scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/short.json create mode 100644 scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/long.json create mode 100644 scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/short.json create mode 100644 scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/long.json create mode 100644 scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/short.json create mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json create mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json create mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json create mode 100644 scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json create mode 100644 scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/long.json create mode 100644 scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/short.json create mode 100644 scripts/gpu-bakeoff/runs/steel141/gemma4-26b/long.json create mode 100644 scripts/gpu-bakeoff/runs/steel141/gemma4-26b/short.json create mode 100644 scripts/gpu-bakeoff/runs/steel141/gemma4-31b/long.json create mode 100644 scripts/gpu-bakeoff/runs/steel141/gemma4-31b/short.json diff --git a/README.md b/README.md index 2cf36e4..99ce858 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Research corpus and implementation guidance for Google Gemma 4, based on product | `docs/openwebui-setup.md` | How to configure Gemma 4 inside OpenWebUI — per-setting reference, two ready-to-bake Workspace Model profiles (chat + extract), and a symptom→cause troubleshooting table mapped back to GOTCHAS.md. Assumes Ollama + OpenWebUI are already running. | When setting up or debugging a Gemma 4 model in OpenWebUI, or handing the front-end config to someone else | | `docs/reference/bakeoff-2026-04-18.md` | CLI-coding-agent bakeoff on 3090 Ti. **Rounds 1/2 misidentified the cause; Round 3 (the correct one): `think: false` silent-stops gemma4:26b at certain multi-turn states on 32K context.** 31B and Qwen3-Coder robust to the flag. Harness at `scripts/bakeoff/` | When deciding which model to back a CLI agent with, writing a custom agent payload, or debugging a silent tool-call halt | | `docs/reference/mort-bakeoff-2026-04-18.md` | mort-bot-specific `think=true` vs `think=false` bakeoff on mort's actual loop shape (gemma4:26b, num_ctx=8192). **Thinking does NOT accumulate in context on Ollama 0.20.4** — strips it from serialized history. Both settings behave identically on step counts, tool counts, wall clock. Harness at `scripts/mort-bakeoff/` | When deciding mort-bot's THINK env var, or when someone claims "think=true eats context" without pinning an Ollama version | +| `docs/reference/gpu-bakeoff-2026-04-20.md` | Cross-GPU throughput bakeoff: steel141 RTX 3090 Ti vs pve197 V100 vs matt-strix (AMD Strix Halo). **3090 Ti wins decode decisively (128 tok/s on 26B MoE). Strix gets ~42% of that on ~25% of the bandwidth. V100 numbers are degraded because SDXL on CT 167 occupies 31/32 GB of its VRAM.** Also quantifies the MoE vs dense gap: 26B decodes ~4.7× faster than 31B on every card. Harness at `scripts/gpu-bakeoff/` | When choosing which host to run a Gemma 4 workload on, or deciding whether the V100 needs isolated for a given job | | `tooling/` | **Canonical upstream tooling** — real scripts, notebooks, model cards, and configs pulled from Google / HF / framework maintainers (147 files). Subdirs: `google-official/`, `huggingface/`, `inference-frameworks/`, `gemma-family/`, `fine-tuning/`. See `tooling/README.md` for index and findings that update the older `CORPUS_*` docs | When you need authoritative source material — model cards, chat templates, fine-tuning recipes, serving commands for vLLM / llama.cpp / MLX, or to scope a specialized sibling (ShieldGemma, EmbeddingGemma, etc.) | ## Source Projects diff --git a/docs/reference/gpu-bakeoff-2026-04-20.md b/docs/reference/gpu-bakeoff-2026-04-20.md new file mode 100644 index 0000000..5f1e6f7 --- /dev/null +++ b/docs/reference/gpu-bakeoff-2026-04-20.md @@ -0,0 +1,242 @@ +# GPU Bakeoff — Gemma 4 Throughput Across Three Architectures + +**Date:** 2026-04-20 +**Host matrix:** steel141 (RTX 3090 Ti) · pve197 CT 105 (Tesla V100) · matt-strix (AMD Strix Halo iGPU) +**Models:** `gemma4:26b` (MoE Q4_K_M) · `gemma4:31b-it-q4_K_M` (dense Q4_K_M) +**Harness:** `scripts/gpu-bakeoff/harness.py` +**Raw data:** `scripts/gpu-bakeoff/runs/` + +--- + +## TL;DR + +| GPU | 26B (MoE) decode | 31B (dense) decode | Long-prompt prefill (26B) | +|-----|------------------|--------------------|-----------------------| +| **RTX 3090 Ti** (steel141) | **128 tok/s** | **27 tok/s** | **23,849 tok/s** | +| **AMD Strix Halo iGPU** (matt-strix) | 54 tok/s (42%) | 11 tok/s (39%) | 14,326 tok/s (60%) | +| **Tesla V100** (pve197) ⚠ | 8 tok/s (6%) | 1.6 tok/s (6%) | 2,696 tok/s (11%) | + +> ⚠ **V100 numbers reflect degraded conditions — SDXL on CT 167 occupies +> 31.7 / 32.7 GB VRAM, forcing Ollama's Gemma 4 models 95% onto CPU.** +> Under isolation, V100 should land between 3090 Ti and Strix based on +> raw specs (HBM2 ~900 GB/s). See § "V100 caveat" for the evidence. + +### Headline findings + +1. **MoE changes everything.** `gemma4:26b` decodes **~4.7× faster** than + `gemma4:31b` on every GPU tested, because only ~4 B of its 25.8 B + parameters activate per token. Total parameter counts (26 B vs 31 B) + don't predict latency; *active* parameters do. +2. **3090 Ti wins decisively on decode.** For inference workloads the + memory-bandwidth-flop ratio of consumer Ampere GDDR6X is hard to + beat at this price point. +3. **Strix Halo punches above its bandwidth.** Gets 42 % of 3090 Ti + decode speed on only ~25 % of the memory bandwidth (~256 GB/s vs + ~1008 GB/s) — good SIMD utilization, especially on the MoE model. +4. **V100 is held back by shared VRAM.** Its spec should put it closer + to 3090 Ti than to Strix, but coresident SDXL crowds out Ollama's + layer offload. The V100 column in this doc is an *as-is* reading, + not a *peak-capability* reading. + +--- + +## Hardware inventory + +| Host | GPU | VRAM | Bandwidth | Compute cap | Notes | +|------|-----|------|-----------|-------------|-------| +| steel141 | RTX 3090 Ti | 24 GB GDDR6X | ~1008 GB/s | 8.6 (Ampere) | Seth's workstation. Also has a GTX 1660 SUPER as aux display card — not used for inference. Ollama on 127.0.0.1:11434. | +| pve197 CT 105 | Tesla V100-PCIE-32GB | 32 GB HBM2 | ~900 GB/s | 7.0 (Volta) | LXC with GPU passthrough. Ollama on 192.168.0.179:11434. **Coresident with CT 167 ai-visualizer (SDXL) which consumes most of the VRAM.** | +| matt-strix | AMD Strix Halo (Radeon 890M iGPU + XDNA 2 NPU) | Shared LPDDR5X | ~256 GB/s | — | Unified memory lets it fit models a 24 GB card can't. Ollama on 100.117.155.64:11434 via Tailscale. | + +--- + +## Methodology + +- Each (host × model × prompt-length) cell: + - 1 warm-up call (discarded, absorbs model load time and JIT warm-up) + - 3 measurement calls + - `temperature: 0.0`, `top_k: 1` (greedy), `num_predict: 256`, `num_ctx: 4096` + - `keep_alive: 10m` so the model stays resident between runs +- Two prompt lengths: + - **short** (~15 tokens) — isolates decode performance, prefill time is negligible + - **long** (~500 tokens) — stresses prefill (prompt evaluation) +- All timings come from Ollama's own `/api/generate` response fields + (`prompt_eval_duration`, `eval_duration`, etc.), so HTTP and wall-clock + jitter are excluded from the rates. +- Median of the 3 measurement runs is reported in tables; min/max are in + the raw JSON. +- **No network-introduced variance** — all three hosts exposed HTTP + Ollama endpoints (matt-strix via Tailscale). The timings reported are + computed server-side from `prompt_eval_count / prompt_eval_duration` + and `eval_count / eval_duration`. + +--- + +## Full results + +### Decode rate (tok/s, median of 3 runs) + +Decode is the metric that matters most for interactive LLM use — it's +the speed of token generation after the prompt has been processed. + +| Model | 3090 Ti | V100 ⚠ | Strix Halo | +|-------|---------|-------|------------| +| gemma4:26b (MoE, ~4 B active) | **128.20** | 8.34 | 53.86 | +| gemma4:31b (dense, 31.3 B active) | **27.15** | 1.55 | 10.64 | + +### Prefill rate (tok/s, long ~500-token prompt, median) + +Prefill is the cost of ingesting the prompt and populating the KV cache +before decode begins. Batched per-token, so short-prompt prefill numbers +are noisy (dominated by fixed overhead — see raw JSON for those); the +long-prompt numbers below are the ones to reason from. + +| Model | 3090 Ti | V100 ⚠ | Strix Halo | +|-------|---------|-------|------------| +| gemma4:26b (long) | **23,849** | 2,696 | 14,326 | +| gemma4:31b (long) | **7,716** | 436 | 3,278 | + +### Short-prompt prefill (for reference) + +On a 15-token prompt, prefill tokens/sec is meaningless — prompt is too +small to amortize overhead. Included only to confirm no regression. + +| Model | 3090 Ti | V100 ⚠ | Strix Halo | +|-------|---------|-------|------------| +| gemma4:26b (short) | 2,063 | 240 | 1,276 | +| gemma4:31b (short) | 661 | 41 | 292 | + +--- + +## V100 caveat — why the numbers are degraded + +Mid-bakeoff I probed `GET /api/ps` on pve197 while the V100's Q8 MoE was +loaded: + +``` +gemma4:26b-a4b-it-q8_0 size: 30.5 GB size_vram: 1.57 GB +``` + +**Only 1.57 GB of the 30.5 GB model is actually resident on the V100;** +the other 28.9 GB is running on CPU via Ollama's CPU-offload fallback. +`nvidia-smi` corroborated: 31,754 / 32,768 MiB used, 0 % utilization +at probe time. That remaining ~29 GB of VRAM isn't free — it's held by +the SDXL pipeline on CT 167 (claude-avatar + ai-visualizer). + +Impact on every V100 number in this doc: +- `gemma4:26b` Q4_K_M is 18 GB — doesn't fit in the ~1 GB of headroom + SDXL leaves, so it runs largely on CPU. Observed 8.3 tok/s is + consistent with CPU inference of a MoE 26B Q4 model. +- `gemma4:31b` Q4_K_M is 19.9 GB — same fate. Observed 1.55 tok/s is + consistent with dense 31B on CPU (dense kills you on CPU; only + ~4 B params activate on the MoE, so the MoE suffers less). +- The Q8 variant (28 GB) never had a chance on the V100 while SDXL is + loaded. Bakeoff did not attempt it. + +**To get isolated V100 numbers**, stop SDXL on CT 167 (or stop CT 167 +entirely) and re-run `scripts/gpu-bakeoff/harness.py --host pve197`. +Left as a follow-up — whether that's worth the ai-visualizer +interruption is a judgment call. See "Open questions" below. + +--- + +## Why 26B decodes 4.7× faster than 31B + +`gemma4:26b` is the MoE variant ("A4B" in Google's naming = *activated +4B*). Per-token inference routes through only ~4 B of its 25.8 B total +parameters. `gemma4:31b` is dense: every one of its 31.3 B parameters +participates in every token's forward pass. Memory bandwidth is the +binding constraint for decode, so the ratio of *active* params is what +you actually pay for. + +Rough math (3090 Ti, 1008 GB/s, Q4_K_M ≈ 0.5 bytes/param): +- 26B MoE: 4 B × 0.5 B = 2 GB per token. Theoretical max ≈ 504 tok/s. + Observed 128 tok/s = **25 % utilization**. +- 31B dense: 31.3 B × 0.5 B = 15.65 GB per token. Theoretical max ≈ + 64 tok/s. Observed 27 tok/s = **42 % utilization**. + +So dense workloads actually extract *higher* bandwidth utilization — +they're less overhead-dominated per token. But in absolute terms, MoE +wins by a large factor because the active-parameter bill is much +smaller. For interactive chat this is decisive: Seth's `mort-bot` +running `gemma4:26b` gets ~4.7× the responsiveness it would on +`gemma4:31b`, even though the models are near-equal in total params. + +Why the ratio holds on every GPU: **memory bandwidth is the bottleneck** +across all three cards. Strix gets 42 % of 3090 Ti on 26B and 39 % of +3090 Ti on 31B — identical ratios — because it has ~25 % of the +bandwidth and matches or exceeds proportionally. + +--- + +## When to use which GPU + +**Interactive chat / agent workloads (decode-heavy).** + - Primary: **3090 Ti** — by a wide margin. 128 tok/s on 26B is + comfortable for real-time responses. + - Fallback: **Strix Halo** — 54 tok/s is usable. Benefit is unified + memory can host larger models the 24 GB 3090 Ti can't. + - Avoid: V100 *while SDXL is coresident.* Without SDXL it should be + competitive. + +**Long-context / prompt-heavy workloads (prefill-heavy).** + - Primary: **3090 Ti** again — 23,849 tok/s prefill means a + 500-token prompt ingests in ~21 ms. + - Strix at 14,326 tok/s is ~35 ms — still interactive. + +**Running models that don't fit elsewhere.** + - Strix Halo. Unified LPDDR5X can hold 80 GB+ models that 24 GB and + 32 GB discrete cards can't — at the cost of lower bandwidth. + - The largest model tested here (`gemma4:31b` Q4 at 19.9 GB) fits + all three. Q8 variants (28 GB+) only fit the V100 and Strix. + +**Fine-tuning / training.** + - Not measured here. 3090 Ti's 24 GB limits batch size on 20 B+ + models; V100's 32 GB HBM2 is much more forgiving *if* isolated. + +--- + +## Open questions / follow-ups + +1. **Isolated V100 re-run.** Stop SDXL, re-run the harness. Expected + outcome: V100 decode lands between 3090 Ti and Strix (probably + ~70-90 tok/s on 26B given HBM2 bandwidth ~900 GB/s vs 3090 Ti's + ~1008 GB/s). That would settle the V100's actual rank. +2. **V100 Q8 baseline.** `gemma4:26b-a4b-it-q8_0` (28 GB) is the Q8 + MoE variant Seth pulled on pve197 — worth measuring once isolated. + Q8 vs Q4 quality/speed tradeoff for the same model would be useful. +3. **Strix max-model fit.** Strix can probably host models that + wouldn't fit the discrete cards. A follow-up would pull a larger + model (70 B+ quantized) on matt-strix and see the Strix-only + performance ceiling. +4. **Contention behavior.** The V100 finding generalizes — whenever + the homelab is running coresident AI workloads, Gemma 4 inference + falls off a cliff. A "contention-aware routing" decision (don't + send latency-sensitive Ollama traffic to a card with SDXL running) + may be worth building into the mort-bot / openwebui gateway. + +--- + +## Raw data + +All per-run JSON traces are under `scripts/gpu-bakeoff/runs/`: + +``` +runs/ +├── steel141/ +│ ├── gemma4-26b/{short,long}.json +│ ├── gemma4-31b/{short,long}.json +│ └── gemma4-26b-q8/{short,long}.json # skipped — model not on host +├── pve197/ +│ ├── gemma4-26b/{short,long}.json # ⚠ degraded, see caveat +│ └── gemma4-31b/{short,long}.json # ⚠ degraded, see caveat +└── matt-strix/ + ├── gemma4-26b/{short,long}.json + ├── gemma4-31b/{short,long}.json + └── gemma4-26b-q8/{short,long}.json # skipped — model not on host +``` + +Each JSON contains the warmup call and all 3 measurement calls with +every field Ollama's `/api/generate` returns (token counts, durations, +loaded-at, context length), plus a `summary` with min/median/max for +prefill and decode rates. diff --git a/scripts/gpu-bakeoff/harness.py b/scripts/gpu-bakeoff/harness.py new file mode 100644 index 0000000..fd346a2 --- /dev/null +++ b/scripts/gpu-bakeoff/harness.py @@ -0,0 +1,242 @@ +"""GPU bakeoff harness — Gemma 4 throughput across heterogeneous GPUs. + +Measures prefill rate, decode rate, load time, and wall-clock across +three hosts: + + - steel141 : RTX 3090 Ti (24 GB GDDR6X, compute 8.6, ~1008 GB/s) + - pve197 : Tesla V100-PCIE-32GB (32 GB HBM2, compute 7.0, ~900 GB/s) + - matt-strix: AMD Strix Halo iGPU (shared LPDDR5X, ~256 GB/s) + +Per (host, model, prompt_length), runs 1 warmup + N measurement runs, +records Ollama's canonical timing fields, and writes one JSON trace to +`runs///.json`. + +All three Ollama servers are polled via HTTP; no SSH required. All +timings come from Ollama's own /api/generate response fields so wall- +clock jitter between the harness and the server is excluded. + +Invocation: + python3 harness.py --host steel141 --model gemma4:26b --prompt short + python3 harness.py all # runs the full planned matrix +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +import urllib.request +from pathlib import Path + + +HOSTS = { + "steel141": {"url": "http://127.0.0.1:11434", "gpu": "RTX 3090 Ti", "vram_gb": 24}, + "pve197": {"url": "http://192.168.0.179:11434", "gpu": "Tesla V100-PCIE-32GB", "vram_gb": 32}, + "matt-strix": {"url": "http://100.117.155.64:11434", "gpu": "AMD Strix Halo iGPU", "vram_gb": None}, +} + +# Per-host model tag mapping. matt-strix uses gemma4:31b, the others +# use gemma4:31b-it-q4_K_M — identical weights, different tags. +MODEL_ALIASES = { + "gemma4:26b": {"steel141": "gemma4:26b", "pve197": "gemma4:26b", "matt-strix": "gemma4:26b"}, + "gemma4:31b": {"steel141": "gemma4:31b-it-q4_K_M", "pve197": "gemma4:31b-it-q4_K_M", "matt-strix": "gemma4:31b"}, + # V100-only edge case — only 32 GB host has headroom for the Q8 MoE. + "gemma4:26b-q8": {"pve197": "gemma4:26b-a4b-it-q8_0"}, +} + + +PROMPTS = { + "short": "Write exactly one sentence summarizing how a transformer language model works.", + + "long": ( + "You are reviewing a short technical passage and must produce a concise summary.\n\n" + "Passage:\n" + "Modern large language models are trained using a combination of self-supervised " + "pretraining on vast text corpora and subsequent instruction-tuning on curated " + "prompt-response pairs. The pretraining stage exposes the model to diverse writing " + "styles, factual information, and reasoning patterns, but leaves it largely unaware " + "of how to follow user instructions. Instruction-tuning, typically via supervised " + "fine-tuning (SFT) followed by a preference-optimization stage such as Direct " + "Preference Optimization (DPO) or Reinforcement Learning from Human Feedback (RLHF), " + "aligns the model's behavior with human expectations. This two-stage recipe — " + "massive pretraining plus alignment — has become the dominant paradigm for open " + "and closed foundation models alike. Variants exist: some models add a midtraining " + "stage between the two for curriculum or skill rebalancing; others use constitutional " + "methods or reinforcement learning with verifiable rewards. For specialized domains " + "like code or math, domain-specific SFT datasets and reward models are commonly " + "layered on top of a general-purpose base. Throughout the process, the model's " + "parameters remain fixed in architecture but shift substantially in value, with " + "alignment stages typically touching a small fraction of the parameter space " + "compared to the changes induced by pretraining.\n\n" + "Task: Summarize the passage above in exactly three sentences, covering (1) what " + "pretraining does, (2) what instruction-tuning does, and (3) why both stages are " + "necessary in modern LLM recipes." + ), +} + + +def _gen(url: str, model: str, prompt: str, num_predict: int, num_ctx: int, keep_alive: str) -> dict: + """Single /api/generate call, stream=False, greedy decoding.""" + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "options": { + "num_ctx": num_ctx, + "num_predict": num_predict, + "temperature": 0.0, + "top_k": 1, + }, + "keep_alive": keep_alive, + } + req = urllib.request.Request( + f"{url}/api/generate", + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}, + ) + t0 = time.time() + with urllib.request.urlopen(req, timeout=600) as r: + d = json.loads(r.read()) + d["_harness_wall_s"] = round(time.time() - t0, 3) + return d + + +def _metrics(d: dict) -> dict: + """Extract canonical rates from Ollama's response. + + Fields (all nanoseconds unless noted): + total_duration — end-to-end, including load + load_duration — time to load model into memory + prompt_eval_count — input tokens + prompt_eval_duration — time to prefill + eval_count — output tokens + eval_duration — time to decode + """ + pec = d.get("prompt_eval_count") or 0 + ped = d.get("prompt_eval_duration") or 0 + ec = d.get("eval_count") or 0 + ed = d.get("eval_duration") or 0 + total = d.get("total_duration") or 0 + load = d.get("load_duration") or 0 + + prefill_rate = (pec / (ped / 1e9)) if ped else None + decode_rate = (ec / (ed / 1e9)) if ed else None + + return { + "prompt_tokens": pec, + "prompt_eval_ms": round(ped / 1e6, 1) if ped else None, + "prefill_tok_per_s": round(prefill_rate, 2) if prefill_rate else None, + "output_tokens": ec, + "eval_ms": round(ed / 1e6, 1) if ed else None, + "decode_tok_per_s": round(decode_rate, 2) if decode_rate else None, + "load_ms": round(load / 1e6, 1) if load else None, + "total_ms": round(total / 1e6, 1) if total else None, + "harness_wall_s": d.get("_harness_wall_s"), + "done_reason": d.get("done_reason"), + } + + +def run_matrix( + host: str, + model_alias: str, + prompt_key: str, + num_predict: int = 256, + num_ctx: int = 4096, + runs: int = 3, +) -> dict: + host_cfg = HOSTS[host] + model_tag = MODEL_ALIASES[model_alias].get(host) + if not model_tag: + return {"host": host, "model_alias": model_alias, "skipped": "model not available on host"} + + prompt = PROMPTS[prompt_key] + url = host_cfg["url"] + + trace = { + "host": host, + "gpu": host_cfg["gpu"], + "vram_gb": host_cfg["vram_gb"], + "model_alias": model_alias, + "model_tag": model_tag, + "prompt_key": prompt_key, + "prompt_chars": len(prompt), + "num_predict": num_predict, + "num_ctx": num_ctx, + "runs": [], + "warmup": None, + } + + # Warmup — discarded. First call absorbs model load time. + try: + w = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m") + trace["warmup"] = _metrics(w) + except Exception as e: + trace["error"] = f"warmup failed: {e}" + return trace + + # Measurement runs. + for i in range(runs): + try: + r = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m") + trace["runs"].append(_metrics(r)) + except Exception as e: + trace["runs"].append({"error": str(e)}) + + # Aggregate. + valid = [r for r in trace["runs"] if r.get("decode_tok_per_s") is not None] + if valid: + def _vals(k): return [r[k] for r in valid if r.get(k) is not None] + def _stats(xs): + if not xs: return None + s = sorted(xs) + return {"min": s[0], "median": s[len(s)//2], "max": s[-1], "n": len(s)} + trace["summary"] = { + "prefill_tok_per_s": _stats(_vals("prefill_tok_per_s")), + "decode_tok_per_s": _stats(_vals("decode_tok_per_s")), + "total_ms": _stats(_vals("total_ms")), + } + + return trace + + +def _run_one(host: str, model: str, prompt: str, out_dir: Path, runs: int) -> None: + t = run_matrix(host, model, prompt, runs=runs) + safe_model = model.replace(":", "-").replace("/", "-") + path = out_dir / host / safe_model / f"{prompt}.json" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(t, indent=2)) + s = t.get("summary") or {} + dec = s.get("decode_tok_per_s") or {} + pre = s.get("prefill_tok_per_s") or {} + skipped = t.get("skipped") or t.get("error") + if skipped: + print(f"[{host:10s}] {model:16s} {prompt:6s} — {skipped}") + else: + print(f"[{host:10s}] {model:16s} {prompt:6s} — " + f"prefill={pre.get('median','?'):>7} tok/s " + f"decode={dec.get('median','?'):>6} tok/s") + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--host", choices=list(HOSTS) + ["all"], default="all") + ap.add_argument("--model", choices=list(MODEL_ALIASES) + ["all"], default="all") + ap.add_argument("--prompt", choices=list(PROMPTS) + ["all"], default="all") + ap.add_argument("--runs", type=int, default=3) + ap.add_argument("--out-dir", type=Path, default=Path(__file__).parent / "runs") + args = ap.parse_args() + + hosts = list(HOSTS) if args.host == "all" else [args.host] + models = list(MODEL_ALIASES) if args.model == "all" else [args.model] + prompts = list(PROMPTS) if args.prompt == "all" else [args.prompt] + + for host in hosts: + for model in models: + for prompt in prompts: + _run_one(host, model, prompt, args.out_dir, args.runs) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/gpu-bakeoff/runs/matt-strix-rerun.log b/scripts/gpu-bakeoff/runs/matt-strix-rerun.log new file mode 100644 index 0000000..58382c6 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix-rerun.log @@ -0,0 +1,6 @@ +[matt-strix] gemma4:26b short — prefill=1275.71 tok/s decode= 53.83 tok/s +[matt-strix] gemma4:26b long — prefill=14326.07 tok/s decode= 52.42 tok/s +[matt-strix] gemma4:31b short — prefill= 291.74 tok/s decode= 10.64 tok/s +[matt-strix] gemma4:31b long — prefill= 3277.8 tok/s decode= 10.42 tok/s +[matt-strix] gemma4:26b-q8 short — model not available on host +[matt-strix] gemma4:26b-q8 long — model not available on host diff --git a/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/long.json b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/long.json new file mode 100644 index 0000000..7b0d94d --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/long.json @@ -0,0 +1,5 @@ +{ + "host": "matt-strix", + "model_alias": "gemma4:26b-q8", + "skipped": "model not available on host" +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/short.json b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/short.json new file mode 100644 index 0000000..7b0d94d --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b-q8/short.json @@ -0,0 +1,5 @@ +{ + "host": "matt-strix", + "model_alias": "gemma4:26b-q8", + "skipped": "model not available on host" +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/long.json b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/long.json new file mode 100644 index 0000000..97ecd3c --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/long.json @@ -0,0 +1,81 @@ +{ + "host": "matt-strix", + "gpu": "AMD Strix Halo iGPU", + "vram_gb": null, + "model_alias": "gemma4:26b", + "model_tag": "gemma4:26b", + "prompt_key": "long", + "prompt_chars": 1614, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 319, + "prompt_eval_ms": 22.5, + "prefill_tok_per_s": 14201.36, + "output_tokens": 256, + "eval_ms": 4883.4, + "decode_tok_per_s": 52.42, + "load_ms": 151.1, + "total_ms": 5120.3, + "harness_wall_s": 5.186, + "done_reason": "length" + }, + { + "prompt_tokens": 319, + "prompt_eval_ms": 22.1, + "prefill_tok_per_s": 14448.45, + "output_tokens": 256, + "eval_ms": 4881.1, + "decode_tok_per_s": 52.45, + "load_ms": 159.1, + "total_ms": 5124.5, + "harness_wall_s": 5.18, + "done_reason": "length" + }, + { + "prompt_tokens": 319, + "prompt_eval_ms": 22.3, + "prefill_tok_per_s": 14326.07, + "output_tokens": 256, + "eval_ms": 4885.3, + "decode_tok_per_s": 52.4, + "load_ms": 155.4, + "total_ms": 5128.9, + "harness_wall_s": 5.192, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 319, + "prompt_eval_ms": 265.0, + "prefill_tok_per_s": 1203.86, + "output_tokens": 256, + "eval_ms": 4880.6, + "decode_tok_per_s": 52.45, + "load_ms": 159.8, + "total_ms": 5368.3, + "harness_wall_s": 5.429, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 14201.36, + "median": 14326.07, + "max": 14448.45, + "n": 3 + }, + "decode_tok_per_s": { + "min": 52.4, + "median": 52.42, + "max": 52.45, + "n": 3 + }, + "total_ms": { + "min": 5120.3, + "median": 5124.5, + "max": 5128.9, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/short.json b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/short.json new file mode 100644 index 0000000..f49e2dc --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-26b/short.json @@ -0,0 +1,81 @@ +{ + "host": "matt-strix", + "gpu": "AMD Strix Halo iGPU", + "vram_gb": null, + "model_alias": "gemma4:26b", + "model_tag": "gemma4:26b", + "prompt_key": "short", + "prompt_chars": 78, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 28, + "prompt_eval_ms": 21.9, + "prefill_tok_per_s": 1278.99, + "output_tokens": 256, + "eval_ms": 4754.7, + "decode_tok_per_s": 53.84, + "load_ms": 172.3, + "total_ms": 5008.5, + "harness_wall_s": 5.057, + "done_reason": "length" + }, + { + "prompt_tokens": 28, + "prompt_eval_ms": 21.9, + "prefill_tok_per_s": 1275.71, + "output_tokens": 256, + "eval_ms": 4755.7, + "decode_tok_per_s": 53.83, + "load_ms": 151.6, + "total_ms": 4988.3, + "harness_wall_s": 5.043, + "done_reason": "length" + }, + { + "prompt_tokens": 28, + "prompt_eval_ms": 22.0, + "prefill_tok_per_s": 1271.11, + "output_tokens": 256, + "eval_ms": 4757.6, + "decode_tok_per_s": 53.81, + "load_ms": 154.4, + "total_ms": 4993.2, + "harness_wall_s": 5.048, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 28, + "prompt_eval_ms": 93.1, + "prefill_tok_per_s": 300.9, + "output_tokens": 256, + "eval_ms": 4756.6, + "decode_tok_per_s": 53.82, + "load_ms": 2272.4, + "total_ms": 7250.0, + "harness_wall_s": 7.341, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 1271.11, + "median": 1275.71, + "max": 1278.99, + "n": 3 + }, + "decode_tok_per_s": { + "min": 53.81, + "median": 53.83, + "max": 53.84, + "n": 3 + }, + "total_ms": { + "min": 4988.3, + "median": 4993.2, + "max": 5008.5, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/long.json b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/long.json new file mode 100644 index 0000000..f9e4faf --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/long.json @@ -0,0 +1,81 @@ +{ + "host": "matt-strix", + "gpu": "AMD Strix Halo iGPU", + "vram_gb": null, + "model_alias": "gemma4:31b", + "model_tag": "gemma4:31b", + "prompt_key": "long", + "prompt_chars": 1614, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 319, + "prompt_eval_ms": 97.2, + "prefill_tok_per_s": 3282.33, + "output_tokens": 256, + "eval_ms": 24566.1, + "decode_tok_per_s": 10.42, + "load_ms": 157.2, + "total_ms": 24879.5, + "harness_wall_s": 24.945, + "done_reason": "length" + }, + { + "prompt_tokens": 319, + "prompt_eval_ms": 97.3, + "prefill_tok_per_s": 3277.8, + "output_tokens": 256, + "eval_ms": 24560.7, + "decode_tok_per_s": 10.42, + "load_ms": 162.3, + "total_ms": 24880.5, + "harness_wall_s": 24.943, + "done_reason": "length" + }, + { + "prompt_tokens": 319, + "prompt_eval_ms": 97.4, + "prefill_tok_per_s": 3274.93, + "output_tokens": 256, + "eval_ms": 24559.5, + "decode_tok_per_s": 10.42, + "load_ms": 157.1, + "total_ms": 24876.8, + "harness_wall_s": 24.94, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 319, + "prompt_eval_ms": 1052.0, + "prefill_tok_per_s": 303.23, + "output_tokens": 256, + "eval_ms": 24563.0, + "decode_tok_per_s": 10.42, + "load_ms": 167.6, + "total_ms": 25843.0, + "harness_wall_s": 25.896, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 3274.93, + "median": 3277.8, + "max": 3282.33, + "n": 3 + }, + "decode_tok_per_s": { + "min": 10.42, + "median": 10.42, + "max": 10.42, + "n": 3 + }, + "total_ms": { + "min": 24876.8, + "median": 24879.5, + "max": 24880.5, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/short.json b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/short.json new file mode 100644 index 0000000..73af9f8 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/matt-strix/gemma4-31b/short.json @@ -0,0 +1,81 @@ +{ + "host": "matt-strix", + "gpu": "AMD Strix Halo iGPU", + "vram_gb": null, + "model_alias": "gemma4:31b", + "model_tag": "gemma4:31b", + "prompt_key": "short", + "prompt_chars": 78, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 28, + "prompt_eval_ms": 96.4, + "prefill_tok_per_s": 290.33, + "output_tokens": 256, + "eval_ms": 24049.7, + "decode_tok_per_s": 10.64, + "load_ms": 169.4, + "total_ms": 24372.6, + "harness_wall_s": 24.428, + "done_reason": "length" + }, + { + "prompt_tokens": 28, + "prompt_eval_ms": 96.0, + "prefill_tok_per_s": 291.74, + "output_tokens": 256, + "eval_ms": 24046.4, + "decode_tok_per_s": 10.65, + "load_ms": 165.7, + "total_ms": 24365.4, + "harness_wall_s": 24.429, + "done_reason": "length" + }, + { + "prompt_tokens": 28, + "prompt_eval_ms": 95.6, + "prefill_tok_per_s": 292.74, + "output_tokens": 256, + "eval_ms": 24065.8, + "decode_tok_per_s": 10.64, + "load_ms": 164.3, + "total_ms": 24385.6, + "harness_wall_s": 24.432, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 28, + "prompt_eval_ms": 207.0, + "prefill_tok_per_s": 135.28, + "output_tokens": 256, + "eval_ms": 24181.8, + "decode_tok_per_s": 10.59, + "load_ms": 5509.8, + "total_ms": 30028.6, + "harness_wall_s": 30.082, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 290.33, + "median": 291.74, + "max": 292.74, + "n": 3 + }, + "decode_tok_per_s": { + "min": 10.64, + "median": 10.64, + "max": 10.65, + "n": 3 + }, + "total_ms": { + "min": 24365.4, + "median": 24372.6, + "max": 24385.6, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json new file mode 100644 index 0000000..ae93a45 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/long.json @@ -0,0 +1,81 @@ +{ + "host": "pve197", + "gpu": "Tesla V100-PCIE-32GB", + "vram_gb": 32, + "model_alias": "gemma4:26b", + "model_tag": "gemma4:26b", + "prompt_key": "long", + "prompt_chars": 1614, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 318, + "prompt_eval_ms": 118.0, + "prefill_tok_per_s": 2695.59, + "output_tokens": 256, + "eval_ms": 32720.5, + "decode_tok_per_s": 7.82, + "load_ms": 475.8, + "total_ms": 33548.1, + "harness_wall_s": 33.555, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 118.3, + "prefill_tok_per_s": 2689.01, + "output_tokens": 256, + "eval_ms": 31273.0, + "decode_tok_per_s": 8.19, + "load_ms": 492.5, + "total_ms": 32116.6, + "harness_wall_s": 32.123, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 117.3, + "prefill_tok_per_s": 2711.41, + "output_tokens": 256, + "eval_ms": 33434.9, + "decode_tok_per_s": 7.66, + "load_ms": 496.0, + "total_ms": 34298.7, + "harness_wall_s": 34.305, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 318, + "prompt_eval_ms": 3562.7, + "prefill_tok_per_s": 89.26, + "output_tokens": 256, + "eval_ms": 32215.7, + "decode_tok_per_s": 7.95, + "load_ms": 491.7, + "total_ms": 36521.3, + "harness_wall_s": 36.529, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 2689.01, + "median": 2695.59, + "max": 2711.41, + "n": 3 + }, + "decode_tok_per_s": { + "min": 7.66, + "median": 7.82, + "max": 8.19, + "n": 3 + }, + "total_ms": { + "min": 32116.6, + "median": 33548.1, + "max": 34298.7, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json new file mode 100644 index 0000000..985a74d --- /dev/null +++ b/scripts/gpu-bakeoff/runs/pve197/gemma4-26b/short.json @@ -0,0 +1,81 @@ +{ + "host": "pve197", + "gpu": "Tesla V100-PCIE-32GB", + "vram_gb": 32, + "model_alias": "gemma4:26b", + "model_tag": "gemma4:26b", + "prompt_key": "short", + "prompt_chars": 78, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 27, + "prompt_eval_ms": 112.5, + "prefill_tok_per_s": 240.05, + "output_tokens": 256, + "eval_ms": 30919.5, + "decode_tok_per_s": 8.28, + "load_ms": 531.1, + "total_ms": 31828.4, + "harness_wall_s": 31.832, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 113.6, + "prefill_tok_per_s": 237.6, + "output_tokens": 256, + "eval_ms": 30399.9, + "decode_tok_per_s": 8.42, + "load_ms": 479.4, + "total_ms": 31242.1, + "harness_wall_s": 31.246, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 111.0, + "prefill_tok_per_s": 243.16, + "output_tokens": 256, + "eval_ms": 30712.9, + "decode_tok_per_s": 8.34, + "load_ms": 483.2, + "total_ms": 31552.8, + "harness_wall_s": 31.557, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 27, + "prompt_eval_ms": 843.7, + "prefill_tok_per_s": 32.0, + "output_tokens": 256, + "eval_ms": 30499.4, + "decode_tok_per_s": 8.39, + "load_ms": 5877.7, + "total_ms": 37664.4, + "harness_wall_s": 37.668, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 237.6, + "median": 240.05, + "max": 243.16, + "n": 3 + }, + "decode_tok_per_s": { + "min": 8.28, + "median": 8.34, + "max": 8.42, + "n": 3 + }, + "total_ms": { + "min": 31242.1, + "median": 31552.8, + "max": 31828.4, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json new file mode 100644 index 0000000..ecab413 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/long.json @@ -0,0 +1,81 @@ +{ + "host": "pve197", + "gpu": "Tesla V100-PCIE-32GB", + "vram_gb": 32, + "model_alias": "gemma4:31b", + "model_tag": "gemma4:31b-it-q4_K_M", + "prompt_key": "long", + "prompt_chars": 1614, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 318, + "prompt_eval_ms": 728.7, + "prefill_tok_per_s": 436.37, + "output_tokens": 256, + "eval_ms": 163511.0, + "decode_tok_per_s": 1.57, + "load_ms": 495.0, + "total_ms": 164970.4, + "harness_wall_s": 164.977, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 682.8, + "prefill_tok_per_s": 465.71, + "output_tokens": 256, + "eval_ms": 168727.1, + "decode_tok_per_s": 1.52, + "load_ms": 545.3, + "total_ms": 170207.4, + "harness_wall_s": 170.214, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 950.0, + "prefill_tok_per_s": 334.75, + "output_tokens": 256, + "eval_ms": 163102.9, + "decode_tok_per_s": 1.57, + "load_ms": 507.9, + "total_ms": 164801.8, + "harness_wall_s": 164.809, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 318, + "prompt_eval_ms": 3883.3, + "prefill_tok_per_s": 81.89, + "output_tokens": 256, + "eval_ms": 172199.4, + "decode_tok_per_s": 1.49, + "load_ms": 528.0, + "total_ms": 176864.8, + "harness_wall_s": 176.871, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 334.75, + "median": 436.37, + "max": 465.71, + "n": 3 + }, + "decode_tok_per_s": { + "min": 1.52, + "median": 1.57, + "max": 1.57, + "n": 3 + }, + "total_ms": { + "min": 164801.8, + "median": 164970.4, + "max": 170207.4, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json b/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json new file mode 100644 index 0000000..7abf1e3 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/pve197/gemma4-31b/short.json @@ -0,0 +1,81 @@ +{ + "host": "pve197", + "gpu": "Tesla V100-PCIE-32GB", + "vram_gb": 32, + "model_alias": "gemma4:31b", + "model_tag": "gemma4:31b-it-q4_K_M", + "prompt_key": "short", + "prompt_chars": 78, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 27, + "prompt_eval_ms": 665.6, + "prefill_tok_per_s": 40.56, + "output_tokens": 256, + "eval_ms": 164631.1, + "decode_tok_per_s": 1.55, + "load_ms": 512.6, + "total_ms": 166062.7, + "harness_wall_s": 166.067, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 660.3, + "prefill_tok_per_s": 40.89, + "output_tokens": 256, + "eval_ms": 159594.3, + "decode_tok_per_s": 1.6, + "load_ms": 523.6, + "total_ms": 161012.3, + "harness_wall_s": 161.016, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 887.8, + "prefill_tok_per_s": 30.41, + "output_tokens": 256, + "eval_ms": 167584.3, + "decode_tok_per_s": 1.53, + "load_ms": 486.8, + "total_ms": 169188.9, + "harness_wall_s": 169.194, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 27, + "prompt_eval_ms": 6642.4, + "prefill_tok_per_s": 4.06, + "output_tokens": 256, + "eval_ms": 173530.1, + "decode_tok_per_s": 1.48, + "load_ms": 20142.1, + "total_ms": 200836.5, + "harness_wall_s": 200.841, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 30.41, + "median": 40.56, + "max": 40.89, + "n": 3 + }, + "decode_tok_per_s": { + "min": 1.53, + "median": 1.55, + "max": 1.6, + "n": 3 + }, + "total_ms": { + "min": 161012.3, + "median": 166062.7, + "max": 169188.9, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/long.json b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/long.json new file mode 100644 index 0000000..c9cb588 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/long.json @@ -0,0 +1,5 @@ +{ + "host": "steel141", + "model_alias": "gemma4:26b-q8", + "skipped": "model not available on host" +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/short.json b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/short.json new file mode 100644 index 0000000..c9cb588 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b-q8/short.json @@ -0,0 +1,5 @@ +{ + "host": "steel141", + "model_alias": "gemma4:26b-q8", + "skipped": "model not available on host" +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/steel141/gemma4-26b/long.json b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b/long.json new file mode 100644 index 0000000..e69b101 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b/long.json @@ -0,0 +1,81 @@ +{ + "host": "steel141", + "gpu": "RTX 3090 Ti", + "vram_gb": 24, + "model_alias": "gemma4:26b", + "model_tag": "gemma4:26b", + "prompt_key": "long", + "prompt_chars": 1614, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 318, + "prompt_eval_ms": 12.5, + "prefill_tok_per_s": 25397.83, + "output_tokens": 256, + "eval_ms": 1988.3, + "decode_tok_per_s": 128.76, + "load_ms": 319.7, + "total_ms": 2461.3, + "harness_wall_s": 2.465, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 13.3, + "prefill_tok_per_s": 23848.87, + "output_tokens": 256, + "eval_ms": 1999.2, + "decode_tok_per_s": 128.05, + "load_ms": 343.2, + "total_ms": 2500.0, + "harness_wall_s": 2.503, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 14.2, + "prefill_tok_per_s": 22372.04, + "output_tokens": 256, + "eval_ms": 1998.9, + "decode_tok_per_s": 128.07, + "load_ms": 326.0, + "total_ms": 2479.5, + "harness_wall_s": 2.483, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 318, + "prompt_eval_ms": 96.4, + "prefill_tok_per_s": 3298.64, + "output_tokens": 256, + "eval_ms": 2018.1, + "decode_tok_per_s": 126.85, + "load_ms": 328.7, + "total_ms": 2578.7, + "harness_wall_s": 2.582, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 22372.04, + "median": 23848.87, + "max": 25397.83, + "n": 3 + }, + "decode_tok_per_s": { + "min": 128.05, + "median": 128.07, + "max": 128.76, + "n": 3 + }, + "total_ms": { + "min": 2461.3, + "median": 2479.5, + "max": 2500.0, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/steel141/gemma4-26b/short.json b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b/short.json new file mode 100644 index 0000000..d0d87ed --- /dev/null +++ b/scripts/gpu-bakeoff/runs/steel141/gemma4-26b/short.json @@ -0,0 +1,81 @@ +{ + "host": "steel141", + "gpu": "RTX 3090 Ti", + "vram_gb": 24, + "model_alias": "gemma4:26b", + "model_tag": "gemma4:26b", + "prompt_key": "short", + "prompt_chars": 78, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 27, + "prompt_eval_ms": 19.8, + "prefill_tok_per_s": 1366.3, + "output_tokens": 256, + "eval_ms": 2089.1, + "decode_tok_per_s": 122.54, + "load_ms": 361.8, + "total_ms": 2614.4, + "harness_wall_s": 2.617, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 12.9, + "prefill_tok_per_s": 2088.79, + "output_tokens": 256, + "eval_ms": 1996.8, + "decode_tok_per_s": 128.2, + "load_ms": 341.7, + "total_ms": 2489.2, + "harness_wall_s": 2.491, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 13.1, + "prefill_tok_per_s": 2062.75, + "output_tokens": 256, + "eval_ms": 1995.2, + "decode_tok_per_s": 128.31, + "load_ms": 330.7, + "total_ms": 2473.9, + "harness_wall_s": 2.476, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 27, + "prompt_eval_ms": 47.7, + "prefill_tok_per_s": 566.39, + "output_tokens": 256, + "eval_ms": 2014.5, + "decode_tok_per_s": 127.08, + "load_ms": 4346.8, + "total_ms": 6739.3, + "harness_wall_s": 6.752, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 1366.3, + "median": 2062.75, + "max": 2088.79, + "n": 3 + }, + "decode_tok_per_s": { + "min": 122.54, + "median": 128.2, + "max": 128.31, + "n": 3 + }, + "total_ms": { + "min": 2473.9, + "median": 2489.2, + "max": 2614.4, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/steel141/gemma4-31b/long.json b/scripts/gpu-bakeoff/runs/steel141/gemma4-31b/long.json new file mode 100644 index 0000000..c1789e3 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/steel141/gemma4-31b/long.json @@ -0,0 +1,81 @@ +{ + "host": "steel141", + "gpu": "RTX 3090 Ti", + "vram_gb": 24, + "model_alias": "gemma4:31b", + "model_tag": "gemma4:31b-it-q4_K_M", + "prompt_key": "long", + "prompt_chars": 1614, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 318, + "prompt_eval_ms": 38.2, + "prefill_tok_per_s": 8318.85, + "output_tokens": 256, + "eval_ms": 9390.5, + "decode_tok_per_s": 27.26, + "load_ms": 317.4, + "total_ms": 9886.3, + "harness_wall_s": 9.89, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 42.7, + "prefill_tok_per_s": 7454.7, + "output_tokens": 256, + "eval_ms": 9429.0, + "decode_tok_per_s": 27.15, + "load_ms": 316.0, + "total_ms": 9929.8, + "harness_wall_s": 9.933, + "done_reason": "length" + }, + { + "prompt_tokens": 318, + "prompt_eval_ms": 41.2, + "prefill_tok_per_s": 7716.07, + "output_tokens": 256, + "eval_ms": 9477.4, + "decode_tok_per_s": 27.01, + "load_ms": 334.3, + "total_ms": 9996.2, + "harness_wall_s": 10.0, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 318, + "prompt_eval_ms": 967.7, + "prefill_tok_per_s": 328.62, + "output_tokens": 256, + "eval_ms": 9339.6, + "decode_tok_per_s": 27.41, + "load_ms": 324.2, + "total_ms": 10774.3, + "harness_wall_s": 10.778, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 7454.7, + "median": 7716.07, + "max": 8318.85, + "n": 3 + }, + "decode_tok_per_s": { + "min": 27.01, + "median": 27.15, + "max": 27.26, + "n": 3 + }, + "total_ms": { + "min": 9886.3, + "median": 9929.8, + "max": 9996.2, + "n": 3 + } + } +} \ No newline at end of file diff --git a/scripts/gpu-bakeoff/runs/steel141/gemma4-31b/short.json b/scripts/gpu-bakeoff/runs/steel141/gemma4-31b/short.json new file mode 100644 index 0000000..9d3b686 --- /dev/null +++ b/scripts/gpu-bakeoff/runs/steel141/gemma4-31b/short.json @@ -0,0 +1,81 @@ +{ + "host": "steel141", + "gpu": "RTX 3090 Ti", + "vram_gb": 24, + "model_alias": "gemma4:31b", + "model_tag": "gemma4:31b-it-q4_K_M", + "prompt_key": "short", + "prompt_chars": 78, + "num_predict": 256, + "num_ctx": 4096, + "runs": [ + { + "prompt_tokens": 27, + "prompt_eval_ms": 44.1, + "prefill_tok_per_s": 611.75, + "output_tokens": 256, + "eval_ms": 9189.5, + "decode_tok_per_s": 27.86, + "load_ms": 373.7, + "total_ms": 9759.8, + "harness_wall_s": 9.762, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 40.4, + "prefill_tok_per_s": 668.59, + "output_tokens": 256, + "eval_ms": 9115.3, + "decode_tok_per_s": 28.08, + "load_ms": 340.5, + "total_ms": 9635.7, + "harness_wall_s": 9.638, + "done_reason": "length" + }, + { + "prompt_tokens": 27, + "prompt_eval_ms": 40.9, + "prefill_tok_per_s": 660.95, + "output_tokens": 256, + "eval_ms": 9123.7, + "decode_tok_per_s": 28.06, + "load_ms": 325.8, + "total_ms": 9626.6, + "harness_wall_s": 9.629, + "done_reason": "length" + } + ], + "warmup": { + "prompt_tokens": 27, + "prompt_eval_ms": 139.6, + "prefill_tok_per_s": 193.44, + "output_tokens": 256, + "eval_ms": 9190.0, + "decode_tok_per_s": 27.86, + "load_ms": 13817.9, + "total_ms": 23488.4, + "harness_wall_s": 23.491, + "done_reason": "length" + }, + "summary": { + "prefill_tok_per_s": { + "min": 611.75, + "median": 660.95, + "max": 668.59, + "n": 3 + }, + "decode_tok_per_s": { + "min": 27.86, + "median": 28.06, + "max": 28.08, + "n": 3 + }, + "total_ms": { + "min": 9626.6, + "median": 9635.7, + "max": 9759.8, + "n": 3 + } + } +} \ No newline at end of file