"""GPU bakeoff harness — Gemma 4 throughput across heterogeneous GPUs. Measures prefill rate, decode rate, load time, and wall-clock across three hosts: - steel141 : RTX 3090 Ti (24 GB GDDR6X, compute 8.6, ~1008 GB/s) - pve197 : Tesla V100-PCIE-32GB (32 GB HBM2, compute 7.0, ~900 GB/s) - matt-strix: AMD Strix Halo iGPU (shared LPDDR5X, ~256 GB/s) Per (host, model, prompt_length), runs 1 warmup + N measurement runs, records Ollama's canonical timing fields, and writes one JSON trace to `runs///.json`. All three Ollama servers are polled via HTTP; no SSH required. All timings come from Ollama's own /api/generate response fields so wall- clock jitter between the harness and the server is excluded. Invocation: python3 harness.py --host steel141 --model gemma4:26b --prompt short python3 harness.py all # runs the full planned matrix """ from __future__ import annotations import argparse import json import sys import time import urllib.request from pathlib import Path HOSTS = { "steel141": {"url": "http://127.0.0.1:11434", "gpu": "RTX 3090 Ti", "vram_gb": 24}, "pve197": {"url": "http://192.168.0.179:11434", "gpu": "Tesla V100-PCIE-32GB", "vram_gb": 32}, "matt-strix": {"url": "http://100.117.155.64:11434", "gpu": "AMD Strix Halo iGPU", "vram_gb": None}, } # Per-host model tag mapping. matt-strix uses gemma4:31b, the others # use gemma4:31b-it-q4_K_M — identical weights, different tags. MODEL_ALIASES = { "gemma4:26b": {"steel141": "gemma4:26b", "pve197": "gemma4:26b", "matt-strix": "gemma4:26b"}, "gemma4:31b": {"steel141": "gemma4:31b-it-q4_K_M", "pve197": "gemma4:31b-it-q4_K_M", "matt-strix": "gemma4:31b"}, # V100-only edge case — only 32 GB host has headroom for the Q8 MoE. "gemma4:26b-q8": {"pve197": "gemma4:26b-a4b-it-q8_0"}, } PROMPTS = { "short": "Write exactly one sentence summarizing how a transformer language model works.", "long": ( "You are reviewing a short technical passage and must produce a concise summary.\n\n" "Passage:\n" "Modern large language models are trained using a combination of self-supervised " "pretraining on vast text corpora and subsequent instruction-tuning on curated " "prompt-response pairs. The pretraining stage exposes the model to diverse writing " "styles, factual information, and reasoning patterns, but leaves it largely unaware " "of how to follow user instructions. Instruction-tuning, typically via supervised " "fine-tuning (SFT) followed by a preference-optimization stage such as Direct " "Preference Optimization (DPO) or Reinforcement Learning from Human Feedback (RLHF), " "aligns the model's behavior with human expectations. This two-stage recipe — " "massive pretraining plus alignment — has become the dominant paradigm for open " "and closed foundation models alike. Variants exist: some models add a midtraining " "stage between the two for curriculum or skill rebalancing; others use constitutional " "methods or reinforcement learning with verifiable rewards. For specialized domains " "like code or math, domain-specific SFT datasets and reward models are commonly " "layered on top of a general-purpose base. Throughout the process, the model's " "parameters remain fixed in architecture but shift substantially in value, with " "alignment stages typically touching a small fraction of the parameter space " "compared to the changes induced by pretraining.\n\n" "Task: Summarize the passage above in exactly three sentences, covering (1) what " "pretraining does, (2) what instruction-tuning does, and (3) why both stages are " "necessary in modern LLM recipes." ), } def _gen(url: str, model: str, prompt: str, num_predict: int, num_ctx: int, keep_alive: str) -> dict: """Single /api/generate call, stream=False, greedy decoding.""" payload = { "model": model, "prompt": prompt, "stream": False, "options": { "num_ctx": num_ctx, "num_predict": num_predict, "temperature": 0.0, "top_k": 1, }, "keep_alive": keep_alive, } req = urllib.request.Request( f"{url}/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) t0 = time.time() with urllib.request.urlopen(req, timeout=600) as r: d = json.loads(r.read()) d["_harness_wall_s"] = round(time.time() - t0, 3) return d def _metrics(d: dict) -> dict: """Extract canonical rates from Ollama's response. Fields (all nanoseconds unless noted): total_duration — end-to-end, including load load_duration — time to load model into memory prompt_eval_count — input tokens prompt_eval_duration — time to prefill eval_count — output tokens eval_duration — time to decode """ pec = d.get("prompt_eval_count") or 0 ped = d.get("prompt_eval_duration") or 0 ec = d.get("eval_count") or 0 ed = d.get("eval_duration") or 0 total = d.get("total_duration") or 0 load = d.get("load_duration") or 0 prefill_rate = (pec / (ped / 1e9)) if ped else None decode_rate = (ec / (ed / 1e9)) if ed else None return { "prompt_tokens": pec, "prompt_eval_ms": round(ped / 1e6, 1) if ped else None, "prefill_tok_per_s": round(prefill_rate, 2) if prefill_rate else None, "output_tokens": ec, "eval_ms": round(ed / 1e6, 1) if ed else None, "decode_tok_per_s": round(decode_rate, 2) if decode_rate else None, "load_ms": round(load / 1e6, 1) if load else None, "total_ms": round(total / 1e6, 1) if total else None, "harness_wall_s": d.get("_harness_wall_s"), "done_reason": d.get("done_reason"), } def run_matrix( host: str, model_alias: str, prompt_key: str, num_predict: int = 256, num_ctx: int = 4096, runs: int = 3, ) -> dict: host_cfg = HOSTS[host] model_tag = MODEL_ALIASES[model_alias].get(host) if not model_tag: return {"host": host, "model_alias": model_alias, "skipped": "model not available on host"} prompt = PROMPTS[prompt_key] url = host_cfg["url"] trace = { "host": host, "gpu": host_cfg["gpu"], "vram_gb": host_cfg["vram_gb"], "model_alias": model_alias, "model_tag": model_tag, "prompt_key": prompt_key, "prompt_chars": len(prompt), "num_predict": num_predict, "num_ctx": num_ctx, "runs": [], "warmup": None, } # Warmup — discarded. First call absorbs model load time. try: w = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m") trace["warmup"] = _metrics(w) except Exception as e: trace["error"] = f"warmup failed: {e}" return trace # Measurement runs. for i in range(runs): try: r = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m") trace["runs"].append(_metrics(r)) except Exception as e: trace["runs"].append({"error": str(e)}) # Aggregate. valid = [r for r in trace["runs"] if r.get("decode_tok_per_s") is not None] if valid: def _vals(k): return [r[k] for r in valid if r.get(k) is not None] def _stats(xs): if not xs: return None s = sorted(xs) return {"min": s[0], "median": s[len(s)//2], "max": s[-1], "n": len(s)} trace["summary"] = { "prefill_tok_per_s": _stats(_vals("prefill_tok_per_s")), "decode_tok_per_s": _stats(_vals("decode_tok_per_s")), "total_ms": _stats(_vals("total_ms")), } return trace def _run_one(host: str, model: str, prompt: str, out_dir: Path, runs: int) -> None: t = run_matrix(host, model, prompt, runs=runs) safe_model = model.replace(":", "-").replace("/", "-") path = out_dir / host / safe_model / f"{prompt}.json" path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(t, indent=2)) s = t.get("summary") or {} dec = s.get("decode_tok_per_s") or {} pre = s.get("prefill_tok_per_s") or {} skipped = t.get("skipped") or t.get("error") if skipped: print(f"[{host:10s}] {model:16s} {prompt:6s} — {skipped}") else: print(f"[{host:10s}] {model:16s} {prompt:6s} — " f"prefill={pre.get('median','?'):>7} tok/s " f"decode={dec.get('median','?'):>6} tok/s") def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--host", choices=list(HOSTS) + ["all"], default="all") ap.add_argument("--model", choices=list(MODEL_ALIASES) + ["all"], default="all") ap.add_argument("--prompt", choices=list(PROMPTS) + ["all"], default="all") ap.add_argument("--runs", type=int, default=3) ap.add_argument("--out-dir", type=Path, default=Path(__file__).parent / "runs") args = ap.parse_args() hosts = list(HOSTS) if args.host == "all" else [args.host] models = list(MODEL_ALIASES) if args.model == "all" else [args.model] prompts = list(PROMPTS) if args.prompt == "all" else [args.prompt] for host in hosts: for model in models: for prompt in prompts: _run_one(host, model, prompt, args.out_dir, args.runs) return 0 if __name__ == "__main__": sys.exit(main())