gemma4-research/scripts/gpu-bakeoff/harness.py

"""GPU bakeoff harness — Gemma 4 throughput across heterogeneous GPUs.

Measures prefill rate, decode rate, load time, and wall-clock across
three hosts:

  - steel141  : RTX 3090 Ti (24 GB GDDR6X, compute 8.6, ~1008 GB/s)
  - pve197    : Tesla V100-PCIE-32GB (32 GB HBM2, compute 7.0, ~900 GB/s)
  - matt-strix: AMD Strix Halo iGPU (shared LPDDR5X, ~256 GB/s)

Per (host, model, prompt_length), runs 1 warmup + N measurement runs,
records Ollama's canonical timing fields, and writes one JSON trace to
`runs/<host>/<model>/<prompt_len>.json`.

All three Ollama servers are polled via HTTP; no SSH required. All
timings come from Ollama's own /api/generate response fields so wall-
clock jitter between the harness and the server is excluded.

Invocation:
    python3 harness.py --host steel141 --model gemma4:26b --prompt short
    python3 harness.py all   # runs the full planned matrix
"""

from __future__ import annotations

import argparse
import json
import sys
import time
import urllib.request
from pathlib import Path


HOSTS = {
    "steel141":   {"url": "http://127.0.0.1:11434",       "gpu": "RTX 3090 Ti",           "vram_gb": 24},
    "pve197":     {"url": "http://192.168.0.179:11434",   "gpu": "Tesla V100-PCIE-32GB",  "vram_gb": 32},
    "matt-strix": {"url": "http://100.117.155.64:11434",  "gpu": "AMD Strix Halo iGPU",   "vram_gb": None},
}

# Per-host model tag mapping. matt-strix uses gemma4:31b, the others
# use gemma4:31b-it-q4_K_M — identical weights, different tags.
MODEL_ALIASES = {
    "gemma4:26b":  {"steel141": "gemma4:26b",            "pve197": "gemma4:26b",            "matt-strix": "gemma4:26b"},
    "gemma4:31b":  {"steel141": "gemma4:31b-it-q4_K_M",  "pve197": "gemma4:31b-it-q4_K_M",  "matt-strix": "gemma4:31b"},
    # V100-only edge case — only 32 GB host has headroom for the Q8 MoE.
    "gemma4:26b-q8":  {"pve197": "gemma4:26b-a4b-it-q8_0"},
}


PROMPTS = {
    "short": "Write exactly one sentence summarizing how a transformer language model works.",

    "long": (
        "You are reviewing a short technical passage and must produce a concise summary.\n\n"
        "Passage:\n"
        "Modern large language models are trained using a combination of self-supervised "
        "pretraining on vast text corpora and subsequent instruction-tuning on curated "
        "prompt-response pairs. The pretraining stage exposes the model to diverse writing "
        "styles, factual information, and reasoning patterns, but leaves it largely unaware "
        "of how to follow user instructions. Instruction-tuning, typically via supervised "
        "fine-tuning (SFT) followed by a preference-optimization stage such as Direct "
        "Preference Optimization (DPO) or Reinforcement Learning from Human Feedback (RLHF), "
        "aligns the model's behavior with human expectations. This two-stage recipe — "
        "massive pretraining plus alignment — has become the dominant paradigm for open "
        "and closed foundation models alike. Variants exist: some models add a midtraining "
        "stage between the two for curriculum or skill rebalancing; others use constitutional "
        "methods or reinforcement learning with verifiable rewards. For specialized domains "
        "like code or math, domain-specific SFT datasets and reward models are commonly "
        "layered on top of a general-purpose base. Throughout the process, the model's "
        "parameters remain fixed in architecture but shift substantially in value, with "
        "alignment stages typically touching a small fraction of the parameter space "
        "compared to the changes induced by pretraining.\n\n"
        "Task: Summarize the passage above in exactly three sentences, covering (1) what "
        "pretraining does, (2) what instruction-tuning does, and (3) why both stages are "
        "necessary in modern LLM recipes."
    ),
}


def _gen(url: str, model: str, prompt: str, num_predict: int, num_ctx: int, keep_alive: str) -> dict:
    """Single /api/generate call, stream=False, greedy decoding."""
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "num_ctx": num_ctx,
            "num_predict": num_predict,
            "temperature": 0.0,
            "top_k": 1,
        },
        "keep_alive": keep_alive,
    }
    req = urllib.request.Request(
        f"{url}/api/generate",
        data=json.dumps(payload).encode(),
        headers={"Content-Type": "application/json"},
    )
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=600) as r:
        d = json.loads(r.read())
    d["_harness_wall_s"] = round(time.time() - t0, 3)
    return d


def _metrics(d: dict) -> dict:
    """Extract canonical rates from Ollama's response.

    Fields (all nanoseconds unless noted):
      total_duration         — end-to-end, including load
      load_duration          — time to load model into memory
      prompt_eval_count      — input tokens
      prompt_eval_duration   — time to prefill
      eval_count             — output tokens
      eval_duration          — time to decode
    """
    pec = d.get("prompt_eval_count") or 0
    ped = d.get("prompt_eval_duration") or 0
    ec = d.get("eval_count") or 0
    ed = d.get("eval_duration") or 0
    total = d.get("total_duration") or 0
    load = d.get("load_duration") or 0

    prefill_rate = (pec / (ped / 1e9)) if ped else None
    decode_rate = (ec / (ed / 1e9)) if ed else None

    return {
        "prompt_tokens": pec,
        "prompt_eval_ms": round(ped / 1e6, 1) if ped else None,
        "prefill_tok_per_s": round(prefill_rate, 2) if prefill_rate else None,
        "output_tokens": ec,
        "eval_ms": round(ed / 1e6, 1) if ed else None,
        "decode_tok_per_s": round(decode_rate, 2) if decode_rate else None,
        "load_ms": round(load / 1e6, 1) if load else None,
        "total_ms": round(total / 1e6, 1) if total else None,
        "harness_wall_s": d.get("_harness_wall_s"),
        "done_reason": d.get("done_reason"),
    }


def run_matrix(
    host: str,
    model_alias: str,
    prompt_key: str,
    num_predict: int = 256,
    num_ctx: int = 4096,
    runs: int = 3,
) -> dict:
    host_cfg = HOSTS[host]
    model_tag = MODEL_ALIASES[model_alias].get(host)
    if not model_tag:
        return {"host": host, "model_alias": model_alias, "skipped": "model not available on host"}

    prompt = PROMPTS[prompt_key]
    url = host_cfg["url"]

    trace = {
        "host": host,
        "gpu": host_cfg["gpu"],
        "vram_gb": host_cfg["vram_gb"],
        "model_alias": model_alias,
        "model_tag": model_tag,
        "prompt_key": prompt_key,
        "prompt_chars": len(prompt),
        "num_predict": num_predict,
        "num_ctx": num_ctx,
        "runs": [],
        "warmup": None,
    }

    # Warmup — discarded. First call absorbs model load time.
    try:
        w = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m")
        trace["warmup"] = _metrics(w)
    except Exception as e:
        trace["error"] = f"warmup failed: {e}"
        return trace

    # Measurement runs.
    for i in range(runs):
        try:
            r = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m")
            trace["runs"].append(_metrics(r))
        except Exception as e:
            trace["runs"].append({"error": str(e)})

    # Aggregate.
    valid = [r for r in trace["runs"] if r.get("decode_tok_per_s") is not None]
    if valid:
        def _vals(k): return [r[k] for r in valid if r.get(k) is not None]
        def _stats(xs):
            if not xs: return None
            s = sorted(xs)
            return {"min": s[0], "median": s[len(s)//2], "max": s[-1], "n": len(s)}
        trace["summary"] = {
            "prefill_tok_per_s": _stats(_vals("prefill_tok_per_s")),
            "decode_tok_per_s": _stats(_vals("decode_tok_per_s")),
            "total_ms": _stats(_vals("total_ms")),
        }

    return trace


def _run_one(host: str, model: str, prompt: str, out_dir: Path, runs: int) -> None:
    t = run_matrix(host, model, prompt, runs=runs)
    safe_model = model.replace(":", "-").replace("/", "-")
    path = out_dir / host / safe_model / f"{prompt}.json"
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(t, indent=2))
    s = t.get("summary") or {}
    dec = s.get("decode_tok_per_s") or {}
    pre = s.get("prefill_tok_per_s") or {}
    skipped = t.get("skipped") or t.get("error")
    if skipped:
        print(f"[{host:10s}] {model:16s} {prompt:6s} — {skipped}")
    else:
        print(f"[{host:10s}] {model:16s} {prompt:6s} — "
              f"prefill={pre.get('median','?'):>7} tok/s  "
              f"decode={dec.get('median','?'):>6} tok/s")


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--host", choices=list(HOSTS) + ["all"], default="all")
    ap.add_argument("--model", choices=list(MODEL_ALIASES) + ["all"], default="all")
    ap.add_argument("--prompt", choices=list(PROMPTS) + ["all"], default="all")
    ap.add_argument("--runs", type=int, default=3)
    ap.add_argument("--out-dir", type=Path, default=Path(__file__).parent / "runs")
    args = ap.parse_args()

    hosts = list(HOSTS) if args.host == "all" else [args.host]
    models = list(MODEL_ALIASES) if args.model == "all" else [args.model]
    prompts = list(PROMPTS) if args.prompt == "all" else [args.prompt]

    for host in hosts:
        for model in models:
            for prompt in prompts:
                _run_one(host, model, prompt, args.out_dir, args.runs)
    return 0


if __name__ == "__main__":
    sys.exit(main())