91842f30cb
- Rename host alias matt-strix -> strix-halo (removes third-party name) - Move host URLs to env-var lookup (OLLAMA_*_URL), drop hardcoded IPs from harness source. Defaults: steel141 keeps localhost; pve197 and strix-halo require their env var to be set before use. - Update doc: remove the Tailscale IP and LAN-IP references, describe access paths without specific addresses. - Rename runs/matt-strix -> runs/strix-halo and patch the host field in each JSON. Harness still functional for the original author (set the env vars) and safe to share without leaking routable addresses. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
265 lines
10 KiB
Python
265 lines
10 KiB
Python
"""GPU bakeoff harness — Gemma 4 throughput across heterogeneous GPUs.
|
|
|
|
Measures prefill rate, decode rate, load time, and wall-clock across
|
|
three hosts:
|
|
|
|
- steel141 : RTX 3090 Ti (24 GB GDDR6X, compute 8.6, ~1008 GB/s)
|
|
- pve197 : Tesla V100-PCIE-32GB (32 GB HBM2, compute 7.0, ~900 GB/s)
|
|
- strix-halo: AMD Strix Halo iGPU (shared LPDDR5X, ~256 GB/s)
|
|
|
|
Per (host, model, prompt_length), runs 1 warmup + N measurement runs,
|
|
records Ollama's canonical timing fields, and writes one JSON trace to
|
|
`runs/<host>/<model>/<prompt_len>.json`.
|
|
|
|
All three Ollama servers are polled via HTTP; no SSH required. All
|
|
timings come from Ollama's own /api/generate response fields so wall-
|
|
clock jitter between the harness and the server is excluded.
|
|
|
|
Host URLs are resolved from environment variables so routable addresses
|
|
don't live in source. Set these before running against non-local hosts:
|
|
|
|
OLLAMA_STEEL141_URL=http://127.0.0.1:11434
|
|
OLLAMA_PVE197_URL=http://<lan-ip>:11434
|
|
OLLAMA_STRIX_URL=http://<tailscale-ip>:11434
|
|
|
|
Invocation:
|
|
python3 harness.py --host steel141 --model gemma4:26b --prompt short
|
|
python3 harness.py all # runs the full planned matrix
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
|
|
HOSTS = {
|
|
"steel141": {"url_env": "OLLAMA_STEEL141_URL", "default_url": "http://127.0.0.1:11434",
|
|
"gpu": "RTX 3090 Ti", "vram_gb": 24},
|
|
"pve197": {"url_env": "OLLAMA_PVE197_URL", "default_url": None,
|
|
"gpu": "Tesla V100-PCIE-32GB", "vram_gb": 32},
|
|
"strix-halo": {"url_env": "OLLAMA_STRIX_URL", "default_url": None,
|
|
"gpu": "AMD Strix Halo iGPU", "vram_gb": None},
|
|
}
|
|
|
|
|
|
def _host_url(host: str) -> str:
|
|
cfg = HOSTS[host]
|
|
url = os.environ.get(cfg["url_env"]) or cfg["default_url"]
|
|
if not url:
|
|
raise RuntimeError(
|
|
f"host {host!r} has no URL — set ${cfg['url_env']} in env"
|
|
)
|
|
return url
|
|
|
|
|
|
# Per-host model tag mapping. strix-halo uses gemma4:31b, the others
|
|
# use gemma4:31b-it-q4_K_M — identical weights, different tags.
|
|
MODEL_ALIASES = {
|
|
"gemma4:26b": {"steel141": "gemma4:26b", "pve197": "gemma4:26b", "strix-halo": "gemma4:26b"},
|
|
"gemma4:31b": {"steel141": "gemma4:31b-it-q4_K_M", "pve197": "gemma4:31b-it-q4_K_M", "strix-halo": "gemma4:31b"},
|
|
# V100-only edge case — only 32 GB host has headroom for the Q8 MoE.
|
|
"gemma4:26b-q8": {"pve197": "gemma4:26b-a4b-it-q8_0"},
|
|
}
|
|
|
|
|
|
PROMPTS = {
|
|
"short": "Write exactly one sentence summarizing how a transformer language model works.",
|
|
|
|
"long": (
|
|
"You are reviewing a short technical passage and must produce a concise summary.\n\n"
|
|
"Passage:\n"
|
|
"Modern large language models are trained using a combination of self-supervised "
|
|
"pretraining on vast text corpora and subsequent instruction-tuning on curated "
|
|
"prompt-response pairs. The pretraining stage exposes the model to diverse writing "
|
|
"styles, factual information, and reasoning patterns, but leaves it largely unaware "
|
|
"of how to follow user instructions. Instruction-tuning, typically via supervised "
|
|
"fine-tuning (SFT) followed by a preference-optimization stage such as Direct "
|
|
"Preference Optimization (DPO) or Reinforcement Learning from Human Feedback (RLHF), "
|
|
"aligns the model's behavior with human expectations. This two-stage recipe — "
|
|
"massive pretraining plus alignment — has become the dominant paradigm for open "
|
|
"and closed foundation models alike. Variants exist: some models add a midtraining "
|
|
"stage between the two for curriculum or skill rebalancing; others use constitutional "
|
|
"methods or reinforcement learning with verifiable rewards. For specialized domains "
|
|
"like code or math, domain-specific SFT datasets and reward models are commonly "
|
|
"layered on top of a general-purpose base. Throughout the process, the model's "
|
|
"parameters remain fixed in architecture but shift substantially in value, with "
|
|
"alignment stages typically touching a small fraction of the parameter space "
|
|
"compared to the changes induced by pretraining.\n\n"
|
|
"Task: Summarize the passage above in exactly three sentences, covering (1) what "
|
|
"pretraining does, (2) what instruction-tuning does, and (3) why both stages are "
|
|
"necessary in modern LLM recipes."
|
|
),
|
|
}
|
|
|
|
|
|
def _gen(url: str, model: str, prompt: str, num_predict: int, num_ctx: int, keep_alive: str) -> dict:
|
|
"""Single /api/generate call, stream=False, greedy decoding."""
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"num_ctx": num_ctx,
|
|
"num_predict": num_predict,
|
|
"temperature": 0.0,
|
|
"top_k": 1,
|
|
},
|
|
"keep_alive": keep_alive,
|
|
}
|
|
req = urllib.request.Request(
|
|
f"{url}/api/generate",
|
|
data=json.dumps(payload).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
t0 = time.time()
|
|
with urllib.request.urlopen(req, timeout=600) as r:
|
|
d = json.loads(r.read())
|
|
d["_harness_wall_s"] = round(time.time() - t0, 3)
|
|
return d
|
|
|
|
|
|
def _metrics(d: dict) -> dict:
|
|
"""Extract canonical rates from Ollama's response.
|
|
|
|
Fields (all nanoseconds unless noted):
|
|
total_duration — end-to-end, including load
|
|
load_duration — time to load model into memory
|
|
prompt_eval_count — input tokens
|
|
prompt_eval_duration — time to prefill
|
|
eval_count — output tokens
|
|
eval_duration — time to decode
|
|
"""
|
|
pec = d.get("prompt_eval_count") or 0
|
|
ped = d.get("prompt_eval_duration") or 0
|
|
ec = d.get("eval_count") or 0
|
|
ed = d.get("eval_duration") or 0
|
|
total = d.get("total_duration") or 0
|
|
load = d.get("load_duration") or 0
|
|
|
|
prefill_rate = (pec / (ped / 1e9)) if ped else None
|
|
decode_rate = (ec / (ed / 1e9)) if ed else None
|
|
|
|
return {
|
|
"prompt_tokens": pec,
|
|
"prompt_eval_ms": round(ped / 1e6, 1) if ped else None,
|
|
"prefill_tok_per_s": round(prefill_rate, 2) if prefill_rate else None,
|
|
"output_tokens": ec,
|
|
"eval_ms": round(ed / 1e6, 1) if ed else None,
|
|
"decode_tok_per_s": round(decode_rate, 2) if decode_rate else None,
|
|
"load_ms": round(load / 1e6, 1) if load else None,
|
|
"total_ms": round(total / 1e6, 1) if total else None,
|
|
"harness_wall_s": d.get("_harness_wall_s"),
|
|
"done_reason": d.get("done_reason"),
|
|
}
|
|
|
|
|
|
def run_matrix(
|
|
host: str,
|
|
model_alias: str,
|
|
prompt_key: str,
|
|
num_predict: int = 256,
|
|
num_ctx: int = 4096,
|
|
runs: int = 3,
|
|
) -> dict:
|
|
host_cfg = HOSTS[host]
|
|
model_tag = MODEL_ALIASES[model_alias].get(host)
|
|
if not model_tag:
|
|
return {"host": host, "model_alias": model_alias, "skipped": "model not available on host"}
|
|
|
|
prompt = PROMPTS[prompt_key]
|
|
url = _host_url(host)
|
|
|
|
trace = {
|
|
"host": host,
|
|
"gpu": host_cfg["gpu"],
|
|
"vram_gb": host_cfg["vram_gb"],
|
|
"model_alias": model_alias,
|
|
"model_tag": model_tag,
|
|
"prompt_key": prompt_key,
|
|
"prompt_chars": len(prompt),
|
|
"num_predict": num_predict,
|
|
"num_ctx": num_ctx,
|
|
"runs": [],
|
|
"warmup": None,
|
|
}
|
|
|
|
# Warmup — discarded. First call absorbs model load time.
|
|
try:
|
|
w = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m")
|
|
trace["warmup"] = _metrics(w)
|
|
except Exception as e:
|
|
trace["error"] = f"warmup failed: {e}"
|
|
return trace
|
|
|
|
# Measurement runs.
|
|
for i in range(runs):
|
|
try:
|
|
r = _gen(url, model_tag, prompt, num_predict=num_predict, num_ctx=num_ctx, keep_alive="10m")
|
|
trace["runs"].append(_metrics(r))
|
|
except Exception as e:
|
|
trace["runs"].append({"error": str(e)})
|
|
|
|
# Aggregate.
|
|
valid = [r for r in trace["runs"] if r.get("decode_tok_per_s") is not None]
|
|
if valid:
|
|
def _vals(k): return [r[k] for r in valid if r.get(k) is not None]
|
|
def _stats(xs):
|
|
if not xs: return None
|
|
s = sorted(xs)
|
|
return {"min": s[0], "median": s[len(s)//2], "max": s[-1], "n": len(s)}
|
|
trace["summary"] = {
|
|
"prefill_tok_per_s": _stats(_vals("prefill_tok_per_s")),
|
|
"decode_tok_per_s": _stats(_vals("decode_tok_per_s")),
|
|
"total_ms": _stats(_vals("total_ms")),
|
|
}
|
|
|
|
return trace
|
|
|
|
|
|
def _run_one(host: str, model: str, prompt: str, out_dir: Path, runs: int) -> None:
|
|
t = run_matrix(host, model, prompt, runs=runs)
|
|
safe_model = model.replace(":", "-").replace("/", "-")
|
|
path = out_dir / host / safe_model / f"{prompt}.json"
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(json.dumps(t, indent=2))
|
|
s = t.get("summary") or {}
|
|
dec = s.get("decode_tok_per_s") or {}
|
|
pre = s.get("prefill_tok_per_s") or {}
|
|
skipped = t.get("skipped") or t.get("error")
|
|
if skipped:
|
|
print(f"[{host:10s}] {model:16s} {prompt:6s} — {skipped}")
|
|
else:
|
|
print(f"[{host:10s}] {model:16s} {prompt:6s} — "
|
|
f"prefill={pre.get('median','?'):>7} tok/s "
|
|
f"decode={dec.get('median','?'):>6} tok/s")
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--host", choices=list(HOSTS) + ["all"], default="all")
|
|
ap.add_argument("--model", choices=list(MODEL_ALIASES) + ["all"], default="all")
|
|
ap.add_argument("--prompt", choices=list(PROMPTS) + ["all"], default="all")
|
|
ap.add_argument("--runs", type=int, default=3)
|
|
ap.add_argument("--out-dir", type=Path, default=Path(__file__).parent / "runs")
|
|
args = ap.parse_args()
|
|
|
|
hosts = list(HOSTS) if args.host == "all" else [args.host]
|
|
models = list(MODEL_ALIASES) if args.model == "all" else [args.model]
|
|
prompts = list(PROMPTS) if args.prompt == "all" else [args.prompt]
|
|
|
|
for host in hosts:
|
|
for model in models:
|
|
for prompt in prompts:
|
|
_run_one(host, model, prompt, args.out_dir, args.runs)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|