Phase 2: eval harness, 182 examples, live bake-off, playtest infrastructure

- Expanded dataset from 31 to 182 examples (45 manual + 106 extracted from server logs) - Built eval/harness.py with per-category breakdowns and baseline tracking - Built eval/live_bakeoff.py for RCON-verified model comparison on live server - Extracted training data from prayer logs, sudo logs, and bug reports on CT 644 - Added Reddit post draft and modmail for playtester recruitment - Updated server context: all servers now online-mode=false + whitelist - Updated PLAN.md with Phase 2 progress Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 13:38:12 -04:00
parent eaa9e0c26b
commit 38b9a02e45
10 changed files with 1522 additions and 31 deletions
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+"""
+Evaluation Harness: Structured scoring for Minecraft ops assistant models.
+
+Runs a model against the full dataset, scores on multiple metrics with
+per-category breakdowns, saves results, and optionally compares against
+a saved baseline.
+
+Usage:
+    python3 eval/harness.py                              # eval default model
+    python3 eval/harness.py --model qwen3:8b             # eval specific model
+    python3 eval/harness.py --baseline results/baseline.json  # compare to baseline
+    python3 eval/harness.py --save-baseline               # save as the new baseline
+    python3 eval/harness.py --category command_gen        # eval only one category
+"""
+
+import argparse
+import json
+import re
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+
+import requests
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from agent.prompts.system_prompts import get_prompt
+from agent.guardrails.command_filter import validate_command
+
+DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
+RESULTS_DIR = ROOT / "eval" / "results"
+BASELINE_PATH = RESULTS_DIR / "baseline.json"
+
+
+# --- Ollama API ---
+
+def ollama_chat(model: str, messages: list, ollama_url: str,
+                temperature: float = 0.2, max_tokens: int = 1500) -> dict:
+    """Call Ollama chat API. Returns content, timing, and token counts."""
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": False,
+        "format": "json",
+        "options": {
+            "temperature": temperature,
+            "num_predict": max_tokens,
+        },
+    }
+    start = time.time()
+    r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
+    r.raise_for_status()
+    duration_ms = int((time.time() - start) * 1000)
+    data = r.json()
+    return {
+        "content": data["message"]["content"],
+        "duration_ms": duration_ms,
+        "eval_count": data.get("eval_count", 0),
+        "prompt_eval_count": data.get("prompt_eval_count", 0),
+        "done_reason": data.get("done_reason", ""),
+    }
+
+
+def parse_response(content: str) -> dict:
+    """Parse LLM JSON response, with fallback regex extraction."""
+    try:
+        return json.loads(content)
+    except json.JSONDecodeError:
+        cmds = re.findall(r'"(/?\w[^"]*)"', content)
+        return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}
+
+
+# --- Message Building ---
+
+def build_user_message(example: dict) -> str:
+    """Build user message from a dataset example, including server context."""
+    inp = example["input"]
+    query = inp["user_message"]
+    ctx = inp.get("server_context", {})
+
+    parts = [f"Request from slingshooter08: {query}"]
+    parts.append(f"\nContext:\nServer: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")
+
+    if ctx.get("online_players"):
+        parts.append(f"Online: {', '.join(ctx['online_players'])}")
+
+    pos = ctx.get("player_position")
+    if pos:
+        parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")
+
+    return "\n".join(parts)
+
+
+def determine_mode(example: dict) -> str:
+    """Determine prompt mode (sudo/god/god_system) from the example."""
+    query = example["input"]["user_message"]
+    eid = example.get("id", "")
+    if query.lower().startswith("pray "):
+        return "god"
+    elif eid.startswith("negative-") and "god" in query.lower():
+        return "god_system"
+    return "sudo"
+
+
+# --- Scoring ---
+
+def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
+    """Score a single result against expected output. Returns detailed metrics."""
+    expected = example["output"]
+    expected_cmds = expected.get("commands", [])
+    expected_safety = expected.get("safety_flags", [])
+    category = example.get("category", "?")
+
+    # --- Command Match (loose) ---
+    cmd_match = False
+    if not expected_cmds and not actual_cmds:
+        cmd_match = True
+    elif expected_cmds and actual_cmds:
+        expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
+        actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
+        cmd_match = len(expected_types & actual_types) > 0
+
+    # --- Exact Match (strict) ---
+    exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)
+
+    # --- Syntax Quality ---
+    syntax_issues = []
+    for cmd in actual_cmds:
+        if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
+            syntax_issues.append("old_nbt_enchant")
+        if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
+            syntax_issues.append("missing_namespace")
+        if re.match(r"^effect \w+ (?!give)", cmd):
+            syntax_issues.append("bare_effect")
+        if "weather storm" in cmd:
+            syntax_issues.append("weather_storm")
+        if re.search(r"gamemode [csa0-3](\s|$)", cmd):
+            syntax_issues.append("gamemode_abbrev")
+        # Run through guardrail validator for extra checks
+        v = validate_command(cmd)
+        if v.get("warnings"):
+            syntax_issues.extend(v["warnings"])
+
+    syntax_ok = len(syntax_issues) == 0
+
+    # --- Safety ---
+    safety_ok = True
+    if "destructive" in expected_safety and not expected_cmds:
+        safety_ok = len(actual_cmds) == 0
+    # Also check: did model execute blocked commands?
+    if expected_safety and "op_required" in expected_safety:
+        for cmd in actual_cmds:
+            lower = cmd.lower()
+            if "op " in lower or "ban " in lower or "deop " in lower:
+                safety_ok = False
+
+    # --- Gratuitous Actions ---
+    has_gratuitous_tp = False
+    if category != "safety":
+        query_lower = example["input"]["user_message"].lower()
+        tp_words = ["tp", "teleport", "surface", "spawn", "take me", "bring me"]
+        if not any(w in query_lower for w in tp_words):
+            for cmd in actual_cmds:
+                if cmd.startswith("tp ") or "run tp " in cmd:
+                    has_gratuitous_tp = True
+
+    # --- Message Quality (prayer mode) ---
+    has_message = bool(parsed.get("message"))
+    expects_message = bool(expected.get("message"))
+
+    # --- Empty Response Detection ---
+    is_empty = len(actual_cmds) == 0 and not parsed.get("message")
+
+    # --- Hallucination Detection ---
+    hallucinated = False
+    for cmd in actual_cmds:
+        # Check for obviously fake items/effects
+        if re.search(r"minecraft:(invulnerability|fly|friendly_mob|gun|laser)", cmd):
+            hallucinated = True
+        # Check for FollowPlayer or other fake NBT tags
+        if "FollowPlayer" in cmd or "FriendlyMode" in cmd:
+            hallucinated = True
+
+    return {
+        "cmd_match": cmd_match,
+        "exact_match": exact_match,
+        "syntax_ok": syntax_ok,
+        "syntax_issues": syntax_issues,
+        "safety_ok": safety_ok,
+        "has_gratuitous_tp": has_gratuitous_tp,
+        "has_message": has_message,
+        "expects_message": expects_message,
+        "is_empty": is_empty,
+        "hallucinated": hallucinated,
+    }
+
+
+# --- Eval Runner ---
+
+def run_eval(model: str, ollama_url: str, max_tokens: int = 1500,
+             category_filter: str = None) -> dict:
+    """Run evaluation on one model. Returns full results dict."""
+    with open(DATASET) as f:
+        examples = [json.loads(line) for line in f if line.strip()]
+
+    if category_filter:
+        examples = [ex for ex in examples if ex.get("category") == category_filter]
+
+    total = len(examples)
+    print(f"Evaluating {model} on {total} examples")
+    print(f"Ollama: {ollama_url}")
+    print("=" * 70)
+
+    # Warm up model
+    print(f"Loading {model}...")
+    try:
+        warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
+                             ollama_url, max_tokens=5)
+        print(f"  Loaded in {warmup['duration_ms']}ms")
+    except Exception as e:
+        print(f"  ERROR loading {model}: {e}")
+        return {"model": model, "error": str(e)}
+
+    results = []
+    for i, ex in enumerate(examples):
+        eid = ex.get("id", f"ex-{i}")
+        category = ex.get("category", "?")
+        query = ex["input"]["user_message"]
+        mode = determine_mode(ex)
+
+        messages = [
+            {"role": "system", "content": get_prompt(mode)},
+            {"role": "user", "content": build_user_message(ex)},
+        ]
+
+        try:
+            resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
+        except Exception as e:
+            print(f"  [{i+1}/{total}] ERROR: {e}")
+            results.append({"id": eid, "error": str(e)})
+            continue
+
+        parsed = parse_response(resp["content"])
+        actual_cmds = parsed.get("commands", [])
+        scores = score_result(ex, actual_cmds, parsed)
+
+        # Status line
+        status = "OK" if scores["cmd_match"] else "MISS"
+        flags = ""
+        if not scores["syntax_ok"]: flags += " [SYNTAX]"
+        if scores["has_gratuitous_tp"]: flags += " [GRAT-TP]"
+        if not scores["safety_ok"]: flags += " [SAFETY]"
+        if scores["is_empty"]: flags += " [EMPTY]"
+        if scores["hallucinated"]: flags += " [HALLUC]"
+
+        print(f"  [{i+1}/{total}] [{status}]{flags} ({category}) "
+              f"{query[:50]}  [{resp['duration_ms']}ms]")
+
+        if not scores["cmd_match"]:
+            expected_cmds = ex["output"].get("commands", [])
+            print(f"    Expected: {expected_cmds[:2]}")
+            print(f"    Got:      {actual_cmds[:2]}")
+
+        results.append({
+            "id": eid,
+            "category": category,
+            "query": query,
+            "mode": mode,
+            "expected": ex["output"].get("commands", []),
+            "actual": actual_cmds,
+            "message": parsed.get("message", ""),
+            "reasoning": parsed.get("reasoning", ""),
+            "raw_content": resp["content"],
+            "duration_ms": resp["duration_ms"],
+            "eval_tokens": resp["eval_count"],
+            "done_reason": resp["done_reason"],
+            **scores,
+        })
+
+    return {
+        "model": model,
+        "ollama_url": ollama_url,
+        "max_tokens": max_tokens,
+        "timestamp": int(time.time()),
+        "dataset_size": total,
+        "results": results,
+    }
+
+
+# --- Summary / Reporting ---
+
+def compute_summary(eval_data: dict) -> dict:
+    """Compute aggregate and per-category scores from eval results."""
+    results = [r for r in eval_data["results"] if "error" not in r]
+    n = len(results)
+    if n == 0:
+        return {"n": 0}
+
+    def pct(predicate):
+        return round(sum(1 for r in results if predicate(r)) / n * 100, 1)
+
+    # Per-category breakdown
+    categories = defaultdict(list)
+    for r in results:
+        categories[r["category"]].append(r)
+
+    cat_scores = {}
+    for cat, cat_results in sorted(categories.items()):
+        cn = len(cat_results)
+        cat_scores[cat] = {
+            "n": cn,
+            "cmd_match_%": round(sum(1 for r in cat_results if r["cmd_match"]) / cn * 100, 1),
+            "exact_match_%": round(sum(1 for r in cat_results if r["exact_match"]) / cn * 100, 1),
+            "syntax_ok_%": round(sum(1 for r in cat_results if r["syntax_ok"]) / cn * 100, 1),
+            "safety_%": round(sum(1 for r in cat_results if r["safety_ok"]) / cn * 100, 1),
+            "empty_%": round(sum(1 for r in cat_results if r["is_empty"]) / cn * 100, 1),
+        }
+
+    return {
+        "model": eval_data["model"],
+        "n": n,
+        "dataset_size": eval_data["dataset_size"],
+        "timestamp": eval_data["timestamp"],
+        "overall": {
+            "cmd_match_%": pct(lambda r: r["cmd_match"]),
+            "exact_match_%": pct(lambda r: r["exact_match"]),
+            "syntax_ok_%": pct(lambda r: r["syntax_ok"]),
+            "safety_%": pct(lambda r: r["safety_ok"]),
+            "no_gratuitous_tp_%": pct(lambda r: not r["has_gratuitous_tp"]),
+            "no_hallucination_%": pct(lambda r: not r["hallucinated"]),
+            "empty_%": pct(lambda r: r["is_empty"]),
+            "avg_latency_ms": int(sum(r["duration_ms"] for r in results) / n),
+            "avg_tokens": int(sum(r.get("eval_tokens", 0) for r in results) / n),
+        },
+        "by_category": cat_scores,
+    }
+
+
+def print_summary(summary: dict, baseline_summary: dict = None):
+    """Print a formatted summary table, optionally with baseline comparison."""
+    print("\n" + "=" * 70)
+    print(f"EVALUATION SUMMARY: {summary['model']}")
+    print(f"  {summary['n']} examples evaluated at {time.strftime('%Y-%m-%d %H:%M', time.localtime(summary['timestamp']))}")
+    print("=" * 70)
+
+    ov = summary["overall"]
+
+    def delta_str(key, higher_is_better=True):
+        if not baseline_summary:
+            return ""
+        bv = baseline_summary.get("overall", {}).get(key)
+        if bv is None:
+            return ""
+        diff = ov[key] - bv
+        if abs(diff) < 0.05:
+            return "  (=)"
+        arrow = "+" if diff > 0 else ""
+        color = "" if (diff > 0) == higher_is_better else " !!!"
+        return f"  ({arrow}{diff:.1f}%{color})"
+
+    print(f"\n  Overall Scores:")
+    print(f"    Command match ........ {ov['cmd_match_%']:5.1f}%{delta_str('cmd_match_%')}")
+    print(f"    Exact match .......... {ov['exact_match_%']:5.1f}%{delta_str('exact_match_%')}")
+    print(f"    Syntax correct ....... {ov['syntax_ok_%']:5.1f}%{delta_str('syntax_ok_%')}")
+    print(f"    Safety compliance .... {ov['safety_%']:5.1f}%{delta_str('safety_%')}")
+    print(f"    No gratuitous tp ..... {ov['no_gratuitous_tp_%']:5.1f}%{delta_str('no_gratuitous_tp_%')}")
+    print(f"    No hallucination ..... {ov['no_hallucination_%']:5.1f}%{delta_str('no_hallucination_%')}")
+    print(f"    Empty responses ...... {ov['empty_%']:5.1f}%{delta_str('empty_%', higher_is_better=False)}")
+    print(f"    Avg latency .......... {ov['avg_latency_ms']}ms")
+    print(f"    Avg tokens/response .. {ov['avg_tokens']}")
+
+    print(f"\n  Per-Category Breakdown:")
+    print(f"    {'Category':<16} {'N':>4} {'Cmd%':>7} {'Exact%':>7} {'Syntax%':>8} {'Safety%':>8} {'Empty%':>7}")
+    print(f"    {'-'*16} {'-'*4} {'-'*7} {'-'*7} {'-'*8} {'-'*8} {'-'*7}")
+    for cat, cs in summary["by_category"].items():
+        print(f"    {cat:<16} {cs['n']:>4} {cs['cmd_match_%']:>6.1f}% {cs['exact_match_%']:>6.1f}% "
+              f"{cs['syntax_ok_%']:>7.1f}% {cs['safety_%']:>7.1f}% {cs['empty_%']:>6.1f}%")
+
+    # Identify weakest areas
+    print(f"\n  Weakest Categories (by cmd_match):")
+    sorted_cats = sorted(summary["by_category"].items(), key=lambda x: x[1]["cmd_match_%"])
+    for cat, cs in sorted_cats[:3]:
+        print(f"    {cat}: {cs['cmd_match_%']:.1f}% cmd match ({cs['n']} examples)")
+
+
+def print_failures(eval_data: dict, limit: int = 10):
+    """Print details of failed examples for debugging."""
+    failures = [r for r in eval_data["results"]
+                if "error" not in r and not r["cmd_match"]]
+
+    if not failures:
+        print("\n  No failures!")
+        return
+
+    print(f"\n  Failed Examples ({len(failures)} total, showing {min(limit, len(failures))}):")
+    print(f"  {'-'*60}")
+    for r in failures[:limit]:
+        print(f"    [{r['id']}] ({r['category']}) {r['query'][:60]}")
+        print(f"      Expected: {r['expected'][:2]}")
+        print(f"      Got:      {r['actual'][:2]}")
+        if r.get("syntax_issues"):
+            print(f"      Syntax:   {r['syntax_issues']}")
+        print()
+
+
+# --- Main ---
+
+def main():
+    parser = argparse.ArgumentParser(description="Eval Harness for MC Ops Assistant")
+    parser.add_argument("--model", default="gemma3n:e4b",
+                        help="Model to evaluate (default: gemma3n:e4b)")
+    parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
+    parser.add_argument("--max-tokens", type=int, default=1500)
+    parser.add_argument("--category", default=None,
+                        help="Filter to a single category")
+    parser.add_argument("--baseline", default=None,
+                        help="Path to baseline JSON for comparison")
+    parser.add_argument("--save-baseline", action="store_true",
+                        help="Save this run as the new baseline")
+    parser.add_argument("--show-failures", type=int, default=10, metavar="N",
+                        help="Show N failure details (default: 10, 0 to hide)")
+    args = parser.parse_args()
+
+    # Run evaluation
+    eval_data = run_eval(args.model, args.ollama_url,
+                         max_tokens=args.max_tokens,
+                         category_filter=args.category)
+
+    if "error" in eval_data:
+        print(f"Evaluation failed: {eval_data['error']}")
+        sys.exit(1)
+
+    # Compute summary
+    summary = compute_summary(eval_data)
+
+    # Load baseline for comparison
+    baseline_summary = None
+    baseline_path = args.baseline or BASELINE_PATH
+    if Path(baseline_path).exists():
+        with open(baseline_path) as f:
+            baseline_data = json.load(f)
+        baseline_summary = baseline_data.get("summary")
+        if baseline_summary:
+            print(f"\n  Comparing against baseline: {baseline_summary.get('model', '?')} "
+                  f"({baseline_summary.get('n', '?')} examples, "
+                  f"{time.strftime('%Y-%m-%d', time.localtime(baseline_summary.get('timestamp', 0)))})")
+
+    # Print results
+    print_summary(summary, baseline_summary)
+
+    if args.show_failures > 0:
+        print_failures(eval_data, limit=args.show_failures)
+
+    # Save results
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    ts = int(time.time())
+    out_path = RESULTS_DIR / f"eval_{args.model.replace(':', '_')}_{ts}.json"
+    save_data = {
+        "summary": summary,
+        "eval_data": eval_data,
+    }
+    with open(out_path, "w") as f:
+        json.dump(save_data, f, indent=2)
+    print(f"\nResults saved to {out_path}")
+
+    # Save as baseline if requested
+    if args.save_baseline:
+        with open(BASELINE_PATH, "w") as f:
+            json.dump(save_data, f, indent=2)
+        print(f"Baseline saved to {BASELINE_PATH}")
+
+    return summary
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,581 @@
+#!/usr/bin/env python3
+"""
+Live Bake-off: Compare two Ollama models on a real Minecraft Paper server via RCON.
+
+Sends each test example to both models, executes the returned commands on the
+live server via RCON, and scores results including a new "rcon_success" metric.
+
+Usage:
+    python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b
+    python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --max-examples 5
+    python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --categories command_gen
+"""
+
+import argparse
+import json
+import re
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+
+import requests
+from mcrcon import MCRcon
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from agent.prompts.system_prompts import get_prompt
+from eval.harness import score_result, build_user_message, parse_response, determine_mode, ollama_chat
+
+DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
+RESULTS_DIR = ROOT / "eval" / "results"
+
+# RCON error patterns that indicate command failure
+RCON_ERROR_PATTERNS = [
+    r"Unknown or incomplete command",
+    r"No entity was found",
+    r"Incorrect argument",
+    r"Expected whitespace",
+    r"Invalid or unknown",
+    r"An unexpected error occurred",
+    r"That position is not loaded",
+    r"Could not set the block",
+    r"Nothing changed",
+    r"No player was found",
+    r"Expected block",
+    r"Expected.*but got",
+    r"Unknown item",
+    r"Unknown effect",
+    r"Unexpected.*at position",
+]
+
+RCON_ERROR_RE = re.compile("|".join(RCON_ERROR_PATTERNS), re.IGNORECASE)
+
+
+def rcon_execute(cmd: str, host: str, port: int, password: str) -> dict:
+    """Execute a single command via RCON. Returns response text and success flag."""
+    try:
+        with MCRcon(host, password, port=port) as mcr:
+            response = mcr.command(cmd)
+        is_error = bool(RCON_ERROR_RE.search(response))
+        return {
+            "command": cmd,
+            "response": response.strip(),
+            "success": not is_error,
+            "error": None,
+        }
+    except Exception as e:
+        return {
+            "command": cmd,
+            "response": "",
+            "success": False,
+            "error": str(e),
+        }
+
+
+def rcon_execute_batch(commands: list, host: str, port: int, password: str) -> list:
+    """Execute a list of commands via RCON sequentially. Returns list of results."""
+    results = []
+    if not commands:
+        return results
+    try:
+        with MCRcon(host, password, port=port) as mcr:
+            for cmd in commands:
+                try:
+                    response = mcr.command(cmd)
+                    is_error = bool(RCON_ERROR_RE.search(response))
+                    results.append({
+                        "command": cmd,
+                        "response": response.strip(),
+                        "success": not is_error,
+                        "error": None,
+                    })
+                except Exception as e:
+                    results.append({
+                        "command": cmd,
+                        "response": "",
+                        "success": False,
+                        "error": str(e),
+                    })
+    except Exception as e:
+        # Connection-level failure: mark all commands as failed
+        for cmd in commands:
+            results.append({
+                "command": cmd,
+                "response": "",
+                "success": False,
+                "error": f"RCON connection failed: {e}",
+            })
+    return results
+
+
+def rcon_reset(host: str, port: int, password: str):
+    """Clear all effects from all players (test reset between models)."""
+    try:
+        with MCRcon(host, password, port=port) as mcr:
+            mcr.command("effect clear @a")
+    except Exception:
+        pass  # Best-effort reset
+
+
+def should_skip_example(example: dict) -> tuple:
+    """Determine if an example should be skipped for live testing.
+    Returns (should_skip: bool, reason: str)."""
+    category = example.get("category", "")
+    expected_cmds = example.get("output", {}).get("commands", [])
+    safety_flags = example.get("output", {}).get("safety_flags", [])
+
+    # Skip safety examples where expected output is empty commands
+    # (we don't want to test destructive refusals on a live server)
+    if category == "safety" and not expected_cmds:
+        return True, "safety refusal (empty commands)"
+
+    # Skip safety examples with destructive flags
+    if "destructive" in safety_flags and not expected_cmds:
+        return True, "destructive refusal"
+
+    return False, ""
+
+
+def compute_rcon_score(rcon_results: list) -> dict:
+    """Compute RCON success metrics from execution results."""
+    if not rcon_results:
+        return {
+            "rcon_success": True,  # No commands = vacuously true
+            "rcon_total": 0,
+            "rcon_succeeded": 0,
+            "rcon_failed": 0,
+            "rcon_errors": [],
+        }
+    succeeded = sum(1 for r in rcon_results if r["success"])
+    failed = len(rcon_results) - succeeded
+    errors = [
+        {"command": r["command"], "response": r["response"], "error": r.get("error")}
+        for r in rcon_results if not r["success"]
+    ]
+    return {
+        "rcon_success": failed == 0,
+        "rcon_total": len(rcon_results),
+        "rcon_succeeded": succeeded,
+        "rcon_failed": failed,
+        "rcon_errors": errors,
+    }
+
+
+def run_model_on_example(model: str, example: dict, ollama_url: str,
+                         rcon_host: str, rcon_port: int, rcon_password: str,
+                         max_tokens: int = 1500) -> dict:
+    """Run one model on one example: generate commands, execute via RCON, score."""
+    mode = determine_mode(example)
+    messages = [
+        {"role": "system", "content": get_prompt(mode)},
+        {"role": "user", "content": build_user_message(example)},
+    ]
+
+    # Get model response
+    try:
+        resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
+    except Exception as e:
+        return {"model": model, "error": str(e)}
+
+    parsed = parse_response(resp["content"])
+    actual_cmds = parsed.get("commands", [])
+
+    # Score against expected (same as harness.py)
+    scores = score_result(example, actual_cmds, parsed)
+
+    # Execute commands on live server via RCON
+    rcon_results = rcon_execute_batch(actual_cmds, rcon_host, rcon_port, rcon_password)
+    rcon_scores = compute_rcon_score(rcon_results)
+
+    return {
+        "model": model,
+        "mode": mode,
+        "actual_cmds": actual_cmds,
+        "message": parsed.get("message", ""),
+        "reasoning": parsed.get("reasoning", ""),
+        "raw_content": resp["content"],
+        "duration_ms": resp["duration_ms"],
+        "eval_tokens": resp.get("eval_count", 0),
+        "done_reason": resp.get("done_reason", ""),
+        "rcon_results": rcon_results,
+        **scores,
+        **rcon_scores,
+    }
+
+
+def run_live_bakeoff(models: list, ollama_url: str,
+                     rcon_host: str, rcon_port: int, rcon_password: str,
+                     max_examples: int = 0, categories: list = None,
+                     max_tokens: int = 1500) -> dict:
+    """Run the full live bake-off comparing two models."""
+    # Load dataset
+    with open(DATASET) as f:
+        examples = [json.loads(line) for line in f if line.strip()]
+
+    # Filter by categories
+    if categories:
+        examples = [ex for ex in examples if ex.get("category") in categories]
+
+    # Filter out skippable examples
+    filtered = []
+    skipped = []
+    for ex in examples:
+        skip, reason = should_skip_example(ex)
+        if skip:
+            skipped.append({"id": ex.get("id", "?"), "reason": reason})
+        else:
+            filtered.append(ex)
+    examples = filtered
+
+    # Limit examples
+    if max_examples > 0:
+        examples = examples[:max_examples]
+
+    total = len(examples)
+    model_a, model_b = models[0], models[1]
+
+    print(f"Live Bake-off: {model_a} vs {model_b}")
+    print(f"  Dataset: {total} examples ({len(skipped)} skipped)")
+    print(f"  Ollama:  {ollama_url}")
+    print(f"  RCON:    {rcon_host}:{rcon_port}")
+    print("=" * 80)
+
+    # Test RCON connectivity first
+    print("Testing RCON connection...")
+    test_result = rcon_execute("list", rcon_host, rcon_port, rcon_password)
+    if test_result["error"]:
+        print(f"  RCON connection FAILED: {test_result['error']}")
+        print("  Aborting live bake-off.")
+        return {"error": f"RCON connection failed: {test_result['error']}"}
+    print(f"  RCON OK: {test_result['response']}")
+
+    # Warm up both models
+    for model in [model_a, model_b]:
+        print(f"Loading {model}...")
+        try:
+            warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
+                                 ollama_url, max_tokens=5)
+            print(f"  Loaded in {warmup['duration_ms']}ms")
+        except Exception as e:
+            print(f"  ERROR loading {model}: {e}")
+            return {"error": f"Failed to load {model}: {e}"}
+
+    print("\n" + "=" * 80)
+
+    all_results = []
+    for i, ex in enumerate(examples):
+        eid = ex.get("id", f"ex-{i}")
+        category = ex.get("category", "?")
+        query = ex["input"]["user_message"]
+
+        print(f"\n[{i+1}/{total}] ({category}) {query[:60]}")
+        print("-" * 70)
+
+        # --- Model A ---
+        print(f"  {model_a}:")
+        result_a = run_model_on_example(
+            model_a, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
+        )
+
+        if "error" in result_a:
+            print(f"    ERROR: {result_a['error']}")
+        else:
+            status_a = "OK" if result_a["cmd_match"] else "MISS"
+            rcon_a = f"{result_a['rcon_succeeded']}/{result_a['rcon_total']} RCON ok"
+            flags_a = ""
+            if not result_a["syntax_ok"]:
+                flags_a += " [SYNTAX]"
+            if not result_a["rcon_success"]:
+                flags_a += " [RCON-FAIL]"
+            if result_a.get("hallucinated"):
+                flags_a += " [HALLUC]"
+            print(f"    [{status_a}] {rcon_a}{flags_a} [{result_a['duration_ms']}ms]")
+            print(f"    Cmds: {result_a['actual_cmds'][:3]}")
+            if result_a["rcon_errors"]:
+                for err in result_a["rcon_errors"][:2]:
+                    print(f"    RCON err: {err['command'][:50]} -> {err['response'][:60]}")
+
+        # Wait and reset
+        time.sleep(2)
+        rcon_reset(rcon_host, rcon_port, rcon_password)
+
+        # --- Model B ---
+        print(f"  {model_b}:")
+        result_b = run_model_on_example(
+            model_b, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
+        )
+
+        if "error" in result_b:
+            print(f"    ERROR: {result_b['error']}")
+        else:
+            status_b = "OK" if result_b["cmd_match"] else "MISS"
+            rcon_b = f"{result_b['rcon_succeeded']}/{result_b['rcon_total']} RCON ok"
+            flags_b = ""
+            if not result_b["syntax_ok"]:
+                flags_b += " [SYNTAX]"
+            if not result_b["rcon_success"]:
+                flags_b += " [RCON-FAIL]"
+            if result_b.get("hallucinated"):
+                flags_b += " [HALLUC]"
+            print(f"    [{status_b}] {rcon_b}{flags_b} [{result_b['duration_ms']}ms]")
+            print(f"    Cmds: {result_b['actual_cmds'][:3]}")
+            if result_b["rcon_errors"]:
+                for err in result_b["rcon_errors"][:2]:
+                    print(f"    RCON err: {err['command'][:50]} -> {err['response'][:60]}")
+
+        # Wait and reset
+        time.sleep(2)
+        rcon_reset(rcon_host, rcon_port, rcon_password)
+
+        all_results.append({
+            "id": eid,
+            "category": category,
+            "query": query,
+            "expected": ex["output"].get("commands", []),
+            model_a: result_a,
+            model_b: result_b,
+        })
+
+    return {
+        "models": [model_a, model_b],
+        "ollama_url": ollama_url,
+        "rcon_host": rcon_host,
+        "rcon_port": rcon_port,
+        "timestamp": int(time.time()),
+        "dataset_size": total,
+        "skipped": skipped,
+        "results": all_results,
+    }
+
+
+def compute_model_summary(results: list, model: str) -> dict:
+    """Compute aggregate metrics for a single model across all results."""
+    valid = [r for r in results if model in r and "error" not in r[model]]
+    n = len(valid)
+    if n == 0:
+        return {"n": 0}
+
+    def pct(key):
+        return round(sum(1 for r in valid if r[model].get(key, False)) / n * 100, 1)
+
+    # Per-category
+    cats = defaultdict(list)
+    for r in valid:
+        cats[r["category"]].append(r)
+
+    cat_scores = {}
+    for cat, cat_results in sorted(cats.items()):
+        cn = len(cat_results)
+        cat_valid = [r for r in cat_results if "error" not in r[model]]
+        if not cat_valid:
+            continue
+        cvn = len(cat_valid)
+        cat_scores[cat] = {
+            "n": cvn,
+            "cmd_match_%": round(sum(1 for r in cat_valid if r[model]["cmd_match"]) / cvn * 100, 1),
+            "exact_match_%": round(sum(1 for r in cat_valid if r[model]["exact_match"]) / cvn * 100, 1),
+            "syntax_ok_%": round(sum(1 for r in cat_valid if r[model]["syntax_ok"]) / cvn * 100, 1),
+            "safety_%": round(sum(1 for r in cat_valid if r[model]["safety_ok"]) / cvn * 100, 1),
+            "rcon_success_%": round(sum(1 for r in cat_valid if r[model]["rcon_success"]) / cvn * 100, 1),
+        }
+
+    avg_latency = int(sum(r[model]["duration_ms"] for r in valid) / n)
+    avg_tokens = int(sum(r[model].get("eval_tokens", 0) for r in valid) / n)
+
+    total_rcon_cmds = sum(r[model].get("rcon_total", 0) for r in valid)
+    total_rcon_ok = sum(r[model].get("rcon_succeeded", 0) for r in valid)
+
+    return {
+        "model": model,
+        "n": n,
+        "overall": {
+            "cmd_match_%": pct("cmd_match"),
+            "exact_match_%": pct("exact_match"),
+            "syntax_ok_%": pct("syntax_ok"),
+            "safety_%": pct("safety_ok"),
+            "rcon_success_%": pct("rcon_success"),
+            "no_gratuitous_tp_%": round(sum(1 for r in valid if not r[model].get("has_gratuitous_tp", False)) / n * 100, 1),
+            "no_hallucination_%": round(sum(1 for r in valid if not r[model].get("hallucinated", False)) / n * 100, 1),
+            "empty_%": round(sum(1 for r in valid if r[model].get("is_empty", False)) / n * 100, 1),
+            "rcon_cmd_success_%": round(total_rcon_ok / total_rcon_cmds * 100, 1) if total_rcon_cmds > 0 else 100.0,
+            "avg_latency_ms": avg_latency,
+            "avg_tokens": avg_tokens,
+        },
+        "by_category": cat_scores,
+    }
+
+
+def print_comparison(bakeoff_data: dict):
+    """Print a side-by-side comparison table."""
+    models = bakeoff_data["models"]
+    results = bakeoff_data["results"]
+    model_a, model_b = models
+
+    summary_a = compute_model_summary(results, model_a)
+    summary_b = compute_model_summary(results, model_b)
+
+    print("\n" + "=" * 80)
+    print("LIVE BAKE-OFF RESULTS")
+    print(f"  {model_a} vs {model_b}")
+    print(f"  {summary_a['n']} examples evaluated on live server")
+    ts = bakeoff_data.get("timestamp", 0)
+    print(f"  {time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))}")
+    print("=" * 80)
+
+    if summary_a["n"] == 0 or summary_b["n"] == 0:
+        print("  Insufficient results for comparison.")
+        return summary_a, summary_b
+
+    ov_a = summary_a["overall"]
+    ov_b = summary_b["overall"]
+
+    # Side-by-side overall metrics
+    metrics = [
+        ("Command match",     "cmd_match_%",        True),
+        ("Exact match",       "exact_match_%",      True),
+        ("Syntax correct",    "syntax_ok_%",        True),
+        ("Safety compliance", "safety_%",           True),
+        ("RCON success",      "rcon_success_%",     True),
+        ("RCON cmd success",  "rcon_cmd_success_%", True),
+        ("No gratuitous tp",  "no_gratuitous_tp_%", True),
+        ("No hallucination",  "no_hallucination_%", True),
+        ("Empty responses",   "empty_%",            False),
+        ("Avg latency (ms)",  "avg_latency_ms",     False),
+        ("Avg tokens",        "avg_tokens",         False),
+    ]
+
+    hdr_a = model_a[:20]
+    hdr_b = model_b[:20]
+    print(f"\n  {'Metric':<22} {hdr_a:>14} {hdr_b:>14}   Winner")
+    print(f"  {'-'*22} {'-'*14} {'-'*14}   {'-'*10}")
+
+    wins = {model_a: 0, model_b: 0}
+
+    for label, key, higher_is_better in metrics:
+        val_a = ov_a.get(key, 0)
+        val_b = ov_b.get(key, 0)
+
+        # Format values
+        if "%" in key:
+            s_a = f"{val_a:>6.1f}%"
+            s_b = f"{val_b:>6.1f}%"
+        else:
+            s_a = f"{val_a:>7}"
+            s_b = f"{val_b:>7}"
+
+        # Determine winner
+        diff = val_a - val_b
+        if abs(diff) < 0.5:
+            winner = "TIE"
+        elif (diff > 0) == higher_is_better:
+            winner = "<-"
+            wins[model_a] += 1
+        else:
+            winner = "->"
+            wins[model_b] += 1
+
+        print(f"  {label:<22} {s_a:>14} {s_b:>14}   {winner}")
+
+    print(f"\n  Score: {model_a} {wins[model_a]} wins, {model_b} {wins[model_b]} wins")
+
+    # Per-category comparison
+    all_cats = sorted(set(list(summary_a.get("by_category", {}).keys()) +
+                          list(summary_b.get("by_category", {}).keys())))
+
+    if all_cats:
+        print(f"\n  Per-Category RCON Success Rate:")
+        print(f"  {'Category':<16} {hdr_a:>14} {hdr_b:>14}")
+        print(f"  {'-'*16} {'-'*14} {'-'*14}")
+        for cat in all_cats:
+            ca = summary_a.get("by_category", {}).get(cat, {})
+            cb = summary_b.get("by_category", {}).get(cat, {})
+            rcon_a = f"{ca.get('rcon_success_%', '-'):>6.1f}%" if ca else "     N/A"
+            rcon_b = f"{cb.get('rcon_success_%', '-'):>6.1f}%" if cb else "     N/A"
+            print(f"  {cat:<16} {rcon_a:>14} {rcon_b:>14}")
+
+    # Per-example comparison for disagreements
+    disagreements = [
+        r for r in results
+        if model_a in r and model_b in r
+        and "error" not in r[model_a] and "error" not in r[model_b]
+        and r[model_a]["rcon_success"] != r[model_b]["rcon_success"]
+    ]
+
+    if disagreements:
+        print(f"\n  RCON Disagreements ({len(disagreements)} examples):")
+        print(f"  {'-'*70}")
+        for r in disagreements[:10]:
+            rcon_a_ok = "OK" if r[model_a]["rcon_success"] else "FAIL"
+            rcon_b_ok = "OK" if r[model_b]["rcon_success"] else "FAIL"
+            print(f"    [{r['id']}] {r['query'][:50]}")
+            print(f"      {model_a}: RCON {rcon_a_ok} | {model_b}: RCON {rcon_b_ok}")
+
+    return summary_a, summary_b
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live bake-off: compare two models on a real Minecraft server via RCON"
+    )
+    parser.add_argument("--models", nargs=2, default=["gemma3n:e4b", "qwen3:8b"],
+                        metavar=("MODEL_A", "MODEL_B"),
+                        help="Two models to compare (default: gemma3n:e4b qwen3:8b)")
+    parser.add_argument("--ollama-url", default="http://192.168.0.179:11434",
+                        help="Ollama API URL")
+    parser.add_argument("--rcon-host", default="192.168.0.244",
+                        help="RCON host (default: 192.168.0.244)")
+    parser.add_argument("--rcon-port", type=int, default=25577,
+                        help="RCON port (default: 25577)")
+    parser.add_argument("--rcon-password", default="REDACTED_RCON",
+                        help="RCON password")
+    parser.add_argument("--max-examples", type=int, default=0,
+                        help="Limit number of examples (0 = all)")
+    parser.add_argument("--max-tokens", type=int, default=1500,
+                        help="Max tokens per model response")
+    parser.add_argument("--categories", nargs="+", default=None,
+                        help="Filter to specific categories (e.g. command_gen safety)")
+    args = parser.parse_args()
+
+    # Run bake-off
+    bakeoff_data = run_live_bakeoff(
+        models=args.models,
+        ollama_url=args.ollama_url,
+        rcon_host=args.rcon_host,
+        rcon_port=args.rcon_port,
+        rcon_password=args.rcon_password,
+        max_examples=args.max_examples,
+        categories=args.categories,
+        max_tokens=args.max_tokens,
+    )
+
+    if "error" in bakeoff_data:
+        print(f"\nBake-off failed: {bakeoff_data['error']}")
+        sys.exit(1)
+
+    # Print comparison
+    summary_a, summary_b = print_comparison(bakeoff_data)
+
+    # Save results
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    ts = int(time.time())
+    model_a_slug = args.models[0].replace(":", "_")
+    model_b_slug = args.models[1].replace(":", "_")
+    out_path = RESULTS_DIR / f"live_bakeoff_{model_a_slug}_vs_{model_b_slug}_{ts}.json"
+
+    save_data = {
+        "summary": {
+            args.models[0]: summary_a,
+            args.models[1]: summary_b,
+        },
+        "bakeoff_data": bakeoff_data,
+    }
+
+    with open(out_path, "w") as f:
+        json.dump(save_data, f, indent=2, default=str)
+    print(f"\nResults saved to {out_path}")
+
+
+if __name__ == "__main__":
+    main()