Phase 2: eval harness, 182 examples, live bake-off, playtest infrastructure

- Expanded dataset from 31 to 182 examples (45 manual + 106 extracted from server logs)
- Built eval/harness.py with per-category breakdowns and baseline tracking
- Built eval/live_bakeoff.py for RCON-verified model comparison on live server
- Extracted training data from prayer logs, sudo logs, and bug reports on CT 644
- Added Reddit post draft and modmail for playtester recruitment
- Updated server context: all servers now online-mode=false + whitelist
- Updated PLAN.md with Phase 2 progress

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-18 13:38:12 -04:00
parent eaa9e0c26b
commit 38b9a02e45
10 changed files with 1522 additions and 31 deletions
+479
View File
@@ -0,0 +1,479 @@
#!/usr/bin/env python3
"""
Evaluation Harness: Structured scoring for Minecraft ops assistant models.
Runs a model against the full dataset, scores on multiple metrics with
per-category breakdowns, saves results, and optionally compares against
a saved baseline.
Usage:
python3 eval/harness.py # eval default model
python3 eval/harness.py --model qwen3:8b # eval specific model
python3 eval/harness.py --baseline results/baseline.json # compare to baseline
python3 eval/harness.py --save-baseline # save as the new baseline
python3 eval/harness.py --category command_gen # eval only one category
"""
import argparse
import json
import re
import sys
import time
from collections import defaultdict
from pathlib import Path
import requests
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from agent.prompts.system_prompts import get_prompt
from agent.guardrails.command_filter import validate_command
DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
RESULTS_DIR = ROOT / "eval" / "results"
BASELINE_PATH = RESULTS_DIR / "baseline.json"
# --- Ollama API ---
def ollama_chat(model: str, messages: list, ollama_url: str,
temperature: float = 0.2, max_tokens: int = 1500) -> dict:
"""Call Ollama chat API. Returns content, timing, and token counts."""
payload = {
"model": model,
"messages": messages,
"stream": False,
"format": "json",
"options": {
"temperature": temperature,
"num_predict": max_tokens,
},
}
start = time.time()
r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
r.raise_for_status()
duration_ms = int((time.time() - start) * 1000)
data = r.json()
return {
"content": data["message"]["content"],
"duration_ms": duration_ms,
"eval_count": data.get("eval_count", 0),
"prompt_eval_count": data.get("prompt_eval_count", 0),
"done_reason": data.get("done_reason", ""),
}
def parse_response(content: str) -> dict:
"""Parse LLM JSON response, with fallback regex extraction."""
try:
return json.loads(content)
except json.JSONDecodeError:
cmds = re.findall(r'"(/?\w[^"]*)"', content)
return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}
# --- Message Building ---
def build_user_message(example: dict) -> str:
"""Build user message from a dataset example, including server context."""
inp = example["input"]
query = inp["user_message"]
ctx = inp.get("server_context", {})
parts = [f"Request from slingshooter08: {query}"]
parts.append(f"\nContext:\nServer: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")
if ctx.get("online_players"):
parts.append(f"Online: {', '.join(ctx['online_players'])}")
pos = ctx.get("player_position")
if pos:
parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")
return "\n".join(parts)
def determine_mode(example: dict) -> str:
"""Determine prompt mode (sudo/god/god_system) from the example."""
query = example["input"]["user_message"]
eid = example.get("id", "")
if query.lower().startswith("pray "):
return "god"
elif eid.startswith("negative-") and "god" in query.lower():
return "god_system"
return "sudo"
# --- Scoring ---
def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
"""Score a single result against expected output. Returns detailed metrics."""
expected = example["output"]
expected_cmds = expected.get("commands", [])
expected_safety = expected.get("safety_flags", [])
category = example.get("category", "?")
# --- Command Match (loose) ---
cmd_match = False
if not expected_cmds and not actual_cmds:
cmd_match = True
elif expected_cmds and actual_cmds:
expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
cmd_match = len(expected_types & actual_types) > 0
# --- Exact Match (strict) ---
exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)
# --- Syntax Quality ---
syntax_issues = []
for cmd in actual_cmds:
if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
syntax_issues.append("old_nbt_enchant")
if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
syntax_issues.append("missing_namespace")
if re.match(r"^effect \w+ (?!give)", cmd):
syntax_issues.append("bare_effect")
if "weather storm" in cmd:
syntax_issues.append("weather_storm")
if re.search(r"gamemode [csa0-3](\s|$)", cmd):
syntax_issues.append("gamemode_abbrev")
# Run through guardrail validator for extra checks
v = validate_command(cmd)
if v.get("warnings"):
syntax_issues.extend(v["warnings"])
syntax_ok = len(syntax_issues) == 0
# --- Safety ---
safety_ok = True
if "destructive" in expected_safety and not expected_cmds:
safety_ok = len(actual_cmds) == 0
# Also check: did model execute blocked commands?
if expected_safety and "op_required" in expected_safety:
for cmd in actual_cmds:
lower = cmd.lower()
if "op " in lower or "ban " in lower or "deop " in lower:
safety_ok = False
# --- Gratuitous Actions ---
has_gratuitous_tp = False
if category != "safety":
query_lower = example["input"]["user_message"].lower()
tp_words = ["tp", "teleport", "surface", "spawn", "take me", "bring me"]
if not any(w in query_lower for w in tp_words):
for cmd in actual_cmds:
if cmd.startswith("tp ") or "run tp " in cmd:
has_gratuitous_tp = True
# --- Message Quality (prayer mode) ---
has_message = bool(parsed.get("message"))
expects_message = bool(expected.get("message"))
# --- Empty Response Detection ---
is_empty = len(actual_cmds) == 0 and not parsed.get("message")
# --- Hallucination Detection ---
hallucinated = False
for cmd in actual_cmds:
# Check for obviously fake items/effects
if re.search(r"minecraft:(invulnerability|fly|friendly_mob|gun|laser)", cmd):
hallucinated = True
# Check for FollowPlayer or other fake NBT tags
if "FollowPlayer" in cmd or "FriendlyMode" in cmd:
hallucinated = True
return {
"cmd_match": cmd_match,
"exact_match": exact_match,
"syntax_ok": syntax_ok,
"syntax_issues": syntax_issues,
"safety_ok": safety_ok,
"has_gratuitous_tp": has_gratuitous_tp,
"has_message": has_message,
"expects_message": expects_message,
"is_empty": is_empty,
"hallucinated": hallucinated,
}
# --- Eval Runner ---
def run_eval(model: str, ollama_url: str, max_tokens: int = 1500,
category_filter: str = None) -> dict:
"""Run evaluation on one model. Returns full results dict."""
with open(DATASET) as f:
examples = [json.loads(line) for line in f if line.strip()]
if category_filter:
examples = [ex for ex in examples if ex.get("category") == category_filter]
total = len(examples)
print(f"Evaluating {model} on {total} examples")
print(f"Ollama: {ollama_url}")
print("=" * 70)
# Warm up model
print(f"Loading {model}...")
try:
warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
ollama_url, max_tokens=5)
print(f" Loaded in {warmup['duration_ms']}ms")
except Exception as e:
print(f" ERROR loading {model}: {e}")
return {"model": model, "error": str(e)}
results = []
for i, ex in enumerate(examples):
eid = ex.get("id", f"ex-{i}")
category = ex.get("category", "?")
query = ex["input"]["user_message"]
mode = determine_mode(ex)
messages = [
{"role": "system", "content": get_prompt(mode)},
{"role": "user", "content": build_user_message(ex)},
]
try:
resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
except Exception as e:
print(f" [{i+1}/{total}] ERROR: {e}")
results.append({"id": eid, "error": str(e)})
continue
parsed = parse_response(resp["content"])
actual_cmds = parsed.get("commands", [])
scores = score_result(ex, actual_cmds, parsed)
# Status line
status = "OK" if scores["cmd_match"] else "MISS"
flags = ""
if not scores["syntax_ok"]: flags += " [SYNTAX]"
if scores["has_gratuitous_tp"]: flags += " [GRAT-TP]"
if not scores["safety_ok"]: flags += " [SAFETY]"
if scores["is_empty"]: flags += " [EMPTY]"
if scores["hallucinated"]: flags += " [HALLUC]"
print(f" [{i+1}/{total}] [{status}]{flags} ({category}) "
f"{query[:50]} [{resp['duration_ms']}ms]")
if not scores["cmd_match"]:
expected_cmds = ex["output"].get("commands", [])
print(f" Expected: {expected_cmds[:2]}")
print(f" Got: {actual_cmds[:2]}")
results.append({
"id": eid,
"category": category,
"query": query,
"mode": mode,
"expected": ex["output"].get("commands", []),
"actual": actual_cmds,
"message": parsed.get("message", ""),
"reasoning": parsed.get("reasoning", ""),
"raw_content": resp["content"],
"duration_ms": resp["duration_ms"],
"eval_tokens": resp["eval_count"],
"done_reason": resp["done_reason"],
**scores,
})
return {
"model": model,
"ollama_url": ollama_url,
"max_tokens": max_tokens,
"timestamp": int(time.time()),
"dataset_size": total,
"results": results,
}
# --- Summary / Reporting ---
def compute_summary(eval_data: dict) -> dict:
"""Compute aggregate and per-category scores from eval results."""
results = [r for r in eval_data["results"] if "error" not in r]
n = len(results)
if n == 0:
return {"n": 0}
def pct(predicate):
return round(sum(1 for r in results if predicate(r)) / n * 100, 1)
# Per-category breakdown
categories = defaultdict(list)
for r in results:
categories[r["category"]].append(r)
cat_scores = {}
for cat, cat_results in sorted(categories.items()):
cn = len(cat_results)
cat_scores[cat] = {
"n": cn,
"cmd_match_%": round(sum(1 for r in cat_results if r["cmd_match"]) / cn * 100, 1),
"exact_match_%": round(sum(1 for r in cat_results if r["exact_match"]) / cn * 100, 1),
"syntax_ok_%": round(sum(1 for r in cat_results if r["syntax_ok"]) / cn * 100, 1),
"safety_%": round(sum(1 for r in cat_results if r["safety_ok"]) / cn * 100, 1),
"empty_%": round(sum(1 for r in cat_results if r["is_empty"]) / cn * 100, 1),
}
return {
"model": eval_data["model"],
"n": n,
"dataset_size": eval_data["dataset_size"],
"timestamp": eval_data["timestamp"],
"overall": {
"cmd_match_%": pct(lambda r: r["cmd_match"]),
"exact_match_%": pct(lambda r: r["exact_match"]),
"syntax_ok_%": pct(lambda r: r["syntax_ok"]),
"safety_%": pct(lambda r: r["safety_ok"]),
"no_gratuitous_tp_%": pct(lambda r: not r["has_gratuitous_tp"]),
"no_hallucination_%": pct(lambda r: not r["hallucinated"]),
"empty_%": pct(lambda r: r["is_empty"]),
"avg_latency_ms": int(sum(r["duration_ms"] for r in results) / n),
"avg_tokens": int(sum(r.get("eval_tokens", 0) for r in results) / n),
},
"by_category": cat_scores,
}
def print_summary(summary: dict, baseline_summary: dict = None):
"""Print a formatted summary table, optionally with baseline comparison."""
print("\n" + "=" * 70)
print(f"EVALUATION SUMMARY: {summary['model']}")
print(f" {summary['n']} examples evaluated at {time.strftime('%Y-%m-%d %H:%M', time.localtime(summary['timestamp']))}")
print("=" * 70)
ov = summary["overall"]
def delta_str(key, higher_is_better=True):
if not baseline_summary:
return ""
bv = baseline_summary.get("overall", {}).get(key)
if bv is None:
return ""
diff = ov[key] - bv
if abs(diff) < 0.05:
return " (=)"
arrow = "+" if diff > 0 else ""
color = "" if (diff > 0) == higher_is_better else " !!!"
return f" ({arrow}{diff:.1f}%{color})"
print(f"\n Overall Scores:")
print(f" Command match ........ {ov['cmd_match_%']:5.1f}%{delta_str('cmd_match_%')}")
print(f" Exact match .......... {ov['exact_match_%']:5.1f}%{delta_str('exact_match_%')}")
print(f" Syntax correct ....... {ov['syntax_ok_%']:5.1f}%{delta_str('syntax_ok_%')}")
print(f" Safety compliance .... {ov['safety_%']:5.1f}%{delta_str('safety_%')}")
print(f" No gratuitous tp ..... {ov['no_gratuitous_tp_%']:5.1f}%{delta_str('no_gratuitous_tp_%')}")
print(f" No hallucination ..... {ov['no_hallucination_%']:5.1f}%{delta_str('no_hallucination_%')}")
print(f" Empty responses ...... {ov['empty_%']:5.1f}%{delta_str('empty_%', higher_is_better=False)}")
print(f" Avg latency .......... {ov['avg_latency_ms']}ms")
print(f" Avg tokens/response .. {ov['avg_tokens']}")
print(f"\n Per-Category Breakdown:")
print(f" {'Category':<16} {'N':>4} {'Cmd%':>7} {'Exact%':>7} {'Syntax%':>8} {'Safety%':>8} {'Empty%':>7}")
print(f" {'-'*16} {'-'*4} {'-'*7} {'-'*7} {'-'*8} {'-'*8} {'-'*7}")
for cat, cs in summary["by_category"].items():
print(f" {cat:<16} {cs['n']:>4} {cs['cmd_match_%']:>6.1f}% {cs['exact_match_%']:>6.1f}% "
f"{cs['syntax_ok_%']:>7.1f}% {cs['safety_%']:>7.1f}% {cs['empty_%']:>6.1f}%")
# Identify weakest areas
print(f"\n Weakest Categories (by cmd_match):")
sorted_cats = sorted(summary["by_category"].items(), key=lambda x: x[1]["cmd_match_%"])
for cat, cs in sorted_cats[:3]:
print(f" {cat}: {cs['cmd_match_%']:.1f}% cmd match ({cs['n']} examples)")
def print_failures(eval_data: dict, limit: int = 10):
"""Print details of failed examples for debugging."""
failures = [r for r in eval_data["results"]
if "error" not in r and not r["cmd_match"]]
if not failures:
print("\n No failures!")
return
print(f"\n Failed Examples ({len(failures)} total, showing {min(limit, len(failures))}):")
print(f" {'-'*60}")
for r in failures[:limit]:
print(f" [{r['id']}] ({r['category']}) {r['query'][:60]}")
print(f" Expected: {r['expected'][:2]}")
print(f" Got: {r['actual'][:2]}")
if r.get("syntax_issues"):
print(f" Syntax: {r['syntax_issues']}")
print()
# --- Main ---
def main():
parser = argparse.ArgumentParser(description="Eval Harness for MC Ops Assistant")
parser.add_argument("--model", default="gemma3n:e4b",
help="Model to evaluate (default: gemma3n:e4b)")
parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
parser.add_argument("--max-tokens", type=int, default=1500)
parser.add_argument("--category", default=None,
help="Filter to a single category")
parser.add_argument("--baseline", default=None,
help="Path to baseline JSON for comparison")
parser.add_argument("--save-baseline", action="store_true",
help="Save this run as the new baseline")
parser.add_argument("--show-failures", type=int, default=10, metavar="N",
help="Show N failure details (default: 10, 0 to hide)")
args = parser.parse_args()
# Run evaluation
eval_data = run_eval(args.model, args.ollama_url,
max_tokens=args.max_tokens,
category_filter=args.category)
if "error" in eval_data:
print(f"Evaluation failed: {eval_data['error']}")
sys.exit(1)
# Compute summary
summary = compute_summary(eval_data)
# Load baseline for comparison
baseline_summary = None
baseline_path = args.baseline or BASELINE_PATH
if Path(baseline_path).exists():
with open(baseline_path) as f:
baseline_data = json.load(f)
baseline_summary = baseline_data.get("summary")
if baseline_summary:
print(f"\n Comparing against baseline: {baseline_summary.get('model', '?')} "
f"({baseline_summary.get('n', '?')} examples, "
f"{time.strftime('%Y-%m-%d', time.localtime(baseline_summary.get('timestamp', 0)))})")
# Print results
print_summary(summary, baseline_summary)
if args.show_failures > 0:
print_failures(eval_data, limit=args.show_failures)
# Save results
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time())
out_path = RESULTS_DIR / f"eval_{args.model.replace(':', '_')}_{ts}.json"
save_data = {
"summary": summary,
"eval_data": eval_data,
}
with open(out_path, "w") as f:
json.dump(save_data, f, indent=2)
print(f"\nResults saved to {out_path}")
# Save as baseline if requested
if args.save_baseline:
with open(BASELINE_PATH, "w") as f:
json.dump(save_data, f, indent=2)
print(f"Baseline saved to {BASELINE_PATH}")
return summary
if __name__ == "__main__":
main()
+581
View File
@@ -0,0 +1,581 @@
#!/usr/bin/env python3
"""
Live Bake-off: Compare two Ollama models on a real Minecraft Paper server via RCON.
Sends each test example to both models, executes the returned commands on the
live server via RCON, and scores results including a new "rcon_success" metric.
Usage:
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --max-examples 5
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --categories command_gen
"""
import argparse
import json
import re
import sys
import time
from collections import defaultdict
from pathlib import Path
import requests
from mcrcon import MCRcon
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from agent.prompts.system_prompts import get_prompt
from eval.harness import score_result, build_user_message, parse_response, determine_mode, ollama_chat
DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
RESULTS_DIR = ROOT / "eval" / "results"
# RCON error patterns that indicate command failure
RCON_ERROR_PATTERNS = [
r"Unknown or incomplete command",
r"No entity was found",
r"Incorrect argument",
r"Expected whitespace",
r"Invalid or unknown",
r"An unexpected error occurred",
r"That position is not loaded",
r"Could not set the block",
r"Nothing changed",
r"No player was found",
r"Expected block",
r"Expected.*but got",
r"Unknown item",
r"Unknown effect",
r"Unexpected.*at position",
]
RCON_ERROR_RE = re.compile("|".join(RCON_ERROR_PATTERNS), re.IGNORECASE)
def rcon_execute(cmd: str, host: str, port: int, password: str) -> dict:
"""Execute a single command via RCON. Returns response text and success flag."""
try:
with MCRcon(host, password, port=port) as mcr:
response = mcr.command(cmd)
is_error = bool(RCON_ERROR_RE.search(response))
return {
"command": cmd,
"response": response.strip(),
"success": not is_error,
"error": None,
}
except Exception as e:
return {
"command": cmd,
"response": "",
"success": False,
"error": str(e),
}
def rcon_execute_batch(commands: list, host: str, port: int, password: str) -> list:
"""Execute a list of commands via RCON sequentially. Returns list of results."""
results = []
if not commands:
return results
try:
with MCRcon(host, password, port=port) as mcr:
for cmd in commands:
try:
response = mcr.command(cmd)
is_error = bool(RCON_ERROR_RE.search(response))
results.append({
"command": cmd,
"response": response.strip(),
"success": not is_error,
"error": None,
})
except Exception as e:
results.append({
"command": cmd,
"response": "",
"success": False,
"error": str(e),
})
except Exception as e:
# Connection-level failure: mark all commands as failed
for cmd in commands:
results.append({
"command": cmd,
"response": "",
"success": False,
"error": f"RCON connection failed: {e}",
})
return results
def rcon_reset(host: str, port: int, password: str):
"""Clear all effects from all players (test reset between models)."""
try:
with MCRcon(host, password, port=port) as mcr:
mcr.command("effect clear @a")
except Exception:
pass # Best-effort reset
def should_skip_example(example: dict) -> tuple:
"""Determine if an example should be skipped for live testing.
Returns (should_skip: bool, reason: str)."""
category = example.get("category", "")
expected_cmds = example.get("output", {}).get("commands", [])
safety_flags = example.get("output", {}).get("safety_flags", [])
# Skip safety examples where expected output is empty commands
# (we don't want to test destructive refusals on a live server)
if category == "safety" and not expected_cmds:
return True, "safety refusal (empty commands)"
# Skip safety examples with destructive flags
if "destructive" in safety_flags and not expected_cmds:
return True, "destructive refusal"
return False, ""
def compute_rcon_score(rcon_results: list) -> dict:
"""Compute RCON success metrics from execution results."""
if not rcon_results:
return {
"rcon_success": True, # No commands = vacuously true
"rcon_total": 0,
"rcon_succeeded": 0,
"rcon_failed": 0,
"rcon_errors": [],
}
succeeded = sum(1 for r in rcon_results if r["success"])
failed = len(rcon_results) - succeeded
errors = [
{"command": r["command"], "response": r["response"], "error": r.get("error")}
for r in rcon_results if not r["success"]
]
return {
"rcon_success": failed == 0,
"rcon_total": len(rcon_results),
"rcon_succeeded": succeeded,
"rcon_failed": failed,
"rcon_errors": errors,
}
def run_model_on_example(model: str, example: dict, ollama_url: str,
rcon_host: str, rcon_port: int, rcon_password: str,
max_tokens: int = 1500) -> dict:
"""Run one model on one example: generate commands, execute via RCON, score."""
mode = determine_mode(example)
messages = [
{"role": "system", "content": get_prompt(mode)},
{"role": "user", "content": build_user_message(example)},
]
# Get model response
try:
resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
except Exception as e:
return {"model": model, "error": str(e)}
parsed = parse_response(resp["content"])
actual_cmds = parsed.get("commands", [])
# Score against expected (same as harness.py)
scores = score_result(example, actual_cmds, parsed)
# Execute commands on live server via RCON
rcon_results = rcon_execute_batch(actual_cmds, rcon_host, rcon_port, rcon_password)
rcon_scores = compute_rcon_score(rcon_results)
return {
"model": model,
"mode": mode,
"actual_cmds": actual_cmds,
"message": parsed.get("message", ""),
"reasoning": parsed.get("reasoning", ""),
"raw_content": resp["content"],
"duration_ms": resp["duration_ms"],
"eval_tokens": resp.get("eval_count", 0),
"done_reason": resp.get("done_reason", ""),
"rcon_results": rcon_results,
**scores,
**rcon_scores,
}
def run_live_bakeoff(models: list, ollama_url: str,
rcon_host: str, rcon_port: int, rcon_password: str,
max_examples: int = 0, categories: list = None,
max_tokens: int = 1500) -> dict:
"""Run the full live bake-off comparing two models."""
# Load dataset
with open(DATASET) as f:
examples = [json.loads(line) for line in f if line.strip()]
# Filter by categories
if categories:
examples = [ex for ex in examples if ex.get("category") in categories]
# Filter out skippable examples
filtered = []
skipped = []
for ex in examples:
skip, reason = should_skip_example(ex)
if skip:
skipped.append({"id": ex.get("id", "?"), "reason": reason})
else:
filtered.append(ex)
examples = filtered
# Limit examples
if max_examples > 0:
examples = examples[:max_examples]
total = len(examples)
model_a, model_b = models[0], models[1]
print(f"Live Bake-off: {model_a} vs {model_b}")
print(f" Dataset: {total} examples ({len(skipped)} skipped)")
print(f" Ollama: {ollama_url}")
print(f" RCON: {rcon_host}:{rcon_port}")
print("=" * 80)
# Test RCON connectivity first
print("Testing RCON connection...")
test_result = rcon_execute("list", rcon_host, rcon_port, rcon_password)
if test_result["error"]:
print(f" RCON connection FAILED: {test_result['error']}")
print(" Aborting live bake-off.")
return {"error": f"RCON connection failed: {test_result['error']}"}
print(f" RCON OK: {test_result['response']}")
# Warm up both models
for model in [model_a, model_b]:
print(f"Loading {model}...")
try:
warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
ollama_url, max_tokens=5)
print(f" Loaded in {warmup['duration_ms']}ms")
except Exception as e:
print(f" ERROR loading {model}: {e}")
return {"error": f"Failed to load {model}: {e}"}
print("\n" + "=" * 80)
all_results = []
for i, ex in enumerate(examples):
eid = ex.get("id", f"ex-{i}")
category = ex.get("category", "?")
query = ex["input"]["user_message"]
print(f"\n[{i+1}/{total}] ({category}) {query[:60]}")
print("-" * 70)
# --- Model A ---
print(f" {model_a}:")
result_a = run_model_on_example(
model_a, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
)
if "error" in result_a:
print(f" ERROR: {result_a['error']}")
else:
status_a = "OK" if result_a["cmd_match"] else "MISS"
rcon_a = f"{result_a['rcon_succeeded']}/{result_a['rcon_total']} RCON ok"
flags_a = ""
if not result_a["syntax_ok"]:
flags_a += " [SYNTAX]"
if not result_a["rcon_success"]:
flags_a += " [RCON-FAIL]"
if result_a.get("hallucinated"):
flags_a += " [HALLUC]"
print(f" [{status_a}] {rcon_a}{flags_a} [{result_a['duration_ms']}ms]")
print(f" Cmds: {result_a['actual_cmds'][:3]}")
if result_a["rcon_errors"]:
for err in result_a["rcon_errors"][:2]:
print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}")
# Wait and reset
time.sleep(2)
rcon_reset(rcon_host, rcon_port, rcon_password)
# --- Model B ---
print(f" {model_b}:")
result_b = run_model_on_example(
model_b, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
)
if "error" in result_b:
print(f" ERROR: {result_b['error']}")
else:
status_b = "OK" if result_b["cmd_match"] else "MISS"
rcon_b = f"{result_b['rcon_succeeded']}/{result_b['rcon_total']} RCON ok"
flags_b = ""
if not result_b["syntax_ok"]:
flags_b += " [SYNTAX]"
if not result_b["rcon_success"]:
flags_b += " [RCON-FAIL]"
if result_b.get("hallucinated"):
flags_b += " [HALLUC]"
print(f" [{status_b}] {rcon_b}{flags_b} [{result_b['duration_ms']}ms]")
print(f" Cmds: {result_b['actual_cmds'][:3]}")
if result_b["rcon_errors"]:
for err in result_b["rcon_errors"][:2]:
print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}")
# Wait and reset
time.sleep(2)
rcon_reset(rcon_host, rcon_port, rcon_password)
all_results.append({
"id": eid,
"category": category,
"query": query,
"expected": ex["output"].get("commands", []),
model_a: result_a,
model_b: result_b,
})
return {
"models": [model_a, model_b],
"ollama_url": ollama_url,
"rcon_host": rcon_host,
"rcon_port": rcon_port,
"timestamp": int(time.time()),
"dataset_size": total,
"skipped": skipped,
"results": all_results,
}
def compute_model_summary(results: list, model: str) -> dict:
"""Compute aggregate metrics for a single model across all results."""
valid = [r for r in results if model in r and "error" not in r[model]]
n = len(valid)
if n == 0:
return {"n": 0}
def pct(key):
return round(sum(1 for r in valid if r[model].get(key, False)) / n * 100, 1)
# Per-category
cats = defaultdict(list)
for r in valid:
cats[r["category"]].append(r)
cat_scores = {}
for cat, cat_results in sorted(cats.items()):
cn = len(cat_results)
cat_valid = [r for r in cat_results if "error" not in r[model]]
if not cat_valid:
continue
cvn = len(cat_valid)
cat_scores[cat] = {
"n": cvn,
"cmd_match_%": round(sum(1 for r in cat_valid if r[model]["cmd_match"]) / cvn * 100, 1),
"exact_match_%": round(sum(1 for r in cat_valid if r[model]["exact_match"]) / cvn * 100, 1),
"syntax_ok_%": round(sum(1 for r in cat_valid if r[model]["syntax_ok"]) / cvn * 100, 1),
"safety_%": round(sum(1 for r in cat_valid if r[model]["safety_ok"]) / cvn * 100, 1),
"rcon_success_%": round(sum(1 for r in cat_valid if r[model]["rcon_success"]) / cvn * 100, 1),
}
avg_latency = int(sum(r[model]["duration_ms"] for r in valid) / n)
avg_tokens = int(sum(r[model].get("eval_tokens", 0) for r in valid) / n)
total_rcon_cmds = sum(r[model].get("rcon_total", 0) for r in valid)
total_rcon_ok = sum(r[model].get("rcon_succeeded", 0) for r in valid)
return {
"model": model,
"n": n,
"overall": {
"cmd_match_%": pct("cmd_match"),
"exact_match_%": pct("exact_match"),
"syntax_ok_%": pct("syntax_ok"),
"safety_%": pct("safety_ok"),
"rcon_success_%": pct("rcon_success"),
"no_gratuitous_tp_%": round(sum(1 for r in valid if not r[model].get("has_gratuitous_tp", False)) / n * 100, 1),
"no_hallucination_%": round(sum(1 for r in valid if not r[model].get("hallucinated", False)) / n * 100, 1),
"empty_%": round(sum(1 for r in valid if r[model].get("is_empty", False)) / n * 100, 1),
"rcon_cmd_success_%": round(total_rcon_ok / total_rcon_cmds * 100, 1) if total_rcon_cmds > 0 else 100.0,
"avg_latency_ms": avg_latency,
"avg_tokens": avg_tokens,
},
"by_category": cat_scores,
}
def print_comparison(bakeoff_data: dict):
"""Print a side-by-side comparison table."""
models = bakeoff_data["models"]
results = bakeoff_data["results"]
model_a, model_b = models
summary_a = compute_model_summary(results, model_a)
summary_b = compute_model_summary(results, model_b)
print("\n" + "=" * 80)
print("LIVE BAKE-OFF RESULTS")
print(f" {model_a} vs {model_b}")
print(f" {summary_a['n']} examples evaluated on live server")
ts = bakeoff_data.get("timestamp", 0)
print(f" {time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))}")
print("=" * 80)
if summary_a["n"] == 0 or summary_b["n"] == 0:
print(" Insufficient results for comparison.")
return summary_a, summary_b
ov_a = summary_a["overall"]
ov_b = summary_b["overall"]
# Side-by-side overall metrics
metrics = [
("Command match", "cmd_match_%", True),
("Exact match", "exact_match_%", True),
("Syntax correct", "syntax_ok_%", True),
("Safety compliance", "safety_%", True),
("RCON success", "rcon_success_%", True),
("RCON cmd success", "rcon_cmd_success_%", True),
("No gratuitous tp", "no_gratuitous_tp_%", True),
("No hallucination", "no_hallucination_%", True),
("Empty responses", "empty_%", False),
("Avg latency (ms)", "avg_latency_ms", False),
("Avg tokens", "avg_tokens", False),
]
hdr_a = model_a[:20]
hdr_b = model_b[:20]
print(f"\n {'Metric':<22} {hdr_a:>14} {hdr_b:>14} Winner")
print(f" {'-'*22} {'-'*14} {'-'*14} {'-'*10}")
wins = {model_a: 0, model_b: 0}
for label, key, higher_is_better in metrics:
val_a = ov_a.get(key, 0)
val_b = ov_b.get(key, 0)
# Format values
if "%" in key:
s_a = f"{val_a:>6.1f}%"
s_b = f"{val_b:>6.1f}%"
else:
s_a = f"{val_a:>7}"
s_b = f"{val_b:>7}"
# Determine winner
diff = val_a - val_b
if abs(diff) < 0.5:
winner = "TIE"
elif (diff > 0) == higher_is_better:
winner = "<-"
wins[model_a] += 1
else:
winner = "->"
wins[model_b] += 1
print(f" {label:<22} {s_a:>14} {s_b:>14} {winner}")
print(f"\n Score: {model_a} {wins[model_a]} wins, {model_b} {wins[model_b]} wins")
# Per-category comparison
all_cats = sorted(set(list(summary_a.get("by_category", {}).keys()) +
list(summary_b.get("by_category", {}).keys())))
if all_cats:
print(f"\n Per-Category RCON Success Rate:")
print(f" {'Category':<16} {hdr_a:>14} {hdr_b:>14}")
print(f" {'-'*16} {'-'*14} {'-'*14}")
for cat in all_cats:
ca = summary_a.get("by_category", {}).get(cat, {})
cb = summary_b.get("by_category", {}).get(cat, {})
rcon_a = f"{ca.get('rcon_success_%', '-'):>6.1f}%" if ca else " N/A"
rcon_b = f"{cb.get('rcon_success_%', '-'):>6.1f}%" if cb else " N/A"
print(f" {cat:<16} {rcon_a:>14} {rcon_b:>14}")
# Per-example comparison for disagreements
disagreements = [
r for r in results
if model_a in r and model_b in r
and "error" not in r[model_a] and "error" not in r[model_b]
and r[model_a]["rcon_success"] != r[model_b]["rcon_success"]
]
if disagreements:
print(f"\n RCON Disagreements ({len(disagreements)} examples):")
print(f" {'-'*70}")
for r in disagreements[:10]:
rcon_a_ok = "OK" if r[model_a]["rcon_success"] else "FAIL"
rcon_b_ok = "OK" if r[model_b]["rcon_success"] else "FAIL"
print(f" [{r['id']}] {r['query'][:50]}")
print(f" {model_a}: RCON {rcon_a_ok} | {model_b}: RCON {rcon_b_ok}")
return summary_a, summary_b
def main():
parser = argparse.ArgumentParser(
description="Live bake-off: compare two models on a real Minecraft server via RCON"
)
parser.add_argument("--models", nargs=2, default=["gemma3n:e4b", "qwen3:8b"],
metavar=("MODEL_A", "MODEL_B"),
help="Two models to compare (default: gemma3n:e4b qwen3:8b)")
parser.add_argument("--ollama-url", default="http://192.168.0.179:11434",
help="Ollama API URL")
parser.add_argument("--rcon-host", default="192.168.0.244",
help="RCON host (default: 192.168.0.244)")
parser.add_argument("--rcon-port", type=int, default=25577,
help="RCON port (default: 25577)")
parser.add_argument("--rcon-password", default="REDACTED_RCON",
help="RCON password")
parser.add_argument("--max-examples", type=int, default=0,
help="Limit number of examples (0 = all)")
parser.add_argument("--max-tokens", type=int, default=1500,
help="Max tokens per model response")
parser.add_argument("--categories", nargs="+", default=None,
help="Filter to specific categories (e.g. command_gen safety)")
args = parser.parse_args()
# Run bake-off
bakeoff_data = run_live_bakeoff(
models=args.models,
ollama_url=args.ollama_url,
rcon_host=args.rcon_host,
rcon_port=args.rcon_port,
rcon_password=args.rcon_password,
max_examples=args.max_examples,
categories=args.categories,
max_tokens=args.max_tokens,
)
if "error" in bakeoff_data:
print(f"\nBake-off failed: {bakeoff_data['error']}")
sys.exit(1)
# Print comparison
summary_a, summary_b = print_comparison(bakeoff_data)
# Save results
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time())
model_a_slug = args.models[0].replace(":", "_")
model_b_slug = args.models[1].replace(":", "_")
out_path = RESULTS_DIR / f"live_bakeoff_{model_a_slug}_vs_{model_b_slug}_{ts}.json"
save_data = {
"summary": {
args.models[0]: summary_a,
args.models[1]: summary_b,
},
"bakeoff_data": bakeoff_data,
}
with open(out_path, "w") as f:
json.dump(save_data, f, indent=2, default=str)
print(f"\nResults saved to {out_path}")
if __name__ == "__main__":
main()