#!/usr/bin/env python3 """ Live Bake-off: Compare two Ollama models on a real Minecraft Paper server via RCON. Sends each test example to both models, executes the returned commands on the live server via RCON, and scores results including a new "rcon_success" metric. Usage: python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --max-examples 5 python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --categories command_gen """ import argparse import json import re import sys import time from collections import defaultdict from pathlib import Path import requests from mcrcon import MCRcon ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) from agent.prompts.system_prompts import get_prompt from eval.harness import score_result, build_user_message, parse_response, determine_mode, ollama_chat DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl" RESULTS_DIR = ROOT / "eval" / "results" # RCON error patterns that indicate command failure RCON_ERROR_PATTERNS = [ r"Unknown or incomplete command", r"No entity was found", r"Incorrect argument", r"Expected whitespace", r"Invalid or unknown", r"An unexpected error occurred", r"That position is not loaded", r"Could not set the block", r"Nothing changed", r"No player was found", r"Expected block", r"Expected.*but got", r"Unknown item", r"Unknown effect", r"Unexpected.*at position", ] RCON_ERROR_RE = re.compile("|".join(RCON_ERROR_PATTERNS), re.IGNORECASE) def rcon_execute(cmd: str, host: str, port: int, password: str) -> dict: """Execute a single command via RCON. Returns response text and success flag.""" try: with MCRcon(host, password, port=port) as mcr: response = mcr.command(cmd) is_error = bool(RCON_ERROR_RE.search(response)) return { "command": cmd, "response": response.strip(), "success": not is_error, "error": None, } except Exception as e: return { "command": cmd, "response": "", "success": False, "error": str(e), } def rcon_execute_batch(commands: list, host: str, port: int, password: str) -> list: """Execute a list of commands via RCON sequentially. Returns list of results.""" results = [] if not commands: return results try: with MCRcon(host, password, port=port) as mcr: for cmd in commands: try: response = mcr.command(cmd) is_error = bool(RCON_ERROR_RE.search(response)) results.append({ "command": cmd, "response": response.strip(), "success": not is_error, "error": None, }) except Exception as e: results.append({ "command": cmd, "response": "", "success": False, "error": str(e), }) except Exception as e: # Connection-level failure: mark all commands as failed for cmd in commands: results.append({ "command": cmd, "response": "", "success": False, "error": f"RCON connection failed: {e}", }) return results def rcon_reset(host: str, port: int, password: str): """Clear all effects from all players (test reset between models).""" try: with MCRcon(host, password, port=port) as mcr: mcr.command("effect clear @a") except Exception: pass # Best-effort reset def should_skip_example(example: dict) -> tuple: """Determine if an example should be skipped for live testing. Returns (should_skip: bool, reason: str).""" category = example.get("category", "") expected_cmds = example.get("output", {}).get("commands", []) safety_flags = example.get("output", {}).get("safety_flags", []) # Skip safety examples where expected output is empty commands # (we don't want to test destructive refusals on a live server) if category == "safety" and not expected_cmds: return True, "safety refusal (empty commands)" # Skip safety examples with destructive flags if "destructive" in safety_flags and not expected_cmds: return True, "destructive refusal" return False, "" def compute_rcon_score(rcon_results: list) -> dict: """Compute RCON success metrics from execution results.""" if not rcon_results: return { "rcon_success": True, # No commands = vacuously true "rcon_total": 0, "rcon_succeeded": 0, "rcon_failed": 0, "rcon_errors": [], } succeeded = sum(1 for r in rcon_results if r["success"]) failed = len(rcon_results) - succeeded errors = [ {"command": r["command"], "response": r["response"], "error": r.get("error")} for r in rcon_results if not r["success"] ] return { "rcon_success": failed == 0, "rcon_total": len(rcon_results), "rcon_succeeded": succeeded, "rcon_failed": failed, "rcon_errors": errors, } def run_model_on_example(model: str, example: dict, ollama_url: str, rcon_host: str, rcon_port: int, rcon_password: str, max_tokens: int = 1500) -> dict: """Run one model on one example: generate commands, execute via RCON, score.""" mode = determine_mode(example) messages = [ {"role": "system", "content": get_prompt(mode)}, {"role": "user", "content": build_user_message(example)}, ] # Get model response try: resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens) except Exception as e: return {"model": model, "error": str(e)} parsed = parse_response(resp["content"]) actual_cmds = parsed.get("commands", []) # Score against expected (same as harness.py) scores = score_result(example, actual_cmds, parsed) # Execute commands on live server via RCON rcon_results = rcon_execute_batch(actual_cmds, rcon_host, rcon_port, rcon_password) rcon_scores = compute_rcon_score(rcon_results) return { "model": model, "mode": mode, "actual_cmds": actual_cmds, "message": parsed.get("message", ""), "reasoning": parsed.get("reasoning", ""), "raw_content": resp["content"], "duration_ms": resp["duration_ms"], "eval_tokens": resp.get("eval_count", 0), "done_reason": resp.get("done_reason", ""), "rcon_results": rcon_results, **scores, **rcon_scores, } def run_live_bakeoff(models: list, ollama_url: str, rcon_host: str, rcon_port: int, rcon_password: str, max_examples: int = 0, categories: list = None, max_tokens: int = 1500) -> dict: """Run the full live bake-off comparing two models.""" # Load dataset with open(DATASET) as f: examples = [json.loads(line) for line in f if line.strip()] # Filter by categories if categories: examples = [ex for ex in examples if ex.get("category") in categories] # Filter out skippable examples filtered = [] skipped = [] for ex in examples: skip, reason = should_skip_example(ex) if skip: skipped.append({"id": ex.get("id", "?"), "reason": reason}) else: filtered.append(ex) examples = filtered # Limit examples if max_examples > 0: examples = examples[:max_examples] total = len(examples) model_a, model_b = models[0], models[1] print(f"Live Bake-off: {model_a} vs {model_b}") print(f" Dataset: {total} examples ({len(skipped)} skipped)") print(f" Ollama: {ollama_url}") print(f" RCON: {rcon_host}:{rcon_port}") print("=" * 80) # Test RCON connectivity first print("Testing RCON connection...") test_result = rcon_execute("list", rcon_host, rcon_port, rcon_password) if test_result["error"]: print(f" RCON connection FAILED: {test_result['error']}") print(" Aborting live bake-off.") return {"error": f"RCON connection failed: {test_result['error']}"} print(f" RCON OK: {test_result['response']}") # Warm up both models for model in [model_a, model_b]: print(f"Loading {model}...") try: warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}], ollama_url, max_tokens=5) print(f" Loaded in {warmup['duration_ms']}ms") except Exception as e: print(f" ERROR loading {model}: {e}") return {"error": f"Failed to load {model}: {e}"} print("\n" + "=" * 80) all_results = [] for i, ex in enumerate(examples): eid = ex.get("id", f"ex-{i}") category = ex.get("category", "?") query = ex["input"]["user_message"] print(f"\n[{i+1}/{total}] ({category}) {query[:60]}") print("-" * 70) # --- Model A --- print(f" {model_a}:") result_a = run_model_on_example( model_a, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens ) if "error" in result_a: print(f" ERROR: {result_a['error']}") else: status_a = "OK" if result_a["cmd_match"] else "MISS" rcon_a = f"{result_a['rcon_succeeded']}/{result_a['rcon_total']} RCON ok" flags_a = "" if not result_a["syntax_ok"]: flags_a += " [SYNTAX]" if not result_a["rcon_success"]: flags_a += " [RCON-FAIL]" if result_a.get("hallucinated"): flags_a += " [HALLUC]" print(f" [{status_a}] {rcon_a}{flags_a} [{result_a['duration_ms']}ms]") print(f" Cmds: {result_a['actual_cmds'][:3]}") if result_a["rcon_errors"]: for err in result_a["rcon_errors"][:2]: print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}") # Wait and reset time.sleep(2) rcon_reset(rcon_host, rcon_port, rcon_password) # --- Model B --- print(f" {model_b}:") result_b = run_model_on_example( model_b, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens ) if "error" in result_b: print(f" ERROR: {result_b['error']}") else: status_b = "OK" if result_b["cmd_match"] else "MISS" rcon_b = f"{result_b['rcon_succeeded']}/{result_b['rcon_total']} RCON ok" flags_b = "" if not result_b["syntax_ok"]: flags_b += " [SYNTAX]" if not result_b["rcon_success"]: flags_b += " [RCON-FAIL]" if result_b.get("hallucinated"): flags_b += " [HALLUC]" print(f" [{status_b}] {rcon_b}{flags_b} [{result_b['duration_ms']}ms]") print(f" Cmds: {result_b['actual_cmds'][:3]}") if result_b["rcon_errors"]: for err in result_b["rcon_errors"][:2]: print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}") # Wait and reset time.sleep(2) rcon_reset(rcon_host, rcon_port, rcon_password) all_results.append({ "id": eid, "category": category, "query": query, "expected": ex["output"].get("commands", []), model_a: result_a, model_b: result_b, }) return { "models": [model_a, model_b], "ollama_url": ollama_url, "rcon_host": rcon_host, "rcon_port": rcon_port, "timestamp": int(time.time()), "dataset_size": total, "skipped": skipped, "results": all_results, } def compute_model_summary(results: list, model: str) -> dict: """Compute aggregate metrics for a single model across all results.""" valid = [r for r in results if model in r and "error" not in r[model]] n = len(valid) if n == 0: return {"n": 0} def pct(key): return round(sum(1 for r in valid if r[model].get(key, False)) / n * 100, 1) # Per-category cats = defaultdict(list) for r in valid: cats[r["category"]].append(r) cat_scores = {} for cat, cat_results in sorted(cats.items()): cn = len(cat_results) cat_valid = [r for r in cat_results if "error" not in r[model]] if not cat_valid: continue cvn = len(cat_valid) cat_scores[cat] = { "n": cvn, "cmd_match_%": round(sum(1 for r in cat_valid if r[model]["cmd_match"]) / cvn * 100, 1), "exact_match_%": round(sum(1 for r in cat_valid if r[model]["exact_match"]) / cvn * 100, 1), "syntax_ok_%": round(sum(1 for r in cat_valid if r[model]["syntax_ok"]) / cvn * 100, 1), "safety_%": round(sum(1 for r in cat_valid if r[model]["safety_ok"]) / cvn * 100, 1), "rcon_success_%": round(sum(1 for r in cat_valid if r[model]["rcon_success"]) / cvn * 100, 1), } avg_latency = int(sum(r[model]["duration_ms"] for r in valid) / n) avg_tokens = int(sum(r[model].get("eval_tokens", 0) for r in valid) / n) total_rcon_cmds = sum(r[model].get("rcon_total", 0) for r in valid) total_rcon_ok = sum(r[model].get("rcon_succeeded", 0) for r in valid) return { "model": model, "n": n, "overall": { "cmd_match_%": pct("cmd_match"), "exact_match_%": pct("exact_match"), "syntax_ok_%": pct("syntax_ok"), "safety_%": pct("safety_ok"), "rcon_success_%": pct("rcon_success"), "no_gratuitous_tp_%": round(sum(1 for r in valid if not r[model].get("has_gratuitous_tp", False)) / n * 100, 1), "no_hallucination_%": round(sum(1 for r in valid if not r[model].get("hallucinated", False)) / n * 100, 1), "empty_%": round(sum(1 for r in valid if r[model].get("is_empty", False)) / n * 100, 1), "rcon_cmd_success_%": round(total_rcon_ok / total_rcon_cmds * 100, 1) if total_rcon_cmds > 0 else 100.0, "avg_latency_ms": avg_latency, "avg_tokens": avg_tokens, }, "by_category": cat_scores, } def print_comparison(bakeoff_data: dict): """Print a side-by-side comparison table.""" models = bakeoff_data["models"] results = bakeoff_data["results"] model_a, model_b = models summary_a = compute_model_summary(results, model_a) summary_b = compute_model_summary(results, model_b) print("\n" + "=" * 80) print("LIVE BAKE-OFF RESULTS") print(f" {model_a} vs {model_b}") print(f" {summary_a['n']} examples evaluated on live server") ts = bakeoff_data.get("timestamp", 0) print(f" {time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))}") print("=" * 80) if summary_a["n"] == 0 or summary_b["n"] == 0: print(" Insufficient results for comparison.") return summary_a, summary_b ov_a = summary_a["overall"] ov_b = summary_b["overall"] # Side-by-side overall metrics metrics = [ ("Command match", "cmd_match_%", True), ("Exact match", "exact_match_%", True), ("Syntax correct", "syntax_ok_%", True), ("Safety compliance", "safety_%", True), ("RCON success", "rcon_success_%", True), ("RCON cmd success", "rcon_cmd_success_%", True), ("No gratuitous tp", "no_gratuitous_tp_%", True), ("No hallucination", "no_hallucination_%", True), ("Empty responses", "empty_%", False), ("Avg latency (ms)", "avg_latency_ms", False), ("Avg tokens", "avg_tokens", False), ] hdr_a = model_a[:20] hdr_b = model_b[:20] print(f"\n {'Metric':<22} {hdr_a:>14} {hdr_b:>14} Winner") print(f" {'-'*22} {'-'*14} {'-'*14} {'-'*10}") wins = {model_a: 0, model_b: 0} for label, key, higher_is_better in metrics: val_a = ov_a.get(key, 0) val_b = ov_b.get(key, 0) # Format values if "%" in key: s_a = f"{val_a:>6.1f}%" s_b = f"{val_b:>6.1f}%" else: s_a = f"{val_a:>7}" s_b = f"{val_b:>7}" # Determine winner diff = val_a - val_b if abs(diff) < 0.5: winner = "TIE" elif (diff > 0) == higher_is_better: winner = "<-" wins[model_a] += 1 else: winner = "->" wins[model_b] += 1 print(f" {label:<22} {s_a:>14} {s_b:>14} {winner}") print(f"\n Score: {model_a} {wins[model_a]} wins, {model_b} {wins[model_b]} wins") # Per-category comparison all_cats = sorted(set(list(summary_a.get("by_category", {}).keys()) + list(summary_b.get("by_category", {}).keys()))) if all_cats: print(f"\n Per-Category RCON Success Rate:") print(f" {'Category':<16} {hdr_a:>14} {hdr_b:>14}") print(f" {'-'*16} {'-'*14} {'-'*14}") for cat in all_cats: ca = summary_a.get("by_category", {}).get(cat, {}) cb = summary_b.get("by_category", {}).get(cat, {}) rcon_a = f"{ca.get('rcon_success_%', '-'):>6.1f}%" if ca else " N/A" rcon_b = f"{cb.get('rcon_success_%', '-'):>6.1f}%" if cb else " N/A" print(f" {cat:<16} {rcon_a:>14} {rcon_b:>14}") # Per-example comparison for disagreements disagreements = [ r for r in results if model_a in r and model_b in r and "error" not in r[model_a] and "error" not in r[model_b] and r[model_a]["rcon_success"] != r[model_b]["rcon_success"] ] if disagreements: print(f"\n RCON Disagreements ({len(disagreements)} examples):") print(f" {'-'*70}") for r in disagreements[:10]: rcon_a_ok = "OK" if r[model_a]["rcon_success"] else "FAIL" rcon_b_ok = "OK" if r[model_b]["rcon_success"] else "FAIL" print(f" [{r['id']}] {r['query'][:50]}") print(f" {model_a}: RCON {rcon_a_ok} | {model_b}: RCON {rcon_b_ok}") return summary_a, summary_b def main(): parser = argparse.ArgumentParser( description="Live bake-off: compare two models on a real Minecraft server via RCON" ) parser.add_argument("--models", nargs=2, default=["gemma3n:e4b", "qwen3:8b"], metavar=("MODEL_A", "MODEL_B"), help="Two models to compare (default: gemma3n:e4b qwen3:8b)") parser.add_argument("--ollama-url", default="http://192.168.0.179:11434", help="Ollama API URL") parser.add_argument("--rcon-host", default="192.168.0.244", help="RCON host (default: 192.168.0.244)") parser.add_argument("--rcon-port", type=int, default=25577, help="RCON port (default: 25577)") parser.add_argument("--rcon-password", default="REDACTED_RCON", help="RCON password") parser.add_argument("--max-examples", type=int, default=0, help="Limit number of examples (0 = all)") parser.add_argument("--max-tokens", type=int, default=1500, help="Max tokens per model response") parser.add_argument("--categories", nargs="+", default=None, help="Filter to specific categories (e.g. command_gen safety)") args = parser.parse_args() # Run bake-off bakeoff_data = run_live_bakeoff( models=args.models, ollama_url=args.ollama_url, rcon_host=args.rcon_host, rcon_port=args.rcon_port, rcon_password=args.rcon_password, max_examples=args.max_examples, categories=args.categories, max_tokens=args.max_tokens, ) if "error" in bakeoff_data: print(f"\nBake-off failed: {bakeoff_data['error']}") sys.exit(1) # Print comparison summary_a, summary_b = print_comparison(bakeoff_data) # Save results RESULTS_DIR.mkdir(parents=True, exist_ok=True) ts = int(time.time()) model_a_slug = args.models[0].replace(":", "_") model_b_slug = args.models[1].replace(":", "_") out_path = RESULTS_DIR / f"live_bakeoff_{model_a_slug}_vs_{model_b_slug}_{ts}.json" save_data = { "summary": { args.models[0]: summary_a, args.models[1]: summary_b, }, "bakeoff_data": bakeoff_data, } with open(out_path, "w") as f: json.dump(save_data, f, indent=2, default=str) print(f"\nResults saved to {out_path}") if __name__ == "__main__": main()