9d789d2524
Eval harness: - Mode-aware scoring: sudo=strict (exact match), pray/god=soft (category match, in-character, appropriate intensity) - New metrics: cmd_category_match, appropriate_intensity, scoring_mode breakdown - Eval defaults to steel141 (192.168.0.141) — prod GPU reserved for serving Dataset (213 examples): - Added 31 boundary/adversarial examples (safety edges, abstention, near-boundary) - Updated pray example reasoning: character-driven logic, not prescriptive outputs - Tagged pray examples with scoring_mode=soft Playtest tooling: - whitelist.sh: add/remove/list across all 3 servers - FRIENDS_INVITE.md + Discord version: playtester recruitment docs - Server addresses and implementation details for both training servers PLAN.md: - Three-tier constraint model documented (sudo/pray/god_system) - Success criteria split by scoring mode - All session decisions logged Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
582 lines
21 KiB
Python
582 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Live Bake-off: Compare two Ollama models on a real Minecraft Paper server via RCON.
|
|
|
|
Sends each test example to both models, executes the returned commands on the
|
|
live server via RCON, and scores results including a new "rcon_success" metric.
|
|
|
|
Usage:
|
|
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b
|
|
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --max-examples 5
|
|
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --categories command_gen
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from mcrcon import MCRcon
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from agent.prompts.system_prompts import get_prompt
|
|
from eval.harness import score_result, build_user_message, parse_response, determine_mode, ollama_chat
|
|
|
|
DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
|
|
RESULTS_DIR = ROOT / "eval" / "results"
|
|
|
|
# RCON error patterns that indicate command failure
|
|
RCON_ERROR_PATTERNS = [
|
|
r"Unknown or incomplete command",
|
|
r"No entity was found",
|
|
r"Incorrect argument",
|
|
r"Expected whitespace",
|
|
r"Invalid or unknown",
|
|
r"An unexpected error occurred",
|
|
r"That position is not loaded",
|
|
r"Could not set the block",
|
|
r"Nothing changed",
|
|
r"No player was found",
|
|
r"Expected block",
|
|
r"Expected.*but got",
|
|
r"Unknown item",
|
|
r"Unknown effect",
|
|
r"Unexpected.*at position",
|
|
]
|
|
|
|
RCON_ERROR_RE = re.compile("|".join(RCON_ERROR_PATTERNS), re.IGNORECASE)
|
|
|
|
|
|
def rcon_execute(cmd: str, host: str, port: int, password: str) -> dict:
|
|
"""Execute a single command via RCON. Returns response text and success flag."""
|
|
try:
|
|
with MCRcon(host, password, port=port) as mcr:
|
|
response = mcr.command(cmd)
|
|
is_error = bool(RCON_ERROR_RE.search(response))
|
|
return {
|
|
"command": cmd,
|
|
"response": response.strip(),
|
|
"success": not is_error,
|
|
"error": None,
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"command": cmd,
|
|
"response": "",
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
def rcon_execute_batch(commands: list, host: str, port: int, password: str) -> list:
|
|
"""Execute a list of commands via RCON sequentially. Returns list of results."""
|
|
results = []
|
|
if not commands:
|
|
return results
|
|
try:
|
|
with MCRcon(host, password, port=port) as mcr:
|
|
for cmd in commands:
|
|
try:
|
|
response = mcr.command(cmd)
|
|
is_error = bool(RCON_ERROR_RE.search(response))
|
|
results.append({
|
|
"command": cmd,
|
|
"response": response.strip(),
|
|
"success": not is_error,
|
|
"error": None,
|
|
})
|
|
except Exception as e:
|
|
results.append({
|
|
"command": cmd,
|
|
"response": "",
|
|
"success": False,
|
|
"error": str(e),
|
|
})
|
|
except Exception as e:
|
|
# Connection-level failure: mark all commands as failed
|
|
for cmd in commands:
|
|
results.append({
|
|
"command": cmd,
|
|
"response": "",
|
|
"success": False,
|
|
"error": f"RCON connection failed: {e}",
|
|
})
|
|
return results
|
|
|
|
|
|
def rcon_reset(host: str, port: int, password: str):
|
|
"""Clear all effects from all players (test reset between models)."""
|
|
try:
|
|
with MCRcon(host, password, port=port) as mcr:
|
|
mcr.command("effect clear @a")
|
|
except Exception:
|
|
pass # Best-effort reset
|
|
|
|
|
|
def should_skip_example(example: dict) -> tuple:
|
|
"""Determine if an example should be skipped for live testing.
|
|
Returns (should_skip: bool, reason: str)."""
|
|
category = example.get("category", "")
|
|
expected_cmds = example.get("output", {}).get("commands", [])
|
|
safety_flags = example.get("output", {}).get("safety_flags", [])
|
|
|
|
# Skip safety examples where expected output is empty commands
|
|
# (we don't want to test destructive refusals on a live server)
|
|
if category == "safety" and not expected_cmds:
|
|
return True, "safety refusal (empty commands)"
|
|
|
|
# Skip safety examples with destructive flags
|
|
if "destructive" in safety_flags and not expected_cmds:
|
|
return True, "destructive refusal"
|
|
|
|
return False, ""
|
|
|
|
|
|
def compute_rcon_score(rcon_results: list) -> dict:
|
|
"""Compute RCON success metrics from execution results."""
|
|
if not rcon_results:
|
|
return {
|
|
"rcon_success": True, # No commands = vacuously true
|
|
"rcon_total": 0,
|
|
"rcon_succeeded": 0,
|
|
"rcon_failed": 0,
|
|
"rcon_errors": [],
|
|
}
|
|
succeeded = sum(1 for r in rcon_results if r["success"])
|
|
failed = len(rcon_results) - succeeded
|
|
errors = [
|
|
{"command": r["command"], "response": r["response"], "error": r.get("error")}
|
|
for r in rcon_results if not r["success"]
|
|
]
|
|
return {
|
|
"rcon_success": failed == 0,
|
|
"rcon_total": len(rcon_results),
|
|
"rcon_succeeded": succeeded,
|
|
"rcon_failed": failed,
|
|
"rcon_errors": errors,
|
|
}
|
|
|
|
|
|
def run_model_on_example(model: str, example: dict, ollama_url: str,
|
|
rcon_host: str, rcon_port: int, rcon_password: str,
|
|
max_tokens: int = 1500) -> dict:
|
|
"""Run one model on one example: generate commands, execute via RCON, score."""
|
|
mode = determine_mode(example)
|
|
messages = [
|
|
{"role": "system", "content": get_prompt(mode)},
|
|
{"role": "user", "content": build_user_message(example)},
|
|
]
|
|
|
|
# Get model response
|
|
try:
|
|
resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
|
|
except Exception as e:
|
|
return {"model": model, "error": str(e)}
|
|
|
|
parsed = parse_response(resp["content"])
|
|
actual_cmds = parsed.get("commands", [])
|
|
|
|
# Score against expected (same as harness.py)
|
|
scores = score_result(example, actual_cmds, parsed)
|
|
|
|
# Execute commands on live server via RCON
|
|
rcon_results = rcon_execute_batch(actual_cmds, rcon_host, rcon_port, rcon_password)
|
|
rcon_scores = compute_rcon_score(rcon_results)
|
|
|
|
return {
|
|
"model": model,
|
|
"mode": mode,
|
|
"actual_cmds": actual_cmds,
|
|
"message": parsed.get("message", ""),
|
|
"reasoning": parsed.get("reasoning", ""),
|
|
"raw_content": resp["content"],
|
|
"duration_ms": resp["duration_ms"],
|
|
"eval_tokens": resp.get("eval_count", 0),
|
|
"done_reason": resp.get("done_reason", ""),
|
|
"rcon_results": rcon_results,
|
|
**scores,
|
|
**rcon_scores,
|
|
}
|
|
|
|
|
|
def run_live_bakeoff(models: list, ollama_url: str,
|
|
rcon_host: str, rcon_port: int, rcon_password: str,
|
|
max_examples: int = 0, categories: list = None,
|
|
max_tokens: int = 1500) -> dict:
|
|
"""Run the full live bake-off comparing two models."""
|
|
# Load dataset
|
|
with open(DATASET) as f:
|
|
examples = [json.loads(line) for line in f if line.strip()]
|
|
|
|
# Filter by categories
|
|
if categories:
|
|
examples = [ex for ex in examples if ex.get("category") in categories]
|
|
|
|
# Filter out skippable examples
|
|
filtered = []
|
|
skipped = []
|
|
for ex in examples:
|
|
skip, reason = should_skip_example(ex)
|
|
if skip:
|
|
skipped.append({"id": ex.get("id", "?"), "reason": reason})
|
|
else:
|
|
filtered.append(ex)
|
|
examples = filtered
|
|
|
|
# Limit examples
|
|
if max_examples > 0:
|
|
examples = examples[:max_examples]
|
|
|
|
total = len(examples)
|
|
model_a, model_b = models[0], models[1]
|
|
|
|
print(f"Live Bake-off: {model_a} vs {model_b}")
|
|
print(f" Dataset: {total} examples ({len(skipped)} skipped)")
|
|
print(f" Ollama: {ollama_url}")
|
|
print(f" RCON: {rcon_host}:{rcon_port}")
|
|
print("=" * 80)
|
|
|
|
# Test RCON connectivity first
|
|
print("Testing RCON connection...")
|
|
test_result = rcon_execute("list", rcon_host, rcon_port, rcon_password)
|
|
if test_result["error"]:
|
|
print(f" RCON connection FAILED: {test_result['error']}")
|
|
print(" Aborting live bake-off.")
|
|
return {"error": f"RCON connection failed: {test_result['error']}"}
|
|
print(f" RCON OK: {test_result['response']}")
|
|
|
|
# Warm up both models
|
|
for model in [model_a, model_b]:
|
|
print(f"Loading {model}...")
|
|
try:
|
|
warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
|
|
ollama_url, max_tokens=5)
|
|
print(f" Loaded in {warmup['duration_ms']}ms")
|
|
except Exception as e:
|
|
print(f" ERROR loading {model}: {e}")
|
|
return {"error": f"Failed to load {model}: {e}"}
|
|
|
|
print("\n" + "=" * 80)
|
|
|
|
all_results = []
|
|
for i, ex in enumerate(examples):
|
|
eid = ex.get("id", f"ex-{i}")
|
|
category = ex.get("category", "?")
|
|
query = ex["input"]["user_message"]
|
|
|
|
print(f"\n[{i+1}/{total}] ({category}) {query[:60]}")
|
|
print("-" * 70)
|
|
|
|
# --- Model A ---
|
|
print(f" {model_a}:")
|
|
result_a = run_model_on_example(
|
|
model_a, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
|
|
)
|
|
|
|
if "error" in result_a:
|
|
print(f" ERROR: {result_a['error']}")
|
|
else:
|
|
status_a = "OK" if result_a["cmd_match"] else "MISS"
|
|
rcon_a = f"{result_a['rcon_succeeded']}/{result_a['rcon_total']} RCON ok"
|
|
flags_a = ""
|
|
if not result_a["syntax_ok"]:
|
|
flags_a += " [SYNTAX]"
|
|
if not result_a["rcon_success"]:
|
|
flags_a += " [RCON-FAIL]"
|
|
if result_a.get("hallucinated"):
|
|
flags_a += " [HALLUC]"
|
|
print(f" [{status_a}] {rcon_a}{flags_a} [{result_a['duration_ms']}ms]")
|
|
print(f" Cmds: {result_a['actual_cmds'][:3]}")
|
|
if result_a["rcon_errors"]:
|
|
for err in result_a["rcon_errors"][:2]:
|
|
print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}")
|
|
|
|
# Wait and reset
|
|
time.sleep(2)
|
|
rcon_reset(rcon_host, rcon_port, rcon_password)
|
|
|
|
# --- Model B ---
|
|
print(f" {model_b}:")
|
|
result_b = run_model_on_example(
|
|
model_b, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
|
|
)
|
|
|
|
if "error" in result_b:
|
|
print(f" ERROR: {result_b['error']}")
|
|
else:
|
|
status_b = "OK" if result_b["cmd_match"] else "MISS"
|
|
rcon_b = f"{result_b['rcon_succeeded']}/{result_b['rcon_total']} RCON ok"
|
|
flags_b = ""
|
|
if not result_b["syntax_ok"]:
|
|
flags_b += " [SYNTAX]"
|
|
if not result_b["rcon_success"]:
|
|
flags_b += " [RCON-FAIL]"
|
|
if result_b.get("hallucinated"):
|
|
flags_b += " [HALLUC]"
|
|
print(f" [{status_b}] {rcon_b}{flags_b} [{result_b['duration_ms']}ms]")
|
|
print(f" Cmds: {result_b['actual_cmds'][:3]}")
|
|
if result_b["rcon_errors"]:
|
|
for err in result_b["rcon_errors"][:2]:
|
|
print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}")
|
|
|
|
# Wait and reset
|
|
time.sleep(2)
|
|
rcon_reset(rcon_host, rcon_port, rcon_password)
|
|
|
|
all_results.append({
|
|
"id": eid,
|
|
"category": category,
|
|
"query": query,
|
|
"expected": ex["output"].get("commands", []),
|
|
model_a: result_a,
|
|
model_b: result_b,
|
|
})
|
|
|
|
return {
|
|
"models": [model_a, model_b],
|
|
"ollama_url": ollama_url,
|
|
"rcon_host": rcon_host,
|
|
"rcon_port": rcon_port,
|
|
"timestamp": int(time.time()),
|
|
"dataset_size": total,
|
|
"skipped": skipped,
|
|
"results": all_results,
|
|
}
|
|
|
|
|
|
def compute_model_summary(results: list, model: str) -> dict:
|
|
"""Compute aggregate metrics for a single model across all results."""
|
|
valid = [r for r in results if model in r and "error" not in r[model]]
|
|
n = len(valid)
|
|
if n == 0:
|
|
return {"n": 0}
|
|
|
|
def pct(key):
|
|
return round(sum(1 for r in valid if r[model].get(key, False)) / n * 100, 1)
|
|
|
|
# Per-category
|
|
cats = defaultdict(list)
|
|
for r in valid:
|
|
cats[r["category"]].append(r)
|
|
|
|
cat_scores = {}
|
|
for cat, cat_results in sorted(cats.items()):
|
|
cn = len(cat_results)
|
|
cat_valid = [r for r in cat_results if "error" not in r[model]]
|
|
if not cat_valid:
|
|
continue
|
|
cvn = len(cat_valid)
|
|
cat_scores[cat] = {
|
|
"n": cvn,
|
|
"cmd_match_%": round(sum(1 for r in cat_valid if r[model]["cmd_match"]) / cvn * 100, 1),
|
|
"exact_match_%": round(sum(1 for r in cat_valid if r[model]["exact_match"]) / cvn * 100, 1),
|
|
"syntax_ok_%": round(sum(1 for r in cat_valid if r[model]["syntax_ok"]) / cvn * 100, 1),
|
|
"safety_%": round(sum(1 for r in cat_valid if r[model]["safety_ok"]) / cvn * 100, 1),
|
|
"rcon_success_%": round(sum(1 for r in cat_valid if r[model]["rcon_success"]) / cvn * 100, 1),
|
|
}
|
|
|
|
avg_latency = int(sum(r[model]["duration_ms"] for r in valid) / n)
|
|
avg_tokens = int(sum(r[model].get("eval_tokens", 0) for r in valid) / n)
|
|
|
|
total_rcon_cmds = sum(r[model].get("rcon_total", 0) for r in valid)
|
|
total_rcon_ok = sum(r[model].get("rcon_succeeded", 0) for r in valid)
|
|
|
|
return {
|
|
"model": model,
|
|
"n": n,
|
|
"overall": {
|
|
"cmd_match_%": pct("cmd_match"),
|
|
"exact_match_%": pct("exact_match"),
|
|
"syntax_ok_%": pct("syntax_ok"),
|
|
"safety_%": pct("safety_ok"),
|
|
"rcon_success_%": pct("rcon_success"),
|
|
"no_gratuitous_tp_%": round(sum(1 for r in valid if not r[model].get("has_gratuitous_tp", False)) / n * 100, 1),
|
|
"no_hallucination_%": round(sum(1 for r in valid if not r[model].get("hallucinated", False)) / n * 100, 1),
|
|
"empty_%": round(sum(1 for r in valid if r[model].get("is_empty", False)) / n * 100, 1),
|
|
"rcon_cmd_success_%": round(total_rcon_ok / total_rcon_cmds * 100, 1) if total_rcon_cmds > 0 else 100.0,
|
|
"avg_latency_ms": avg_latency,
|
|
"avg_tokens": avg_tokens,
|
|
},
|
|
"by_category": cat_scores,
|
|
}
|
|
|
|
|
|
def print_comparison(bakeoff_data: dict):
|
|
"""Print a side-by-side comparison table."""
|
|
models = bakeoff_data["models"]
|
|
results = bakeoff_data["results"]
|
|
model_a, model_b = models
|
|
|
|
summary_a = compute_model_summary(results, model_a)
|
|
summary_b = compute_model_summary(results, model_b)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("LIVE BAKE-OFF RESULTS")
|
|
print(f" {model_a} vs {model_b}")
|
|
print(f" {summary_a['n']} examples evaluated on live server")
|
|
ts = bakeoff_data.get("timestamp", 0)
|
|
print(f" {time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))}")
|
|
print("=" * 80)
|
|
|
|
if summary_a["n"] == 0 or summary_b["n"] == 0:
|
|
print(" Insufficient results for comparison.")
|
|
return summary_a, summary_b
|
|
|
|
ov_a = summary_a["overall"]
|
|
ov_b = summary_b["overall"]
|
|
|
|
# Side-by-side overall metrics
|
|
metrics = [
|
|
("Command match", "cmd_match_%", True),
|
|
("Exact match", "exact_match_%", True),
|
|
("Syntax correct", "syntax_ok_%", True),
|
|
("Safety compliance", "safety_%", True),
|
|
("RCON success", "rcon_success_%", True),
|
|
("RCON cmd success", "rcon_cmd_success_%", True),
|
|
("No gratuitous tp", "no_gratuitous_tp_%", True),
|
|
("No hallucination", "no_hallucination_%", True),
|
|
("Empty responses", "empty_%", False),
|
|
("Avg latency (ms)", "avg_latency_ms", False),
|
|
("Avg tokens", "avg_tokens", False),
|
|
]
|
|
|
|
hdr_a = model_a[:20]
|
|
hdr_b = model_b[:20]
|
|
print(f"\n {'Metric':<22} {hdr_a:>14} {hdr_b:>14} Winner")
|
|
print(f" {'-'*22} {'-'*14} {'-'*14} {'-'*10}")
|
|
|
|
wins = {model_a: 0, model_b: 0}
|
|
|
|
for label, key, higher_is_better in metrics:
|
|
val_a = ov_a.get(key, 0)
|
|
val_b = ov_b.get(key, 0)
|
|
|
|
# Format values
|
|
if "%" in key:
|
|
s_a = f"{val_a:>6.1f}%"
|
|
s_b = f"{val_b:>6.1f}%"
|
|
else:
|
|
s_a = f"{val_a:>7}"
|
|
s_b = f"{val_b:>7}"
|
|
|
|
# Determine winner
|
|
diff = val_a - val_b
|
|
if abs(diff) < 0.5:
|
|
winner = "TIE"
|
|
elif (diff > 0) == higher_is_better:
|
|
winner = "<-"
|
|
wins[model_a] += 1
|
|
else:
|
|
winner = "->"
|
|
wins[model_b] += 1
|
|
|
|
print(f" {label:<22} {s_a:>14} {s_b:>14} {winner}")
|
|
|
|
print(f"\n Score: {model_a} {wins[model_a]} wins, {model_b} {wins[model_b]} wins")
|
|
|
|
# Per-category comparison
|
|
all_cats = sorted(set(list(summary_a.get("by_category", {}).keys()) +
|
|
list(summary_b.get("by_category", {}).keys())))
|
|
|
|
if all_cats:
|
|
print(f"\n Per-Category RCON Success Rate:")
|
|
print(f" {'Category':<16} {hdr_a:>14} {hdr_b:>14}")
|
|
print(f" {'-'*16} {'-'*14} {'-'*14}")
|
|
for cat in all_cats:
|
|
ca = summary_a.get("by_category", {}).get(cat, {})
|
|
cb = summary_b.get("by_category", {}).get(cat, {})
|
|
rcon_a = f"{ca.get('rcon_success_%', '-'):>6.1f}%" if ca else " N/A"
|
|
rcon_b = f"{cb.get('rcon_success_%', '-'):>6.1f}%" if cb else " N/A"
|
|
print(f" {cat:<16} {rcon_a:>14} {rcon_b:>14}")
|
|
|
|
# Per-example comparison for disagreements
|
|
disagreements = [
|
|
r for r in results
|
|
if model_a in r and model_b in r
|
|
and "error" not in r[model_a] and "error" not in r[model_b]
|
|
and r[model_a]["rcon_success"] != r[model_b]["rcon_success"]
|
|
]
|
|
|
|
if disagreements:
|
|
print(f"\n RCON Disagreements ({len(disagreements)} examples):")
|
|
print(f" {'-'*70}")
|
|
for r in disagreements[:10]:
|
|
rcon_a_ok = "OK" if r[model_a]["rcon_success"] else "FAIL"
|
|
rcon_b_ok = "OK" if r[model_b]["rcon_success"] else "FAIL"
|
|
print(f" [{r['id']}] {r['query'][:50]}")
|
|
print(f" {model_a}: RCON {rcon_a_ok} | {model_b}: RCON {rcon_b_ok}")
|
|
|
|
return summary_a, summary_b
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Live bake-off: compare two models on a real Minecraft server via RCON"
|
|
)
|
|
parser.add_argument("--models", nargs=2, default=["gemma3n:e4b", "qwen3:8b"],
|
|
metavar=("MODEL_A", "MODEL_B"),
|
|
help="Two models to compare (default: gemma3n:e4b qwen3:8b)")
|
|
parser.add_argument("--ollama-url", default="http://192.168.0.141:11434",
|
|
help="Ollama API URL")
|
|
parser.add_argument("--rcon-host", default="192.168.0.244",
|
|
help="RCON host (default: 192.168.0.244)")
|
|
parser.add_argument("--rcon-port", type=int, default=25577,
|
|
help="RCON port (default: 25577)")
|
|
parser.add_argument("--rcon-password", default="REDACTED_RCON",
|
|
help="RCON password")
|
|
parser.add_argument("--max-examples", type=int, default=0,
|
|
help="Limit number of examples (0 = all)")
|
|
parser.add_argument("--max-tokens", type=int, default=1500,
|
|
help="Max tokens per model response")
|
|
parser.add_argument("--categories", nargs="+", default=None,
|
|
help="Filter to specific categories (e.g. command_gen safety)")
|
|
args = parser.parse_args()
|
|
|
|
# Run bake-off
|
|
bakeoff_data = run_live_bakeoff(
|
|
models=args.models,
|
|
ollama_url=args.ollama_url,
|
|
rcon_host=args.rcon_host,
|
|
rcon_port=args.rcon_port,
|
|
rcon_password=args.rcon_password,
|
|
max_examples=args.max_examples,
|
|
categories=args.categories,
|
|
max_tokens=args.max_tokens,
|
|
)
|
|
|
|
if "error" in bakeoff_data:
|
|
print(f"\nBake-off failed: {bakeoff_data['error']}")
|
|
sys.exit(1)
|
|
|
|
# Print comparison
|
|
summary_a, summary_b = print_comparison(bakeoff_data)
|
|
|
|
# Save results
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
ts = int(time.time())
|
|
model_a_slug = args.models[0].replace(":", "_")
|
|
model_b_slug = args.models[1].replace(":", "_")
|
|
out_path = RESULTS_DIR / f"live_bakeoff_{model_a_slug}_vs_{model_b_slug}_{ts}.json"
|
|
|
|
save_data = {
|
|
"summary": {
|
|
args.models[0]: summary_a,
|
|
args.models[1]: summary_b,
|
|
},
|
|
"bakeoff_data": bakeoff_data,
|
|
}
|
|
|
|
with open(out_path, "w") as f:
|
|
json.dump(save_data, f, indent=2, default=str)
|
|
print(f"\nResults saved to {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|