f5118505b1
Bake-off (0.5.0 vs 0.4.0): - Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2 - Enchantments: +47% (20% → 67%) - EssentialsX: +60% (0% → 60%) - Effects: +25% (0% → 25%) - Regressions: fill_build -67%, world -20% Knowledge Lookup Tools (4 new): - plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs - minecraft.changelog_lookup: version history from Minecraft Wiki - paper.docs_lookup: Paper server-specific documentation - Wired into gateway model-driven tool loop and exploration self-play Exploration Self-Play: - General (vanilla MC) and plugins focus modes - Wiki-grounded: model researches before acting, validates through RCON - 2,243 exploration examples generated, 150 kept after quality filtering Training Progress Chart: - SVG chart showing training examples and inverse loss across versions - Added to MODEL_CARD.md for Gitea display Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
298 lines
10 KiB
Python
298 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bake-off — compare model versions on standard test prompts with RCON validation.
|
|
|
|
Runs the same prompts through multiple models, executes via RCON, and scores
|
|
success rate, response quality, and speed.
|
|
|
|
Usage:
|
|
python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
|
|
--ollama-url http://localhost:11434 --rcon-host 192.168.0.244
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import random
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
import requests
|
|
from agent.tools.persistent_rcon import get_rcon
|
|
|
|
OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
|
|
|
|
# Standard test prompts across categories
|
|
TEST_PROMPTS = {
|
|
"basic_give": [
|
|
"sudo give me a diamond sword",
|
|
"sudo give me 64 golden apples",
|
|
"sudo give me full netherite armor",
|
|
"sudo give me a stack of oak logs",
|
|
],
|
|
"enchantments": [
|
|
"sudo give me a sword with sharpness 5 and mending",
|
|
"sudo give me a bow with power 5 and infinity",
|
|
"sudo give me boots with feather falling 4 and depth strider 3",
|
|
"sudo give me a trident with loyalty 3 and channeling",
|
|
],
|
|
"effects": [
|
|
"sudo give me speed 2 for 5 minutes",
|
|
"sudo make me invisible for 60 seconds",
|
|
"sudo give me night vision forever",
|
|
"sudo give everyone resistance 3",
|
|
],
|
|
"world": [
|
|
"sudo set time to day",
|
|
"sudo clear the weather",
|
|
"sudo kill all zombies",
|
|
"sudo summon 3 cows near me",
|
|
],
|
|
"teleport": [
|
|
"sudo tp me to 0 100 0",
|
|
"sudo tp me 50 blocks up",
|
|
],
|
|
"fill_build": [
|
|
"sudo fill a 5x5 gold platform under me",
|
|
"sudo place a beacon at 0 64 0",
|
|
],
|
|
"complex": [
|
|
"sudo give me a mace with density 5 and wind burst 3",
|
|
"sudo give me a decorated pot",
|
|
"sudo spawn a warden 10 blocks away",
|
|
"sudo create a team called red with red color",
|
|
],
|
|
"plugins_worldguard": [
|
|
"sudo create a region called test-region",
|
|
"sudo set pvp deny in the test-region",
|
|
"sudo list all regions",
|
|
],
|
|
"plugins_coreprotect": [
|
|
"sudo check coreprotect status",
|
|
"sudo lookup block changes in the last hour",
|
|
],
|
|
"plugins_essentials": [
|
|
"sudo set spawn here",
|
|
"sudo create a warp called bakeoff-test",
|
|
"sudo heal me",
|
|
],
|
|
"plugins_luckperms": [
|
|
"sudo create a group called testers",
|
|
"sudo list all permission groups",
|
|
],
|
|
"error_prone": [
|
|
"sudo give me a bed",
|
|
"sudo give me cooked beef",
|
|
"sudo effect give me speed",
|
|
"sudo fill with stone 10",
|
|
],
|
|
}
|
|
|
|
PLAYER = "slingshooter08"
|
|
|
|
|
|
def query_model(prompt, model, ollama_url, timeout=60):
|
|
"""Query a model and return parsed response + timing."""
|
|
system = (
|
|
"/no_think\n"
|
|
"You are a Minecraft 1.21 command translator for a Paper server with plugins: "
|
|
"FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
|
|
"PERMISSION LEVEL: 4 (generous).\n"
|
|
"Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
|
|
)
|
|
|
|
start = time.time()
|
|
try:
|
|
r = requests.post(f"{ollama_url}/api/chat", json={
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": f"Player {PLAYER}: {prompt}"},
|
|
],
|
|
"stream": False, "format": "json",
|
|
"options": {"temperature": 0.2, "num_predict": 500},
|
|
}, timeout=timeout)
|
|
elapsed = time.time() - start
|
|
content = r.json()["message"]["content"]
|
|
content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
|
|
parsed = json.loads(content)
|
|
return {
|
|
"commands": parsed.get("commands", []),
|
|
"reasoning": parsed.get("reasoning", ""),
|
|
"elapsed": round(elapsed, 2),
|
|
"error": None,
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"commands": [],
|
|
"reasoning": "",
|
|
"elapsed": round(time.time() - start, 2),
|
|
"error": str(e)[:200],
|
|
}
|
|
|
|
|
|
def validate_commands(commands, rcon):
|
|
"""Execute commands and return results."""
|
|
results = []
|
|
for cmd in commands[:8]:
|
|
if not isinstance(cmd, str) or not cmd.strip():
|
|
continue
|
|
cmd = cmd.strip().lstrip("/")
|
|
try:
|
|
result = rcon.command(cmd)
|
|
is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
|
|
results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
|
|
except Exception as e:
|
|
results.append({"cmd": cmd, "result": str(e), "ok": False})
|
|
return results
|
|
|
|
|
|
def run_bakeoff(models, ollama_url, rcon):
|
|
"""Run all test prompts through all models."""
|
|
results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
|
|
"no_commands": 0, "errors": 0, "total_time": 0, "details": []}
|
|
for m in models}
|
|
|
|
total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
|
|
print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
|
|
|
|
for category, prompts in TEST_PROMPTS.items():
|
|
print(f"── {category} ──")
|
|
for prompt in prompts:
|
|
print(f" {prompt[:65]}")
|
|
for model in models:
|
|
resp = query_model(prompt, model, ollama_url)
|
|
r = results[model]
|
|
r["total"] += 1
|
|
r["total_time"] += resp["elapsed"]
|
|
|
|
if resp["error"]:
|
|
r["errors"] += 1
|
|
status = "ERR"
|
|
rcon_results = []
|
|
elif not resp["commands"]:
|
|
r["no_commands"] += 1
|
|
status = "EMPTY"
|
|
rcon_results = []
|
|
else:
|
|
rcon_results = validate_commands(resp["commands"], rcon)
|
|
ok = sum(1 for rr in rcon_results if rr["ok"])
|
|
fail = sum(1 for rr in rcon_results if not rr["ok"])
|
|
r["cmd_success"] += ok
|
|
r["cmd_fail"] += fail
|
|
r["cmd_total"] += ok + fail
|
|
status = f"{ok}/{ok+fail}" if fail else f"{ok}✓"
|
|
|
|
model_short = model.split(":")[-1]
|
|
print(f" {model_short:8s} {status:8s} {resp['elapsed']:.1f}s {len(resp['commands'])} cmds")
|
|
|
|
r["details"].append({
|
|
"category": category,
|
|
"prompt": prompt,
|
|
"commands": resp["commands"],
|
|
"rcon_results": rcon_results,
|
|
"elapsed": resp["elapsed"],
|
|
"error": resp["error"],
|
|
})
|
|
print()
|
|
|
|
return results
|
|
|
|
|
|
def print_summary(results, models):
|
|
"""Print comparison table."""
|
|
print("=" * 70)
|
|
print("BAKE-OFF RESULTS")
|
|
print("=" * 70)
|
|
|
|
header = f"{'Metric':<30s}"
|
|
for m in models:
|
|
header += f" {m.split(':')[-1]:>12s}"
|
|
print(header)
|
|
print("-" * 70)
|
|
|
|
metrics = [
|
|
("Prompts tested", lambda r: r["total"]),
|
|
("Commands generated", lambda r: r["cmd_total"]),
|
|
("Commands succeeded", lambda r: r["cmd_success"]),
|
|
("Commands failed", lambda r: r["cmd_fail"]),
|
|
("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
|
|
("Empty responses", lambda r: r["no_commands"]),
|
|
("Errors", lambda r: r["errors"]),
|
|
("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
|
|
("Total time", lambda r: f"{r['total_time']:.1f}s"),
|
|
]
|
|
|
|
for label, fn in metrics:
|
|
row = f"{label:<30s}"
|
|
for m in models:
|
|
val = fn(results[m])
|
|
row += f" {str(val):>12s}"
|
|
print(row)
|
|
|
|
print("=" * 70)
|
|
|
|
# Category breakdown
|
|
print("\nCATEGORY BREAKDOWN (success rate):")
|
|
print("-" * 70)
|
|
categories = list(TEST_PROMPTS.keys())
|
|
header = f"{'Category':<25s}"
|
|
for m in models:
|
|
header += f" {m.split(':')[-1]:>12s}"
|
|
print(header)
|
|
|
|
for cat in categories:
|
|
row = f"{cat:<25s}"
|
|
for m in models:
|
|
cat_details = [d for d in results[m]["details"] if d["category"] == cat]
|
|
cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
|
|
cat_total = sum(len(d["rcon_results"]) for d in cat_details)
|
|
if cat_total > 0:
|
|
row += f" {100*cat_ok/cat_total:>10.0f}%"
|
|
else:
|
|
row += f" {'N/A':>12s}"
|
|
print(row)
|
|
|
|
print()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Model bake-off")
|
|
parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
|
|
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
|
parser.add_argument("--rcon-host", default="192.168.0.244")
|
|
parser.add_argument("--rcon-port", type=int, default=25578)
|
|
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
|
|
args = parser.parse_args()
|
|
|
|
models = [m.strip() for m in args.models.split(",")]
|
|
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
|
|
|
|
print(f"Bake-off: {' vs '.join(models)}")
|
|
print(f"Ollama: {args.ollama_url}")
|
|
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
|
|
print()
|
|
|
|
results = run_bakeoff(models, args.ollama_url, rcon)
|
|
print_summary(results, models)
|
|
|
|
# Save results
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump({
|
|
"models": models,
|
|
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
|
|
"details": {m: r["details"] for m, r in results.items()},
|
|
}, f, indent=2, default=str)
|
|
print(f"Results saved to {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|