Files
Mortdecai f5118505b1 0.5.0 bake-off results, knowledge lookup tools, training progress chart
Bake-off (0.5.0 vs 0.4.0):
- Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2
- Enchantments: +47% (20% → 67%)
- EssentialsX: +60% (0% → 60%)
- Effects: +25% (0% → 25%)
- Regressions: fill_build -67%, world -20%

Knowledge Lookup Tools (4 new):
- plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs
- minecraft.changelog_lookup: version history from Minecraft Wiki
- paper.docs_lookup: Paper server-specific documentation
- Wired into gateway model-driven tool loop and exploration self-play

Exploration Self-Play:
- General (vanilla MC) and plugins focus modes
- Wiki-grounded: model researches before acting, validates through RCON
- 2,243 exploration examples generated, 150 kept after quality filtering

Training Progress Chart:
- SVG chart showing training examples and inverse loss across versions
- Added to MODEL_CARD.md for Gitea display

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:28:09 -04:00

298 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Bake-off — compare model versions on standard test prompts with RCON validation.
Runs the same prompts through multiple models, executes via RCON, and scores
success rate, response quality, and speed.
Usage:
python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
--ollama-url http://localhost:11434 --rcon-host 192.168.0.244
"""
import argparse
import json
import random
import re
import sys
import time
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
import requests
from agent.tools.persistent_rcon import get_rcon
OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
# Standard test prompts across categories
TEST_PROMPTS = {
"basic_give": [
"sudo give me a diamond sword",
"sudo give me 64 golden apples",
"sudo give me full netherite armor",
"sudo give me a stack of oak logs",
],
"enchantments": [
"sudo give me a sword with sharpness 5 and mending",
"sudo give me a bow with power 5 and infinity",
"sudo give me boots with feather falling 4 and depth strider 3",
"sudo give me a trident with loyalty 3 and channeling",
],
"effects": [
"sudo give me speed 2 for 5 minutes",
"sudo make me invisible for 60 seconds",
"sudo give me night vision forever",
"sudo give everyone resistance 3",
],
"world": [
"sudo set time to day",
"sudo clear the weather",
"sudo kill all zombies",
"sudo summon 3 cows near me",
],
"teleport": [
"sudo tp me to 0 100 0",
"sudo tp me 50 blocks up",
],
"fill_build": [
"sudo fill a 5x5 gold platform under me",
"sudo place a beacon at 0 64 0",
],
"complex": [
"sudo give me a mace with density 5 and wind burst 3",
"sudo give me a decorated pot",
"sudo spawn a warden 10 blocks away",
"sudo create a team called red with red color",
],
"plugins_worldguard": [
"sudo create a region called test-region",
"sudo set pvp deny in the test-region",
"sudo list all regions",
],
"plugins_coreprotect": [
"sudo check coreprotect status",
"sudo lookup block changes in the last hour",
],
"plugins_essentials": [
"sudo set spawn here",
"sudo create a warp called bakeoff-test",
"sudo heal me",
],
"plugins_luckperms": [
"sudo create a group called testers",
"sudo list all permission groups",
],
"error_prone": [
"sudo give me a bed",
"sudo give me cooked beef",
"sudo effect give me speed",
"sudo fill with stone 10",
],
}
PLAYER = "slingshooter08"
def query_model(prompt, model, ollama_url, timeout=60):
"""Query a model and return parsed response + timing."""
system = (
"/no_think\n"
"You are a Minecraft 1.21 command translator for a Paper server with plugins: "
"FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
"PERMISSION LEVEL: 4 (generous).\n"
"Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
)
start = time.time()
try:
r = requests.post(f"{ollama_url}/api/chat", json={
"model": model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": f"Player {PLAYER}: {prompt}"},
],
"stream": False, "format": "json",
"options": {"temperature": 0.2, "num_predict": 500},
}, timeout=timeout)
elapsed = time.time() - start
content = r.json()["message"]["content"]
content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
parsed = json.loads(content)
return {
"commands": parsed.get("commands", []),
"reasoning": parsed.get("reasoning", ""),
"elapsed": round(elapsed, 2),
"error": None,
}
except Exception as e:
return {
"commands": [],
"reasoning": "",
"elapsed": round(time.time() - start, 2),
"error": str(e)[:200],
}
def validate_commands(commands, rcon):
"""Execute commands and return results."""
results = []
for cmd in commands[:8]:
if not isinstance(cmd, str) or not cmd.strip():
continue
cmd = cmd.strip().lstrip("/")
try:
result = rcon.command(cmd)
is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
except Exception as e:
results.append({"cmd": cmd, "result": str(e), "ok": False})
return results
def run_bakeoff(models, ollama_url, rcon):
"""Run all test prompts through all models."""
results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
"no_commands": 0, "errors": 0, "total_time": 0, "details": []}
for m in models}
total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
for category, prompts in TEST_PROMPTS.items():
print(f"── {category} ──")
for prompt in prompts:
print(f" {prompt[:65]}")
for model in models:
resp = query_model(prompt, model, ollama_url)
r = results[model]
r["total"] += 1
r["total_time"] += resp["elapsed"]
if resp["error"]:
r["errors"] += 1
status = "ERR"
rcon_results = []
elif not resp["commands"]:
r["no_commands"] += 1
status = "EMPTY"
rcon_results = []
else:
rcon_results = validate_commands(resp["commands"], rcon)
ok = sum(1 for rr in rcon_results if rr["ok"])
fail = sum(1 for rr in rcon_results if not rr["ok"])
r["cmd_success"] += ok
r["cmd_fail"] += fail
r["cmd_total"] += ok + fail
status = f"{ok}/{ok+fail}" if fail else f"{ok}"
model_short = model.split(":")[-1]
print(f" {model_short:8s} {status:8s} {resp['elapsed']:.1f}s {len(resp['commands'])} cmds")
r["details"].append({
"category": category,
"prompt": prompt,
"commands": resp["commands"],
"rcon_results": rcon_results,
"elapsed": resp["elapsed"],
"error": resp["error"],
})
print()
return results
def print_summary(results, models):
"""Print comparison table."""
print("=" * 70)
print("BAKE-OFF RESULTS")
print("=" * 70)
header = f"{'Metric':<30s}"
for m in models:
header += f" {m.split(':')[-1]:>12s}"
print(header)
print("-" * 70)
metrics = [
("Prompts tested", lambda r: r["total"]),
("Commands generated", lambda r: r["cmd_total"]),
("Commands succeeded", lambda r: r["cmd_success"]),
("Commands failed", lambda r: r["cmd_fail"]),
("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
("Empty responses", lambda r: r["no_commands"]),
("Errors", lambda r: r["errors"]),
("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
("Total time", lambda r: f"{r['total_time']:.1f}s"),
]
for label, fn in metrics:
row = f"{label:<30s}"
for m in models:
val = fn(results[m])
row += f" {str(val):>12s}"
print(row)
print("=" * 70)
# Category breakdown
print("\nCATEGORY BREAKDOWN (success rate):")
print("-" * 70)
categories = list(TEST_PROMPTS.keys())
header = f"{'Category':<25s}"
for m in models:
header += f" {m.split(':')[-1]:>12s}"
print(header)
for cat in categories:
row = f"{cat:<25s}"
for m in models:
cat_details = [d for d in results[m]["details"] if d["category"] == cat]
cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
cat_total = sum(len(d["rcon_results"]) for d in cat_details)
if cat_total > 0:
row += f" {100*cat_ok/cat_total:>10.0f}%"
else:
row += f" {'N/A':>12s}"
print(row)
print()
def main():
parser = argparse.ArgumentParser(description="Model bake-off")
parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
parser.add_argument("--ollama-url", default="http://localhost:11434")
parser.add_argument("--rcon-host", default="192.168.0.244")
parser.add_argument("--rcon-port", type=int, default=25578)
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
args = parser.parse_args()
models = [m.strip() for m in args.models.split(",")]
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
print(f"Bake-off: {' vs '.join(models)}")
print(f"Ollama: {args.ollama_url}")
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
print()
results = run_bakeoff(models, args.ollama_url, rcon)
print_summary(results, models)
# Save results
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
with open(out_path, "w") as f:
json.dump({
"models": models,
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
"results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
"details": {m: r["details"] for m, r in results.items()},
}, f, indent=2, default=str)
print(f"Results saved to {out_path}")
if __name__ == "__main__":
main()