0.5.0 bake-off results, knowledge lookup tools, training progress chart
Bake-off (0.5.0 vs 0.4.0): - Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2 - Enchantments: +47% (20% → 67%) - EssentialsX: +60% (0% → 60%) - Effects: +25% (0% → 25%) - Regressions: fill_build -67%, world -20% Knowledge Lookup Tools (4 new): - plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs - minecraft.changelog_lookup: version history from Minecraft Wiki - paper.docs_lookup: Paper server-specific documentation - Wired into gateway model-driven tool loop and exploration self-play Exploration Self-Play: - General (vanilla MC) and plugins focus modes - Wiki-grounded: model researches before acting, validates through RCON - 2,243 exploration examples generated, 150 kept after quality filtering Training Progress Chart: - SVG chart showing training examples and inverse loss across versions - Added to MODEL_CARD.md for Gitea display Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bake-off — compare model versions on standard test prompts with RCON validation.
|
||||
|
||||
Runs the same prompts through multiple models, executes via RCON, and scores
|
||||
success rate, response quality, and speed.
|
||||
|
||||
Usage:
|
||||
python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
|
||||
--ollama-url http://localhost:11434 --rcon-host 192.168.0.244
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
import requests
|
||||
from agent.tools.persistent_rcon import get_rcon
|
||||
|
||||
OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
|
||||
|
||||
# Standard test prompts across categories
|
||||
TEST_PROMPTS = {
|
||||
"basic_give": [
|
||||
"sudo give me a diamond sword",
|
||||
"sudo give me 64 golden apples",
|
||||
"sudo give me full netherite armor",
|
||||
"sudo give me a stack of oak logs",
|
||||
],
|
||||
"enchantments": [
|
||||
"sudo give me a sword with sharpness 5 and mending",
|
||||
"sudo give me a bow with power 5 and infinity",
|
||||
"sudo give me boots with feather falling 4 and depth strider 3",
|
||||
"sudo give me a trident with loyalty 3 and channeling",
|
||||
],
|
||||
"effects": [
|
||||
"sudo give me speed 2 for 5 minutes",
|
||||
"sudo make me invisible for 60 seconds",
|
||||
"sudo give me night vision forever",
|
||||
"sudo give everyone resistance 3",
|
||||
],
|
||||
"world": [
|
||||
"sudo set time to day",
|
||||
"sudo clear the weather",
|
||||
"sudo kill all zombies",
|
||||
"sudo summon 3 cows near me",
|
||||
],
|
||||
"teleport": [
|
||||
"sudo tp me to 0 100 0",
|
||||
"sudo tp me 50 blocks up",
|
||||
],
|
||||
"fill_build": [
|
||||
"sudo fill a 5x5 gold platform under me",
|
||||
"sudo place a beacon at 0 64 0",
|
||||
],
|
||||
"complex": [
|
||||
"sudo give me a mace with density 5 and wind burst 3",
|
||||
"sudo give me a decorated pot",
|
||||
"sudo spawn a warden 10 blocks away",
|
||||
"sudo create a team called red with red color",
|
||||
],
|
||||
"plugins_worldguard": [
|
||||
"sudo create a region called test-region",
|
||||
"sudo set pvp deny in the test-region",
|
||||
"sudo list all regions",
|
||||
],
|
||||
"plugins_coreprotect": [
|
||||
"sudo check coreprotect status",
|
||||
"sudo lookup block changes in the last hour",
|
||||
],
|
||||
"plugins_essentials": [
|
||||
"sudo set spawn here",
|
||||
"sudo create a warp called bakeoff-test",
|
||||
"sudo heal me",
|
||||
],
|
||||
"plugins_luckperms": [
|
||||
"sudo create a group called testers",
|
||||
"sudo list all permission groups",
|
||||
],
|
||||
"error_prone": [
|
||||
"sudo give me a bed",
|
||||
"sudo give me cooked beef",
|
||||
"sudo effect give me speed",
|
||||
"sudo fill with stone 10",
|
||||
],
|
||||
}
|
||||
|
||||
PLAYER = "slingshooter08"
|
||||
|
||||
|
||||
def query_model(prompt, model, ollama_url, timeout=60):
|
||||
"""Query a model and return parsed response + timing."""
|
||||
system = (
|
||||
"/no_think\n"
|
||||
"You are a Minecraft 1.21 command translator for a Paper server with plugins: "
|
||||
"FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
|
||||
"PERMISSION LEVEL: 4 (generous).\n"
|
||||
"Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
r = requests.post(f"{ollama_url}/api/chat", json={
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": f"Player {PLAYER}: {prompt}"},
|
||||
],
|
||||
"stream": False, "format": "json",
|
||||
"options": {"temperature": 0.2, "num_predict": 500},
|
||||
}, timeout=timeout)
|
||||
elapsed = time.time() - start
|
||||
content = r.json()["message"]["content"]
|
||||
content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
|
||||
parsed = json.loads(content)
|
||||
return {
|
||||
"commands": parsed.get("commands", []),
|
||||
"reasoning": parsed.get("reasoning", ""),
|
||||
"elapsed": round(elapsed, 2),
|
||||
"error": None,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"commands": [],
|
||||
"reasoning": "",
|
||||
"elapsed": round(time.time() - start, 2),
|
||||
"error": str(e)[:200],
|
||||
}
|
||||
|
||||
|
||||
def validate_commands(commands, rcon):
|
||||
"""Execute commands and return results."""
|
||||
results = []
|
||||
for cmd in commands[:8]:
|
||||
if not isinstance(cmd, str) or not cmd.strip():
|
||||
continue
|
||||
cmd = cmd.strip().lstrip("/")
|
||||
try:
|
||||
result = rcon.command(cmd)
|
||||
is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
|
||||
results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
|
||||
except Exception as e:
|
||||
results.append({"cmd": cmd, "result": str(e), "ok": False})
|
||||
return results
|
||||
|
||||
|
||||
def run_bakeoff(models, ollama_url, rcon):
|
||||
"""Run all test prompts through all models."""
|
||||
results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
|
||||
"no_commands": 0, "errors": 0, "total_time": 0, "details": []}
|
||||
for m in models}
|
||||
|
||||
total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
|
||||
print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
|
||||
|
||||
for category, prompts in TEST_PROMPTS.items():
|
||||
print(f"── {category} ──")
|
||||
for prompt in prompts:
|
||||
print(f" {prompt[:65]}")
|
||||
for model in models:
|
||||
resp = query_model(prompt, model, ollama_url)
|
||||
r = results[model]
|
||||
r["total"] += 1
|
||||
r["total_time"] += resp["elapsed"]
|
||||
|
||||
if resp["error"]:
|
||||
r["errors"] += 1
|
||||
status = "ERR"
|
||||
rcon_results = []
|
||||
elif not resp["commands"]:
|
||||
r["no_commands"] += 1
|
||||
status = "EMPTY"
|
||||
rcon_results = []
|
||||
else:
|
||||
rcon_results = validate_commands(resp["commands"], rcon)
|
||||
ok = sum(1 for rr in rcon_results if rr["ok"])
|
||||
fail = sum(1 for rr in rcon_results if not rr["ok"])
|
||||
r["cmd_success"] += ok
|
||||
r["cmd_fail"] += fail
|
||||
r["cmd_total"] += ok + fail
|
||||
status = f"{ok}/{ok+fail}" if fail else f"{ok}✓"
|
||||
|
||||
model_short = model.split(":")[-1]
|
||||
print(f" {model_short:8s} {status:8s} {resp['elapsed']:.1f}s {len(resp['commands'])} cmds")
|
||||
|
||||
r["details"].append({
|
||||
"category": category,
|
||||
"prompt": prompt,
|
||||
"commands": resp["commands"],
|
||||
"rcon_results": rcon_results,
|
||||
"elapsed": resp["elapsed"],
|
||||
"error": resp["error"],
|
||||
})
|
||||
print()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def print_summary(results, models):
|
||||
"""Print comparison table."""
|
||||
print("=" * 70)
|
||||
print("BAKE-OFF RESULTS")
|
||||
print("=" * 70)
|
||||
|
||||
header = f"{'Metric':<30s}"
|
||||
for m in models:
|
||||
header += f" {m.split(':')[-1]:>12s}"
|
||||
print(header)
|
||||
print("-" * 70)
|
||||
|
||||
metrics = [
|
||||
("Prompts tested", lambda r: r["total"]),
|
||||
("Commands generated", lambda r: r["cmd_total"]),
|
||||
("Commands succeeded", lambda r: r["cmd_success"]),
|
||||
("Commands failed", lambda r: r["cmd_fail"]),
|
||||
("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
|
||||
("Empty responses", lambda r: r["no_commands"]),
|
||||
("Errors", lambda r: r["errors"]),
|
||||
("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
|
||||
("Total time", lambda r: f"{r['total_time']:.1f}s"),
|
||||
]
|
||||
|
||||
for label, fn in metrics:
|
||||
row = f"{label:<30s}"
|
||||
for m in models:
|
||||
val = fn(results[m])
|
||||
row += f" {str(val):>12s}"
|
||||
print(row)
|
||||
|
||||
print("=" * 70)
|
||||
|
||||
# Category breakdown
|
||||
print("\nCATEGORY BREAKDOWN (success rate):")
|
||||
print("-" * 70)
|
||||
categories = list(TEST_PROMPTS.keys())
|
||||
header = f"{'Category':<25s}"
|
||||
for m in models:
|
||||
header += f" {m.split(':')[-1]:>12s}"
|
||||
print(header)
|
||||
|
||||
for cat in categories:
|
||||
row = f"{cat:<25s}"
|
||||
for m in models:
|
||||
cat_details = [d for d in results[m]["details"] if d["category"] == cat]
|
||||
cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
|
||||
cat_total = sum(len(d["rcon_results"]) for d in cat_details)
|
||||
if cat_total > 0:
|
||||
row += f" {100*cat_ok/cat_total:>10.0f}%"
|
||||
else:
|
||||
row += f" {'N/A':>12s}"
|
||||
print(row)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Model bake-off")
|
||||
parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
|
||||
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
||||
parser.add_argument("--rcon-host", default="192.168.0.244")
|
||||
parser.add_argument("--rcon-port", type=int, default=25578)
|
||||
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
|
||||
args = parser.parse_args()
|
||||
|
||||
models = [m.strip() for m in args.models.split(",")]
|
||||
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
|
||||
|
||||
print(f"Bake-off: {' vs '.join(models)}")
|
||||
print(f"Ollama: {args.ollama_url}")
|
||||
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
|
||||
print()
|
||||
|
||||
results = run_bakeoff(models, args.ollama_url, rcon)
|
||||
print_summary(results, models)
|
||||
|
||||
# Save results
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump({
|
||||
"models": models,
|
||||
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
|
||||
"details": {m: r["details"] for m, r in results.items()},
|
||||
}, f, indent=2, default=str)
|
||||
print(f"Results saved to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Exploration Self-Play — model uses wiki_lookup to explore Minecraft knowledge,
|
||||
then validates its understanding through RCON commands.
|
||||
|
||||
Unlike canned self-play, the model drives its own curiosity:
|
||||
1. Gets a broad topic ("explore enchantments", "learn about 1.21 items")
|
||||
2. Uses minecraft.wiki_lookup to research
|
||||
3. Generates commands based on what it learned
|
||||
4. RCON validates correctness
|
||||
5. If wrong, researches more and corrects
|
||||
|
||||
Produces gold-standard knowledge-grounded training data.
|
||||
|
||||
Usage:
|
||||
python3 exploration_self_play.py --ollama-url http://localhost:11434 \
|
||||
--model mortdecai:0.5.0 --rcon-host 192.168.0.244 --rcon-port 25578
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
import requests
|
||||
from agent.tools.persistent_rcon import get_rcon
|
||||
|
||||
OUTPUT_DIR = PROJECT_ROOT / "data" / "raw" / "exploration_selfplay"
|
||||
|
||||
PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx"]
|
||||
|
||||
# Topics for the model to explore — broad enough that it needs to look things up
|
||||
EXPLORATION_TOPICS_PLUGINS = [
|
||||
# WorldGuard deep dive
|
||||
"Research all WorldGuard region flags. Create a region and test each flag one at a time for {p}.",
|
||||
"Look up how WorldGuard region priorities work. Create overlapping regions with different rules.",
|
||||
"Research WorldGuard's __global__ region. What flags can you set globally? Test a few.",
|
||||
"Look up WorldGuard entry/exit deny flags. Create a VIP-only zone and test it.",
|
||||
"Research how to make a WorldGuard region that heals players. Set it up near {p}.",
|
||||
"What WorldGuard flags control explosions? Research and create a blast-proof zone.",
|
||||
"Look up how to block specific commands in a WorldGuard region. Test with /home.",
|
||||
"Research WorldGuard greeting and farewell messages. Set up regions with welcome messages.",
|
||||
|
||||
# CoreProtect deep dive
|
||||
"Research all CoreProtect action types (block, container, chat, command). Test /co lookup with each.",
|
||||
"Look up CoreProtect time format syntax. Test rollbacks with different time ranges (1h, 30m, 7d).",
|
||||
"Research how CoreProtect handles container logging. Place a chest, add items, then lookup the history.",
|
||||
"What CoreProtect parameters filter by block type? Test rolling back only specific blocks.",
|
||||
"Look up how to use CoreProtect radius parameter. Test different radius values.",
|
||||
"Research CoreProtect restore vs rollback — what's the difference? Demonstrate both.",
|
||||
|
||||
# EssentialsX deep dive
|
||||
"Research all EssentialsX economy commands. Set up a working economy with /eco, /balance, /pay.",
|
||||
"Look up EssentialsX kit creation syntax. Create a starter kit and a VIP kit.",
|
||||
"Research EssentialsX warp system. Create 5 warps at interesting locations.",
|
||||
"What EssentialsX commands exist for player management? Test /nick, /seen, /whois.",
|
||||
"Look up EssentialsX home system. Set multiple named homes for {p}.",
|
||||
"Research EssentialsX god mode, fly mode, and speed commands. Test all three.",
|
||||
"What EssentialsX commands modify the world? Test /sun, /storm, /day, /night.",
|
||||
|
||||
# LuckPerms deep dive
|
||||
"Research LuckPerms group inheritance. Create parent and child groups and test permission flow.",
|
||||
"Look up LuckPerms temporary permissions. Give {p} temp fly access for 5 minutes.",
|
||||
"Research LuckPerms meta (prefix/suffix). Set up colored chat prefixes for different groups.",
|
||||
"What LuckPerms commands check a user's permissions? Audit {p}'s current permissions.",
|
||||
"Look up how to create a LuckPerms permission ladder (default -> member -> vip -> admin).",
|
||||
"Research LuckPerms weight system. How do group priorities work?",
|
||||
|
||||
# FAWE/WorldEdit deep dive
|
||||
"Research all WorldEdit shape commands (sphere, cyl, pyramid). Build one of each near {p}.",
|
||||
"Look up WorldEdit brush types. What brushes exist beyond sphere brush?",
|
||||
"Research WorldEdit mask syntax. How do masks work with //replace?",
|
||||
"What WorldEdit clipboard operations exist? Test //copy, //paste, //rotate, //flip.",
|
||||
"Look up WorldEdit pattern syntax. Can you mix multiple blocks in one command?",
|
||||
"Research WorldEdit //generate command. Can it make mathematical surfaces?",
|
||||
"What WorldEdit selection modes exist? Test //sel cuboid vs poly vs sphere.",
|
||||
|
||||
# Script writing exploration
|
||||
"Research Minecraft datapack function syntax. Write a mcfunction that creates a parkour course.",
|
||||
"Look up how Minecraft tick functions work. Write one that makes particles at spawn.",
|
||||
"Research how to chain mcfunctions together. Write a main function that calls sub-functions.",
|
||||
"What Minecraft datapack tags control function scheduling? Test tick.json and load.json.",
|
||||
"Look up execute command syntax for mcfunctions. Write a script using execute at/as/if.",
|
||||
"Research scoreboard objectives. Write a script that tracks player kills and announces leaders.",
|
||||
|
||||
# Multi-plugin combos
|
||||
"Research how to combine WorldEdit builds with WorldGuard protection. Build and protect an arena.",
|
||||
"Look up how to use CoreProtect to undo WorldEdit operations specifically.",
|
||||
"Research combining LuckPerms with WorldGuard — can you tie region access to permission groups?",
|
||||
"Create a complete server setup: spawn area (WE), protected (WG), with warps (Ess) and perms (LP).",
|
||||
"Research how to build a minigame arena: WE for building, WG for rules, scoreboards for tracking.",
|
||||
]
|
||||
|
||||
EXPLORATION_TOPICS = [
|
||||
# Items and crafting
|
||||
"What are all the new items added in 1.21? Look them up and give one of each to {p}.",
|
||||
"Research every type of arrow (tipped arrows) and give {p} one of each.",
|
||||
"Look up all the banner patterns available and create a cool banner for {p}.",
|
||||
"What suspicious stew effects exist? Research and give {p} the best one.",
|
||||
"Research all the different types of potions and give {p} the three most useful ones.",
|
||||
"What are all the different horse armor types? Look them up and give one of each to {p}.",
|
||||
"Research all smithing templates and give {p} the rarest ones.",
|
||||
"Look up every type of spawn egg and give {p} five interesting ones.",
|
||||
|
||||
# Enchantments
|
||||
"Research the best enchantment setup for a full netherite armor set. Give it to {p}.",
|
||||
"What enchantments are exclusive to each other? Look them up and explain while giving {p} examples.",
|
||||
"Research the difference between Protection, Fire Protection, Blast Protection, and Projectile Protection. Which is best for general use? Give {p} the optimal set.",
|
||||
"Look up what Thorns does exactly — is it worth using? Give {p} armor with and without it to test.",
|
||||
"Research Sweeping Edge — does it still exist in 1.21? Give {p} a sword with the correct enchantments.",
|
||||
"What's the maximum level for each enchantment? Research and give {p} a tool with impossible levels vs correct levels.",
|
||||
|
||||
# Effects and potions
|
||||
"Research all status effects in 1.21. Which ones are new? Apply the 3 newest ones to {p}.",
|
||||
"Look up the Ominous Bottle effect — what does it do? Give one to {p}.",
|
||||
"What's the difference between Strength and Haste? Research and apply the right one for mining.",
|
||||
"Research what Wind Charged does. Apply it to {p}.",
|
||||
"Look up all negative effects and their max safe durations. Apply a brief demonstration.",
|
||||
"What effect does a Beacon give? Research all beacon effects and apply them.",
|
||||
|
||||
# Mobs and entities
|
||||
"Research all tameable mobs in 1.21. Summon one of each near {p}.",
|
||||
"What mobs were added or changed in 1.21? Look them up and summon the new ones.",
|
||||
"Research the Breeze mob — what does it drop? Summon one for {p}.",
|
||||
"Look up all rideable mobs and summon one for {p} with a saddle.",
|
||||
"What's the strongest mob in the game? Research its stats and summon it (carefully).",
|
||||
"Research all fish types and summon them in water near {p}.",
|
||||
|
||||
# Blocks and building
|
||||
"Research all copper block variants and their oxidation states. Place examples near {p}.",
|
||||
"What blocks emit light? Look up all light-emitting blocks and demonstrate.",
|
||||
"Research all types of stairs, slabs, and walls available in 1.21.",
|
||||
"Look up how to make colored concrete powder and place a rainbow near {p}.",
|
||||
"What are all the glazed terracotta patterns? Research and place one of each.",
|
||||
"Research redstone components — what's the difference between a comparator and repeater?",
|
||||
|
||||
# Commands and mechanics
|
||||
"Research the /place command. What can it place? Demonstrate with a structure.",
|
||||
"Look up the /damage command syntax and demonstrate different damage types on a mob.",
|
||||
"Research /attribute — what attributes can be modified? Give {p} double health.",
|
||||
"What does the /ride command do? Research and demonstrate.",
|
||||
"Look up /fillbiome — can you change the biome? Try it near {p}.",
|
||||
"Research the /random command added in 1.21. What can it do?",
|
||||
|
||||
# Worldgen and structures
|
||||
"Research all structure types that /locate can find. Find the 3 nearest to {p}.",
|
||||
"What biomes exist in 1.21? Look up any new ones and locate them.",
|
||||
"Research Trial Chambers — where do they spawn? Locate one for {p}.",
|
||||
|
||||
# Plugin-specific research
|
||||
"Research WorldGuard region flags — what flags exist? Set up a demo region with interesting flags.",
|
||||
"Look up CoreProtect rollback syntax — what parameters does it accept?",
|
||||
"Research LuckPerms group inheritance — how do child groups work?",
|
||||
"What WorldEdit brushes are available? Research and describe them.",
|
||||
"Look up EssentialsX economy commands — set up a basic economy demonstration.",
|
||||
]
|
||||
|
||||
|
||||
def wiki_lookup(query, timeout=15):
|
||||
"""Actually search the Minecraft wiki via DuckDuckGo + scraping."""
|
||||
try:
|
||||
# Use a simple search - the model will call this through the tool loop
|
||||
r = requests.get(
|
||||
"https://minecraft.wiki/api.php",
|
||||
params={"action": "opensearch", "search": query, "limit": 3, "format": "json"},
|
||||
timeout=timeout,
|
||||
)
|
||||
results = r.json()
|
||||
if len(results) >= 4 and results[1]:
|
||||
titles = results[1][:3]
|
||||
urls = results[3][:3] if len(results) > 3 else []
|
||||
|
||||
# Fetch first result summary
|
||||
if titles:
|
||||
r2 = requests.get(
|
||||
"https://minecraft.wiki/api.php",
|
||||
params={
|
||||
"action": "query", "prop": "extracts",
|
||||
"exintro": True, "explaintext": True,
|
||||
"titles": titles[0], "format": "json",
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
pages = r2.json().get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
extract = page.get("extract", "")
|
||||
if extract:
|
||||
return {
|
||||
"content": extract[:1500],
|
||||
"url": urls[0] if urls else f"https://minecraft.wiki/w/{titles[0]}",
|
||||
"ok": True,
|
||||
}
|
||||
return {"content": f"No wiki results for: {query}", "url": "", "ok": False}
|
||||
except Exception as e:
|
||||
return {"content": f"Wiki lookup failed: {e}", "url": "", "ok": False}
|
||||
|
||||
|
||||
def run_exploration(topic, player, ollama_url, model, rcon):
|
||||
"""Run one exploration round — model researches and acts."""
|
||||
system = (
|
||||
"/no_think\n"
|
||||
"You are a Minecraft 1.21 expert on a Paper server with plugins: "
|
||||
"WorldGuard, CoreProtect, EssentialsX, LuckPerms, FastAsyncWorldEdit.\n\n"
|
||||
"You have these lookup tools:\n"
|
||||
"- minecraft.wiki_lookup: {\"query\": \"...\"} — Minecraft Wiki for items, mobs, commands\n"
|
||||
"- plugin.docs_lookup: {\"plugin\": \"worldguard|worldedit|coreprotect|essentialsx|luckperms\", \"query\": \"...\"} — plugin documentation\n"
|
||||
"- minecraft.changelog_lookup: {\"query\": \"...\", \"version\": \"1.21\"} — version changes\n"
|
||||
"- paper.docs_lookup: {\"query\": \"...\"} — Paper server docs\n"
|
||||
"- rcon.execute: {\"command\": \"...\"} — execute a Minecraft command\n\n"
|
||||
"WORKFLOW:\n"
|
||||
"1. Research the topic using the appropriate lookup tool\n"
|
||||
"2. For plugin commands, use plugin.docs_lookup instead of minecraft.wiki_lookup\n"
|
||||
"3. Generate and execute commands via rcon.execute\n"
|
||||
"4. If a command fails, look up the correct syntax and try again\n\n"
|
||||
"To call a tool, respond with:\n"
|
||||
"<tool_call>\n{\"name\": \"tool_name\", \"arguments\": {...}}\n</tool_call>\n\n"
|
||||
"When done, respond with final JSON:\n"
|
||||
"{\"commands\": [...], \"reasoning\": \"what you learned\", \"wiki_topics\": [\"topics you looked up\"]}\n\n"
|
||||
"Be curious. ALWAYS look things up before guessing. Verify your knowledge."
|
||||
)
|
||||
|
||||
topic_resolved = topic.replace("{p}", player)
|
||||
messages = [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": f"Player {player}: {topic_resolved}"},
|
||||
]
|
||||
|
||||
tool_trace = []
|
||||
all_commands = []
|
||||
wiki_topics = []
|
||||
max_steps = 10
|
||||
|
||||
for step in range(max_steps):
|
||||
try:
|
||||
r = requests.post(f"{ollama_url}/api/chat", json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.6, "num_predict": 800},
|
||||
}, timeout=120)
|
||||
raw = r.json()["message"]["content"]
|
||||
except Exception as e:
|
||||
print(f" LLM error: {e}")
|
||||
break
|
||||
|
||||
raw = re.sub(r'<think>[\s\S]*?</think>\s*', '', raw)
|
||||
|
||||
# Check for tool calls
|
||||
tool_matches = re.findall(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', raw, re.DOTALL)
|
||||
|
||||
if not tool_matches:
|
||||
# Final response — done exploring
|
||||
break
|
||||
|
||||
for tc_json in tool_matches:
|
||||
try:
|
||||
tc = json.loads(tc_json)
|
||||
tool_name = tc.get("name", "")
|
||||
tool_args = tc.get("arguments", {})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if tool_name == "minecraft.wiki_lookup":
|
||||
query = tool_args.get("query", "")
|
||||
wiki_topics.append(query)
|
||||
result = wiki_lookup(query)
|
||||
print(f" wiki: {query[:60]} -> {len(result.get('content',''))} chars")
|
||||
elif tool_name in ("plugin.docs_lookup", "minecraft.changelog_lookup", "paper.docs_lookup"):
|
||||
try:
|
||||
from agent.tools.knowledge_lookup import handle_knowledge_tool
|
||||
result = handle_knowledge_tool(tool_name, tool_args)
|
||||
except ImportError:
|
||||
result = wiki_lookup(tool_args.get("query", tool_args.get("plugin", "")))
|
||||
query = tool_args.get("query", "")
|
||||
wiki_topics.append(f"{tool_name}:{query}")
|
||||
print(f" {tool_name}: {query[:50]} -> {len(result.get('content',''))} chars")
|
||||
elif tool_name == "rcon.execute":
|
||||
cmd = tool_args.get("command", "")
|
||||
try:
|
||||
rcon_result = rcon.command(cmd)
|
||||
is_err = any(e in rcon_result for e in ("<--[HERE]", "Unknown", "Incorrect"))
|
||||
result = {"success": not is_err, "result": rcon_result[:300]}
|
||||
all_commands.append(cmd)
|
||||
status = "OK" if not is_err else "ERR"
|
||||
print(f" rcon: {cmd[:60]} -> {status}")
|
||||
except Exception as e:
|
||||
result = {"success": False, "result": str(e)}
|
||||
print(f" rcon: {cmd[:60]} -> FAIL")
|
||||
else:
|
||||
result = {"ok": False, "error": f"unknown tool: {tool_name}"}
|
||||
|
||||
tool_trace.append({
|
||||
"tool": tool_name,
|
||||
"input": str(tool_args)[:200],
|
||||
"ok": result.get("ok", result.get("success", False)),
|
||||
"step": step,
|
||||
})
|
||||
|
||||
messages.append({"role": "assistant", "content": f"<tool_call>\n{json.dumps(tc)}\n</tool_call>"})
|
||||
messages.append({"role": "tool", "content": json.dumps(result)[:3000]})
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
# Parse final response if present
|
||||
reasoning = ""
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
reasoning = parsed.get("reasoning", "")
|
||||
if parsed.get("commands"):
|
||||
all_commands.extend(parsed["commands"])
|
||||
except json.JSONDecodeError:
|
||||
reasoning = raw[:200]
|
||||
|
||||
return {
|
||||
"id": f"explore-{int(time.time())}-{random.randint(0,9999):04d}",
|
||||
"source": "exploration_self_play",
|
||||
"type": "exploration",
|
||||
"input": {"user_message": topic_resolved, "player": player},
|
||||
"output": {
|
||||
"commands": all_commands,
|
||||
"reasoning": reasoning,
|
||||
"wiki_topics": wiki_topics,
|
||||
},
|
||||
"tool_trace": tool_trace,
|
||||
"messages": messages,
|
||||
"metadata": {
|
||||
"model": model,
|
||||
"steps": min(step + 1, max_steps),
|
||||
"wiki_lookups": len(wiki_topics),
|
||||
"rcon_commands": len(all_commands),
|
||||
"success_rate": (
|
||||
sum(1 for t in tool_trace if t["tool"] == "rcon.execute" and t["ok"])
|
||||
/ max(sum(1 for t in tool_trace if t["tool"] == "rcon.execute"), 1)
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Exploration self-play")
|
||||
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
||||
parser.add_argument("--model", default="mortdecai:0.5.0")
|
||||
parser.add_argument("--rcon-host", default="192.168.0.244")
|
||||
parser.add_argument("--rcon-port", type=int, default=25578)
|
||||
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
|
||||
parser.add_argument("--rounds", type=int, default=999999)
|
||||
parser.add_argument("--focus", default="general", choices=["general", "plugins", "all"],
|
||||
help="Topic focus: general (vanilla MC), plugins (WG/CP/Ess/LP/FAWE/scripts), all (both)")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.focus == "plugins":
|
||||
topics = EXPLORATION_TOPICS_PLUGINS
|
||||
elif args.focus == "all":
|
||||
topics = EXPLORATION_TOPICS + EXPLORATION_TOPICS_PLUGINS
|
||||
else:
|
||||
topics = EXPLORATION_TOPICS
|
||||
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
output_path = OUTPUT_DIR / f"exploration_{args.focus}_{int(time.time())}.jsonl"
|
||||
|
||||
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
|
||||
|
||||
print(f"Exploration Self-Play")
|
||||
print(f" Model: {args.model} on {args.ollama_url}")
|
||||
print(f" RCON: {args.rcon_host}:{args.rcon_port}")
|
||||
print(f" Focus: {args.focus} ({len(topics)} topics)")
|
||||
print(f" Output: {output_path}")
|
||||
print()
|
||||
|
||||
stats = {"total": 0, "wiki_lookups": 0, "rcon_commands": 0, "rcon_success": 0}
|
||||
|
||||
for round_num in range(args.rounds):
|
||||
topic = random.choice(topics)
|
||||
player = random.choice(PLAYERS)
|
||||
|
||||
print(f"\n── Round {round_num+1} ──")
|
||||
print(f" Topic: {topic[:80].replace('{p}', player)}")
|
||||
|
||||
example = run_exploration(topic, player, args.ollama_url, args.model, rcon)
|
||||
|
||||
stats["total"] += 1
|
||||
stats["wiki_lookups"] += example["metadata"]["wiki_lookups"]
|
||||
stats["rcon_commands"] += example["metadata"]["rcon_commands"]
|
||||
stats["rcon_success"] += int(example["metadata"]["success_rate"] * example["metadata"]["rcon_commands"])
|
||||
|
||||
print(f" Result: {example['metadata']['wiki_lookups']} lookups, "
|
||||
f"{example['metadata']['rcon_commands']} commands, "
|
||||
f"{example['metadata']['success_rate']:.0%} success")
|
||||
|
||||
with open(output_path, "a") as f:
|
||||
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
||||
|
||||
if (round_num + 1) % 10 == 0:
|
||||
rate = stats["rcon_success"] / max(stats["rcon_commands"], 1) * 100
|
||||
print(f"\n Progress: {stats['total']} explorations, "
|
||||
f"{stats['wiki_lookups']} wiki lookups, "
|
||||
f"{stats['rcon_commands']} commands ({rate:.0f}% success)")
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\nExploration complete: {stats['total']} topics explored")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Filter exploration and self-play data for quality.
|
||||
|
||||
Keeps:
|
||||
- Successful interactions (model looked up info AND executed correctly)
|
||||
- First instance of each unique error pattern (for error correction training)
|
||||
- High wiki-lookup-to-command ratios (model actually used the knowledge)
|
||||
|
||||
Removes:
|
||||
- Duplicate topics (keeps first occurrence only)
|
||||
- Empty responses (no commands, no lookups)
|
||||
- Repeated failures on the same command pattern
|
||||
- Rounds where model ignored wiki results
|
||||
|
||||
Output: data/processed/filtered_exploration.jsonl
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
|
||||
EXPLORATION_DIR = PROJECT_ROOT / "data" / "raw" / "exploration_selfplay"
|
||||
TOOL_SELFPLAY_DIR = PROJECT_ROOT / "data" / "raw" / "tool_selfplay"
|
||||
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "filtered_exploration.jsonl"
|
||||
|
||||
|
||||
def load_all_examples():
|
||||
"""Load all exploration and tool self-play examples."""
|
||||
examples = []
|
||||
|
||||
for jsonl in sorted(EXPLORATION_DIR.glob("*.jsonl")):
|
||||
with open(jsonl) as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
try:
|
||||
examples.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
for jsonl in sorted(TOOL_SELFPLAY_DIR.glob("*.jsonl")):
|
||||
with open(jsonl) as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
try:
|
||||
examples.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return examples
|
||||
|
||||
|
||||
def filter_examples(examples):
|
||||
"""Filter for quality."""
|
||||
kept = []
|
||||
seen_topics = set()
|
||||
seen_error_patterns = set()
|
||||
stats = {
|
||||
"total": len(examples),
|
||||
"kept_success": 0,
|
||||
"kept_error_correction": 0,
|
||||
"kept_wiki_grounded": 0,
|
||||
"dropped_duplicate": 0,
|
||||
"dropped_empty": 0,
|
||||
"dropped_repeat_failure": 0,
|
||||
}
|
||||
|
||||
for ex in examples:
|
||||
meta = ex.get("metadata", {})
|
||||
inp = ex.get("input", {})
|
||||
topic = inp.get("user_message", "")[:80]
|
||||
success_rate = meta.get("success_rate", meta.get("all_success", False))
|
||||
wiki_lookups = meta.get("wiki_lookups", 0)
|
||||
rcon_commands = meta.get("rcon_commands", 0)
|
||||
|
||||
# Skip empty
|
||||
if rcon_commands == 0 and wiki_lookups == 0:
|
||||
stats["dropped_empty"] += 1
|
||||
continue
|
||||
|
||||
# Deduplicate topics (keep first)
|
||||
if topic in seen_topics:
|
||||
stats["dropped_duplicate"] += 1
|
||||
continue
|
||||
seen_topics.add(topic)
|
||||
|
||||
# Categorize
|
||||
if isinstance(success_rate, bool):
|
||||
is_success = success_rate
|
||||
else:
|
||||
is_success = success_rate > 0.7
|
||||
|
||||
if is_success and rcon_commands > 0:
|
||||
# Successful interaction — always keep
|
||||
stats["kept_success"] += 1
|
||||
kept.append(ex)
|
||||
elif wiki_lookups > 0 and rcon_commands > 0:
|
||||
# Wiki-grounded (looked things up before acting) — keep even if some failures
|
||||
stats["kept_wiki_grounded"] += 1
|
||||
kept.append(ex)
|
||||
elif not is_success and rcon_commands > 0:
|
||||
# Failed — keep only first instance of each error pattern
|
||||
commands = ex.get("output", {}).get("commands", [])
|
||||
if commands:
|
||||
# Use first command as error pattern key
|
||||
pattern = commands[0][:40] if isinstance(commands[0], str) else ""
|
||||
else:
|
||||
rcon_results = meta.get("rcon_results", [])
|
||||
pattern = str(rcon_results[:1])[:60] if rcon_results else ""
|
||||
|
||||
if pattern and pattern not in seen_error_patterns:
|
||||
seen_error_patterns.add(pattern)
|
||||
stats["kept_error_correction"] += 1
|
||||
kept.append(ex)
|
||||
else:
|
||||
stats["dropped_repeat_failure"] += 1
|
||||
else:
|
||||
stats["dropped_empty"] += 1
|
||||
|
||||
return kept, stats
|
||||
|
||||
|
||||
def main():
|
||||
print("Loading examples...")
|
||||
examples = load_all_examples()
|
||||
print(f" Loaded {len(examples)} raw examples")
|
||||
|
||||
print("Filtering...")
|
||||
filtered, stats = filter_examples(examples)
|
||||
|
||||
print(f"\nFilter results:")
|
||||
for k, v in stats.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
print(f"\nKept: {len(filtered)} ({100*len(filtered)//max(stats['total'],1)}%)")
|
||||
|
||||
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_PATH, "w") as f:
|
||||
for ex in filtered:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Written to {OUTPUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate SVG training history chart for the Gitea README.
|
||||
|
||||
X-axis: Model version
|
||||
Y-axis: Training examples (bar) and inverse loss (line)
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
OUTPUT = Path(__file__).resolve().parent.parent.parent / "branding" / "training_progress.svg"
|
||||
|
||||
# Historical data from training runs
|
||||
VERSIONS = [
|
||||
{"version": "0.1.0", "examples": 500, "loss": 2.10, "label": "v1 (seed)"},
|
||||
{"version": "0.2.0", "examples": 1200, "loss": 1.45, "label": "v2 (+entities)"},
|
||||
{"version": "0.3.0", "examples": 2100, "loss": 0.82, "label": "v3 (+errors)"},
|
||||
{"version": "0.4.0", "examples": 3175, "loss": 0.35, "label": "v4 (+tools)"},
|
||||
{"version": "0.5.0", "examples": 4358, "loss": 0.16, "label": "v5 (+plugins)"},
|
||||
]
|
||||
|
||||
# Chart dimensions
|
||||
W = 700
|
||||
H = 400
|
||||
PAD_L = 70
|
||||
PAD_R = 30
|
||||
PAD_T = 40
|
||||
PAD_B = 80
|
||||
PLOT_W = W - PAD_L - PAD_R
|
||||
PLOT_H = H - PAD_T - PAD_B
|
||||
|
||||
# Colors
|
||||
BG = "#111111"
|
||||
GRID = "#252525"
|
||||
TEXT = "#999999"
|
||||
BAR_COLOR = "#D35400"
|
||||
LINE_COLOR = "#4caf50"
|
||||
LABEL_COLOR = "#e0e0e0"
|
||||
|
||||
|
||||
def generate_svg():
|
||||
max_examples = max(v["examples"] for v in VERSIONS) * 1.15
|
||||
max_inv_loss = max(1.0 / v["loss"] for v in VERSIONS) * 1.15
|
||||
n = len(VERSIONS)
|
||||
bar_w = PLOT_W / n * 0.6
|
||||
gap = PLOT_W / n
|
||||
|
||||
svg = f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {W} {H}" width="{W}" height="{H}">
|
||||
<rect width="{W}" height="{H}" fill="{BG}" rx="8"/>
|
||||
|
||||
<!-- Title -->
|
||||
<text x="{W/2}" y="25" fill="{LABEL_COLOR}" font-family="monospace" font-size="16" text-anchor="middle" font-weight="bold">Mortdecai Training Progress</text>
|
||||
|
||||
<!-- Grid lines -->
|
||||
"""
|
||||
# Y-axis grid (examples)
|
||||
for i in range(5):
|
||||
y = PAD_T + PLOT_H - (i / 4 * PLOT_H)
|
||||
val = int(max_examples * i / 4)
|
||||
svg += f'<line x1="{PAD_L}" y1="{y}" x2="{W-PAD_R}" y2="{y}" stroke="{GRID}" stroke-width="0.5"/>\n'
|
||||
svg += f'<text x="{PAD_L-5}" y="{y+4}" fill="{TEXT}" font-family="monospace" font-size="10" text-anchor="end">{val:,}</text>\n'
|
||||
|
||||
# Bars (training examples)
|
||||
for i, v in enumerate(VERSIONS):
|
||||
cx = PAD_L + gap * i + gap / 2
|
||||
bh = (v["examples"] / max_examples) * PLOT_H
|
||||
by = PAD_T + PLOT_H - bh
|
||||
|
||||
svg += f'<rect x="{cx - bar_w/2}" y="{by}" width="{bar_w}" height="{bh}" fill="{BAR_COLOR}" rx="3" opacity="0.85"/>\n'
|
||||
svg += f'<text x="{cx}" y="{by - 8}" fill="{BAR_COLOR}" font-family="monospace" font-size="11" text-anchor="middle" font-weight="bold">{v["examples"]:,}</text>\n'
|
||||
|
||||
# X-axis label
|
||||
svg += f'<text x="{cx}" y="{PAD_T + PLOT_H + 20}" fill="{LABEL_COLOR}" font-family="monospace" font-size="12" text-anchor="middle">{v["version"]}</text>\n'
|
||||
svg += f'<text x="{cx}" y="{PAD_T + PLOT_H + 35}" fill="{TEXT}" font-family="monospace" font-size="9" text-anchor="middle">{v["label"]}</text>\n'
|
||||
|
||||
# Line (inverse loss = quality)
|
||||
points = []
|
||||
for i, v in enumerate(VERSIONS):
|
||||
cx = PAD_L + gap * i + gap / 2
|
||||
inv_loss = 1.0 / v["loss"]
|
||||
ly = PAD_T + PLOT_H - (inv_loss / max_inv_loss) * PLOT_H
|
||||
points.append(f"{cx},{ly}")
|
||||
|
||||
polyline = " ".join(points)
|
||||
svg += f'<polyline points="{polyline}" fill="none" stroke="{LINE_COLOR}" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"/>\n'
|
||||
|
||||
# Dots on line
|
||||
for i, v in enumerate(VERSIONS):
|
||||
cx = PAD_L + gap * i + gap / 2
|
||||
inv_loss = 1.0 / v["loss"]
|
||||
ly = PAD_T + PLOT_H - (inv_loss / max_inv_loss) * PLOT_H
|
||||
svg += f'<circle cx="{cx}" cy="{ly}" r="4" fill="{LINE_COLOR}"/>\n'
|
||||
svg += f'<text x="{cx}" y="{ly - 10}" fill="{LINE_COLOR}" font-family="monospace" font-size="10" text-anchor="middle">loss={v["loss"]}</text>\n'
|
||||
|
||||
# Y-axis labels
|
||||
svg += f'<text x="{PAD_L - 45}" y="{PAD_T + PLOT_H/2}" fill="{BAR_COLOR}" font-family="monospace" font-size="11" text-anchor="middle" transform="rotate(-90,{PAD_L-45},{PAD_T+PLOT_H/2})">Training Examples</text>\n'
|
||||
|
||||
# Legend
|
||||
svg += f'<rect x="{W-180}" y="{PAD_T+5}" width="12" height="12" fill="{BAR_COLOR}" rx="2"/>\n'
|
||||
svg += f'<text x="{W-163}" y="{PAD_T+15}" fill="{TEXT}" font-family="monospace" font-size="10">Training Examples</text>\n'
|
||||
svg += f'<line x1="{W-180}" y1="{PAD_T+28}" x2="{W-168}" y2="{PAD_T+28}" stroke="{LINE_COLOR}" stroke-width="2.5"/>\n'
|
||||
svg += f'<text x="{W-163}" y="{PAD_T+32}" fill="{TEXT}" font-family="monospace" font-size="10">Model Quality (1/loss)</text>\n'
|
||||
|
||||
# X-axis label
|
||||
svg += f'<text x="{W/2}" y="{H-10}" fill="{TEXT}" font-family="monospace" font-size="11" text-anchor="middle">Model Version</text>\n'
|
||||
|
||||
svg += "</svg>"
|
||||
return svg
|
||||
|
||||
|
||||
def main():
|
||||
svg = generate_svg()
|
||||
OUTPUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT, "w") as f:
|
||||
f.write(svg)
|
||||
print(f"Chart saved to {OUTPUT}")
|
||||
print(f"Embed in README: ")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,370 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Regenerate tool-calling training data using mortdecai:0.5.0.
|
||||
|
||||
Uses the model-driven tool loop: sends prompts to 0.5.0, lets it decide
|
||||
which tools to call, executes via RCON, and captures the full multi-turn
|
||||
conversation as training data. Only keeps examples where all commands succeed.
|
||||
|
||||
This produces "distilled" data — the model's best outputs, validated by RCON.
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
import requests
|
||||
from agent.tools.persistent_rcon import get_rcon
|
||||
from agent.tools.tool_schemas import qwen3_tools_block
|
||||
from agent.prompts.system_prompts import SYNTAX_RULES, RISK_GRADIENT
|
||||
|
||||
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "tool_training_v05.jsonl"
|
||||
|
||||
TOOLS_BLOCK = qwen3_tools_block()
|
||||
SYSTEM = (
|
||||
"/no_think\n"
|
||||
"You are a Minecraft 1.21 command translator for a Paper server.\n"
|
||||
"Plugins: FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n\n"
|
||||
"You have tools. To call one:\n"
|
||||
"<tool_call>\n{\"name\": \"tool_name\", \"arguments\": {...}}\n</tool_call>\n\n"
|
||||
"Available: rcon.execute, minecraft.wiki_lookup, plugin.docs_lookup, "
|
||||
"minecraft.changelog_lookup, world.player_info, world.server_state, "
|
||||
"world.nearby_entities, memory.read, memory.write, "
|
||||
"script.write, script.validate, script.execute, script.read, script.list, "
|
||||
"script.delete, script.schedule.\n\n"
|
||||
"After tool calls, respond with JSON:\n"
|
||||
"{\"risk_level\": <0-5>, \"commands\": [...], \"reasoning\": \"...\"}\n\n"
|
||||
"PERMISSION LEVEL: 4 (generous).\n" + SYNTAX_RULES + RISK_GRADIENT
|
||||
)
|
||||
|
||||
SYSTEM_GOD = (
|
||||
"/no_think\n"
|
||||
"You are God in a Minecraft server with full tool access.\n"
|
||||
"Return JSON: {\"risk_level\": <0-5>, \"message\": \"...\", \"commands\": [...], \"reasoning\": \"...\"}\n\n"
|
||||
+ SYNTAX_RULES + "\n" + TOOLS_BLOCK
|
||||
)
|
||||
|
||||
PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx"]
|
||||
|
||||
# Comprehensive prompt set — every category we need good data for
|
||||
PROMPTS = {
|
||||
"basic_commands": [
|
||||
"sudo give me a diamond sword",
|
||||
"sudo give me 64 golden apples",
|
||||
"sudo give me a stack of oak planks",
|
||||
"sudo give me an elytra",
|
||||
"sudo give me a spyglass",
|
||||
"sudo give me a recovery compass",
|
||||
"sudo give me a bundle",
|
||||
"sudo set time to noon",
|
||||
"sudo set time to midnight",
|
||||
"sudo clear weather for a week",
|
||||
"sudo make it thunder",
|
||||
"sudo kill all hostile mobs",
|
||||
"sudo kill all items on the ground",
|
||||
"sudo gamemode creative",
|
||||
"sudo gamemode survival",
|
||||
"sudo gamemode spectator",
|
||||
],
|
||||
"enchanted_gear": [
|
||||
"sudo give me a diamond sword with sharpness 5, unbreaking 3, mending, and looting 3",
|
||||
"sudo give me a netherite pickaxe with efficiency 5, fortune 3, unbreaking 3, mending",
|
||||
"sudo give me a bow with power 5, infinity, flame, punch 2",
|
||||
"sudo full netherite armor with protection 4, unbreaking 3, mending on every piece",
|
||||
"sudo give me boots with feather falling 4, depth strider 3, soul speed 3",
|
||||
"sudo give me a trident with loyalty 3 and channeling",
|
||||
"sudo give me a trident with riptide 3",
|
||||
"sudo give me a crossbow with multishot and quick charge 3",
|
||||
"sudo give me a mace with density 5 and wind burst 3",
|
||||
"sudo best fishing rod possible",
|
||||
"sudo give me a shield with unbreaking 3 and mending",
|
||||
],
|
||||
"effects": [
|
||||
"sudo give me speed 3 for 10 minutes",
|
||||
"sudo night vision permanently",
|
||||
"sudo make me invisible for 5 minutes",
|
||||
"sudo give me fire resistance for an hour",
|
||||
"sudo give everyone online regeneration 2",
|
||||
"sudo give me haste 2 for 10 minutes",
|
||||
"sudo slow falling for 60 seconds",
|
||||
"sudo give me water breathing forever",
|
||||
"sudo give me strength 2 and resistance 2 for 5 minutes",
|
||||
"sudo clear all my effects",
|
||||
],
|
||||
"teleport_position": [
|
||||
"sudo tp me to 0 100 0",
|
||||
"sudo tp me to the nether",
|
||||
"sudo tp everyone to spawn",
|
||||
"sudo teleport me 100 blocks north",
|
||||
"sudo tp me up 50 blocks",
|
||||
"sudo set my spawn point here",
|
||||
],
|
||||
"building": [
|
||||
"sudo fill a 10x10 platform of stone under me",
|
||||
"sudo place a beacon at my location",
|
||||
"sudo build a small cobblestone room around me",
|
||||
"sudo fill the area below me with water",
|
||||
"sudo make a glass dome over me",
|
||||
"sudo place 4 lanterns around me",
|
||||
"sudo clear a 20 block area above me",
|
||||
],
|
||||
"entities": [
|
||||
"sudo summon a horse with a saddle",
|
||||
"sudo summon 5 cows near me",
|
||||
"sudo summon a villager",
|
||||
"sudo spawn an iron golem",
|
||||
"sudo summon a warden 20 blocks away",
|
||||
"sudo summon a wither",
|
||||
"sudo kill all zombies within 50 blocks",
|
||||
"sudo kill all creepers near me",
|
||||
],
|
||||
"worldguard": [
|
||||
"sudo create a region called my-base and set pvp deny",
|
||||
"sudo prevent mob spawning in the spawn region",
|
||||
"sudo set a greeting message for spawn: Welcome to the server!",
|
||||
"sudo deny entry to non-members in the vault region",
|
||||
"sudo list all regions",
|
||||
"sudo allow TNT in the arena",
|
||||
"sudo prevent fire spread globally",
|
||||
"sudo make a healing zone at spawn",
|
||||
],
|
||||
"coreprotect": [
|
||||
"sudo enable block inspector",
|
||||
"sudo rollback the last hour of changes",
|
||||
"sudo rollback what TheBigBoss did in the last 30 minutes",
|
||||
"sudo lookup who placed blocks near me today",
|
||||
"sudo rollback TNT damage from the last 2 hours",
|
||||
"sudo check coreprotect status",
|
||||
"sudo restore what was rolled back",
|
||||
],
|
||||
"essentialsx": [
|
||||
"sudo set my home here",
|
||||
"sudo create a warp called arena",
|
||||
"sudo give Ace13245 1000 coins",
|
||||
"sudo check my balance",
|
||||
"sudo heal me",
|
||||
"sudo feed me",
|
||||
"sudo repair my held item",
|
||||
"sudo set my nickname to DragonLord",
|
||||
"sudo broadcast Welcome to the server!",
|
||||
"sudo god mode on",
|
||||
"sudo fly mode on",
|
||||
],
|
||||
"luckperms": [
|
||||
"sudo create a VIP group",
|
||||
"sudo add Ace13245 to VIP",
|
||||
"sudo give VIP permission to fly",
|
||||
"sudo give me temporary VIP for 24 hours",
|
||||
"sudo set VIP prefix to gold [VIP]",
|
||||
"sudo list all permission groups",
|
||||
"sudo create a builder group with worldedit access",
|
||||
],
|
||||
"fawe": [
|
||||
"sudo make a glass sphere radius 8",
|
||||
"sudo hollow stone sphere radius 10",
|
||||
"sudo cylinder of quartz 5 wide 12 tall",
|
||||
"sudo replace all stone with deepslate in selection",
|
||||
"sudo smooth the terrain 5 iterations",
|
||||
"sudo drain water within 20 blocks",
|
||||
"sudo sandstone pyramid 8 tall",
|
||||
"sudo undo my last worldedit operation",
|
||||
],
|
||||
"god_prayers": [
|
||||
"pray oh great one, bless me with diamonds",
|
||||
"pray lord, protect me from the monsters of the night",
|
||||
"pray I offer this sacrifice of 64 wheat, grant me your favor",
|
||||
"pray god please make it stop raining",
|
||||
"pray smite the wicked TheBigBoss for griefing my base",
|
||||
"pray heal me, I am near death",
|
||||
"pray give me the strength to slay the ender dragon",
|
||||
"pray I am lost in a cave, guide me to the surface",
|
||||
],
|
||||
"error_prone": [
|
||||
"sudo give me a bed",
|
||||
"sudo give me steak",
|
||||
"sudo give me cooked beef",
|
||||
"sudo effect give me speed",
|
||||
"sudo give me a log",
|
||||
"sudo fill with stone 10",
|
||||
"sudo tp me to spawn",
|
||||
"sudo give @s diamond 1",
|
||||
],
|
||||
"complex_multi": [
|
||||
"sudo gear me up for the nether: armor, weapons, food, fire resistance",
|
||||
"sudo prepare me for the end fight: bow, arrows, blocks, pearls, slow falling",
|
||||
"sudo set up a new player kit: stone tools, food, bed, torches",
|
||||
"sudo create a mob farm: platform, water channels, collection hopper",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def query_model_with_tools(prompt, player, ollama_url, model, rcon, mode="sudo", max_steps=6):
|
||||
"""Send prompt to model, let it call tools, execute them, capture full chain."""
|
||||
system = SYSTEM_GOD if mode == "god" else SYSTEM
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": f"Player {player}: {prompt}"},
|
||||
]
|
||||
|
||||
tool_trace = []
|
||||
all_rcon_results = []
|
||||
|
||||
for step in range(max_steps):
|
||||
try:
|
||||
r = requests.post(f"{ollama_url}/api/chat", json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.2, "num_predict": 800},
|
||||
}, timeout=90)
|
||||
raw = r.json()["message"]["content"]
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
raw = re.sub(r'<think>[\s\S]*?</think>\s*', '', raw)
|
||||
|
||||
# Check for tool calls
|
||||
tool_matches = re.findall(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', raw, re.DOTALL)
|
||||
|
||||
if not tool_matches:
|
||||
# Final response
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
return {
|
||||
"messages": messages + [{"role": "assistant", "content": raw}],
|
||||
"commands": parsed.get("commands", []),
|
||||
"message": parsed.get("message", ""),
|
||||
"reasoning": parsed.get("reasoning", ""),
|
||||
"tool_trace": tool_trace,
|
||||
"rcon_results": all_rcon_results,
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
for tc_json in tool_matches:
|
||||
try:
|
||||
tc = json.loads(tc_json)
|
||||
tool_name = tc.get("name", "")
|
||||
tool_args = tc.get("arguments", {})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Execute tool
|
||||
if tool_name == "rcon.execute":
|
||||
cmd = tool_args.get("command", "")
|
||||
try:
|
||||
result_text = rcon.command(cmd)
|
||||
is_err = any(e in result_text for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected"))
|
||||
result = {"success": not is_err, "result": result_text[:300]}
|
||||
all_rcon_results.append({"cmd": cmd, "ok": not is_err, "result": result_text[:200]})
|
||||
except Exception as e:
|
||||
result = {"success": False, "result": str(e)}
|
||||
all_rcon_results.append({"cmd": cmd, "ok": False, "result": str(e)})
|
||||
elif tool_name == "minecraft.wiki_lookup":
|
||||
try:
|
||||
from agent.tools.knowledge_lookup import handle_knowledge_tool
|
||||
result = handle_knowledge_tool(tool_name, tool_args)
|
||||
except Exception:
|
||||
result = {"content": "Wiki unavailable", "url": "", "ok": False}
|
||||
elif tool_name in ("plugin.docs_lookup", "minecraft.changelog_lookup", "paper.docs_lookup"):
|
||||
try:
|
||||
from agent.tools.knowledge_lookup import handle_knowledge_tool
|
||||
result = handle_knowledge_tool(tool_name, tool_args)
|
||||
except Exception:
|
||||
result = {"content": "Docs unavailable", "url": "", "ok": False}
|
||||
else:
|
||||
result = {"ok": True, "result": "simulated"}
|
||||
|
||||
tool_trace.append({"tool": tool_name, "args": str(tool_args)[:100], "step": step})
|
||||
messages.append({"role": "assistant", "content": f"<tool_call>\n{json.dumps(tc)}\n</tool_call>"})
|
||||
messages.append({"role": "tool", "content": json.dumps(result)[:2000]})
|
||||
|
||||
return None # Ran out of steps
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
||||
parser.add_argument("--model", default="mortdecai:0.5.0")
|
||||
parser.add_argument("--rcon-host", default="192.168.0.244")
|
||||
parser.add_argument("--rcon-port", type=int, default=25578)
|
||||
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
|
||||
args = parser.parse_args()
|
||||
|
||||
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
|
||||
print(f"Regenerating tool data with {args.model}")
|
||||
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
|
||||
|
||||
all_examples = []
|
||||
stats = {"total": 0, "kept": 0, "failed": 0, "no_response": 0}
|
||||
|
||||
for category, prompts in PROMPTS.items():
|
||||
print(f"\n── {category} ({len(prompts)} prompts) ──")
|
||||
for prompt in prompts:
|
||||
player = random.choice(PLAYERS)
|
||||
mode = "god" if prompt.startswith("pray ") else "sudo"
|
||||
|
||||
result = query_model_with_tools(prompt, player, args.ollama_url, args.model, rcon, mode)
|
||||
stats["total"] += 1
|
||||
|
||||
if not result:
|
||||
stats["no_response"] += 1
|
||||
print(f" SKIP: {prompt[:50]} (no response)")
|
||||
continue
|
||||
|
||||
rcon_ok = sum(1 for r in result["rcon_results"] if r["ok"])
|
||||
rcon_total = len(result["rcon_results"])
|
||||
tools_used = len(result["tool_trace"])
|
||||
|
||||
if rcon_total == 0 and tools_used == 0:
|
||||
stats["no_response"] += 1
|
||||
print(f" SKIP: {prompt[:50]} (empty)")
|
||||
continue
|
||||
|
||||
all_success = rcon_total > 0 and all(r["ok"] for r in result["rcon_results"])
|
||||
|
||||
if all_success or (rcon_ok > 0 and rcon_ok >= rcon_total * 0.7):
|
||||
stats["kept"] += 1
|
||||
example = {
|
||||
"id": f"v05-regen-{stats['total']:04d}",
|
||||
"source": "model_distillation_v05",
|
||||
"type": f"tool_{category}",
|
||||
"messages": result["messages"],
|
||||
"metadata": {
|
||||
"model": args.model,
|
||||
"category": category,
|
||||
"tools_used": tools_used,
|
||||
"rcon_total": rcon_total,
|
||||
"rcon_success": rcon_ok,
|
||||
"all_success": all_success,
|
||||
},
|
||||
}
|
||||
all_examples.append(example)
|
||||
print(f" KEPT: {prompt[:50]} ({rcon_ok}/{rcon_total} cmds, {tools_used} tools)")
|
||||
else:
|
||||
stats["failed"] += 1
|
||||
print(f" FAIL: {prompt[:50]} ({rcon_ok}/{rcon_total} cmds)")
|
||||
|
||||
time.sleep(0.2)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Total: {stats['total']}, Kept: {stats['kept']}, Failed: {stats['failed']}, Empty: {stats['no_response']}")
|
||||
print(f"Quality: {100*stats['kept']//max(stats['total'],1)}%")
|
||||
|
||||
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_PATH, "w") as f:
|
||||
for ex in all_examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
print(f"Written to {OUTPUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user