9c2c9a2310
Distilled Training Data (1,203 examples): - 341 initial gold (plugins, enchantments, builds, effects, god, errors) - 165 buildings + pipeline (100 structures built on dev, 65 request→query→act) - 24 safety-aware (worldborder, safe tp, intentional harm, gamemode checks) - 17 advanced logic (decanonized items, redstone gates, iterative builds) - 12 redstone mastery (NOT/OR/AND/XOR/RS-latch/T-flip-flop/comparator/clock) - 7 circuit verification and diagnosis - 1 compact comparator gates - 10 redstone methodology (build→test→save→recall→learn from mistakes) - 8 player journal usage - 29 creative+uncommon+pipeline+god with full tool chains Player Journal System: - agent/tools/player_journal.py — per-player text files (1-10 lines) - journal.read + journal.write tool schemas added - Cross-contaminated: God and Sudo share same journal per player - Includes sentiment, relationship, builds, preferences, skill level Redstone Engineering: - agent/prompts/redstone_rules.md — baked-in wall torch, dedicated lead, repeater rules - Learned from 4 iterations of 8-switch circuit: wall_torch on back face, not top - T-junction bypass prevention: dedicated lead wire between merge and NOT block - RCON limitation: can build circuits but cannot test them (lever toggle doesn't propagate) Training Data Cleaning: - 466 @s→@p fixes, 10 template commands removed - 12 outdated refusals replaced with correct plugin commands - Data de-duped across all sources Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
338 lines
12 KiB
Python
338 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert IGLU dataset to Mortdecai build training examples.
|
|
|
|
IGLU provides natural language instructions paired with block placement
|
|
coordinates. We convert these to:
|
|
1. Direct setblock/fill commands (for simple builds)
|
|
2. script.write + script.execute flows (for complex builds)
|
|
|
|
Source: microsoft/iglu-datasets singleturn dataset
|
|
Output: data/raw/iglu_build_training.jsonl
|
|
|
|
Usage:
|
|
python3 training/scripts/convert_iglu_to_training.py
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import os
|
|
import random
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
from agent.tools.tool_schemas import qwen3_tools_block
|
|
from agent.prompts.system_prompts import SYNTAX_RULES, RISK_GRADIENT
|
|
|
|
IGLU_DIR = PROJECT_ROOT / "data" / "external" / "iglu-repo" / "datasets" / "singleturn"
|
|
OUTPUT_PATH = PROJECT_ROOT / "data" / "raw" / "iglu_build_training.jsonl"
|
|
|
|
TOOLS_BLOCK = qwen3_tools_block()
|
|
SYSTEM = (
|
|
"You are a Minecraft 1.21 command translator with script writing abilities.\n"
|
|
"For complex builds (4+ blocks), write a mcfunction script. Validate first.\n"
|
|
"For simple builds (1-3 blocks), use rcon.execute directly.\n"
|
|
"PERMISSION LEVEL: 4 (generous).\n\n"
|
|
"Return JSON: {\"risk_level\": <0-5>, \"commands\": [...], \"reasoning\": \"...\"}\n\n"
|
|
+ SYNTAX_RULES + RISK_GRADIENT + "\n" + TOOLS_BLOCK
|
|
)
|
|
|
|
# IGLU uses color IDs for blocks. Map to Minecraft wool colors.
|
|
IGLU_BLOCK_MAP = {
|
|
57: "minecraft:blue_wool",
|
|
58: "minecraft:light_blue_wool",
|
|
59: "minecraft:green_wool",
|
|
60: "minecraft:red_wool",
|
|
61: "minecraft:orange_wool",
|
|
62: "minecraft:purple_wool",
|
|
63: "minecraft:yellow_wool",
|
|
}
|
|
|
|
# For variety, also map to concrete and terracotta
|
|
BLOCK_VARIANTS = {
|
|
"wool": {
|
|
57: "minecraft:blue_wool", 58: "minecraft:light_blue_wool",
|
|
59: "minecraft:green_wool", 60: "minecraft:red_wool",
|
|
61: "minecraft:orange_wool", 62: "minecraft:purple_wool",
|
|
63: "minecraft:yellow_wool",
|
|
},
|
|
"concrete": {
|
|
57: "minecraft:blue_concrete", 58: "minecraft:light_blue_concrete",
|
|
59: "minecraft:green_concrete", 60: "minecraft:red_concrete",
|
|
61: "minecraft:orange_concrete", 62: "minecraft:purple_concrete",
|
|
63: "minecraft:yellow_concrete",
|
|
},
|
|
"terracotta": {
|
|
57: "minecraft:blue_terracotta", 58: "minecraft:light_blue_terracotta",
|
|
59: "minecraft:green_terracotta", 60: "minecraft:red_terracotta",
|
|
61: "minecraft:orange_terracotta", 62: "minecraft:purple_terracotta",
|
|
63: "minecraft:yellow_terracotta",
|
|
},
|
|
}
|
|
|
|
PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx", "CreeperKing99"]
|
|
|
|
|
|
def sys_msg():
|
|
return {"role": "system", "content": SYSTEM}
|
|
|
|
def user_msg(text):
|
|
return {"role": "user", "content": text}
|
|
|
|
def tool_call(name, args):
|
|
return {"role": "assistant", "content": f"<tool_call>\n{json.dumps({'name': name, 'arguments': args})}\n</tool_call>"}
|
|
|
|
def tool_result(data):
|
|
return {"role": "tool", "content": json.dumps(data)}
|
|
|
|
def final_response(resp):
|
|
return {"role": "assistant", "content": json.dumps(resp)}
|
|
|
|
|
|
def blocks_to_commands(blocks_to_place, blocks_to_remove, block_map, use_relative=True, offset=(0, 64, 0)):
|
|
"""Convert block coordinate lists to setblock/fill commands."""
|
|
commands = []
|
|
|
|
# Group placed blocks by color for potential fill optimization
|
|
by_color = defaultdict(list)
|
|
for x, y, z, color_id in blocks_to_place:
|
|
block = block_map.get(color_id, "minecraft:white_wool")
|
|
by_color[block].append((x, y, z))
|
|
|
|
for block, coords in by_color.items():
|
|
if len(coords) == 1:
|
|
x, y, z = coords[0]
|
|
if use_relative:
|
|
commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} {block}")
|
|
else:
|
|
commands.append(f"setblock {x} {y} {z} {block}")
|
|
elif len(coords) <= 3:
|
|
for x, y, z in coords:
|
|
if use_relative:
|
|
commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} {block}")
|
|
else:
|
|
commands.append(f"setblock {x} {y} {z} {block}")
|
|
else:
|
|
# Try to find a bounding box for fill
|
|
xs = [c[0] for c in coords]
|
|
ys = [c[1] for c in coords]
|
|
zs = [c[2] for c in coords]
|
|
min_x, max_x = min(xs), max(xs)
|
|
min_y, max_y = min(ys), max(ys)
|
|
min_z, max_z = min(zs), max(zs)
|
|
|
|
# Check if it's a solid fill (all positions in the box are filled)
|
|
box_volume = (max_x - min_x + 1) * (max_y - min_y + 1) * (max_z - min_z + 1)
|
|
if box_volume == len(coords) and box_volume > 2:
|
|
if use_relative:
|
|
commands.append(
|
|
f"fill ~{min_x} ~{min_y-offset[1]} ~{min_z} "
|
|
f"~{max_x} ~{max_y-offset[1]} ~{max_z} {block}"
|
|
)
|
|
else:
|
|
commands.append(
|
|
f"fill {min_x} {min_y} {min_z} {max_x} {max_y} {max_z} {block}"
|
|
)
|
|
else:
|
|
# Not a clean box — individual setblocks
|
|
for x, y, z in coords:
|
|
if use_relative:
|
|
commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} {block}")
|
|
else:
|
|
commands.append(f"setblock {x} {y} {z} {block}")
|
|
|
|
# Remove blocks
|
|
for x, y, z, _ in blocks_to_remove:
|
|
if use_relative:
|
|
commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} minecraft:air")
|
|
else:
|
|
commands.append(f"setblock {x} {y} {z} minecraft:air")
|
|
|
|
return commands
|
|
|
|
|
|
def load_iglu_pairs():
|
|
"""Load instruction-to-build pairs from IGLU dataset."""
|
|
csv_path = IGLU_DIR / "clarifying_questions_train.csv"
|
|
if not csv_path.exists():
|
|
print(f"CSV not found: {csv_path}")
|
|
return []
|
|
|
|
# Build target state index
|
|
target_dir = IGLU_DIR / "target_world_states" / "builder-data"
|
|
targets = {}
|
|
if target_dir.exists():
|
|
for game_dir in target_dir.iterdir():
|
|
if game_dir.is_dir():
|
|
for step_file in game_dir.iterdir():
|
|
if step_file.is_file():
|
|
targets.setdefault(game_dir.name, []).append(step_file)
|
|
|
|
pairs = []
|
|
with open(csv_path) as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
game_id = row['GameId'].lower().replace(' ', '')
|
|
instruction = row['InputInstruction'].strip()
|
|
if not instruction or row.get('IsInstructionClear') != 'Yes':
|
|
continue
|
|
|
|
init_path = IGLU_DIR / row['InitializedWorldPath']
|
|
if game_id in targets and init_path.exists():
|
|
pairs.append({
|
|
'instruction': instruction,
|
|
'init_path': str(init_path),
|
|
'target_path': str(targets[game_id][0]),
|
|
'game_id': game_id,
|
|
})
|
|
|
|
return pairs
|
|
|
|
|
|
def convert_pair_to_example(pair, idx, block_variant="wool"):
|
|
"""Convert one IGLU pair to a training example."""
|
|
block_map = BLOCK_VARIANTS.get(block_variant, BLOCK_VARIANTS["wool"])
|
|
player = random.choice(PLAYERS)
|
|
|
|
with open(pair['init_path']) as f:
|
|
init = json.load(f)
|
|
with open(pair['target_path']) as f:
|
|
target = json.load(f)
|
|
|
|
init_blocks = set(tuple(b) for b in init.get('worldEndingState', {}).get('blocks', []))
|
|
target_blocks = set(tuple(b) for b in target.get('worldEndingState', {}).get('blocks', []))
|
|
|
|
to_place = sorted(target_blocks - init_blocks)
|
|
to_remove = sorted(init_blocks - target_blocks)
|
|
|
|
if not to_place and not to_remove:
|
|
return None
|
|
|
|
total_changes = len(to_place) + len(to_remove)
|
|
commands = blocks_to_commands(to_place, to_remove, block_map)
|
|
|
|
if not commands:
|
|
return None
|
|
|
|
instruction = pair['instruction']
|
|
# Make it sound like a Minecraft player request
|
|
prefixes = [
|
|
f"sudo {instruction}",
|
|
f"sudo can you {instruction.lower()}",
|
|
f"sudo please {instruction.lower()}",
|
|
f"sudo I need you to {instruction.lower()}",
|
|
]
|
|
prompt = random.choice(prefixes)
|
|
|
|
msgs = [sys_msg(), user_msg(f"Player {player}: {prompt}")]
|
|
|
|
if total_changes <= 4:
|
|
# Direct rcon.execute for small builds
|
|
for cmd in commands:
|
|
msgs.append(tool_call("rcon.execute", {"command": cmd}))
|
|
msgs.append(tool_result({"success": True, "result": "Changed the block"}))
|
|
reasoning = f"Direct block placement: {len(to_place)} placed, {len(to_remove)} removed."
|
|
resp = {"risk_level": 3, "commands": commands, "reasoning": reasoning}
|
|
else:
|
|
# Script workflow for larger builds
|
|
script_name = f"build_{idx:04d}"
|
|
desc = instruction[:80]
|
|
|
|
# Validate
|
|
msgs.append(tool_call("script.validate", {"commands": commands}))
|
|
msgs.append(tool_result({
|
|
"valid": True, "total": len(commands),
|
|
"passed": len(commands), "errors": [],
|
|
}))
|
|
|
|
# Write
|
|
msgs.append(tool_call("script.write", {
|
|
"name": script_name,
|
|
"commands": commands,
|
|
"description": desc,
|
|
}))
|
|
msgs.append(tool_result({
|
|
"ok": True, "path": f"mortdecai:{script_name}",
|
|
"lines": len(commands),
|
|
}))
|
|
|
|
# Execute at player
|
|
msgs.append(tool_call("script.execute", {
|
|
"name": script_name, "as_player": player,
|
|
}))
|
|
msgs.append(tool_result({
|
|
"ok": True,
|
|
"result": f"Executed {len(commands)} commands from function mortdecai:{script_name}",
|
|
}))
|
|
|
|
reasoning = (f"Complex build ({total_changes} block changes). "
|
|
f"Wrote script '{script_name}' with {len(commands)} commands. "
|
|
f"Placed {len(to_place)}, removed {len(to_remove)}.")
|
|
resp = {
|
|
"risk_level": 3,
|
|
"commands": [f"function mortdecai:{script_name}"],
|
|
"reasoning": reasoning,
|
|
}
|
|
|
|
msgs.append(final_response(resp))
|
|
|
|
return {
|
|
"id": f"iglu-build-{idx:05d}",
|
|
"source": "iglu_dataset",
|
|
"type": "build_script" if total_changes > 4 else "build_direct",
|
|
"block_changes": total_changes,
|
|
"messages": msgs,
|
|
}
|
|
|
|
|
|
def main():
|
|
print("Loading IGLU dataset...")
|
|
pairs = load_iglu_pairs()
|
|
print(f"Found {len(pairs)} instruction-build pairs")
|
|
|
|
if not pairs:
|
|
print("No data found. Make sure iglu-repo is cloned in data/external/")
|
|
return
|
|
|
|
examples = []
|
|
skipped = 0
|
|
|
|
# Process with variety — use different block variants
|
|
variants = list(BLOCK_VARIANTS.keys())
|
|
|
|
for idx, pair in enumerate(pairs):
|
|
variant = variants[idx % len(variants)]
|
|
ex = convert_pair_to_example(pair, idx, variant)
|
|
if ex:
|
|
examples.append(ex)
|
|
else:
|
|
skipped += 1
|
|
|
|
if (idx + 1) % 500 == 0:
|
|
print(f" Processed {idx+1}/{len(pairs)}, generated {len(examples)}")
|
|
|
|
# Stats
|
|
direct = sum(1 for e in examples if e['type'] == 'build_direct')
|
|
script = sum(1 for e in examples if e['type'] == 'build_script')
|
|
avg_blocks = sum(e['block_changes'] for e in examples) / max(len(examples), 1)
|
|
|
|
print(f"\nGenerated {len(examples)} examples (skipped {skipped} empty)")
|
|
print(f" Direct (1-4 blocks): {direct}")
|
|
print(f" Script (5+ blocks): {script}")
|
|
print(f" Avg block changes: {avg_blocks:.1f}")
|
|
|
|
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(OUTPUT_PATH, "w") as f:
|
|
for ex in examples:
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
|
|
print(f"\nWritten to {OUTPUT_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|