1200+ distilled gold examples, journal system, redstone mastery, safety awareness
Distilled Training Data (1,203 examples): - 341 initial gold (plugins, enchantments, builds, effects, god, errors) - 165 buildings + pipeline (100 structures built on dev, 65 request→query→act) - 24 safety-aware (worldborder, safe tp, intentional harm, gamemode checks) - 17 advanced logic (decanonized items, redstone gates, iterative builds) - 12 redstone mastery (NOT/OR/AND/XOR/RS-latch/T-flip-flop/comparator/clock) - 7 circuit verification and diagnosis - 1 compact comparator gates - 10 redstone methodology (build→test→save→recall→learn from mistakes) - 8 player journal usage - 29 creative+uncommon+pipeline+god with full tool chains Player Journal System: - agent/tools/player_journal.py — per-player text files (1-10 lines) - journal.read + journal.write tool schemas added - Cross-contaminated: God and Sudo share same journal per player - Includes sentiment, relationship, builds, preferences, skill level Redstone Engineering: - agent/prompts/redstone_rules.md — baked-in wall torch, dedicated lead, repeater rules - Learned from 4 iterations of 8-switch circuit: wall_torch on back face, not top - T-junction bypass prevention: dedicated lead wire between merge and NOT block - RCON limitation: can build circuits but cannot test them (lever toggle doesn't propagate) Training Data Cleaning: - 466 @s→@p fixes, 10 template commands removed - 12 outdated refusals replaced with correct plugin commands - Data de-duped across all sources Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,449 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dataset merge pipeline for Mortdecai training.
|
||||
|
||||
Normalizes all dataset formats into the two schemas the trainer expects:
|
||||
1. `conversations` — [{role, content}, ...] for simple command gen
|
||||
2. `messages` + `qwen3_text` — multi-turn tool-calling with pre-formatted text
|
||||
|
||||
Handles deduplication, mix ratios, and outputs a single training-ready JSONL.
|
||||
|
||||
Usage:
|
||||
# Default merge with recommended ratios
|
||||
python3 merge_datasets.py
|
||||
|
||||
# Custom ratios (multipliers per source)
|
||||
python3 merge_datasets.py --ratios seed=2.0,tool=1.0,iglu=0.5
|
||||
|
||||
# Dry run — show stats without writing
|
||||
python3 merge_datasets.py --dry-run
|
||||
|
||||
# Include chat app exports
|
||||
python3 merge_datasets.py --include-chat-logs
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import hashlib
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from agent.prompts.system_prompts import get_prompt, SYNTAX_RULES, RISK_GRADIENT
|
||||
|
||||
# ── Dataset sources ───────────────────────────────────────────────────────────
|
||||
|
||||
SOURCES = {
|
||||
"seed": {
|
||||
"path": "data/processed/seed_dataset.jsonl",
|
||||
"format": "seed",
|
||||
"default_ratio": 2.0, # Oversample — keep seed dominant
|
||||
"description": "Core command gen with pos/neg pairs",
|
||||
},
|
||||
"tool": {
|
||||
"path": "data/processed/tool_training.jsonl",
|
||||
"format": "tool_messages",
|
||||
"default_ratio": 1.0,
|
||||
"description": "Multi-turn tool-calling examples",
|
||||
},
|
||||
"tool_v05": {
|
||||
"path": "data/processed/tool_training_v05.jsonl",
|
||||
"format": "tool_messages",
|
||||
"default_ratio": 1.5, # High quality, oversample
|
||||
"description": "0.5.0 quality tool examples",
|
||||
},
|
||||
"iglu": {
|
||||
"path": "data/raw/iglu_build_training.jsonl",
|
||||
"format": "tool_messages",
|
||||
"default_ratio": 0.8,
|
||||
"description": "IGLU building dataset",
|
||||
},
|
||||
"plugin": {
|
||||
"path": "data/raw/plugin_training.jsonl",
|
||||
"format": "tool_messages",
|
||||
"default_ratio": 1.5,
|
||||
"description": "Plugin command examples",
|
||||
},
|
||||
"exploration": {
|
||||
"path": "data/processed/filtered_exploration.jsonl",
|
||||
"format": "exploration",
|
||||
"default_ratio": 1.0,
|
||||
"description": "Wiki-grounded exploration",
|
||||
},
|
||||
"self_play": {
|
||||
"path": "data/processed/self_play.jsonl",
|
||||
"format": "self_play",
|
||||
"default_ratio": 0.6, # Large set, don't let it dominate
|
||||
"description": "Self-play generations",
|
||||
},
|
||||
"audit": {
|
||||
"path": "data/processed/filtered_audit.jsonl",
|
||||
"format": "audit",
|
||||
"default_ratio": 0.5, # Large set, needs dilution
|
||||
"description": "Filtered audit log data",
|
||||
},
|
||||
"distilled": {
|
||||
"path": "data/processed/distilled.jsonl",
|
||||
"format": "seed",
|
||||
"default_ratio": 1.5, # Gold standard from Claude
|
||||
"description": "Claude-distilled examples",
|
||||
},
|
||||
"chat_logs": {
|
||||
"path": "data/chat_logs/training_export.jsonl",
|
||||
"format": "audit",
|
||||
"default_ratio": 2.0, # Hand-curated via chat app
|
||||
"description": "Chat app training exports",
|
||||
"optional": True,
|
||||
},
|
||||
}
|
||||
|
||||
# Also include all raw training files
|
||||
RAW_TRAINING_FILES = [
|
||||
"data/raw/advanced_commands_training.jsonl",
|
||||
"data/raw/biome_dimension_training.jsonl",
|
||||
"data/raw/chaos_event_training.jsonl",
|
||||
"data/raw/chaos_gaps_training.jsonl",
|
||||
"data/raw/command_reference_training.jsonl",
|
||||
"data/raw/cosmetic_xp_training.jsonl",
|
||||
"data/raw/dangerous_effects_training.jsonl",
|
||||
"data/raw/death_environment_training.jsonl",
|
||||
"data/raw/distance_projectile_training.jsonl",
|
||||
"data/raw/distance_scale_training.jsonl",
|
||||
"data/raw/enchant_order_errors.jsonl",
|
||||
"data/raw/enchantment_training.jsonl",
|
||||
"data/raw/entity_mob_training.jsonl",
|
||||
"data/raw/entity_targeting_training.jsonl",
|
||||
"data/raw/error_correction_training.jsonl",
|
||||
"data/raw/event_trigger_training.jsonl",
|
||||
"data/raw/execute_chain_training.jsonl",
|
||||
"data/raw/fall_safety_training.jsonl",
|
||||
"data/raw/gamerule_training.jsonl",
|
||||
"data/raw/kill_radius_training.jsonl",
|
||||
"data/raw/memory_training.jsonl",
|
||||
"data/raw/multiplayer_training.jsonl",
|
||||
"data/raw/multistep_training.jsonl",
|
||||
"data/raw/paper_training.jsonl",
|
||||
"data/raw/prod_pattern_fixes.jsonl",
|
||||
"data/raw/quantity_training.jsonl",
|
||||
"data/raw/recipe_training.jsonl",
|
||||
"data/raw/redstone_training.jsonl",
|
||||
"data/raw/revert_and_drops_training.jsonl",
|
||||
"data/raw/revert_format_training.jsonl",
|
||||
"data/raw/risk_hierarchy_training.jsonl",
|
||||
"data/raw/script_tool_training.jsonl",
|
||||
"data/raw/suffocation_training.jsonl",
|
||||
"data/raw/worldedit_training.jsonl",
|
||||
]
|
||||
|
||||
# ── Format converters ─────────────────────────────────────────────────────────
|
||||
|
||||
SUDO_SYSTEM = get_prompt("sudo")
|
||||
GOD_SYSTEM = get_prompt("god")
|
||||
|
||||
|
||||
def _seed_to_conversations(record: dict) -> dict:
|
||||
"""Convert seed_dataset format to conversations."""
|
||||
inp = record.get("input", {})
|
||||
out = record.get("output", {})
|
||||
user_msg = inp.get("user_message", "")
|
||||
commands = out.get("commands", [])
|
||||
reasoning = out.get("reasoning", "")
|
||||
|
||||
# Detect mode from prefix
|
||||
if user_msg.lower().startswith("pray "):
|
||||
system = GOD_SYSTEM
|
||||
mode = "god"
|
||||
else:
|
||||
system = SUDO_SYSTEM
|
||||
mode = "sudo"
|
||||
|
||||
# Build assistant response JSON
|
||||
response = {"commands": commands, "reasoning": reasoning}
|
||||
if mode == "god":
|
||||
response["message"] = out.get("message", "")
|
||||
|
||||
return {
|
||||
"conversations": [
|
||||
{"role": "system", "content": "/no_think\n" + system},
|
||||
{"role": "user", "content": user_msg},
|
||||
{"role": "assistant", "content": json.dumps(response)},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _audit_to_conversations(record: dict) -> dict:
|
||||
"""Convert audit log format to conversations."""
|
||||
inp = record.get("input", {})
|
||||
out = record.get("output", {})
|
||||
mode = record.get("mode", "sudo")
|
||||
user_msg = inp.get("user_message", "")
|
||||
commands = out.get("commands_generated", []) or out.get("commands", [])
|
||||
message = out.get("message", "")
|
||||
|
||||
system = GOD_SYSTEM if mode == "god" else SUDO_SYSTEM
|
||||
|
||||
response = {"commands": commands}
|
||||
if message:
|
||||
response["message"] = message
|
||||
|
||||
return {
|
||||
"conversations": [
|
||||
{"role": "system", "content": "/no_think\n" + system},
|
||||
{"role": "user", "content": user_msg},
|
||||
{"role": "assistant", "content": json.dumps(response)},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _self_play_to_conversations(record: dict) -> dict:
|
||||
"""Convert self_play format to conversations."""
|
||||
inp = record.get("input", {})
|
||||
out = record.get("output", {})
|
||||
user_msg = inp.get("user_message", "")
|
||||
commands = out.get("commands", [])
|
||||
reasoning = out.get("reasoning", "")
|
||||
message = out.get("message", record.get("message", ""))
|
||||
|
||||
if user_msg.lower().startswith("pray "):
|
||||
system = GOD_SYSTEM
|
||||
else:
|
||||
system = SUDO_SYSTEM
|
||||
|
||||
response = {"commands": commands, "reasoning": reasoning}
|
||||
if message:
|
||||
response["message"] = message
|
||||
|
||||
return {
|
||||
"conversations": [
|
||||
{"role": "system", "content": "/no_think\n" + system},
|
||||
{"role": "user", "content": user_msg},
|
||||
{"role": "assistant", "content": json.dumps(response)},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _exploration_to_conversations(record: dict) -> dict:
|
||||
"""Convert exploration format to conversations."""
|
||||
inp = record.get("input", {})
|
||||
out = record.get("output", {})
|
||||
user_msg = inp.get("user_message", "") if isinstance(inp, dict) else str(inp)
|
||||
commands = out.get("commands", [])
|
||||
reasoning = out.get("reasoning", "")
|
||||
|
||||
response = {"commands": commands, "reasoning": reasoning}
|
||||
|
||||
return {
|
||||
"conversations": [
|
||||
{"role": "system", "content": "/no_think\n" + SUDO_SYSTEM},
|
||||
{"role": "user", "content": user_msg},
|
||||
{"role": "assistant", "content": json.dumps(response)},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _tool_messages_passthrough(record: dict) -> dict:
|
||||
"""Tool training already has messages — pass through or use qwen3_text."""
|
||||
if "qwen3_text" in record:
|
||||
return {"text": record["qwen3_text"]}
|
||||
if "messages" in record:
|
||||
return {"conversations": record["messages"]}
|
||||
return None
|
||||
|
||||
|
||||
def _raw_training_to_conversations(record: dict) -> dict:
|
||||
"""Convert raw training files (same as seed format)."""
|
||||
return _seed_to_conversations(record)
|
||||
|
||||
|
||||
CONVERTERS = {
|
||||
"seed": _seed_to_conversations,
|
||||
"tool_messages": _tool_messages_passthrough,
|
||||
"audit": _audit_to_conversations,
|
||||
"self_play": _self_play_to_conversations,
|
||||
"exploration": _exploration_to_conversations,
|
||||
"raw_training": _raw_training_to_conversations,
|
||||
}
|
||||
|
||||
|
||||
# ── Pipeline ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def dedup_key(record: dict) -> str:
|
||||
"""Generate a dedup key from the training content."""
|
||||
if "text" in record:
|
||||
content = record["text"][:500]
|
||||
elif "conversations" in record:
|
||||
# Use user message + first 200 chars of assistant response
|
||||
user = ""
|
||||
asst = ""
|
||||
for msg in record["conversations"]:
|
||||
if msg["role"] == "user":
|
||||
user = msg["content"][:200]
|
||||
elif msg["role"] == "assistant" and not asst:
|
||||
asst = msg["content"][:200]
|
||||
content = user + "|" + asst
|
||||
else:
|
||||
content = json.dumps(record)[:500]
|
||||
return hashlib.md5(content.encode()).hexdigest()
|
||||
|
||||
|
||||
def load_and_convert(source_name: str, meta: dict, ratio: float) -> list:
|
||||
"""Load a source file, convert to training format, apply ratio."""
|
||||
path = PROJECT_ROOT / meta["path"]
|
||||
if not path.exists():
|
||||
if meta.get("optional"):
|
||||
return []
|
||||
print(f" WARNING: {path} not found, skipping {source_name}")
|
||||
return []
|
||||
|
||||
converter = CONVERTERS[meta["format"]]
|
||||
records = []
|
||||
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
raw = json.loads(line)
|
||||
converted = converter(raw)
|
||||
if converted:
|
||||
records.append(converted)
|
||||
except (json.JSONDecodeError, KeyError, TypeError) as e:
|
||||
continue
|
||||
|
||||
# Apply ratio (oversample or downsample)
|
||||
if ratio > 1.0:
|
||||
# Oversample: duplicate records
|
||||
full_copies = int(ratio)
|
||||
partial = ratio - full_copies
|
||||
oversampled = records * full_copies
|
||||
if partial > 0:
|
||||
extra = random.sample(records, int(len(records) * partial))
|
||||
oversampled.extend(extra)
|
||||
records = oversampled
|
||||
elif ratio < 1.0:
|
||||
# Downsample
|
||||
k = max(1, int(len(records) * ratio))
|
||||
records = random.sample(records, k)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Merge datasets for Mortdecai training")
|
||||
parser.add_argument("--output", type=Path,
|
||||
default=PROJECT_ROOT / "data" / "processed" / "merged_training_v06.jsonl")
|
||||
parser.add_argument("--ratios", default="",
|
||||
help="Override ratios: seed=2.0,tool=1.0,iglu=0.5")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print stats without writing output")
|
||||
parser.add_argument("--include-chat-logs", action="store_true",
|
||||
help="Include chat app training exports")
|
||||
parser.add_argument("--include-raw", action="store_true", default=True,
|
||||
help="Include raw training files (default: true)")
|
||||
parser.add_argument("--seed", type=int, default=42,
|
||||
help="Random seed for reproducibility")
|
||||
args = parser.parse_args()
|
||||
|
||||
random.seed(args.seed)
|
||||
|
||||
# Parse ratio overrides
|
||||
ratio_overrides = {}
|
||||
if args.ratios:
|
||||
for pair in args.ratios.split(","):
|
||||
name, val = pair.split("=")
|
||||
ratio_overrides[name.strip()] = float(val.strip())
|
||||
|
||||
# Filter sources
|
||||
active_sources = dict(SOURCES)
|
||||
if not args.include_chat_logs:
|
||||
active_sources.pop("chat_logs", None)
|
||||
|
||||
print("Mortdecai Dataset Merge Pipeline")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
all_records = []
|
||||
stats = {}
|
||||
|
||||
# Load named sources
|
||||
for name, meta in active_sources.items():
|
||||
ratio = ratio_overrides.get(name, meta["default_ratio"])
|
||||
records = load_and_convert(name, meta, ratio)
|
||||
raw_count = 0
|
||||
path = PROJECT_ROOT / meta["path"]
|
||||
if path.exists():
|
||||
with open(path) as f:
|
||||
raw_count = sum(1 for _ in f)
|
||||
|
||||
stats[name] = {"raw": raw_count, "after_ratio": len(records), "ratio": ratio}
|
||||
all_records.extend(records)
|
||||
print(f" {name:<20s} {raw_count:>6} raw x{ratio:.1f} = {len(records):>7} ({meta['description']})")
|
||||
|
||||
# Load raw training files
|
||||
if args.include_raw:
|
||||
raw_total = 0
|
||||
for filepath in RAW_TRAINING_FILES:
|
||||
path = PROJECT_ROOT / filepath
|
||||
if not path.exists():
|
||||
continue
|
||||
converter = CONVERTERS["raw_training"]
|
||||
count = 0
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
try:
|
||||
raw = json.loads(line.strip())
|
||||
converted = converter(raw)
|
||||
if converted:
|
||||
all_records.append(converted)
|
||||
count += 1
|
||||
except:
|
||||
continue
|
||||
raw_total += count
|
||||
stats["raw_files"] = {"raw": raw_total, "after_ratio": raw_total, "ratio": 1.0}
|
||||
print(f" {'raw_files':<20s} {raw_total:>6} raw x1.0 = {raw_total:>7} ({len(RAW_TRAINING_FILES)} files)")
|
||||
|
||||
print()
|
||||
print(f" Total before dedup: {len(all_records)}")
|
||||
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
deduped = []
|
||||
for r in all_records:
|
||||
key = dedup_key(r)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
deduped.append(r)
|
||||
|
||||
dupes_removed = len(all_records) - len(deduped)
|
||||
print(f" Duplicates removed: {dupes_removed}")
|
||||
print(f" Total after dedup: {len(deduped)}")
|
||||
|
||||
# Count format split
|
||||
text_count = sum(1 for r in deduped if "text" in r)
|
||||
conv_count = sum(1 for r in deduped if "conversations" in r)
|
||||
print(f" Format: {conv_count} conversations, {text_count} pre-formatted text")
|
||||
|
||||
# Shuffle
|
||||
random.shuffle(deduped)
|
||||
|
||||
if args.dry_run:
|
||||
print("\n [DRY RUN] No output written.")
|
||||
return
|
||||
|
||||
# Write
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.output, "w") as f:
|
||||
for r in deduped:
|
||||
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\n Wrote {len(deduped)} examples to {args.output}")
|
||||
print(f" File size: {args.output.stat().st_size / 1e6:.1f} MB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user