#!/usr/bin/env python3 """ Unified training data validator for Mortdecai 0.6.0. Validates ALL training data files: - RCON command syntax (live server when available, pattern matching always) - System prompt correctness (24-tool list) - Format consistency (messages[] chat format) - Known bad patterns (@s, enchantment syntax, leading slashes, generic items) - Tool call schema compliance - Duplicate detection Modes: --check Dry run, report issues only (default) --fix Auto-fix known issues in place --rcon Enable live RCON validation (requires dev server) Usage: python3 training/scripts/validate_all_training.py --check python3 training/scripts/validate_all_training.py --fix --rcon """ import argparse import glob import json import logging import os import re import socket import struct import sys import time from collections import Counter, defaultdict from pathlib import Path from typing import Any, Dict, List, Optional, Tuple log = logging.getLogger(__name__) PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(PROJECT_ROOT)) # --- Current 24-tool list --- CURRENT_TOOLS = [ "rcon.execute", "minecraft.lookup", "plugin.docs_lookup", "world.player_info", "world.server_state", "world.nearby_entities", "world.scan_area", "world.redstone_trace", "world.render", "server.config", "memory.read", "memory.write", "journal.read", "journal.write", "log.query", "user.ask", "script.write", "script.validate", "script.execute", "script.read", "script.list", "script.delete", "script.schedule", "training.save", ] # Legacy tool names that should be updated LEGACY_TOOLS = { "minecraft.wiki_lookup": "minecraft.lookup", "minecraft.changelog_lookup": "minecraft.lookup", "paper.docs_lookup": "minecraft.lookup", } # Known bad patterns in commands BAD_PATTERNS = [ (re.compile(r'^/'), "leading_slash", "Commands should not start with /"), (re.compile(r'@s\b'), "at_s", "@s invalid via RCON (no executor). Auto-fix: @p (imprecise — selects nearest player, not necessarily the requester)"), (re.compile(r'\[enchantments=\{'), "enchantment_syntax", "Paper RCON rejects [enchantments={...}] component syntax"), (re.compile(r'\[potion_contents='), "potion_syntax", "Paper RCON rejects [potion_contents={...}] syntax"), (re.compile(r'give \S+ minecraft:bed\b'), "generic_bed", "minecraft:bed doesn't exist, use minecraft:white_bed"), (re.compile(r'give \S+ minecraft:log\b'), "generic_log", "minecraft:log doesn't exist, use minecraft:oak_log"), (re.compile(r'give \S+ minecraft:wood\b'), "generic_wood", "minecraft:wood doesn't exist, use minecraft:oak_planks"), (re.compile(r'give \S+ minecraft:boat\b'), "generic_boat", "minecraft:boat doesn't exist, use minecraft:oak_boat"), (re.compile(r'give \S+ minecraft:steak\b'), "generic_steak", "minecraft:steak doesn't exist, use minecraft:cooked_beef"), (re.compile(r'"weather clear"'), "weather_no_world", "Paper needs world name: weather devworld clear"), (re.compile(r'template (search|pick|build)'), "template_cmd", "Template commands removed in 0.5.0+"), ] # Fix map for auto-repair FIXES = { "leading_slash": lambda cmd: cmd.lstrip("/"), "at_s": lambda cmd: cmd.replace("@s", "@p"), "generic_bed": lambda cmd: cmd.replace("minecraft:bed", "minecraft:white_bed"), "generic_log": lambda cmd: cmd.replace("minecraft:log", "minecraft:oak_log"), "generic_wood": lambda cmd: cmd.replace("minecraft:wood", "minecraft:oak_planks"), "generic_boat": lambda cmd: cmd.replace("minecraft:boat", "minecraft:oak_boat"), "generic_steak": lambda cmd: cmd.replace("minecraft:steak", "minecraft:cooked_beef"), } # Enchantment fix: replace enchanted give with plain give def fix_enchantment(cmd: str) -> List[str]: m = re.match(r'(give \S+ minecraft:\S+)\[enchantments=\{[^}]+\}\]\s*(\d*)', cmd) if m: base = m.group(1) count = m.group(2) or "1" return [f"{base} {count}"] m = re.match(r'(give \S+ minecraft:\S+)\[potion_contents=\{[^}]+\}\]\s*(\d*)', cmd) if m: base = m.group(1).split("[")[0] count = m.group(2) or "1" return [f"{base.rsplit(' ', 1)[0]} minecraft:potion {count}"] return [cmd] # --- RCON helper --- def rcon_connect(host: str, port: int, password: str) -> Optional[socket.socket]: try: s = socket.socket() s.settimeout(10) s.connect((host, port)) data = password.encode() + b'\x00\x00' s.sendall(struct.pack(' str: try: data = cmd.encode() + b'\x00\x00' sock.sendall(struct.pack('= 14: return raw[12:-2].decode('utf-8', errors='replace') return '' except Exception as e: return f'ERROR: {e}' def rcon_validate_cmd(sock: socket.socket, cmd: str) -> Tuple[bool, str]: """Validate a command via RCON. Returns (valid, result).""" result = rcon_cmd(sock, cmd) bad_words = ["unknown", "invalid", "incorrect", "expected whitespace", "error"] benign = ["no player", "no entity", "not loaded", "not online"] result_lower = result.lower() if any(b in result_lower for b in benign): return True, result # command syntax is fine, just no target if any(b in result_lower for b in bad_words): return False, result return True, result # --- Extract commands from training examples --- def extract_commands(rec: dict) -> List[str]: """Extract all RCON commands from a training example (any format).""" commands = [] # Format 1: messages[] chat format messages = rec.get("messages", []) for msg in messages: if not isinstance(msg, dict): continue content = msg.get("content", "") if not isinstance(content, str): continue # Tool calls with rcon.execute if msg.get("role") == "assistant" and "" in content: try: tc_json = content.split("")[1].split("")[0].strip() tc = json.loads(tc_json) if tc.get("name") == "rcon.execute": cmd = tc["arguments"].get("command", "") if cmd: commands.append(cmd) except (json.JSONDecodeError, KeyError, IndexError): pass # Final JSON with commands array if msg.get("role") == "assistant" and "" not in content: try: parsed = json.loads(content) for cmd in parsed.get("commands", []): if isinstance(cmd, str) and cmd: commands.append(cmd) except (json.JSONDecodeError, TypeError): pass # Format 2: old dict format output = rec.get("output", {}) if isinstance(output, dict): for cmd in output.get("commands", output.get("commands_generated", [])): if isinstance(cmd, str) and cmd: commands.append(cmd) return commands def extract_tool_calls(rec: dict) -> List[str]: """Extract tool names called in a training example.""" tools = [] for msg in rec.get("messages", []): if not isinstance(msg, dict): continue content = msg.get("content", "") if msg.get("role") == "assistant" and "" in content: try: tc = json.loads(content.split("")[1].split("")[0].strip()) tools.append(tc.get("name", "unknown")) except: pass return tools def check_system_prompt(rec: dict) -> List[str]: """Check if system prompt has correct tool list.""" issues = [] messages = rec.get("messages", []) if not messages: return issues sys_content = "" for msg in messages: if isinstance(msg, dict) and msg.get("role") == "system": sys_content = msg.get("content", "") break if not sys_content: return ["no_system_prompt"] # Check for outdated tool names for old_name in LEGACY_TOOLS: if old_name in sys_content: issues.append(f"legacy_tool:{old_name}") # Check for missing new tools for tool in ["world.scan_area", "world.redstone_trace", "world.render", "training.save", "server.config"]: # Only flag if the prompt lists tools at all if "rcon.execute" in sys_content and tool not in sys_content: issues.append(f"missing_tool:{tool}") return issues # --- Main validation --- def validate_file(filepath: str, fix_mode: bool, rcon_sock: Optional[socket.socket]) -> dict: """Validate a single JSONL file. Returns stats dict.""" stats = { "file": os.path.basename(filepath), "total": 0, "valid": 0, "issues": Counter(), "cmd_issues": Counter(), "format_issues": Counter(), "fixed": 0, "examples_with_issues": 0, "rcon_tested": 0, "rcon_passed": 0, "rcon_failed": 0, } lines = [] with open(filepath) as f: for line in f: try: rec = json.loads(line.strip()) lines.append(rec) except json.JSONDecodeError: stats["format_issues"]["bad_json"] += 1 output_lines = [] seen_ids = set() for rec in lines: stats["total"] += 1 example_issues = [] # Duplicate check rec_id = rec.get("id", "") if rec_id and rec_id in seen_ids: example_issues.append("duplicate_id") stats["issues"]["duplicate_id"] += 1 seen_ids.add(rec_id) # Format check has_messages = "messages" in rec and isinstance(rec.get("messages"), list) has_old_format = "input" in rec and "output" in rec if not has_messages and not has_old_format: example_issues.append("unknown_format") stats["format_issues"]["unknown_format"] += 1 if has_old_format and not has_messages: stats["format_issues"]["old_format"] += 1 # System prompt check prompt_issues = check_system_prompt(rec) for pi in prompt_issues: stats["issues"][pi] += 1 example_issues.append(pi) # Tool call check tool_calls = extract_tool_calls(rec) for tool_name in tool_calls: if tool_name in LEGACY_TOOLS: stats["issues"][f"legacy_tool_call:{tool_name}"] += 1 example_issues.append(f"legacy_tool_call:{tool_name}") # Command validation commands = extract_commands(rec) fixed_commands = {} # index → fixed command(s) for i, cmd in enumerate(commands): matched_issues = [] for pattern, issue_name, _desc in BAD_PATTERNS: if pattern.search(cmd): stats["cmd_issues"][issue_name] += 1 example_issues.append(issue_name) matched_issues.append(issue_name) # Auto-fix if in fix mode if fix_mode: if issue_name in FIXES: fixed_commands[i] = [FIXES[issue_name](cmd)] stats["fixed"] += 1 elif issue_name in ("enchantment_syntax", "potion_syntax"): fixed_commands[i] = fix_enchantment(cmd) stats["fixed"] += 1 # RCON validation — skip commands with unfixable syntax issues skip_rcon = any(mi in ("enchantment_syntax", "potion_syntax", "template_cmd") for mi in matched_issues) if rcon_sock and not skip_rcon: test_cmd = cmd if i in fixed_commands: test_cmd = fixed_commands[i][0] # Skip player-targeted commands (bots not guaranteed online) player_cmds = ["give ", "tp ", "effect ", "tellraw ", "execute at "] if not any(test_cmd.startswith(p) for p in player_cmds): valid, result = rcon_validate_cmd(rcon_sock, test_cmd) stats["rcon_tested"] += 1 if valid: stats["rcon_passed"] += 1 else: stats["rcon_failed"] += 1 example_issues.append(f"rcon_fail:{result[:60]}") if example_issues: stats["examples_with_issues"] += 1 else: stats["valid"] += 1 # Apply fixes to the record if in fix mode if fix_mode and fixed_commands: rec = _apply_fixes_to_record(rec, commands, fixed_commands) output_lines.append(rec) # Write back if fixing and changes were made if fix_mode and stats["fixed"] > 0: with open(filepath, 'w', encoding='utf-8') as f: for rec in output_lines: f.write(json.dumps(rec, ensure_ascii=True) + '\n') log.info("Wrote %d fixed examples to %s", stats["fixed"], filepath) return stats def _apply_fixes_to_record(rec: dict, original_commands: List[str], fixed_commands: Dict[int, List[str]]) -> dict: """Apply command fixes to all locations in a training record.""" # Build old→new mapping fix_map = {} for i, new_cmds in fixed_commands.items(): if i < len(original_commands): fix_map[original_commands[i]] = new_cmds if not fix_map: return rec # Fix messages[] format for msg in rec.get("messages", []): if not isinstance(msg, dict) or msg.get("role") != "assistant": continue content = msg.get("content", "") # Fix tool_call blocks if "" in content and "rcon.execute" in content: try: tc = json.loads(content.split("")[1].split("")[0].strip()) if tc.get("name") == "rcon.execute": cmd = tc["arguments"].get("command", "") if cmd in fix_map: tc["arguments"]["command"] = fix_map[cmd][0] msg["content"] = f'\n{json.dumps(tc)}\n' except: pass # Fix final JSON commands array if "" not in content: try: parsed = json.loads(content) if "commands" in parsed: new_cmds = [] for cmd in parsed["commands"]: if cmd in fix_map: new_cmds.extend(fix_map[cmd]) else: new_cmds.append(cmd) parsed["commands"] = new_cmds msg["content"] = json.dumps(parsed, ensure_ascii=True) except: pass # Fix old dict format output = rec.get("output", {}) if isinstance(output, dict): for key in ("commands", "commands_generated", "commands_executed"): if key in output and isinstance(output[key], list): new_cmds = [] for cmd in output[key]: if isinstance(cmd, str) and cmd in fix_map: new_cmds.extend(fix_map[cmd]) else: new_cmds.append(cmd) output[key] = new_cmds return rec def main(): parser = argparse.ArgumentParser(description="Validate all training data") parser.add_argument("--check", action="store_true", default=True, help="Dry run (default)") parser.add_argument("--fix", action="store_true", help="Auto-fix known issues") parser.add_argument("--rcon", action="store_true", help="Enable live RCON validation") parser.add_argument("--files", nargs="*", help="Specific files to validate (default: all)") args = parser.parse_args() # Find all training files if args.files: files = args.files else: files = sorted( glob.glob(str(PROJECT_ROOT / "data/raw/*.jsonl")) + glob.glob(str(PROJECT_ROOT / "data/processed/*.jsonl")) ) # Exclude quarantine and queue files files = [f for f in files if "quarantine" not in f and "queue" not in f] print(f"Validating {len(files)} files...") # RCON connection rcon_sock = None if args.rcon: print("Connecting to dev RCON (192.168.0.244:25578)...") rcon_sock = rcon_connect("192.168.0.244", 25578, "REDACTED_RCON") if rcon_sock: print(" RCON connected") else: print(" RCON connection failed — running without live validation") # Validate each file all_stats = [] total_issues = Counter() total_cmd_issues = Counter() for filepath in files: stats = validate_file(filepath, args.fix, rcon_sock) all_stats.append(stats) total_issues.update(stats["issues"]) total_cmd_issues.update(stats["cmd_issues"]) # Close RCON if rcon_sock: try: rcon_sock.close() except: pass # Report print(f"\n{'='*70}") print(f"VALIDATION REPORT") print(f"{'='*70}") total_examples = sum(s["total"] for s in all_stats) total_valid = sum(s["valid"] for s in all_stats) total_with_issues = sum(s["examples_with_issues"] for s in all_stats) total_fixed = sum(s["fixed"] for s in all_stats) print(f"\nTotal examples: {total_examples}") print(f"Valid: {total_valid} ({total_valid/total_examples*100:.1f}%)") print(f"With issues: {total_with_issues} ({total_with_issues/total_examples*100:.1f}%)") if args.fix: print(f"Fixed: {total_fixed}") # Per-file summary print(f"\n{'File':<45} {'Total':>6} {'Valid':>6} {'Issues':>6} {'Rate':>6}") print("-" * 75) for s in sorted(all_stats, key=lambda x: -x["examples_with_issues"]): if s["total"] == 0: continue rate = f"{s['valid']/s['total']*100:.0f}%" print(f"{s['file']:<45} {s['total']:>6} {s['valid']:>6} {s['examples_with_issues']:>6} {rate:>6}") # Issue breakdown if total_cmd_issues: print(f"\nCommand issues:") for issue, count in total_cmd_issues.most_common(): print(f" {count:>5} {issue}") if total_issues: print(f"\nOther issues:") for issue, count in total_issues.most_common(20): print(f" {count:>5} {issue}") # RCON stats rcon_tested = sum(s["rcon_tested"] for s in all_stats) if rcon_tested: rcon_passed = sum(s["rcon_passed"] for s in all_stats) rcon_failed = sum(s["rcon_failed"] for s in all_stats) print(f"\nRCON validation: {rcon_tested} tested, {rcon_passed} passed, {rcon_failed} failed") # Format breakdown format_issues = Counter() for s in all_stats: format_issues.update(s["format_issues"]) if format_issues: print(f"\nFormat issues:") for issue, count in format_issues.most_common(): print(f" {count:>5} {issue}") if __name__ == "__main__": main()