#!/usr/bin/env python3 """ validate_distilled.py — Execute distilled Claude responses on a live server via RCON. Takes distilled.jsonl, executes each example's commands on the dev server, captures RCON results, and writes validated training pairs to output. This creates the strongest training signal: input → Claude's commands → actual server result. Usage: python3 training/scripts/validate_distilled.py # run all python3 training/scripts/validate_distilled.py --dry-run # preview python3 training/scripts/validate_distilled.py --max 10 # first 10 only python3 training/scripts/validate_distilled.py --rcon-host 192.168.0.244 --rcon-port 25578 """ import argparse import json import re import sys import time from pathlib import Path from mcrcon import MCRcon ROOT = Path(__file__).resolve().parent.parent.parent DISTILLED = ROOT / "data" / "processed" / "distilled.jsonl" OUTPUT = ROOT / "data" / "processed" / "validated_distilled.jsonl" # RCON error patterns RCON_ERRORS = [ re.compile(r'Unknown or incomplete command', re.I), re.compile(r'Incorrect argument', re.I), re.compile(r'Expected .+ at position', re.I), re.compile(r'Unknown item', re.I), re.compile(r'Unknown item component', re.I), re.compile(r'Invalid or unknown', re.I), re.compile(r"Can't find element", re.I), re.compile(r'Expected whitespace', re.I), ] # Expected "failures" that are actually fine (no player online, no entity, unloaded chunks) BENIGN_ERRORS = [ re.compile(r'No player was found', re.I), re.compile(r'No entity was found', re.I), re.compile(r'That position is not loaded', re.I), ] def is_real_error(result: str) -> bool: """Check if RCON result is a real syntax/command error (not just missing player).""" for pat in RCON_ERRORS: if pat.search(result): # Check it's not just a benign error for bp in BENIGN_ERRORS: if bp.search(result): return False return True return False def is_benign_error(result: str) -> bool: """Check if error is benign (would work with a player online).""" for bp in BENIGN_ERRORS: if bp.search(result): return True return False def execute_commands(commands: list, rcon_host: str, rcon_port: int, rcon_pass: str) -> list: """Execute commands via RCON, return list of (cmd, result, success) tuples.""" results = [] try: with MCRcon(rcon_host, rcon_pass, port=rcon_port) as rcon: for cmd in commands: try: result = rcon.command(cmd) real_err = is_real_error(result) benign = is_benign_error(result) success = not real_err results.append({ "command": cmd, "result": result[:200], "success": success, "benign_error": benign, "real_error": real_err, }) time.sleep(0.2) except Exception as e: results.append({ "command": cmd, "result": str(e)[:200], "success": False, "benign_error": False, "real_error": True, }) except Exception as e: results.append({ "command": "(connection failed)", "result": str(e)[:200], "success": False, "benign_error": False, "real_error": True, }) return results def main(): parser = argparse.ArgumentParser(description="Validate distilled responses via RCON") parser.add_argument("--input", default=str(DISTILLED)) parser.add_argument("--output", default=str(OUTPUT)) parser.add_argument("--rcon-host", default="192.168.0.244") parser.add_argument("--rcon-port", type=int, default=25578) parser.add_argument("--rcon-pass", default="REDACTED_RCON") parser.add_argument("--max", type=int, default=0, help="Max examples to process (0=all)") parser.add_argument("--dry-run", action="store_true") parser.add_argument("--reset-between", action="store_true", default=True, help="Clear effects between examples") args = parser.parse_args() with open(args.input) as f: examples = [json.loads(l) for l in f if l.strip()] if args.max > 0: examples = examples[:args.max] print(f"Validating {len(examples)} distilled examples") print(f"RCON: {args.rcon_host}:{args.rcon_port}") print(f"Output: {args.output}") if args.dry_run: total_cmds = sum(len(ex.get("output", {}).get("commands", [])) for ex in examples) print(f"\n[DRY RUN] Would execute {total_cmds} commands across {len(examples)} examples") return # Test RCON try: with MCRcon(args.rcon_host, args.rcon_pass, port=args.rcon_port) as rcon: print(f"RCON OK: {rcon.command('list')}") except Exception as e: print(f"RCON FAILED: {e}") sys.exit(1) validated = [] stats = {"total": 0, "all_success": 0, "partial": 0, "all_fail": 0, "no_cmds": 0, "real_errors": 0, "benign_errors": 0} for i, ex in enumerate(examples): commands = ex.get("output", {}).get("commands", []) msg = ex.get("input", {}).get("user_message", "")[:50] mode = "god" if "pray" in msg.lower() or ex.get("source") == "prayer_log" else "sudo" stats["total"] += 1 if not commands: stats["no_cmds"] += 1 # Still valid — refusal or info-only response ex["rcon_validation"] = {"status": "no_commands", "results": []} validated.append(ex) print(f" [{i+1}/{len(examples)}] ({mode}) {msg:50} [no cmds — kept]") continue # Execute results = execute_commands(commands, args.rcon_host, args.rcon_port, args.rcon_pass) real_errors = sum(1 for r in results if r["real_error"]) benign = sum(1 for r in results if r["benign_error"]) successes = sum(1 for r in results if r["success"]) stats["real_errors"] += real_errors stats["benign_errors"] += benign if real_errors == 0: stats["all_success"] += 1 status = "valid" elif real_errors < len(results): stats["partial"] += 1 status = "partial" else: stats["all_fail"] += 1 status = "invalid" # Tag the example with validation results ex["rcon_validation"] = { "status": status, "results": results, "real_errors": real_errors, "benign_errors": benign, "successes": successes, } validated.append(ex) flag = "" if real_errors > 0: flag = f" [FAIL:{real_errors}]" # Show first real error for r in results: if r["real_error"]: flag += f" {r['command'][:30]}→{r['result'][:40]}" break elif benign > 0: flag = f" [benign:{benign}]" print(f" [{i+1}/{len(examples)}] ({mode}) {msg:50} [{successes}/{len(results)} ok]{flag}") # Reset effects between examples if args.reset_between and mode == "god": try: with MCRcon(args.rcon_host, args.rcon_pass, port=args.rcon_port) as rcon: rcon.command("effect clear @a") except: pass time.sleep(0.5) time.sleep(0.3) # Write output with open(args.output, "w") as f: for ex in validated: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"\n{'='*60}") print(f"Validation complete") print(f" Total: {stats['total']}") print(f" All valid: {stats['all_success']} ({stats['all_success']/max(stats['total'],1)*100:.1f}%)") print(f" Partial: {stats['partial']}") print(f" All failed: {stats['all_fail']}") print(f" No commands: {stats['no_cmds']}") print(f" Real errors: {stats['real_errors']}") print(f" Benign errors: {stats['benign_errors']} (would work with player online)") print(f" Output: {args.output}") if __name__ == "__main__": main()