#!/usr/bin/env python3 """ self_play.py — Multi-tier self-play training data generator. Three tiers of self-play, each teaching different skills: Tier 1 — Command drills: Feed known prompts, execute commands via RCON, validate syntax. Teaches: accurate command generation. Usage: --tier 1 --rounds 50 Tier 2 — Single-shot self-critique: Model generates BOTH the prompt AND response in one call. Teaches: edge-case awareness, self-evaluation. Usage: --tier 2 --rounds 50 Tier 3 — Adversarial self-play: Session A generates a challenging prompt. Fresh Session B responds. RCON validates. Model can't cheat by knowing both sides. Teaches: robustness, generalization, error correction. Usage: --tier 3 --rounds 50 All tiers: --tier all --rounds 50 (runs ~17 rounds of each) No API cost — runs entirely on the local model with RCON as ground truth. """ import argparse import json import os import re import random import sys import time from pathlib import Path import requests ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(ROOT)) OUTPUT = ROOT / "data" / "processed" / "self_play.jsonl" # --- RCON --- def rcon_command(cmd, host, port, password): """Execute via RCON, return (success, result_text).""" import socket, struct try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(5) s.connect((host, port)) def send(rid, ptype, payload): data = struct.pack("[\s\S]*?\s*', '', content) return content.strip() # --- Prompt generation categories --- EXPLORATION_CATEGORIES = { "enchantment_combos": { "instruction": """Generate 5 Minecraft chat messages that test unusual or edge-case enchantment combinations. Include: mutually exclusive enchants, max level exceeded, enchants on wrong items, multi-enchant syntax. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, "entity_nbt": { "instruction": """Generate 5 Minecraft chat messages that test entity spawning with unusual NBT data. Include: custom names, baby variants, colored sheep, armored mobs, riding/passengers, powered creepers. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, "execute_chains": { "instruction": """Generate 5 Minecraft chat messages that require complex execute command chains. Include: nested execute, conditional execution, store results, dimension switching, targeting by gamemode/team. Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "edge_items": { "instruction": """Generate 5 Minecraft chat messages requesting obscure or easily-confused items. Include: items with color variants, items that changed names between versions, items with underscores, items people misspell (like "wooden_sword" vs "wood_sword", "cooked_beef" vs "steak"). Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "worldedit": { "instruction": """Generate 5 Minecraft chat messages requesting WorldEdit operations. Include: shapes, selections, replacements, brushes, stacking, clipboard operations. Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "multiplayer": { "instruction": """Generate 5 Minecraft chat messages involving multiple players or complex selectors. Include: @a with exclusions, team commands, scoreboard operations, targeting by distance/gamemode. Use player names like: slingshooter08, SwiftWolf, DarkWolf, BraveWolf. Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "boundary_testing": { "instruction": """Generate 5 Minecraft chat messages that test safety boundaries. Include: requests for forbidden items, mass destruction, OP commands, but also requests that SEEM dangerous but are actually fine (like giving TNT to yourself, or killing your own mobs). Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, "natural_language": { "instruction": """Generate 5 Minecraft chat messages phrased in unusual or creative natural language. Include: typos, slang, roleplay, indirect requests, questions, sarcasm, mixed languages. The AI should still be able to figure out what the player wants. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.2, }, "cosmetic_effects": { "instruction": """Generate 5 Minecraft chat messages requesting cosmetic or dramatic effects. Include: particles, sounds, titles, tellraw formatting, fireworks, combination effects. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, } # System prompts SUDO_SYSTEM = """You are a Minecraft 1.21 command translator. Return JSON: {"commands": ["cmd1", ...], "reasoning": "why"} Commands use minecraft: prefix. Enchantments: item[enchantments={name:level}]. Effects: effect give minecraft: . Do NOT start commands with /. Player name: slingshooter08.""" GOD_SYSTEM = """You are God in a Minecraft server. Return JSON: {"message": "dramatic response", "commands": ["cmd1", ...], "reasoning": "why"} Commands use minecraft: prefix. Be dramatic but use valid 1.21 syntax. Player: slingshooter08.""" RETRY_SYSTEM = """You are a Minecraft 1.21 command translator. Your previous command failed with an error. Analyze the error and return a corrected command. Return JSON: {"commands": ["corrected_cmd"], "reasoning": "what was wrong and how you fixed it"}""" # --- Tier 1: Command drills --- def run_tier1_drill(model, ollama_url, rcon_host, rcon_port, rcon_pass, max_retries=2): """Pick a random prompt from seed dataset, generate commands, validate via RCON.""" seed_path = ROOT / "data" / "processed" / "seed_dataset.jsonl" with open(seed_path) as f: lines = [l for l in f if l.strip()] line = random.choice(lines) ex = json.loads(line) prompt = ex["input"]["user_message"] # Only drill command_gen examples if ex.get("category") not in ("command_gen",): return None trace = attempt_command(model, ollama_url, prompt, rcon_host, rcon_port, rcon_pass, max_retries) trace["tier"] = 1 trace["original_commands"] = ex.get("output", {}).get("commands", []) return trace # --- Tier 2: Single-shot self-critique --- SELF_CRITIQUE_SYSTEM = """You are a Minecraft 1.21 AI training data generator AND command translator. Your task: generate a challenging Minecraft player request, then respond to it yourself. Focus on edge cases you might get wrong: unusual items, complex enchantments, execute chains, ambiguous phrasing. Return JSON: { "generated_prompt": "the player request you invented (must start with 'sudo ' or 'pray ')", "difficulty": "what makes this tricky", "commands": ["cmd1", "cmd2"], "reasoning": "why these commands are correct", "message": "God message if pray, empty string if sudo" } Commands use minecraft: prefix. Enchantments: item[enchantments={name:level}]. Effects: effect give minecraft: . Player: slingshooter08. Do NOT start commands with /.""" def run_tier2_selfcritique(model, ollama_url, rcon_host, rcon_port, rcon_pass, category=None): """Model generates a prompt AND responds in one shot, then RCON validates.""" focus = "" if category: focus = f"\nFocus area: {category}. Generate a prompt specifically testing {category}." try: raw = llm_call( model=model, system=SELF_CRITIQUE_SYSTEM + focus, user="Generate one challenging Minecraft request and your response. Be creative — pick something you might get wrong.", ollama_url=ollama_url, temperature=0.9, max_tokens=500, fmt="json", ) result = json.loads(raw) except: match = re.search(r'\{[\s\S]*\}', raw if 'raw' in dir() else '') if match: try: result = json.loads(match.group()) except: return None else: return None prompt = result.get("generated_prompt", "") commands = result.get("commands") or [] message = result.get("message") or "" reasoning = result.get("reasoning") or "" difficulty = result.get("difficulty") or "" if not prompt: return None trace = { "prompt": prompt, "mode": "god" if prompt.lower().startswith("pray ") else "sudo", "tier": 2, "difficulty_note": difficulty, "attempts": [], "final_success": False, "self_corrected": False, } if not commands: trace["attempts"].append({ "commands": [], "reasoning": reasoning, "message": message, "rcon_results": [], "all_success": True, }) trace["final_success"] = True return trace # Validate via RCON rcon_results = [] all_success = True for cmd in commands: success, rcon_result = rcon_command(cmd, rcon_host, rcon_port, rcon_pass) rcon_results.append({"command": cmd, "success": success, "result": rcon_result}) if not success: all_success = False trace["attempts"].append({ "commands": commands, "reasoning": reasoning, "message": message, "rcon_results": rcon_results, "all_success": all_success, }) trace["final_success"] = all_success return trace # --- Tier 3: Adversarial self-play (original generate_prompts + attempt_command) --- def generate_prompts(model, ollama_url, category=None): """Use the model to generate edge-case prompts for itself.""" if category: cats = {category: EXPLORATION_CATEGORIES[category]} else: # Pick 2-3 random categories per round keys = random.sample(list(EXPLORATION_CATEGORIES.keys()), min(3, len(EXPLORATION_CATEGORIES))) cats = {k: EXPLORATION_CATEGORIES[k] for k in keys} prompts = [] for cat_name, cat_config in cats.items(): try: raw = llm_call( model=model, system="You are a Minecraft test case generator. Generate diverse edge cases for an AI training pipeline.", user=cat_config["instruction"], ollama_url=ollama_url, temperature=cat_config["temperature"], max_tokens=400, ) # Parse JSON array cleaned = raw.replace("```json", "").replace("```", "").strip() match = re.search(r'\[[\s\S]*\]', cleaned) if match: items = json.loads(match.group()) for item in items: if isinstance(item, str) and item.strip(): prompts.append({"prompt": item.strip(), "category": cat_name}) except Exception as e: print(f" [!] Prompt generation failed for {cat_name}: {e}") return prompts def attempt_command(model, ollama_url, prompt, rcon_host, rcon_port, rcon_pass, max_retries=2): """ Model generates a command for the prompt, executes via RCON. On error, model self-corrects up to max_retries times. Returns the full interaction trace. """ mode = "god" if prompt.lower().startswith("pray ") else "sudo" system = GOD_SYSTEM if mode == "god" else SUDO_SYSTEM trace = { "prompt": prompt, "mode": mode, "attempts": [], "final_success": False, "self_corrected": False, } # First attempt try: raw = llm_call(model, system, prompt, ollama_url, temperature=0.3, max_tokens=300, fmt="json") result = json.loads(raw) except (json.JSONDecodeError, Exception) as e: # Try extracting JSON match = re.search(r'\{[\s\S]*\}', raw if 'raw' in dir() else '') if match: try: result = json.loads(match.group()) except: trace["attempts"].append({"commands": [], "error": f"JSON parse failed: {e}"}) return trace else: trace["attempts"].append({"commands": [], "error": f"LLM failed: {e}"}) return trace commands = result.get("commands") or [] message = result.get("message") or "" reasoning = result.get("reasoning") or "" if not commands: trace["attempts"].append({ "commands": [], "reasoning": reasoning, "message": message, "rcon_results": [], "all_success": True, }) trace["final_success"] = True # Refusal/info is valid return trace # Execute commands via RCON rcon_results = [] all_success = True for cmd in commands: success, rcon_result = rcon_command(cmd, rcon_host, rcon_port, rcon_pass) rcon_results.append({"command": cmd, "success": success, "result": rcon_result}) if not success: all_success = False trace["attempts"].append({ "commands": commands, "reasoning": reasoning, "message": message, "rcon_results": rcon_results, "all_success": all_success, }) if all_success: trace["final_success"] = True return trace # Self-correction loop for retry in range(max_retries): # Build error context for the model failed_cmds = [r for r in rcon_results if not r["success"]] error_context = "\n".join( f"Command: {r['command']}\nError: {r['result']}" for r in failed_cmds ) retry_prompt = f"Original request: {prompt}\n\nFailed commands:\n{error_context}\n\nPlease fix the commands." try: raw = llm_call(model, RETRY_SYSTEM, retry_prompt, ollama_url, temperature=0.2, max_tokens=300, fmt="json") result = json.loads(raw) except: match = re.search(r'\{[\s\S]*\}', raw if 'raw' in dir() else '') if match: try: result = json.loads(match.group()) except: break else: break commands = result.get("commands") or [] reasoning = result.get("reasoning") or "" if not commands: break # Execute corrected commands rcon_results = [] all_success = True for cmd in commands: success, rcon_result = rcon_command(cmd, rcon_host, rcon_port, rcon_pass) rcon_results.append({"command": cmd, "success": success, "result": rcon_result}) if not success: all_success = False trace["attempts"].append({ "commands": commands, "reasoning": reasoning, "rcon_results": rcon_results, "all_success": all_success, "retry": retry + 1, }) if all_success: trace["final_success"] = True trace["self_corrected"] = True break return trace def trace_to_training(trace): """Convert a self-play trace to training examples.""" examples = [] prompt = trace["prompt"] mode = trace["mode"] if not trace["attempts"]: return examples # Single successful attempt → standard training pair if trace["final_success"] and len(trace["attempts"]) == 1: att = trace["attempts"][0] ex = { "id": f"selfplay-{int(time.time())}-{random.randint(0,999):03d}", "source": "self_play", "category": "command_gen", "input": { "user_message": prompt, "server_context": {"server_type": "paper", "version": "1.21.x"}, }, "output": { "reasoning": att.get("reasoning", ""), "commands": att.get("commands", []), "message": att.get("message", "") if mode == "god" else "", "safety_flags": [], }, "metadata": { "difficulty": "medium", "validated": True, "risk_level": 3, "rcon_verified": True, "self_play": True, }, } examples.append(ex) # Self-corrected → multi-turn tool-calling training example elif trace["self_corrected"] and len(trace["attempts"]) >= 2: messages = [] # System system = GOD_SYSTEM if mode == "god" else SUDO_SYSTEM messages.append({"role": "system", "content": system}) # User messages.append({"role": "user", "content": prompt}) # First attempt (failed) first = trace["attempts"][0] for r in first.get("rcon_results", []): messages.append({ "role": "assistant", "content": f'\n{{"name": "rcon.execute", "arguments": {{"command": "{r["command"]}"}}}}\n' }) messages.append({ "role": "tool", "content": json.dumps({"success": r["success"], "result": r["result"]}) }) # Successful retry last = trace["attempts"][-1] for r in last.get("rcon_results", []): messages.append({ "role": "assistant", "content": f'\n{{"name": "rcon.execute", "arguments": {{"command": "{r["command"]}"}}}}\n' }) messages.append({ "role": "tool", "content": json.dumps({"success": r["success"], "result": r["result"]}) }) # Final response final_cmds = last.get("commands", []) final_response = { "commands": final_cmds, "reasoning": f"Self-corrected: {first.get('reasoning', '')} → {last.get('reasoning', '')}", } if mode == "god": final_response["message"] = first.get("message", "") messages.append({"role": "assistant", "content": json.dumps(final_response)}) ex = { "id": f"selfplay-correction-{int(time.time())}-{random.randint(0,999):03d}", "source": "self_play", "type": "error_correction", "messages": messages, "metadata": { "self_play": True, "rcon_verified": True, "attempts": len(trace["attempts"]), }, } examples.append(ex) return examples def main(): parser = argparse.ArgumentParser(description="Self-play training data generator") parser.add_argument("--model", default="qwen3-8b-mc-lora-v3") parser.add_argument("--ollama-url", default="http://192.168.0.141:11434") parser.add_argument("--rcon-host", default="192.168.0.244") parser.add_argument("--rcon-port", type=int, default=25578) parser.add_argument("--rcon-pass", default="REDACTED_RCON") parser.add_argument("--rounds", type=int, default=50) parser.add_argument("--tier", default="all", choices=["1", "2", "3", "all"]) parser.add_argument("--focus", default=None, choices=list(EXPLORATION_CATEGORIES.keys())) parser.add_argument("--output", default=str(OUTPUT)) parser.add_argument("--dry-run", action="store_true") parser.add_argument("--max-retries", type=int, default=2) args = parser.parse_args() tiers = [1, 2, 3] if args.tier == "all" else [int(args.tier)] print(f"Self-play training data generator") print(f" Model: {args.model}") print(f" RCON: {args.rcon_host}:{args.rcon_port}") print(f" Rounds: {args.rounds}") print(f" Tiers: {tiers}") print(f" Focus: {args.focus or 'all categories'}") print(f" Max retries: {args.max_retries}") print(f" Output: {args.output}") all_examples = [] stats = { "rounds": 0, "prompts_generated": 0, "attempts": 0, "success_first_try": 0, "self_corrected": 0, "failed": 0, "training_examples": 0, "by_tier": {1: 0, 2: 0, 3: 0}, "by_category": {}, } for round_num in range(args.rounds): tier = tiers[round_num % len(tiers)] print(f"\n--- Round {round_num + 1}/{args.rounds} [Tier {tier}] ---") if tier == 1: # Command drill: pick random seed example, try to execute if args.dry_run: print(" [DRY RUN] Would drill a random seed prompt via RCON") stats["rounds"] += 1 continue for _ in range(5): # 5 drills per round trace = run_tier1_drill( args.model, args.ollama_url, args.rcon_host, args.rcon_port, args.rcon_pass, args.max_retries, ) if trace is None: continue stats["attempts"] += 1 stats["by_tier"][1] += 1 prompt = trace["prompt"] print(f" [drill] {prompt[:55]:55}", end="") if trace["final_success"] and not trace["self_corrected"]: stats["success_first_try"] += 1 n_cmds = len(trace["attempts"][0].get("commands", [])) print(f" OK ({n_cmds} cmds)") elif trace.get("self_corrected"): stats["self_corrected"] += 1 print(f" CORRECTED ({len(trace['attempts'])} attempts)") else: stats["failed"] += 1 print(f" FAILED") examples = trace_to_training(trace) all_examples.extend(examples) stats["training_examples"] += len(examples) time.sleep(0.5) elif tier == 2: # Self-critique: model generates prompt + response, RCON validates cats = [args.focus] if args.focus else random.sample( list(EXPLORATION_CATEGORIES.keys()), min(3, len(EXPLORATION_CATEGORIES)) ) for cat in cats: if args.dry_run: print(f" [DRY RUN] Would self-critique on {cat}") continue trace = run_tier2_selfcritique( args.model, args.ollama_url, args.rcon_host, args.rcon_port, args.rcon_pass, category=cat, ) if trace is None: continue stats["attempts"] += 1 stats["by_tier"][2] += 1 prompt = trace["prompt"] diff = trace.get("difficulty_note", "")[:30] print(f" [self-critique:{cat[:12]}] {prompt[:40]:40} ({diff})", end="") if trace["final_success"]: stats["success_first_try"] += 1 n_cmds = len(trace["attempts"][0].get("commands", [])) print(f" OK ({n_cmds} cmds)") else: stats["failed"] += 1 print(f" FAILED (self-generated bad commands)") examples = trace_to_training(trace) all_examples.extend(examples) stats["training_examples"] += len(examples) time.sleep(1) elif tier == 3: # Adversarial: generate prompts in Session A, respond in fresh Session B prompts = generate_prompts(args.model, args.ollama_url, args.focus) if not prompts: print(" No prompts generated, skipping round") stats["rounds"] += 1 continue stats["prompts_generated"] += len(prompts) print(f" Generated {len(prompts)} adversarial prompts") for p in prompts: prompt = p["prompt"] cat = p["category"] stats["by_category"].setdefault(cat, {"total": 0, "success": 0, "corrected": 0}) stats["by_category"][cat]["total"] += 1 stats["attempts"] += 1 stats["by_tier"][3] += 1 print(f" [adversarial:{cat[:12]}] {prompt[:48]:48}", end="") if args.dry_run: print(" [DRY RUN]") continue trace = attempt_command( args.model, args.ollama_url, prompt, args.rcon_host, args.rcon_port, args.rcon_pass, max_retries=args.max_retries, ) if trace["final_success"] and not trace["self_corrected"]: stats["success_first_try"] += 1 stats["by_category"][cat]["success"] += 1 n_cmds = len(trace["attempts"][0].get("commands", [])) print(f" OK ({n_cmds} cmds)") elif trace["self_corrected"]: stats["self_corrected"] += 1 stats["by_category"][cat]["corrected"] += 1 print(f" CORRECTED ({len(trace['attempts'])} attempts)") else: stats["failed"] += 1 print(f" FAILED") examples = trace_to_training(trace) all_examples.extend(examples) stats["training_examples"] += len(examples) time.sleep(1) stats["rounds"] += 1 # Save if not args.dry_run and all_examples: output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "a") as f: for ex in all_examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") # Summary print(f"\n{'='*60}") print(f"Self-play complete") print(f" Rounds: {stats['rounds']}") print(f" Prompts generated:{stats['prompts_generated']}") print(f" Attempts: {stats['attempts']}") print(f" Success (1st try):{stats['success_first_try']}") print(f" Self-corrected: {stats['self_corrected']}") print(f" Failed: {stats['failed']}") print(f" Training examples:{stats['training_examples']}") print(f"\n By tier:") for t in sorted(stats["by_tier"]): labels = {1: "Command drills", 2: "Self-critique", 3: "Adversarial"} print(f" Tier {t} ({labels[t]:16}): {stats['by_tier'][t]} attempts") if stats["by_category"]: print(f"\n By category:") for cat, s in sorted(stats["by_category"].items()): total = s["total"] ok = s["success"] corr = s["corrected"] fail = total - ok - corr print(f" {cat:25} total={total} ok={ok} corrected={corr} failed={fail}") print(f"\n Output: {args.output}") if __name__ == "__main__": main()