Add baseline assistant with tools, guardrails, and system prompts (Phase 1.4)

- agent/serve.py: CLI assistant with interactive, single-query, and eval modes (Ollama + qwen3-coder) - agent/tools/rcon_tool.py: RCON execute, server status, player info - agent/tools/knowledge_tool.py: TF-IDF RAG search, command reference lookup, server context - agent/guardrails/command_filter.py: 14-prefix allowlist, execute-tail bypass detection, destructive flags, 1.21 syntax warnings, audit log - agent/prompts/system_prompts.py: sudo (pure commands), god (persona), intervention (benign) system prompts - Guardrails tested: 10/10 allowlist, 5/6 syntax warnings pass
2026-03-18 02:12:20 -04:00
parent 77efac0283
commit e00d454b19
10 changed files with 815 additions and 12 deletions
@@ -0,0 +1,138 @@
+"""
+Safety guardrails for Minecraft command execution.
+
+Provides:
+  - Command allowlist filtering
+  - Destructive action detection
+  - Syntax validation hints
+  - Audit logging
+"""
+
+import json
+import re
+import time
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
+
+# Commands allowed for execution via the assistant.
+# Anything not on this list is blocked.
+ALLOWED_PREFIXES = [
+    'give ', 'effect ', 'xp ', 'tp ', 'teleport ',
+    'time ', 'weather ', 'execute ',
+    'kill ', 'summon ', 'tellraw ',
+    'worldborder ', 'fill ', 'setblock ',
+    'clone ', 'gamemode ', 'data ',
+    'scoreboard ', 'clear ',
+]
+
+# Commands that require explicit confirmation before execution.
+DESTRUCTIVE_PATTERNS = [
+    re.compile(r'^kill\s+@a\b'),               # kill all players
+    re.compile(r'^kill\s+@e\b'),               # kill all entities
+    re.compile(r'\bfill\b.*\bair\b'),          # filling with air (clearing)
+    re.compile(r'^worldborder\s+set\s+[01]\b'), # border to 0 or 1
+    re.compile(r'\btnt\b', re.I),              # TNT-related (destructive)
+    re.compile(r'\bfire\b.*\breplace\b', re.I), # fire fill
+]
+
+# Patterns that indicate invalid 1.21 syntax.
+SYNTAX_WARNINGS = [
+    (re.compile(r'\{Enchantments:\['), 'Old NBT enchantment syntax. Use item[enchantments={name:level}] in 1.21+.'),
+    (re.compile(r'^effect\s+(?!give\b|clear\b)\S+\s+minecraft:'), 'Missing "give" subcommand. Use "effect give <target> <effect>".'),
+    (re.compile(r'^weather\s+(storm|rainstorm|thunderstorm)', re.I), 'Invalid weather value. Use: clear, rain, thunder.'),
+    (re.compile(r'^gameMode\b'), '"gameMode" is not valid. Use lowercase "gamemode".'),
+    (re.compile(r'^gamemode\s+[0-3]\b'), 'Numeric gamemodes not valid in JE. Use: survival, creative, adventure, spectator.'),
+    (re.compile(r'^gamemode\s+[scaSCA]\b'), 'Abbreviated gamemodes not valid in JE. Use full words.'),
+    (re.compile(r'summon\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+$'), 'Cannot append count to summon. Each summon creates exactly one entity.'),
+    (re.compile(r'fire\s+0\s+replace'), 'Legacy fire metadata "0". Use minecraft:fire without metadata in 1.21+.'),
+]
+
+AUDIT_LOG_PATH = Path(__file__).resolve().parent.parent.parent / 'data' / 'raw' / 'audit_log.jsonl'
+
+
+def validate_command(command: str) -> Dict[str, Any]:
+    """
+    Validate a command against the allowlist and syntax checks.
+
+    Returns:
+        {
+            'command': str,
+            'allowed': bool,
+            'destructive': bool,
+            'warnings': [str],
+            'blocked_reason': str or None,
+        }
+    """
+    cmd = command.strip()
+    if cmd.startswith('/'):
+        cmd = cmd[1:]
+
+    result = {
+        'command': cmd,
+        'allowed': False,
+        'destructive': False,
+        'warnings': [],
+        'blocked_reason': None,
+    }
+
+    # Check allowlist
+    if not any(cmd.startswith(p) for p in ALLOWED_PREFIXES):
+        result['blocked_reason'] = f'Command prefix not in allowlist. Allowed: {", ".join(p.strip() for p in ALLOWED_PREFIXES[:10])}...'
+        return result
+
+    result['allowed'] = True
+
+    # Check for execute-wrapped bypass
+    if cmd.startswith('execute '):
+        tail = cmd
+        for _ in range(6):
+            if not tail.startswith('execute '):
+                break
+            idx = tail.find(' run ')
+            if idx < 0:
+                break
+            tail = tail[idx + 5:].strip()
+        if tail and not tail.startswith('execute '):
+            inner_prefixes = [p for p in ALLOWED_PREFIXES if p != 'execute ']
+            if not any(tail.startswith(p) for p in inner_prefixes):
+                result['allowed'] = False
+                result['blocked_reason'] = f'Unsafe execute tail: {tail[:50]}'
+                return result
+
+    # Check destructive patterns
+    for pattern in DESTRUCTIVE_PATTERNS:
+        if pattern.search(cmd):
+            result['destructive'] = True
+            break
+
+    # Check syntax warnings
+    for pattern, warning in SYNTAX_WARNINGS:
+        if pattern.search(cmd):
+            result['warnings'].append(warning)
+
+    return result
+
+
+def filter_commands(commands: List[str]) -> Tuple[List[str], List[Dict[str, Any]]]:
+    """
+    Filter a list of commands. Returns (safe_commands, validation_results).
+
+    Safe commands are those that pass the allowlist.
+    Destructive commands are included but flagged.
+    """
+    safe = []
+    results = []
+    for cmd in commands:
+        v = validate_command(cmd)
+        results.append(v)
+        if v['allowed']:
+            safe.append(v['command'])
+    return safe, results
+
+
+def audit_log(entry: Dict[str, Any]):
+    """Append an entry to the audit log."""
+    entry['timestamp'] = time.time()
+    AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(AUDIT_LOG_PATH, 'a') as f:
+        f.write(json.dumps(entry, ensure_ascii=True) + '\n')