Add baseline assistant with tools, guardrails, and system prompts (Phase 1.4)

- agent/serve.py: CLI assistant with interactive, single-query, and eval modes (Ollama + qwen3-coder)
- agent/tools/rcon_tool.py: RCON execute, server status, player info
- agent/tools/knowledge_tool.py: TF-IDF RAG search, command reference lookup, server context
- agent/guardrails/command_filter.py: 14-prefix allowlist, execute-tail bypass detection, destructive flags, 1.21 syntax warnings, audit log
- agent/prompts/system_prompts.py: sudo (pure commands), god (persona), intervention (benign) system prompts
- Guardrails tested: 10/10 allowlist, 5/6 syntax warnings pass
This commit is contained in:
2026-03-18 02:12:20 -04:00
parent 77efac0283
commit e00d454b19
10 changed files with 815 additions and 12 deletions
View File
+138
View File
@@ -0,0 +1,138 @@
"""
Safety guardrails for Minecraft command execution.
Provides:
- Command allowlist filtering
- Destructive action detection
- Syntax validation hints
- Audit logging
"""
import json
import re
import time
from pathlib import Path
from typing import Dict, Any, List, Tuple
# Commands allowed for execution via the assistant.
# Anything not on this list is blocked.
ALLOWED_PREFIXES = [
'give ', 'effect ', 'xp ', 'tp ', 'teleport ',
'time ', 'weather ', 'execute ',
'kill ', 'summon ', 'tellraw ',
'worldborder ', 'fill ', 'setblock ',
'clone ', 'gamemode ', 'data ',
'scoreboard ', 'clear ',
]
# Commands that require explicit confirmation before execution.
DESTRUCTIVE_PATTERNS = [
re.compile(r'^kill\s+@a\b'), # kill all players
re.compile(r'^kill\s+@e\b'), # kill all entities
re.compile(r'\bfill\b.*\bair\b'), # filling with air (clearing)
re.compile(r'^worldborder\s+set\s+[01]\b'), # border to 0 or 1
re.compile(r'\btnt\b', re.I), # TNT-related (destructive)
re.compile(r'\bfire\b.*\breplace\b', re.I), # fire fill
]
# Patterns that indicate invalid 1.21 syntax.
SYNTAX_WARNINGS = [
(re.compile(r'\{Enchantments:\['), 'Old NBT enchantment syntax. Use item[enchantments={name:level}] in 1.21+.'),
(re.compile(r'^effect\s+(?!give\b|clear\b)\S+\s+minecraft:'), 'Missing "give" subcommand. Use "effect give <target> <effect>".'),
(re.compile(r'^weather\s+(storm|rainstorm|thunderstorm)', re.I), 'Invalid weather value. Use: clear, rain, thunder.'),
(re.compile(r'^gameMode\b'), '"gameMode" is not valid. Use lowercase "gamemode".'),
(re.compile(r'^gamemode\s+[0-3]\b'), 'Numeric gamemodes not valid in JE. Use: survival, creative, adventure, spectator.'),
(re.compile(r'^gamemode\s+[scaSCA]\b'), 'Abbreviated gamemodes not valid in JE. Use full words.'),
(re.compile(r'summon\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+$'), 'Cannot append count to summon. Each summon creates exactly one entity.'),
(re.compile(r'fire\s+0\s+replace'), 'Legacy fire metadata "0". Use minecraft:fire without metadata in 1.21+.'),
]
AUDIT_LOG_PATH = Path(__file__).resolve().parent.parent.parent / 'data' / 'raw' / 'audit_log.jsonl'
def validate_command(command: str) -> Dict[str, Any]:
"""
Validate a command against the allowlist and syntax checks.
Returns:
{
'command': str,
'allowed': bool,
'destructive': bool,
'warnings': [str],
'blocked_reason': str or None,
}
"""
cmd = command.strip()
if cmd.startswith('/'):
cmd = cmd[1:]
result = {
'command': cmd,
'allowed': False,
'destructive': False,
'warnings': [],
'blocked_reason': None,
}
# Check allowlist
if not any(cmd.startswith(p) for p in ALLOWED_PREFIXES):
result['blocked_reason'] = f'Command prefix not in allowlist. Allowed: {", ".join(p.strip() for p in ALLOWED_PREFIXES[:10])}...'
return result
result['allowed'] = True
# Check for execute-wrapped bypass
if cmd.startswith('execute '):
tail = cmd
for _ in range(6):
if not tail.startswith('execute '):
break
idx = tail.find(' run ')
if idx < 0:
break
tail = tail[idx + 5:].strip()
if tail and not tail.startswith('execute '):
inner_prefixes = [p for p in ALLOWED_PREFIXES if p != 'execute ']
if not any(tail.startswith(p) for p in inner_prefixes):
result['allowed'] = False
result['blocked_reason'] = f'Unsafe execute tail: {tail[:50]}'
return result
# Check destructive patterns
for pattern in DESTRUCTIVE_PATTERNS:
if pattern.search(cmd):
result['destructive'] = True
break
# Check syntax warnings
for pattern, warning in SYNTAX_WARNINGS:
if pattern.search(cmd):
result['warnings'].append(warning)
return result
def filter_commands(commands: List[str]) -> Tuple[List[str], List[Dict[str, Any]]]:
"""
Filter a list of commands. Returns (safe_commands, validation_results).
Safe commands are those that pass the allowlist.
Destructive commands are included but flagged.
"""
safe = []
results = []
for cmd in commands:
v = validate_command(cmd)
results.append(v)
if v['allowed']:
safe.append(v['command'])
return safe, results
def audit_log(entry: Dict[str, Any]):
"""Append an entry to the audit log."""
entry['timestamp'] = time.time()
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(AUDIT_LOG_PATH, 'a') as f:
f.write(json.dumps(entry, ensure_ascii=True) + '\n')