Add baseline assistant with tools, guardrails, and system prompts (Phase 1.4)
- agent/serve.py: CLI assistant with interactive, single-query, and eval modes (Ollama + qwen3-coder) - agent/tools/rcon_tool.py: RCON execute, server status, player info - agent/tools/knowledge_tool.py: TF-IDF RAG search, command reference lookup, server context - agent/guardrails/command_filter.py: 14-prefix allowlist, execute-tail bypass detection, destructive flags, 1.21 syntax warnings, audit log - agent/prompts/system_prompts.py: sudo (pure commands), god (persona), intervention (benign) system prompts - Guardrails tested: 10/10 allowlist, 5/6 syntax warnings pass
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
Safety guardrails for Minecraft command execution.
|
||||
|
||||
Provides:
|
||||
- Command allowlist filtering
|
||||
- Destructive action detection
|
||||
- Syntax validation hints
|
||||
- Audit logging
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Tuple
|
||||
|
||||
# Commands allowed for execution via the assistant.
|
||||
# Anything not on this list is blocked.
|
||||
ALLOWED_PREFIXES = [
|
||||
'give ', 'effect ', 'xp ', 'tp ', 'teleport ',
|
||||
'time ', 'weather ', 'execute ',
|
||||
'kill ', 'summon ', 'tellraw ',
|
||||
'worldborder ', 'fill ', 'setblock ',
|
||||
'clone ', 'gamemode ', 'data ',
|
||||
'scoreboard ', 'clear ',
|
||||
]
|
||||
|
||||
# Commands that require explicit confirmation before execution.
|
||||
DESTRUCTIVE_PATTERNS = [
|
||||
re.compile(r'^kill\s+@a\b'), # kill all players
|
||||
re.compile(r'^kill\s+@e\b'), # kill all entities
|
||||
re.compile(r'\bfill\b.*\bair\b'), # filling with air (clearing)
|
||||
re.compile(r'^worldborder\s+set\s+[01]\b'), # border to 0 or 1
|
||||
re.compile(r'\btnt\b', re.I), # TNT-related (destructive)
|
||||
re.compile(r'\bfire\b.*\breplace\b', re.I), # fire fill
|
||||
]
|
||||
|
||||
# Patterns that indicate invalid 1.21 syntax.
|
||||
SYNTAX_WARNINGS = [
|
||||
(re.compile(r'\{Enchantments:\['), 'Old NBT enchantment syntax. Use item[enchantments={name:level}] in 1.21+.'),
|
||||
(re.compile(r'^effect\s+(?!give\b|clear\b)\S+\s+minecraft:'), 'Missing "give" subcommand. Use "effect give <target> <effect>".'),
|
||||
(re.compile(r'^weather\s+(storm|rainstorm|thunderstorm)', re.I), 'Invalid weather value. Use: clear, rain, thunder.'),
|
||||
(re.compile(r'^gameMode\b'), '"gameMode" is not valid. Use lowercase "gamemode".'),
|
||||
(re.compile(r'^gamemode\s+[0-3]\b'), 'Numeric gamemodes not valid in JE. Use: survival, creative, adventure, spectator.'),
|
||||
(re.compile(r'^gamemode\s+[scaSCA]\b'), 'Abbreviated gamemodes not valid in JE. Use full words.'),
|
||||
(re.compile(r'summon\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+$'), 'Cannot append count to summon. Each summon creates exactly one entity.'),
|
||||
(re.compile(r'fire\s+0\s+replace'), 'Legacy fire metadata "0". Use minecraft:fire without metadata in 1.21+.'),
|
||||
]
|
||||
|
||||
AUDIT_LOG_PATH = Path(__file__).resolve().parent.parent.parent / 'data' / 'raw' / 'audit_log.jsonl'
|
||||
|
||||
|
||||
def validate_command(command: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate a command against the allowlist and syntax checks.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'command': str,
|
||||
'allowed': bool,
|
||||
'destructive': bool,
|
||||
'warnings': [str],
|
||||
'blocked_reason': str or None,
|
||||
}
|
||||
"""
|
||||
cmd = command.strip()
|
||||
if cmd.startswith('/'):
|
||||
cmd = cmd[1:]
|
||||
|
||||
result = {
|
||||
'command': cmd,
|
||||
'allowed': False,
|
||||
'destructive': False,
|
||||
'warnings': [],
|
||||
'blocked_reason': None,
|
||||
}
|
||||
|
||||
# Check allowlist
|
||||
if not any(cmd.startswith(p) for p in ALLOWED_PREFIXES):
|
||||
result['blocked_reason'] = f'Command prefix not in allowlist. Allowed: {", ".join(p.strip() for p in ALLOWED_PREFIXES[:10])}...'
|
||||
return result
|
||||
|
||||
result['allowed'] = True
|
||||
|
||||
# Check for execute-wrapped bypass
|
||||
if cmd.startswith('execute '):
|
||||
tail = cmd
|
||||
for _ in range(6):
|
||||
if not tail.startswith('execute '):
|
||||
break
|
||||
idx = tail.find(' run ')
|
||||
if idx < 0:
|
||||
break
|
||||
tail = tail[idx + 5:].strip()
|
||||
if tail and not tail.startswith('execute '):
|
||||
inner_prefixes = [p for p in ALLOWED_PREFIXES if p != 'execute ']
|
||||
if not any(tail.startswith(p) for p in inner_prefixes):
|
||||
result['allowed'] = False
|
||||
result['blocked_reason'] = f'Unsafe execute tail: {tail[:50]}'
|
||||
return result
|
||||
|
||||
# Check destructive patterns
|
||||
for pattern in DESTRUCTIVE_PATTERNS:
|
||||
if pattern.search(cmd):
|
||||
result['destructive'] = True
|
||||
break
|
||||
|
||||
# Check syntax warnings
|
||||
for pattern, warning in SYNTAX_WARNINGS:
|
||||
if pattern.search(cmd):
|
||||
result['warnings'].append(warning)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def filter_commands(commands: List[str]) -> Tuple[List[str], List[Dict[str, Any]]]:
|
||||
"""
|
||||
Filter a list of commands. Returns (safe_commands, validation_results).
|
||||
|
||||
Safe commands are those that pass the allowlist.
|
||||
Destructive commands are included but flagged.
|
||||
"""
|
||||
safe = []
|
||||
results = []
|
||||
for cmd in commands:
|
||||
v = validate_command(cmd)
|
||||
results.append(v)
|
||||
if v['allowed']:
|
||||
safe.append(v['command'])
|
||||
return safe, results
|
||||
|
||||
|
||||
def audit_log(entry: Dict[str, Any]):
|
||||
"""Append an entry to the audit log."""
|
||||
entry['timestamp'] = time.time()
|
||||
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(AUDIT_LOG_PATH, 'a') as f:
|
||||
f.write(json.dumps(entry, ensure_ascii=True) + '\n')
|
||||
Reference in New Issue
Block a user