Files
Seth 5b28002001 0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline
Major changes from this session:

Training:
- 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL
- 7,256 merged training examples (up from 3,183)
- New training data: failure modes (85), midloop messaging (27),
  prompt injection defense (29), personality (32), gold from quarantine
  bank (232), new tool examples (30), claude's own experience (10)
- All training data RCON-validated at 100% pass rate
- Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56%

Oracle Bot (Mind's Eye):
- Invisible spectator bot (mineflayer) streams world state via WebSocket
- HTML5 Canvas frontend at mind.mortdec.ai
- Real-time tool trace visualization with expandable entries
- Streaming model tokens during inference
- Gateway integration: fire-and-forget POST /trace on every tool call

Reinforcement Learning:
- Gymnasium environment wrapping mineflayer bot (minecraft_env.py)
- PPO training via Stable Baselines3 (10K param policy network)
- Behavioral cloning pretraining (97.5% accuracy on expert policy)
- Infinite training loop with auto-restart and checkpoint resume
- Bot learns combat, survival, navigation from raw experience

Bot Army:
- 8-soldier marching formation with autonomous combat
- Combat bots using mineflayer-pvp, pathfinder, armor-manager
- Multilingual prayer bots via translategemma:27b (18 languages)
- Frame-based AI architecture: LLM planner + reactive micro-scripts

Infrastructure:
- Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser)
- Billing gateway now tracks all LAN traffic (LAN auto-auth)
- Gateway fallback for empty god-mode responses
- Updated mortdec.ai landing page

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 20:22:50 -04:00

343 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.
The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
The Python Gym env sends actions and receives observations at ~600ms ticks.
Usage:
from minecraft_env import MinecraftCombatEnv
env = MinecraftCombatEnv()
obs, info = env.reset()
while True:
action = env.action_space.sample() # or policy(obs)
obs, reward, terminated, truncated, info = env.step(action)
"""
import json
import subprocess
import time
import os
import signal
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from pathlib import Path
INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"
class MinecraftCombatEnv(gym.Env):
"""Minecraft combat survival environment via mineflayer bot."""
metadata = {"render_modes": ["human"], "render_fps": 2}
# Discrete actions
ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]
# Hostile mob types for reward calculation
HOSTILE = {
"zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
"witch", "enderman", "drowned", "stray", "phantom", "parched",
"camel_husk", "slime", "magma_cube",
}
def __init__(
self,
host="192.168.0.244",
port=25568,
username="RLBot",
max_steps=600, # 600 ticks × 0.6s = 6 minutes per episode
tick_rate=0.6, # seconds per tick (sword cooldown rate)
render_mode=None,
):
super().__init__()
self.host = host
self.port = port
self.username = username
self.max_steps = max_steps
self.tick_rate = tick_rate
self.render_mode = render_mode
# Observation space: 13 floats normalized to [0, 1]
# [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
# has_sword, has_armor, has_food, is_day, on_water,
# y_level_norm, damage_taken_this_tick, is_fleeing]
self.observation_space = spaces.Box(
low=0.0, high=1.0, shape=(13,), dtype=np.float32
)
# Action space: 6 discrete actions
self.action_space = spaces.Discrete(len(self.ACTIONS))
# Internal state
self.proc = None
self.step_count = 0
self.total_reward = 0
self.kills = 0
self.prev_hp = 20.0
self.prev_food = 20
self.alive = False
self.last_obs = None
def _start_bot(self):
"""Start the mineflayer bot subprocess."""
if self.proc and self.proc.poll() is None:
self._stop_bot()
bot_script = INGAME_DIR / "rl_bot.js"
self.proc = subprocess.Popen(
["node", str(bot_script), self.host, str(self.port), self.username],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
bufsize=1, # line buffered
)
def _stop_bot(self):
"""Stop the bot subprocess."""
if self.proc:
try:
self.proc.stdin.write("quit\n")
self.proc.stdin.flush()
self.proc.wait(timeout=3)
except Exception:
try:
self.proc.kill()
except Exception:
pass
self.proc = None
def _send(self, cmd):
"""Send a command to the bot and read the JSON response."""
try:
self.proc.stdin.write(cmd + "\n")
self.proc.stdin.flush()
# Read lines until we get a valid JSON observation
deadline = time.time() + 5.0
while time.time() < deadline:
line = self.proc.stdout.readline().strip()
if not line:
continue
try:
data = json.loads(line)
return data
except json.JSONDecodeError:
continue
return None
except (BrokenPipeError, OSError):
return None
def _parse_observation(self, data):
"""Convert JSON bot state to numpy observation vector."""
if not data or "hp" not in data:
return np.zeros(13, dtype=np.float32)
hp = (data.get("hp") or 0) / 20.0 # normalize to [0, 1]
food = (data.get("food") or 0) / 20.0
mobs = data.get("mobs", [])
# Nearest hostile mob
hostile_mobs = [m for m in mobs if m.get("hostile", False)]
if hostile_mobs:
nearest = min(hostile_mobs, key=lambda m: m["dist"])
nearest_dist = min(nearest["dist"] / 24.0, 1.0) # normalize
# Angle: approximate from relative position if available
nearest_angle = 0.5 # default forward
else:
nearest_dist = 1.0 # no mob = max distance
nearest_angle = 0.5
mob_count = min(len(hostile_mobs) / 10.0, 1.0)
# Inventory flags
inv = data.get("inv", "")
has_sword = 1.0 if "sword" in inv else 0.0
has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0
# World state
is_day = 1.0 if data.get("day", True) else 0.0
on_water = 1.0 if data.get("below", "") == "water" else 0.0
# Y level (normalize: 0=bedrock, 320=max → 0-1)
y = (data.get("pos") or {}).get("y", 64) or 64
y_norm = min(max(y, 0), 320) / 320.0
# Damage taken this tick
current_hp = float(data.get("hp") or 20)
prev = float(self.prev_hp or 20)
damage = max(0, prev - current_hp) / 20.0
# Is currently fleeing (HP < 5)
is_fleeing = 1.0 if current_hp < 5 else 0.0
obs = np.array([
hp, food, nearest_dist, nearest_angle, mob_count,
has_sword, has_armor, has_food, is_day, on_water,
y_norm, damage, is_fleeing,
], dtype=np.float32)
return obs
def _calc_reward(self, data, action):
"""Calculate reward from state transition."""
if not data or "hp" not in data:
return -100.0 # lost connection = death equivalent
reward = 0.0
hp = float(data.get("hp") or 0)
food = int(data.get("food") or 20)
# Survival reward: +1 per tick alive
reward += 1.0
# Damage penalty
damage = max(0, float(self.prev_hp or 20) - hp)
if damage > 0:
reward -= damage * 2.0 # -2 per HP lost
# Death penalty
if hp <= 0 or data.get("died", False):
reward -= 100.0
# Kill reward
new_kills = data.get("kills", 0)
kills_this_tick = new_kills - self.kills
if kills_this_tick > 0:
reward += kills_this_tick * 10.0
self.kills = new_kills
# Eating when hungry: good
prev_food = int(self.prev_food or 20)
if action == 3 and prev_food < 14 and food > prev_food:
reward += 5.0
# Eating when full: wasted action
if action == 3 and prev_food >= 18:
reward -= 1.0
# Fighting when no mobs nearby: wasted
mobs = data.get("mobs", [])
hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
if action == 1 and not hostile_nearby:
reward -= 0.5
# Fleeing when HP is low and mobs nearby: good decision
if action == 2 and hp < 8 and hostile_nearby:
reward += 3.0
# Idle penalty (doing nothing when threats exist)
if action == 5 and hostile_nearby:
reward -= 2.0
# Update state
self.prev_hp = hp
self.prev_food = food
return reward
def reset(self, seed=None, options=None):
"""Reset the environment — reconnect bot and start new episode."""
super().reset(seed=seed)
self._start_bot()
# Wait for bot to spawn
deadline = time.time() + 30.0
data = None
while time.time() < deadline:
line = self.proc.stdout.readline().strip()
if not line:
continue
try:
d = json.loads(line)
if d.get("event") == "ready":
data = d
break
if "hp" in d:
data = d
break
except json.JSONDecodeError:
continue
if not data:
# Fallback: send observe
time.sleep(3)
data = self._send("observe")
self.step_count = 0
self.total_reward = 0
self.kills = data.get("kills", 0) if data else 0
self.prev_hp = data.get("hp", 20) if data else 20
self.prev_food = data.get("food", 20) if data else 20
self.alive = True
obs = self._parse_observation(data)
self.last_obs = obs
info = {"raw": data}
return obs, info
def step(self, action):
"""Execute one action and return (obs, reward, terminated, truncated, info)."""
self.step_count += 1
action_name = self.ACTIONS[action]
# Send action to bot
data = self._send(action_name)
# Wait for game tick
time.sleep(self.tick_rate)
# Get observation after action
if data is None or "hp" not in data:
obs_data = self._send("observe")
else:
obs_data = data
obs = self._parse_observation(obs_data)
reward = self._calc_reward(obs_data, action)
self.total_reward += reward
# Check termination
terminated = False
if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
terminated = True
self.alive = False
# Check truncation (max steps)
truncated = self.step_count >= self.max_steps
info = {
"raw": obs_data,
"step": self.step_count,
"total_reward": self.total_reward,
"kills": self.kills,
"alive": self.alive,
}
self.last_obs = obs
if self.render_mode == "human":
self.render()
return obs, reward, terminated, truncated, info
def render(self):
"""Print current state."""
if self.last_obs is not None:
hp = self.last_obs[0] * 20
food = self.last_obs[1] * 20
mob_dist = self.last_obs[2] * 24
mob_count = int(self.last_obs[4] * 10)
print(f" Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
f"R={self.total_reward:.1f}")
def close(self):
"""Clean up."""
self._stop_bot()