5b28002001
Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
343 lines
11 KiB
Python
343 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.
|
||
|
||
The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
|
||
The Python Gym env sends actions and receives observations at ~600ms ticks.
|
||
|
||
Usage:
|
||
from minecraft_env import MinecraftCombatEnv
|
||
env = MinecraftCombatEnv()
|
||
obs, info = env.reset()
|
||
while True:
|
||
action = env.action_space.sample() # or policy(obs)
|
||
obs, reward, terminated, truncated, info = env.step(action)
|
||
"""
|
||
|
||
import json
|
||
import subprocess
|
||
import time
|
||
import os
|
||
import signal
|
||
import numpy as np
|
||
import gymnasium as gym
|
||
from gymnasium import spaces
|
||
from pathlib import Path
|
||
|
||
INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"
|
||
|
||
|
||
class MinecraftCombatEnv(gym.Env):
|
||
"""Minecraft combat survival environment via mineflayer bot."""
|
||
|
||
metadata = {"render_modes": ["human"], "render_fps": 2}
|
||
|
||
# Discrete actions
|
||
ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]
|
||
|
||
# Hostile mob types for reward calculation
|
||
HOSTILE = {
|
||
"zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
|
||
"witch", "enderman", "drowned", "stray", "phantom", "parched",
|
||
"camel_husk", "slime", "magma_cube",
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
host="192.168.0.244",
|
||
port=25568,
|
||
username="RLBot",
|
||
max_steps=600, # 600 ticks × 0.6s = 6 minutes per episode
|
||
tick_rate=0.6, # seconds per tick (sword cooldown rate)
|
||
render_mode=None,
|
||
):
|
||
super().__init__()
|
||
self.host = host
|
||
self.port = port
|
||
self.username = username
|
||
self.max_steps = max_steps
|
||
self.tick_rate = tick_rate
|
||
self.render_mode = render_mode
|
||
|
||
# Observation space: 13 floats normalized to [0, 1]
|
||
# [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
|
||
# has_sword, has_armor, has_food, is_day, on_water,
|
||
# y_level_norm, damage_taken_this_tick, is_fleeing]
|
||
self.observation_space = spaces.Box(
|
||
low=0.0, high=1.0, shape=(13,), dtype=np.float32
|
||
)
|
||
|
||
# Action space: 6 discrete actions
|
||
self.action_space = spaces.Discrete(len(self.ACTIONS))
|
||
|
||
# Internal state
|
||
self.proc = None
|
||
self.step_count = 0
|
||
self.total_reward = 0
|
||
self.kills = 0
|
||
self.prev_hp = 20.0
|
||
self.prev_food = 20
|
||
self.alive = False
|
||
self.last_obs = None
|
||
|
||
def _start_bot(self):
|
||
"""Start the mineflayer bot subprocess."""
|
||
if self.proc and self.proc.poll() is None:
|
||
self._stop_bot()
|
||
|
||
bot_script = INGAME_DIR / "rl_bot.js"
|
||
self.proc = subprocess.Popen(
|
||
["node", str(bot_script), self.host, str(self.port), self.username],
|
||
stdin=subprocess.PIPE,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.DEVNULL,
|
||
text=True,
|
||
bufsize=1, # line buffered
|
||
)
|
||
|
||
def _stop_bot(self):
|
||
"""Stop the bot subprocess."""
|
||
if self.proc:
|
||
try:
|
||
self.proc.stdin.write("quit\n")
|
||
self.proc.stdin.flush()
|
||
self.proc.wait(timeout=3)
|
||
except Exception:
|
||
try:
|
||
self.proc.kill()
|
||
except Exception:
|
||
pass
|
||
self.proc = None
|
||
|
||
def _send(self, cmd):
|
||
"""Send a command to the bot and read the JSON response."""
|
||
try:
|
||
self.proc.stdin.write(cmd + "\n")
|
||
self.proc.stdin.flush()
|
||
|
||
# Read lines until we get a valid JSON observation
|
||
deadline = time.time() + 5.0
|
||
while time.time() < deadline:
|
||
line = self.proc.stdout.readline().strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
return data
|
||
except json.JSONDecodeError:
|
||
continue
|
||
return None
|
||
except (BrokenPipeError, OSError):
|
||
return None
|
||
|
||
def _parse_observation(self, data):
|
||
"""Convert JSON bot state to numpy observation vector."""
|
||
if not data or "hp" not in data:
|
||
return np.zeros(13, dtype=np.float32)
|
||
|
||
hp = (data.get("hp") or 0) / 20.0 # normalize to [0, 1]
|
||
food = (data.get("food") or 0) / 20.0
|
||
mobs = data.get("mobs", [])
|
||
|
||
# Nearest hostile mob
|
||
hostile_mobs = [m for m in mobs if m.get("hostile", False)]
|
||
if hostile_mobs:
|
||
nearest = min(hostile_mobs, key=lambda m: m["dist"])
|
||
nearest_dist = min(nearest["dist"] / 24.0, 1.0) # normalize
|
||
# Angle: approximate from relative position if available
|
||
nearest_angle = 0.5 # default forward
|
||
else:
|
||
nearest_dist = 1.0 # no mob = max distance
|
||
nearest_angle = 0.5
|
||
|
||
mob_count = min(len(hostile_mobs) / 10.0, 1.0)
|
||
|
||
# Inventory flags
|
||
inv = data.get("inv", "")
|
||
has_sword = 1.0 if "sword" in inv else 0.0
|
||
has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
|
||
has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0
|
||
|
||
# World state
|
||
is_day = 1.0 if data.get("day", True) else 0.0
|
||
on_water = 1.0 if data.get("below", "") == "water" else 0.0
|
||
|
||
# Y level (normalize: 0=bedrock, 320=max → 0-1)
|
||
y = (data.get("pos") or {}).get("y", 64) or 64
|
||
y_norm = min(max(y, 0), 320) / 320.0
|
||
|
||
# Damage taken this tick
|
||
current_hp = float(data.get("hp") or 20)
|
||
prev = float(self.prev_hp or 20)
|
||
damage = max(0, prev - current_hp) / 20.0
|
||
|
||
# Is currently fleeing (HP < 5)
|
||
is_fleeing = 1.0 if current_hp < 5 else 0.0
|
||
|
||
obs = np.array([
|
||
hp, food, nearest_dist, nearest_angle, mob_count,
|
||
has_sword, has_armor, has_food, is_day, on_water,
|
||
y_norm, damage, is_fleeing,
|
||
], dtype=np.float32)
|
||
|
||
return obs
|
||
|
||
def _calc_reward(self, data, action):
|
||
"""Calculate reward from state transition."""
|
||
if not data or "hp" not in data:
|
||
return -100.0 # lost connection = death equivalent
|
||
|
||
reward = 0.0
|
||
hp = float(data.get("hp") or 0)
|
||
food = int(data.get("food") or 20)
|
||
|
||
# Survival reward: +1 per tick alive
|
||
reward += 1.0
|
||
|
||
# Damage penalty
|
||
damage = max(0, float(self.prev_hp or 20) - hp)
|
||
if damage > 0:
|
||
reward -= damage * 2.0 # -2 per HP lost
|
||
|
||
# Death penalty
|
||
if hp <= 0 or data.get("died", False):
|
||
reward -= 100.0
|
||
|
||
# Kill reward
|
||
new_kills = data.get("kills", 0)
|
||
kills_this_tick = new_kills - self.kills
|
||
if kills_this_tick > 0:
|
||
reward += kills_this_tick * 10.0
|
||
self.kills = new_kills
|
||
|
||
# Eating when hungry: good
|
||
prev_food = int(self.prev_food or 20)
|
||
if action == 3 and prev_food < 14 and food > prev_food:
|
||
reward += 5.0
|
||
|
||
# Eating when full: wasted action
|
||
if action == 3 and prev_food >= 18:
|
||
reward -= 1.0
|
||
|
||
# Fighting when no mobs nearby: wasted
|
||
mobs = data.get("mobs", [])
|
||
hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
|
||
if action == 1 and not hostile_nearby:
|
||
reward -= 0.5
|
||
|
||
# Fleeing when HP is low and mobs nearby: good decision
|
||
if action == 2 and hp < 8 and hostile_nearby:
|
||
reward += 3.0
|
||
|
||
# Idle penalty (doing nothing when threats exist)
|
||
if action == 5 and hostile_nearby:
|
||
reward -= 2.0
|
||
|
||
# Update state
|
||
self.prev_hp = hp
|
||
self.prev_food = food
|
||
|
||
return reward
|
||
|
||
def reset(self, seed=None, options=None):
|
||
"""Reset the environment — reconnect bot and start new episode."""
|
||
super().reset(seed=seed)
|
||
|
||
self._start_bot()
|
||
|
||
# Wait for bot to spawn
|
||
deadline = time.time() + 30.0
|
||
data = None
|
||
while time.time() < deadline:
|
||
line = self.proc.stdout.readline().strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
d = json.loads(line)
|
||
if d.get("event") == "ready":
|
||
data = d
|
||
break
|
||
if "hp" in d:
|
||
data = d
|
||
break
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
if not data:
|
||
# Fallback: send observe
|
||
time.sleep(3)
|
||
data = self._send("observe")
|
||
|
||
self.step_count = 0
|
||
self.total_reward = 0
|
||
self.kills = data.get("kills", 0) if data else 0
|
||
self.prev_hp = data.get("hp", 20) if data else 20
|
||
self.prev_food = data.get("food", 20) if data else 20
|
||
self.alive = True
|
||
|
||
obs = self._parse_observation(data)
|
||
self.last_obs = obs
|
||
info = {"raw": data}
|
||
|
||
return obs, info
|
||
|
||
def step(self, action):
|
||
"""Execute one action and return (obs, reward, terminated, truncated, info)."""
|
||
self.step_count += 1
|
||
action_name = self.ACTIONS[action]
|
||
|
||
# Send action to bot
|
||
data = self._send(action_name)
|
||
|
||
# Wait for game tick
|
||
time.sleep(self.tick_rate)
|
||
|
||
# Get observation after action
|
||
if data is None or "hp" not in data:
|
||
obs_data = self._send("observe")
|
||
else:
|
||
obs_data = data
|
||
|
||
obs = self._parse_observation(obs_data)
|
||
reward = self._calc_reward(obs_data, action)
|
||
self.total_reward += reward
|
||
|
||
# Check termination
|
||
terminated = False
|
||
if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
|
||
terminated = True
|
||
self.alive = False
|
||
|
||
# Check truncation (max steps)
|
||
truncated = self.step_count >= self.max_steps
|
||
|
||
info = {
|
||
"raw": obs_data,
|
||
"step": self.step_count,
|
||
"total_reward": self.total_reward,
|
||
"kills": self.kills,
|
||
"alive": self.alive,
|
||
}
|
||
|
||
self.last_obs = obs
|
||
|
||
if self.render_mode == "human":
|
||
self.render()
|
||
|
||
return obs, reward, terminated, truncated, info
|
||
|
||
def render(self):
|
||
"""Print current state."""
|
||
if self.last_obs is not None:
|
||
hp = self.last_obs[0] * 20
|
||
food = self.last_obs[1] * 20
|
||
mob_dist = self.last_obs[2] * 24
|
||
mob_count = int(self.last_obs[4] * 10)
|
||
print(f" Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
|
||
f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
|
||
f"R={self.total_reward:.1f}")
|
||
|
||
def close(self):
|
||
"""Clean up."""
|
||
self._stop_bot()
|