0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline

Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 20:22:50 -04:00
parent baab24f8b1
commit 5b28002001
44 changed files with 20873 additions and 4352 deletions
@@ -0,0 +1,342 @@
+#!/usr/bin/env python3
+"""
+minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.
+
+The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
+The Python Gym env sends actions and receives observations at ~600ms ticks.
+
+Usage:
+    from minecraft_env import MinecraftCombatEnv
+    env = MinecraftCombatEnv()
+    obs, info = env.reset()
+    while True:
+        action = env.action_space.sample()  # or policy(obs)
+        obs, reward, terminated, truncated, info = env.step(action)
+"""
+
+import json
+import subprocess
+import time
+import os
+import signal
+import numpy as np
+import gymnasium as gym
+from gymnasium import spaces
+from pathlib import Path
+
+INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"
+
+
+class MinecraftCombatEnv(gym.Env):
+    """Minecraft combat survival environment via mineflayer bot."""
+
+    metadata = {"render_modes": ["human"], "render_fps": 2}
+
+    # Discrete actions
+    ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]
+
+    # Hostile mob types for reward calculation
+    HOSTILE = {
+        "zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
+        "witch", "enderman", "drowned", "stray", "phantom", "parched",
+        "camel_husk", "slime", "magma_cube",
+    }
+
+    def __init__(
+        self,
+        host="192.168.0.244",
+        port=25568,
+        username="RLBot",
+        max_steps=600,       # 600 ticks × 0.6s = 6 minutes per episode
+        tick_rate=0.6,       # seconds per tick (sword cooldown rate)
+        render_mode=None,
+    ):
+        super().__init__()
+        self.host = host
+        self.port = port
+        self.username = username
+        self.max_steps = max_steps
+        self.tick_rate = tick_rate
+        self.render_mode = render_mode
+
+        # Observation space: 13 floats normalized to [0, 1]
+        # [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
+        #  has_sword, has_armor, has_food, is_day, on_water,
+        #  y_level_norm, damage_taken_this_tick, is_fleeing]
+        self.observation_space = spaces.Box(
+            low=0.0, high=1.0, shape=(13,), dtype=np.float32
+        )
+
+        # Action space: 6 discrete actions
+        self.action_space = spaces.Discrete(len(self.ACTIONS))
+
+        # Internal state
+        self.proc = None
+        self.step_count = 0
+        self.total_reward = 0
+        self.kills = 0
+        self.prev_hp = 20.0
+        self.prev_food = 20
+        self.alive = False
+        self.last_obs = None
+
+    def _start_bot(self):
+        """Start the mineflayer bot subprocess."""
+        if self.proc and self.proc.poll() is None:
+            self._stop_bot()
+
+        bot_script = INGAME_DIR / "rl_bot.js"
+        self.proc = subprocess.Popen(
+            ["node", str(bot_script), self.host, str(self.port), self.username],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            text=True,
+            bufsize=1,  # line buffered
+        )
+
+    def _stop_bot(self):
+        """Stop the bot subprocess."""
+        if self.proc:
+            try:
+                self.proc.stdin.write("quit\n")
+                self.proc.stdin.flush()
+                self.proc.wait(timeout=3)
+            except Exception:
+                try:
+                    self.proc.kill()
+                except Exception:
+                    pass
+            self.proc = None
+
+    def _send(self, cmd):
+        """Send a command to the bot and read the JSON response."""
+        try:
+            self.proc.stdin.write(cmd + "\n")
+            self.proc.stdin.flush()
+
+            # Read lines until we get a valid JSON observation
+            deadline = time.time() + 5.0
+            while time.time() < deadline:
+                line = self.proc.stdout.readline().strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    return data
+                except json.JSONDecodeError:
+                    continue
+            return None
+        except (BrokenPipeError, OSError):
+            return None
+
+    def _parse_observation(self, data):
+        """Convert JSON bot state to numpy observation vector."""
+        if not data or "hp" not in data:
+            return np.zeros(13, dtype=np.float32)
+
+        hp = (data.get("hp") or 0) / 20.0  # normalize to [0, 1]
+        food = (data.get("food") or 0) / 20.0
+        mobs = data.get("mobs", [])
+
+        # Nearest hostile mob
+        hostile_mobs = [m for m in mobs if m.get("hostile", False)]
+        if hostile_mobs:
+            nearest = min(hostile_mobs, key=lambda m: m["dist"])
+            nearest_dist = min(nearest["dist"] / 24.0, 1.0)  # normalize
+            # Angle: approximate from relative position if available
+            nearest_angle = 0.5  # default forward
+        else:
+            nearest_dist = 1.0  # no mob = max distance
+            nearest_angle = 0.5
+
+        mob_count = min(len(hostile_mobs) / 10.0, 1.0)
+
+        # Inventory flags
+        inv = data.get("inv", "")
+        has_sword = 1.0 if "sword" in inv else 0.0
+        has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
+        has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0
+
+        # World state
+        is_day = 1.0 if data.get("day", True) else 0.0
+        on_water = 1.0 if data.get("below", "") == "water" else 0.0
+
+        # Y level (normalize: 0=bedrock, 320=max → 0-1)
+        y = (data.get("pos") or {}).get("y", 64) or 64
+        y_norm = min(max(y, 0), 320) / 320.0
+
+        # Damage taken this tick
+        current_hp = float(data.get("hp") or 20)
+        prev = float(self.prev_hp or 20)
+        damage = max(0, prev - current_hp) / 20.0
+
+        # Is currently fleeing (HP < 5)
+        is_fleeing = 1.0 if current_hp < 5 else 0.0
+
+        obs = np.array([
+            hp, food, nearest_dist, nearest_angle, mob_count,
+            has_sword, has_armor, has_food, is_day, on_water,
+            y_norm, damage, is_fleeing,
+        ], dtype=np.float32)
+
+        return obs
+
+    def _calc_reward(self, data, action):
+        """Calculate reward from state transition."""
+        if not data or "hp" not in data:
+            return -100.0  # lost connection = death equivalent
+
+        reward = 0.0
+        hp = float(data.get("hp") or 0)
+        food = int(data.get("food") or 20)
+
+        # Survival reward: +1 per tick alive
+        reward += 1.0
+
+        # Damage penalty
+        damage = max(0, float(self.prev_hp or 20) - hp)
+        if damage > 0:
+            reward -= damage * 2.0  # -2 per HP lost
+
+        # Death penalty
+        if hp <= 0 or data.get("died", False):
+            reward -= 100.0
+
+        # Kill reward
+        new_kills = data.get("kills", 0)
+        kills_this_tick = new_kills - self.kills
+        if kills_this_tick > 0:
+            reward += kills_this_tick * 10.0
+        self.kills = new_kills
+
+        # Eating when hungry: good
+        prev_food = int(self.prev_food or 20)
+        if action == 3 and prev_food < 14 and food > prev_food:
+            reward += 5.0
+
+        # Eating when full: wasted action
+        if action == 3 and prev_food >= 18:
+            reward -= 1.0
+
+        # Fighting when no mobs nearby: wasted
+        mobs = data.get("mobs", [])
+        hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
+        if action == 1 and not hostile_nearby:
+            reward -= 0.5
+
+        # Fleeing when HP is low and mobs nearby: good decision
+        if action == 2 and hp < 8 and hostile_nearby:
+            reward += 3.0
+
+        # Idle penalty (doing nothing when threats exist)
+        if action == 5 and hostile_nearby:
+            reward -= 2.0
+
+        # Update state
+        self.prev_hp = hp
+        self.prev_food = food
+
+        return reward
+
+    def reset(self, seed=None, options=None):
+        """Reset the environment — reconnect bot and start new episode."""
+        super().reset(seed=seed)
+
+        self._start_bot()
+
+        # Wait for bot to spawn
+        deadline = time.time() + 30.0
+        data = None
+        while time.time() < deadline:
+            line = self.proc.stdout.readline().strip()
+            if not line:
+                continue
+            try:
+                d = json.loads(line)
+                if d.get("event") == "ready":
+                    data = d
+                    break
+                if "hp" in d:
+                    data = d
+                    break
+            except json.JSONDecodeError:
+                continue
+
+        if not data:
+            # Fallback: send observe
+            time.sleep(3)
+            data = self._send("observe")
+
+        self.step_count = 0
+        self.total_reward = 0
+        self.kills = data.get("kills", 0) if data else 0
+        self.prev_hp = data.get("hp", 20) if data else 20
+        self.prev_food = data.get("food", 20) if data else 20
+        self.alive = True
+
+        obs = self._parse_observation(data)
+        self.last_obs = obs
+        info = {"raw": data}
+
+        return obs, info
+
+    def step(self, action):
+        """Execute one action and return (obs, reward, terminated, truncated, info)."""
+        self.step_count += 1
+        action_name = self.ACTIONS[action]
+
+        # Send action to bot
+        data = self._send(action_name)
+
+        # Wait for game tick
+        time.sleep(self.tick_rate)
+
+        # Get observation after action
+        if data is None or "hp" not in data:
+            obs_data = self._send("observe")
+        else:
+            obs_data = data
+
+        obs = self._parse_observation(obs_data)
+        reward = self._calc_reward(obs_data, action)
+        self.total_reward += reward
+
+        # Check termination
+        terminated = False
+        if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
+            terminated = True
+            self.alive = False
+
+        # Check truncation (max steps)
+        truncated = self.step_count >= self.max_steps
+
+        info = {
+            "raw": obs_data,
+            "step": self.step_count,
+            "total_reward": self.total_reward,
+            "kills": self.kills,
+            "alive": self.alive,
+        }
+
+        self.last_obs = obs
+
+        if self.render_mode == "human":
+            self.render()
+
+        return obs, reward, terminated, truncated, info
+
+    def render(self):
+        """Print current state."""
+        if self.last_obs is not None:
+            hp = self.last_obs[0] * 20
+            food = self.last_obs[1] * 20
+            mob_dist = self.last_obs[2] * 24
+            mob_count = int(self.last_obs[4] * 10)
+            print(f"  Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
+                  f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
+                  f"R={self.total_reward:.1f}")
+
+    def close(self):
+        """Clean up."""
+        self._stop_bot()
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+pretrain_policy.py — Give the RL policy a head start via behavioral cloning.
+
+Generates synthetic expert demonstrations from our hand-coded survival rules,
+then trains the policy network to imitate them. The resulting weights become
+the starting point for PPO (instead of random initialization).
+
+Usage:
+    python3 training/rl/pretrain_policy.py
+    # Then run train_combat.py — it will load the pretrained checkpoint
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+from pathlib import Path
+from stable_baselines3 import PPO
+from gymnasium import spaces
+
+ROOT = Path(__file__).resolve().parent.parent.parent
+CKPT_DIR = ROOT / "training" / "rl" / "checkpoints"
+
+# Actions: 0=forward, 1=fight, 2=flee, 3=eat, 4=sprint, 5=idle
+# Obs: [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
+#       has_sword, has_armor, has_food, is_day, on_water,
+#       y_level_norm, damage_taken, is_fleeing]
+
+def expert_action(obs):
+    """Hand-coded expert policy — the survival rules we discovered tonight."""
+    hp = obs[0]           # 0-1 (0=dead, 1=full)
+    food = obs[1]         # 0-1
+    mob_dist = obs[2]     # 0-1 (0=right here, 1=24+ blocks)
+    mob_count = obs[4]    # 0-1 (0=none, 1=10+)
+    has_sword = obs[5]    # 0 or 1
+    has_food = obs[7]     # 0 or 1
+    damage = obs[11]      # 0-1
+    is_fleeing = obs[12]  # 0 or 1
+
+    # PRIORITY 1: Flee if critical HP
+    if hp < 0.25:  # < 5 HP
+        if has_food and food < 0.7:
+            return 3  # eat
+        return 2  # flee
+
+    # PRIORITY 2: Flee if overwhelmed (3+ mobs and not full HP)
+    if mob_count > 0.3 and hp < 0.6:
+        return 2  # flee
+
+    # PRIORITY 3: Eat if hungry and have food
+    if food < 0.7 and has_food and hp < 0.8:
+        return 3  # eat
+
+    # PRIORITY 4: Fight if mob nearby and have sword
+    if mob_dist < 0.25 and has_sword:  # < 6 blocks
+        return 1  # fight
+
+    # PRIORITY 5: Approach mob if nearby but not in melee
+    if mob_dist < 0.5 and has_sword:  # < 12 blocks
+        return 0  # forward (approach)
+
+    # PRIORITY 6: Sprint if taking damage (dodge)
+    if damage > 0:
+        return 4  # sprint
+
+    # PRIORITY 7: Explore
+    if mob_dist > 0.8:  # no mobs nearby
+        return 0  # forward
+
+    # Default: idle
+    return 5
+
+
+def generate_expert_data(n_samples=50000):
+    """Generate diverse observations and expert actions."""
+    obs_list = []
+    act_list = []
+
+    for _ in range(n_samples):
+        # Random observation (covering the full state space)
+        obs = np.zeros(13, dtype=np.float32)
+        obs[0] = np.random.beta(2, 1)        # hp: skew toward higher
+        obs[1] = np.random.beta(2, 1)        # food: skew toward higher
+        obs[2] = np.random.uniform(0, 1)     # mob distance
+        obs[3] = np.random.uniform(0, 1)     # mob angle
+        obs[4] = np.random.beta(1, 3)        # mob count: skew toward fewer
+        obs[5] = float(np.random.random() > 0.3)  # has_sword: 70% chance
+        obs[6] = float(np.random.random() > 0.4)  # has_armor: 60% chance
+        obs[7] = float(np.random.random() > 0.3)  # has_food: 70% chance
+        obs[8] = float(np.random.random() > 0.4)  # is_day: 60% chance
+        obs[9] = float(np.random.random() > 0.85) # on_water: 15% chance
+        obs[10] = np.random.uniform(0.15, 0.3)    # y_level: surface range
+        obs[11] = np.random.beta(1, 5)             # damage: skew toward low
+        obs[12] = float(obs[0] < 0.25)             # is_fleeing
+
+        action = expert_action(obs)
+        obs_list.append(obs)
+        act_list.append(action)
+
+    return np.array(obs_list), np.array(act_list)
+
+
+def pretrain():
+    print("Generating 50,000 expert demonstrations...")
+    obs_data, act_data = generate_expert_data(50000)
+
+    # Show action distribution
+    unique, counts = np.unique(act_data, return_counts=True)
+    action_names = ["forward", "fight", "flee", "eat", "sprint", "idle"]
+    print("\nExpert action distribution:")
+    for a, c in zip(unique, counts):
+        print(f"  {action_names[a]:10} {c:6} ({c/len(act_data)*100:.1f}%)")
+
+    # Create a PPO model with the same architecture
+    import gymnasium as gym
+    class DummyMCEnv(gym.Env):
+        metadata = {"render_modes": []}
+        def __init__(self):
+            self.observation_space = spaces.Box(low=0, high=1, shape=(13,), dtype=np.float32)
+            self.action_space = spaces.Discrete(6)
+        def reset(self, **kw):
+            return np.zeros(13, dtype=np.float32), {}
+        def step(self, a):
+            return np.zeros(13, dtype=np.float32), 0, True, False, {}
+
+    dummy_env = DummyMCEnv()
+
+    model = PPO(
+        "MlpPolicy", dummy_env, verbose=0,
+        policy_kwargs={"net_arch": [64, 64]},
+    )
+
+    # Extract the policy network and train via supervised learning
+    policy = model.policy
+    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
+    criterion = nn.CrossEntropyLoss()
+
+    obs_tensor = torch.FloatTensor(obs_data)
+    act_tensor = torch.LongTensor(act_data)
+
+    print(f"\nPretraining policy ({sum(p.numel() for p in policy.parameters()):,} params)...")
+    batch_size = 256
+    n_epochs = 20
+
+    for epoch in range(n_epochs):
+        # Shuffle
+        perm = torch.randperm(len(obs_tensor))
+        total_loss = 0
+        correct = 0
+        n_batches = 0
+
+        for i in range(0, len(obs_tensor), batch_size):
+            idx = perm[i:i+batch_size]
+            batch_obs = obs_tensor[idx]
+            batch_act = act_tensor[idx]
+
+            # Forward through policy network
+            features = policy.extract_features(batch_obs, policy.pi_features_extractor)
+            latent = policy.mlp_extractor.forward_actor(features)
+            logits = policy.action_net(latent)
+
+            loss = criterion(logits, batch_act)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            total_loss += loss.item()
+            correct += (logits.argmax(dim=1) == batch_act).sum().item()
+            n_batches += 1
+
+        accuracy = correct / len(obs_tensor) * 100
+        avg_loss = total_loss / n_batches
+        print(f"  Epoch {epoch+1:2d}/{n_epochs}: loss={avg_loss:.4f} accuracy={accuracy:.1f}%")
+
+    # Save the pretrained model
+    CKPT_DIR.mkdir(parents=True, exist_ok=True)
+    save_path = CKPT_DIR / "combat_ppo_pretrained.zip"
+    model.save(str(save_path))
+    print(f"\nPretrained model saved to {save_path}")
+    print("PPO will resume from this checkpoint and improve via RL.")
+
+
+if __name__ == "__main__":
+    pretrain()
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+train_combat.py — Train a small policy network for Minecraft combat via PPO.
+
+The agent learns to fight, flee, eat, and survive in a hostile Minecraft world.
+Uses the MinecraftCombatEnv gymnasium wrapper which controls a mineflayer bot.
+
+Usage:
+    # Install deps first:
+    pip install gymnasium stable-baselines3 torch
+
+    # Train (on steel141 with mc-train conda env):
+    python3 training/rl/train_combat.py
+
+    # Train with custom settings:
+    python3 training/rl/train_combat.py --timesteps 50000 --host 192.168.0.244 --port 25568
+
+    # Evaluate a trained model:
+    python3 training/rl/train_combat.py --eval --model training/rl/checkpoints/combat_ppo.zip
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+# Add project root to path
+ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(ROOT))
+
+
+def train(args):
+    from stable_baselines3 import PPO
+    from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
+    from training.rl.minecraft_env import MinecraftCombatEnv
+
+    print(f"=== Minecraft RL Combat Training ===")
+    print(f"Host: {args.host}:{args.port}")
+    print(f"Timesteps: {args.timesteps}")
+    print(f"Policy: MlpPolicy (3-layer MLP)")
+    print()
+
+    # Create environment
+    env = MinecraftCombatEnv(
+        host=args.host,
+        port=args.port,
+        username=f"RLBot_{os.getpid() % 100}",
+        max_steps=args.max_steps,
+        render_mode="human" if args.verbose else None,
+    )
+
+    # Checkpointing
+    ckpt_dir = ROOT / "training" / "rl" / "checkpoints"
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+
+    checkpoint_cb = CheckpointCallback(
+        save_freq=args.save_freq,
+        save_path=str(ckpt_dir),
+        name_prefix="combat_ppo",
+    )
+
+    # Check for existing checkpoint to resume from
+    latest_ckpt = None
+    if ckpt_dir.exists():
+        ckpts = sorted(ckpt_dir.glob("combat_ppo_*.zip"), key=lambda p: p.stat().st_mtime)
+        if ckpts:
+            latest_ckpt = str(ckpts[-1])
+            print(f"RESUMING from: {latest_ckpt}")
+
+    if latest_ckpt:
+        # Load existing model and continue training
+        model = PPO.load(
+            latest_ckpt,
+            env=env,
+            tensorboard_log=str(ckpt_dir / "tb_logs"),
+        )
+        model.learning_rate = 3e-4  # can adjust between runs
+    else:
+        # Fresh model
+        model = PPO(
+            "MlpPolicy",
+            env,
+            verbose=1,
+            learning_rate=3e-4,
+            n_steps=256,           # collect 256 steps before update
+            batch_size=64,
+            n_epochs=4,
+            gamma=0.99,            # discount factor
+            gae_lambda=0.95,
+            clip_range=0.2,
+            ent_coef=0.01,         # entropy bonus for exploration
+            policy_kwargs={
+                "net_arch": [64, 64],  # 2 hidden layers of 64 units
+            },
+            tensorboard_log=str(ckpt_dir / "tb_logs"),
+        )
+
+    print(f"Policy network params: {sum(p.numel() for p in model.policy.parameters()):,}")
+    print(f"Training for {args.timesteps} timesteps...")
+    print()
+
+    try:
+        model.learn(
+            total_timesteps=args.timesteps,
+            callback=checkpoint_cb,
+            progress_bar=True,
+        )
+    except KeyboardInterrupt:
+        print("\nTraining interrupted.")
+
+    # Save final model
+    final_path = ckpt_dir / "combat_ppo_final.zip"
+    model.save(str(final_path))
+    print(f"\nModel saved to {final_path}")
+
+    env.close()
+
+
+def evaluate(args):
+    from stable_baselines3 import PPO
+    from training.rl.minecraft_env import MinecraftCombatEnv
+
+    print(f"=== Evaluating {args.model} ===")
+
+    env = MinecraftCombatEnv(
+        host=args.host,
+        port=args.port,
+        username="RLBot_eval",
+        max_steps=args.max_steps,
+        render_mode="human",
+    )
+
+    model = PPO.load(args.model)
+
+    total_reward = 0
+    total_kills = 0
+    episodes = args.eval_episodes
+
+    for ep in range(episodes):
+        obs, info = env.reset()
+        ep_reward = 0
+        done = False
+
+        while not done:
+            action, _ = model.predict(obs, deterministic=True)
+            obs, reward, terminated, truncated, info = env.step(action)
+            ep_reward += reward
+            done = terminated or truncated
+
+        total_reward += ep_reward
+        total_kills += info.get("kills", 0)
+        print(f"  Episode {ep+1}: reward={ep_reward:.1f} kills={info.get('kills', 0)} steps={info.get('step', 0)}")
+
+    print(f"\nAverage: reward={total_reward/episodes:.1f} kills={total_kills/episodes:.1f}")
+    env.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Minecraft RL Combat Training")
+    parser.add_argument("--host", default="192.168.0.244")
+    parser.add_argument("--port", type=int, default=25568)
+    parser.add_argument("--timesteps", type=int, default=10000)
+    parser.add_argument("--max-steps", type=int, default=300)
+    parser.add_argument("--save-freq", type=int, default=2000)
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--eval", action="store_true")
+    parser.add_argument("--eval-episodes", type=int, default=5)
+    parser.add_argument("--model", default="training/rl/checkpoints/combat_ppo_final.zip")
+    args = parser.parse_args()
+
+    if args.eval:
+        evaluate(args)
+    else:
+        train(args)
+
+
+if __name__ == "__main__":
+    main()