Mortdecai/training/rl/minecraft_env.py

#!/usr/bin/env python3
"""
minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.

The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
The Python Gym env sends actions and receives observations at ~600ms ticks.

Usage:
    from minecraft_env import MinecraftCombatEnv
    env = MinecraftCombatEnv()
    obs, info = env.reset()
    while True:
        action = env.action_space.sample()  # or policy(obs)
        obs, reward, terminated, truncated, info = env.step(action)
"""

import json
import subprocess
import time
import os
import signal
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from pathlib import Path

INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"


class MinecraftCombatEnv(gym.Env):
    """Minecraft combat survival environment via mineflayer bot."""

    metadata = {"render_modes": ["human"], "render_fps": 2}

    # Discrete actions
    ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]

    # Hostile mob types for reward calculation
    HOSTILE = {
        "zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
        "witch", "enderman", "drowned", "stray", "phantom", "parched",
        "camel_husk", "slime", "magma_cube",
    }

    def __init__(
        self,
        host="192.168.0.244",
        port=25568,
        username="RLBot",
        max_steps=600,       # 600 ticks × 0.6s = 6 minutes per episode
        tick_rate=0.6,       # seconds per tick (sword cooldown rate)
        render_mode=None,
    ):
        super().__init__()
        self.host = host
        self.port = port
        self.username = username
        self.max_steps = max_steps
        self.tick_rate = tick_rate
        self.render_mode = render_mode

        # Observation space: 13 floats normalized to [0, 1]
        # [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
        #  has_sword, has_armor, has_food, is_day, on_water,
        #  y_level_norm, damage_taken_this_tick, is_fleeing]
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(13,), dtype=np.float32
        )

        # Action space: 6 discrete actions
        self.action_space = spaces.Discrete(len(self.ACTIONS))

        # Internal state
        self.proc = None
        self.step_count = 0
        self.total_reward = 0
        self.kills = 0
        self.prev_hp = 20.0
        self.prev_food = 20
        self.alive = False
        self.last_obs = None

    def _start_bot(self):
        """Start the mineflayer bot subprocess."""
        if self.proc and self.proc.poll() is None:
            self._stop_bot()

        bot_script = INGAME_DIR / "rl_bot.js"
        self.proc = subprocess.Popen(
            ["node", str(bot_script), self.host, str(self.port), self.username],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            text=True,
            bufsize=1,  # line buffered
        )

    def _stop_bot(self):
        """Stop the bot subprocess."""
        if self.proc:
            try:
                self.proc.stdin.write("quit\n")
                self.proc.stdin.flush()
                self.proc.wait(timeout=3)
            except Exception:
                try:
                    self.proc.kill()
                except Exception:
                    pass
            self.proc = None

    def _send(self, cmd):
        """Send a command to the bot and read the JSON response."""
        try:
            self.proc.stdin.write(cmd + "\n")
            self.proc.stdin.flush()

            # Read lines until we get a valid JSON observation
            deadline = time.time() + 5.0
            while time.time() < deadline:
                line = self.proc.stdout.readline().strip()
                if not line:
                    continue
                try:
                    data = json.loads(line)
                    return data
                except json.JSONDecodeError:
                    continue
            return None
        except (BrokenPipeError, OSError):
            return None

    def _parse_observation(self, data):
        """Convert JSON bot state to numpy observation vector."""
        if not data or "hp" not in data:
            return np.zeros(13, dtype=np.float32)

        hp = (data.get("hp") or 0) / 20.0  # normalize to [0, 1]
        food = (data.get("food") or 0) / 20.0
        mobs = data.get("mobs", [])

        # Nearest hostile mob
        hostile_mobs = [m for m in mobs if m.get("hostile", False)]
        if hostile_mobs:
            nearest = min(hostile_mobs, key=lambda m: m["dist"])
            nearest_dist = min(nearest["dist"] / 24.0, 1.0)  # normalize
            # Angle: approximate from relative position if available
            nearest_angle = 0.5  # default forward
        else:
            nearest_dist = 1.0  # no mob = max distance
            nearest_angle = 0.5

        mob_count = min(len(hostile_mobs) / 10.0, 1.0)

        # Inventory flags
        inv = data.get("inv", "")
        has_sword = 1.0 if "sword" in inv else 0.0
        has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
        has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0

        # World state
        is_day = 1.0 if data.get("day", True) else 0.0
        on_water = 1.0 if data.get("below", "") == "water" else 0.0

        # Y level (normalize: 0=bedrock, 320=max → 0-1)
        y = (data.get("pos") or {}).get("y", 64) or 64
        y_norm = min(max(y, 0), 320) / 320.0

        # Damage taken this tick
        current_hp = float(data.get("hp") or 20)
        prev = float(self.prev_hp or 20)
        damage = max(0, prev - current_hp) / 20.0

        # Is currently fleeing (HP < 5)
        is_fleeing = 1.0 if current_hp < 5 else 0.0

        obs = np.array([
            hp, food, nearest_dist, nearest_angle, mob_count,
            has_sword, has_armor, has_food, is_day, on_water,
            y_norm, damage, is_fleeing,
        ], dtype=np.float32)

        return obs

    def _calc_reward(self, data, action):
        """Calculate reward from state transition."""
        if not data or "hp" not in data:
            return -100.0  # lost connection = death equivalent

        reward = 0.0
        hp = float(data.get("hp") or 0)
        food = int(data.get("food") or 20)

        # Survival reward: +1 per tick alive
        reward += 1.0

        # Damage penalty
        damage = max(0, float(self.prev_hp or 20) - hp)
        if damage > 0:
            reward -= damage * 2.0  # -2 per HP lost

        # Death penalty
        if hp <= 0 or data.get("died", False):
            reward -= 100.0

        # Kill reward
        new_kills = data.get("kills", 0)
        kills_this_tick = new_kills - self.kills
        if kills_this_tick > 0:
            reward += kills_this_tick * 10.0
        self.kills = new_kills

        # Eating when hungry: good
        prev_food = int(self.prev_food or 20)
        if action == 3 and prev_food < 14 and food > prev_food:
            reward += 5.0

        # Eating when full: wasted action
        if action == 3 and prev_food >= 18:
            reward -= 1.0

        # Fighting when no mobs nearby: wasted
        mobs = data.get("mobs", [])
        hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
        if action == 1 and not hostile_nearby:
            reward -= 0.5

        # Fleeing when HP is low and mobs nearby: good decision
        if action == 2 and hp < 8 and hostile_nearby:
            reward += 3.0

        # Idle penalty (doing nothing when threats exist)
        if action == 5 and hostile_nearby:
            reward -= 2.0

        # Update state
        self.prev_hp = hp
        self.prev_food = food

        return reward

    def reset(self, seed=None, options=None):
        """Reset the environment — reconnect bot and start new episode."""
        super().reset(seed=seed)

        self._start_bot()

        # Wait for bot to spawn
        deadline = time.time() + 30.0
        data = None
        while time.time() < deadline:
            line = self.proc.stdout.readline().strip()
            if not line:
                continue
            try:
                d = json.loads(line)
                if d.get("event") == "ready":
                    data = d
                    break
                if "hp" in d:
                    data = d
                    break
            except json.JSONDecodeError:
                continue

        if not data:
            # Fallback: send observe
            time.sleep(3)
            data = self._send("observe")

        self.step_count = 0
        self.total_reward = 0
        self.kills = data.get("kills", 0) if data else 0
        self.prev_hp = data.get("hp", 20) if data else 20
        self.prev_food = data.get("food", 20) if data else 20
        self.alive = True

        obs = self._parse_observation(data)
        self.last_obs = obs
        info = {"raw": data}

        return obs, info

    def step(self, action):
        """Execute one action and return (obs, reward, terminated, truncated, info)."""
        self.step_count += 1
        action_name = self.ACTIONS[action]

        # Send action to bot
        data = self._send(action_name)

        # Wait for game tick
        time.sleep(self.tick_rate)

        # Get observation after action
        if data is None or "hp" not in data:
            obs_data = self._send("observe")
        else:
            obs_data = data

        obs = self._parse_observation(obs_data)
        reward = self._calc_reward(obs_data, action)
        self.total_reward += reward

        # Check termination
        terminated = False
        if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
            terminated = True
            self.alive = False

        # Check truncation (max steps)
        truncated = self.step_count >= self.max_steps

        info = {
            "raw": obs_data,
            "step": self.step_count,
            "total_reward": self.total_reward,
            "kills": self.kills,
            "alive": self.alive,
        }

        self.last_obs = obs

        if self.render_mode == "human":
            self.render()

        return obs, reward, terminated, truncated, info

    def render(self):
        """Print current state."""
        if self.last_obs is not None:
            hp = self.last_obs[0] * 20
            food = self.last_obs[1] * 20
            mob_dist = self.last_obs[2] * 24
            mob_count = int(self.last_obs[4] * 10)
            print(f"  Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
                  f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
                  f"R={self.total_reward:.1f}")

    def close(self):
        """Clean up."""
        self._stop_bot()