0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline
Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,342 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.
|
||||
|
||||
The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
|
||||
The Python Gym env sends actions and receives observations at ~600ms ticks.
|
||||
|
||||
Usage:
|
||||
from minecraft_env import MinecraftCombatEnv
|
||||
env = MinecraftCombatEnv()
|
||||
obs, info = env.reset()
|
||||
while True:
|
||||
action = env.action_space.sample() # or policy(obs)
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import os
|
||||
import signal
|
||||
import numpy as np
|
||||
import gymnasium as gym
|
||||
from gymnasium import spaces
|
||||
from pathlib import Path
|
||||
|
||||
INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"
|
||||
|
||||
|
||||
class MinecraftCombatEnv(gym.Env):
|
||||
"""Minecraft combat survival environment via mineflayer bot."""
|
||||
|
||||
metadata = {"render_modes": ["human"], "render_fps": 2}
|
||||
|
||||
# Discrete actions
|
||||
ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]
|
||||
|
||||
# Hostile mob types for reward calculation
|
||||
HOSTILE = {
|
||||
"zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
|
||||
"witch", "enderman", "drowned", "stray", "phantom", "parched",
|
||||
"camel_husk", "slime", "magma_cube",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host="192.168.0.244",
|
||||
port=25568,
|
||||
username="RLBot",
|
||||
max_steps=600, # 600 ticks × 0.6s = 6 minutes per episode
|
||||
tick_rate=0.6, # seconds per tick (sword cooldown rate)
|
||||
render_mode=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.username = username
|
||||
self.max_steps = max_steps
|
||||
self.tick_rate = tick_rate
|
||||
self.render_mode = render_mode
|
||||
|
||||
# Observation space: 13 floats normalized to [0, 1]
|
||||
# [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
|
||||
# has_sword, has_armor, has_food, is_day, on_water,
|
||||
# y_level_norm, damage_taken_this_tick, is_fleeing]
|
||||
self.observation_space = spaces.Box(
|
||||
low=0.0, high=1.0, shape=(13,), dtype=np.float32
|
||||
)
|
||||
|
||||
# Action space: 6 discrete actions
|
||||
self.action_space = spaces.Discrete(len(self.ACTIONS))
|
||||
|
||||
# Internal state
|
||||
self.proc = None
|
||||
self.step_count = 0
|
||||
self.total_reward = 0
|
||||
self.kills = 0
|
||||
self.prev_hp = 20.0
|
||||
self.prev_food = 20
|
||||
self.alive = False
|
||||
self.last_obs = None
|
||||
|
||||
def _start_bot(self):
|
||||
"""Start the mineflayer bot subprocess."""
|
||||
if self.proc and self.proc.poll() is None:
|
||||
self._stop_bot()
|
||||
|
||||
bot_script = INGAME_DIR / "rl_bot.js"
|
||||
self.proc = subprocess.Popen(
|
||||
["node", str(bot_script), self.host, str(self.port), self.username],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
bufsize=1, # line buffered
|
||||
)
|
||||
|
||||
def _stop_bot(self):
|
||||
"""Stop the bot subprocess."""
|
||||
if self.proc:
|
||||
try:
|
||||
self.proc.stdin.write("quit\n")
|
||||
self.proc.stdin.flush()
|
||||
self.proc.wait(timeout=3)
|
||||
except Exception:
|
||||
try:
|
||||
self.proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
self.proc = None
|
||||
|
||||
def _send(self, cmd):
|
||||
"""Send a command to the bot and read the JSON response."""
|
||||
try:
|
||||
self.proc.stdin.write(cmd + "\n")
|
||||
self.proc.stdin.flush()
|
||||
|
||||
# Read lines until we get a valid JSON observation
|
||||
deadline = time.time() + 5.0
|
||||
while time.time() < deadline:
|
||||
line = self.proc.stdout.readline().strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return None
|
||||
except (BrokenPipeError, OSError):
|
||||
return None
|
||||
|
||||
def _parse_observation(self, data):
|
||||
"""Convert JSON bot state to numpy observation vector."""
|
||||
if not data or "hp" not in data:
|
||||
return np.zeros(13, dtype=np.float32)
|
||||
|
||||
hp = (data.get("hp") or 0) / 20.0 # normalize to [0, 1]
|
||||
food = (data.get("food") or 0) / 20.0
|
||||
mobs = data.get("mobs", [])
|
||||
|
||||
# Nearest hostile mob
|
||||
hostile_mobs = [m for m in mobs if m.get("hostile", False)]
|
||||
if hostile_mobs:
|
||||
nearest = min(hostile_mobs, key=lambda m: m["dist"])
|
||||
nearest_dist = min(nearest["dist"] / 24.0, 1.0) # normalize
|
||||
# Angle: approximate from relative position if available
|
||||
nearest_angle = 0.5 # default forward
|
||||
else:
|
||||
nearest_dist = 1.0 # no mob = max distance
|
||||
nearest_angle = 0.5
|
||||
|
||||
mob_count = min(len(hostile_mobs) / 10.0, 1.0)
|
||||
|
||||
# Inventory flags
|
||||
inv = data.get("inv", "")
|
||||
has_sword = 1.0 if "sword" in inv else 0.0
|
||||
has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
|
||||
has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0
|
||||
|
||||
# World state
|
||||
is_day = 1.0 if data.get("day", True) else 0.0
|
||||
on_water = 1.0 if data.get("below", "") == "water" else 0.0
|
||||
|
||||
# Y level (normalize: 0=bedrock, 320=max → 0-1)
|
||||
y = (data.get("pos") or {}).get("y", 64) or 64
|
||||
y_norm = min(max(y, 0), 320) / 320.0
|
||||
|
||||
# Damage taken this tick
|
||||
current_hp = float(data.get("hp") or 20)
|
||||
prev = float(self.prev_hp or 20)
|
||||
damage = max(0, prev - current_hp) / 20.0
|
||||
|
||||
# Is currently fleeing (HP < 5)
|
||||
is_fleeing = 1.0 if current_hp < 5 else 0.0
|
||||
|
||||
obs = np.array([
|
||||
hp, food, nearest_dist, nearest_angle, mob_count,
|
||||
has_sword, has_armor, has_food, is_day, on_water,
|
||||
y_norm, damage, is_fleeing,
|
||||
], dtype=np.float32)
|
||||
|
||||
return obs
|
||||
|
||||
def _calc_reward(self, data, action):
|
||||
"""Calculate reward from state transition."""
|
||||
if not data or "hp" not in data:
|
||||
return -100.0 # lost connection = death equivalent
|
||||
|
||||
reward = 0.0
|
||||
hp = float(data.get("hp") or 0)
|
||||
food = int(data.get("food") or 20)
|
||||
|
||||
# Survival reward: +1 per tick alive
|
||||
reward += 1.0
|
||||
|
||||
# Damage penalty
|
||||
damage = max(0, float(self.prev_hp or 20) - hp)
|
||||
if damage > 0:
|
||||
reward -= damage * 2.0 # -2 per HP lost
|
||||
|
||||
# Death penalty
|
||||
if hp <= 0 or data.get("died", False):
|
||||
reward -= 100.0
|
||||
|
||||
# Kill reward
|
||||
new_kills = data.get("kills", 0)
|
||||
kills_this_tick = new_kills - self.kills
|
||||
if kills_this_tick > 0:
|
||||
reward += kills_this_tick * 10.0
|
||||
self.kills = new_kills
|
||||
|
||||
# Eating when hungry: good
|
||||
prev_food = int(self.prev_food or 20)
|
||||
if action == 3 and prev_food < 14 and food > prev_food:
|
||||
reward += 5.0
|
||||
|
||||
# Eating when full: wasted action
|
||||
if action == 3 and prev_food >= 18:
|
||||
reward -= 1.0
|
||||
|
||||
# Fighting when no mobs nearby: wasted
|
||||
mobs = data.get("mobs", [])
|
||||
hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
|
||||
if action == 1 and not hostile_nearby:
|
||||
reward -= 0.5
|
||||
|
||||
# Fleeing when HP is low and mobs nearby: good decision
|
||||
if action == 2 and hp < 8 and hostile_nearby:
|
||||
reward += 3.0
|
||||
|
||||
# Idle penalty (doing nothing when threats exist)
|
||||
if action == 5 and hostile_nearby:
|
||||
reward -= 2.0
|
||||
|
||||
# Update state
|
||||
self.prev_hp = hp
|
||||
self.prev_food = food
|
||||
|
||||
return reward
|
||||
|
||||
def reset(self, seed=None, options=None):
|
||||
"""Reset the environment — reconnect bot and start new episode."""
|
||||
super().reset(seed=seed)
|
||||
|
||||
self._start_bot()
|
||||
|
||||
# Wait for bot to spawn
|
||||
deadline = time.time() + 30.0
|
||||
data = None
|
||||
while time.time() < deadline:
|
||||
line = self.proc.stdout.readline().strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
d = json.loads(line)
|
||||
if d.get("event") == "ready":
|
||||
data = d
|
||||
break
|
||||
if "hp" in d:
|
||||
data = d
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not data:
|
||||
# Fallback: send observe
|
||||
time.sleep(3)
|
||||
data = self._send("observe")
|
||||
|
||||
self.step_count = 0
|
||||
self.total_reward = 0
|
||||
self.kills = data.get("kills", 0) if data else 0
|
||||
self.prev_hp = data.get("hp", 20) if data else 20
|
||||
self.prev_food = data.get("food", 20) if data else 20
|
||||
self.alive = True
|
||||
|
||||
obs = self._parse_observation(data)
|
||||
self.last_obs = obs
|
||||
info = {"raw": data}
|
||||
|
||||
return obs, info
|
||||
|
||||
def step(self, action):
|
||||
"""Execute one action and return (obs, reward, terminated, truncated, info)."""
|
||||
self.step_count += 1
|
||||
action_name = self.ACTIONS[action]
|
||||
|
||||
# Send action to bot
|
||||
data = self._send(action_name)
|
||||
|
||||
# Wait for game tick
|
||||
time.sleep(self.tick_rate)
|
||||
|
||||
# Get observation after action
|
||||
if data is None or "hp" not in data:
|
||||
obs_data = self._send("observe")
|
||||
else:
|
||||
obs_data = data
|
||||
|
||||
obs = self._parse_observation(obs_data)
|
||||
reward = self._calc_reward(obs_data, action)
|
||||
self.total_reward += reward
|
||||
|
||||
# Check termination
|
||||
terminated = False
|
||||
if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
|
||||
terminated = True
|
||||
self.alive = False
|
||||
|
||||
# Check truncation (max steps)
|
||||
truncated = self.step_count >= self.max_steps
|
||||
|
||||
info = {
|
||||
"raw": obs_data,
|
||||
"step": self.step_count,
|
||||
"total_reward": self.total_reward,
|
||||
"kills": self.kills,
|
||||
"alive": self.alive,
|
||||
}
|
||||
|
||||
self.last_obs = obs
|
||||
|
||||
if self.render_mode == "human":
|
||||
self.render()
|
||||
|
||||
return obs, reward, terminated, truncated, info
|
||||
|
||||
def render(self):
|
||||
"""Print current state."""
|
||||
if self.last_obs is not None:
|
||||
hp = self.last_obs[0] * 20
|
||||
food = self.last_obs[1] * 20
|
||||
mob_dist = self.last_obs[2] * 24
|
||||
mob_count = int(self.last_obs[4] * 10)
|
||||
print(f" Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
|
||||
f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
|
||||
f"R={self.total_reward:.1f}")
|
||||
|
||||
def close(self):
|
||||
"""Clean up."""
|
||||
self._stop_bot()
|
||||
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
pretrain_policy.py — Give the RL policy a head start via behavioral cloning.
|
||||
|
||||
Generates synthetic expert demonstrations from our hand-coded survival rules,
|
||||
then trains the policy network to imitate them. The resulting weights become
|
||||
the starting point for PPO (instead of random initialization).
|
||||
|
||||
Usage:
|
||||
python3 training/rl/pretrain_policy.py
|
||||
# Then run train_combat.py — it will load the pretrained checkpoint
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from pathlib import Path
|
||||
from stable_baselines3 import PPO
|
||||
from gymnasium import spaces
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
CKPT_DIR = ROOT / "training" / "rl" / "checkpoints"
|
||||
|
||||
# Actions: 0=forward, 1=fight, 2=flee, 3=eat, 4=sprint, 5=idle
|
||||
# Obs: [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
|
||||
# has_sword, has_armor, has_food, is_day, on_water,
|
||||
# y_level_norm, damage_taken, is_fleeing]
|
||||
|
||||
def expert_action(obs):
|
||||
"""Hand-coded expert policy — the survival rules we discovered tonight."""
|
||||
hp = obs[0] # 0-1 (0=dead, 1=full)
|
||||
food = obs[1] # 0-1
|
||||
mob_dist = obs[2] # 0-1 (0=right here, 1=24+ blocks)
|
||||
mob_count = obs[4] # 0-1 (0=none, 1=10+)
|
||||
has_sword = obs[5] # 0 or 1
|
||||
has_food = obs[7] # 0 or 1
|
||||
damage = obs[11] # 0-1
|
||||
is_fleeing = obs[12] # 0 or 1
|
||||
|
||||
# PRIORITY 1: Flee if critical HP
|
||||
if hp < 0.25: # < 5 HP
|
||||
if has_food and food < 0.7:
|
||||
return 3 # eat
|
||||
return 2 # flee
|
||||
|
||||
# PRIORITY 2: Flee if overwhelmed (3+ mobs and not full HP)
|
||||
if mob_count > 0.3 and hp < 0.6:
|
||||
return 2 # flee
|
||||
|
||||
# PRIORITY 3: Eat if hungry and have food
|
||||
if food < 0.7 and has_food and hp < 0.8:
|
||||
return 3 # eat
|
||||
|
||||
# PRIORITY 4: Fight if mob nearby and have sword
|
||||
if mob_dist < 0.25 and has_sword: # < 6 blocks
|
||||
return 1 # fight
|
||||
|
||||
# PRIORITY 5: Approach mob if nearby but not in melee
|
||||
if mob_dist < 0.5 and has_sword: # < 12 blocks
|
||||
return 0 # forward (approach)
|
||||
|
||||
# PRIORITY 6: Sprint if taking damage (dodge)
|
||||
if damage > 0:
|
||||
return 4 # sprint
|
||||
|
||||
# PRIORITY 7: Explore
|
||||
if mob_dist > 0.8: # no mobs nearby
|
||||
return 0 # forward
|
||||
|
||||
# Default: idle
|
||||
return 5
|
||||
|
||||
|
||||
def generate_expert_data(n_samples=50000):
|
||||
"""Generate diverse observations and expert actions."""
|
||||
obs_list = []
|
||||
act_list = []
|
||||
|
||||
for _ in range(n_samples):
|
||||
# Random observation (covering the full state space)
|
||||
obs = np.zeros(13, dtype=np.float32)
|
||||
obs[0] = np.random.beta(2, 1) # hp: skew toward higher
|
||||
obs[1] = np.random.beta(2, 1) # food: skew toward higher
|
||||
obs[2] = np.random.uniform(0, 1) # mob distance
|
||||
obs[3] = np.random.uniform(0, 1) # mob angle
|
||||
obs[4] = np.random.beta(1, 3) # mob count: skew toward fewer
|
||||
obs[5] = float(np.random.random() > 0.3) # has_sword: 70% chance
|
||||
obs[6] = float(np.random.random() > 0.4) # has_armor: 60% chance
|
||||
obs[7] = float(np.random.random() > 0.3) # has_food: 70% chance
|
||||
obs[8] = float(np.random.random() > 0.4) # is_day: 60% chance
|
||||
obs[9] = float(np.random.random() > 0.85) # on_water: 15% chance
|
||||
obs[10] = np.random.uniform(0.15, 0.3) # y_level: surface range
|
||||
obs[11] = np.random.beta(1, 5) # damage: skew toward low
|
||||
obs[12] = float(obs[0] < 0.25) # is_fleeing
|
||||
|
||||
action = expert_action(obs)
|
||||
obs_list.append(obs)
|
||||
act_list.append(action)
|
||||
|
||||
return np.array(obs_list), np.array(act_list)
|
||||
|
||||
|
||||
def pretrain():
|
||||
print("Generating 50,000 expert demonstrations...")
|
||||
obs_data, act_data = generate_expert_data(50000)
|
||||
|
||||
# Show action distribution
|
||||
unique, counts = np.unique(act_data, return_counts=True)
|
||||
action_names = ["forward", "fight", "flee", "eat", "sprint", "idle"]
|
||||
print("\nExpert action distribution:")
|
||||
for a, c in zip(unique, counts):
|
||||
print(f" {action_names[a]:10} {c:6} ({c/len(act_data)*100:.1f}%)")
|
||||
|
||||
# Create a PPO model with the same architecture
|
||||
import gymnasium as gym
|
||||
class DummyMCEnv(gym.Env):
|
||||
metadata = {"render_modes": []}
|
||||
def __init__(self):
|
||||
self.observation_space = spaces.Box(low=0, high=1, shape=(13,), dtype=np.float32)
|
||||
self.action_space = spaces.Discrete(6)
|
||||
def reset(self, **kw):
|
||||
return np.zeros(13, dtype=np.float32), {}
|
||||
def step(self, a):
|
||||
return np.zeros(13, dtype=np.float32), 0, True, False, {}
|
||||
|
||||
dummy_env = DummyMCEnv()
|
||||
|
||||
model = PPO(
|
||||
"MlpPolicy", dummy_env, verbose=0,
|
||||
policy_kwargs={"net_arch": [64, 64]},
|
||||
)
|
||||
|
||||
# Extract the policy network and train via supervised learning
|
||||
policy = model.policy
|
||||
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
obs_tensor = torch.FloatTensor(obs_data)
|
||||
act_tensor = torch.LongTensor(act_data)
|
||||
|
||||
print(f"\nPretraining policy ({sum(p.numel() for p in policy.parameters()):,} params)...")
|
||||
batch_size = 256
|
||||
n_epochs = 20
|
||||
|
||||
for epoch in range(n_epochs):
|
||||
# Shuffle
|
||||
perm = torch.randperm(len(obs_tensor))
|
||||
total_loss = 0
|
||||
correct = 0
|
||||
n_batches = 0
|
||||
|
||||
for i in range(0, len(obs_tensor), batch_size):
|
||||
idx = perm[i:i+batch_size]
|
||||
batch_obs = obs_tensor[idx]
|
||||
batch_act = act_tensor[idx]
|
||||
|
||||
# Forward through policy network
|
||||
features = policy.extract_features(batch_obs, policy.pi_features_extractor)
|
||||
latent = policy.mlp_extractor.forward_actor(features)
|
||||
logits = policy.action_net(latent)
|
||||
|
||||
loss = criterion(logits, batch_act)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
total_loss += loss.item()
|
||||
correct += (logits.argmax(dim=1) == batch_act).sum().item()
|
||||
n_batches += 1
|
||||
|
||||
accuracy = correct / len(obs_tensor) * 100
|
||||
avg_loss = total_loss / n_batches
|
||||
print(f" Epoch {epoch+1:2d}/{n_epochs}: loss={avg_loss:.4f} accuracy={accuracy:.1f}%")
|
||||
|
||||
# Save the pretrained model
|
||||
CKPT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
save_path = CKPT_DIR / "combat_ppo_pretrained.zip"
|
||||
model.save(str(save_path))
|
||||
print(f"\nPretrained model saved to {save_path}")
|
||||
print("PPO will resume from this checkpoint and improve via RL.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pretrain()
|
||||
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
train_combat.py — Train a small policy network for Minecraft combat via PPO.
|
||||
|
||||
The agent learns to fight, flee, eat, and survive in a hostile Minecraft world.
|
||||
Uses the MinecraftCombatEnv gymnasium wrapper which controls a mineflayer bot.
|
||||
|
||||
Usage:
|
||||
# Install deps first:
|
||||
pip install gymnasium stable-baselines3 torch
|
||||
|
||||
# Train (on steel141 with mc-train conda env):
|
||||
python3 training/rl/train_combat.py
|
||||
|
||||
# Train with custom settings:
|
||||
python3 training/rl/train_combat.py --timesteps 50000 --host 192.168.0.244 --port 25568
|
||||
|
||||
# Evaluate a trained model:
|
||||
python3 training/rl/train_combat.py --eval --model training/rl/checkpoints/combat_ppo.zip
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
|
||||
def train(args):
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
|
||||
from training.rl.minecraft_env import MinecraftCombatEnv
|
||||
|
||||
print(f"=== Minecraft RL Combat Training ===")
|
||||
print(f"Host: {args.host}:{args.port}")
|
||||
print(f"Timesteps: {args.timesteps}")
|
||||
print(f"Policy: MlpPolicy (3-layer MLP)")
|
||||
print()
|
||||
|
||||
# Create environment
|
||||
env = MinecraftCombatEnv(
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
username=f"RLBot_{os.getpid() % 100}",
|
||||
max_steps=args.max_steps,
|
||||
render_mode="human" if args.verbose else None,
|
||||
)
|
||||
|
||||
# Checkpointing
|
||||
ckpt_dir = ROOT / "training" / "rl" / "checkpoints"
|
||||
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
checkpoint_cb = CheckpointCallback(
|
||||
save_freq=args.save_freq,
|
||||
save_path=str(ckpt_dir),
|
||||
name_prefix="combat_ppo",
|
||||
)
|
||||
|
||||
# Check for existing checkpoint to resume from
|
||||
latest_ckpt = None
|
||||
if ckpt_dir.exists():
|
||||
ckpts = sorted(ckpt_dir.glob("combat_ppo_*.zip"), key=lambda p: p.stat().st_mtime)
|
||||
if ckpts:
|
||||
latest_ckpt = str(ckpts[-1])
|
||||
print(f"RESUMING from: {latest_ckpt}")
|
||||
|
||||
if latest_ckpt:
|
||||
# Load existing model and continue training
|
||||
model = PPO.load(
|
||||
latest_ckpt,
|
||||
env=env,
|
||||
tensorboard_log=str(ckpt_dir / "tb_logs"),
|
||||
)
|
||||
model.learning_rate = 3e-4 # can adjust between runs
|
||||
else:
|
||||
# Fresh model
|
||||
model = PPO(
|
||||
"MlpPolicy",
|
||||
env,
|
||||
verbose=1,
|
||||
learning_rate=3e-4,
|
||||
n_steps=256, # collect 256 steps before update
|
||||
batch_size=64,
|
||||
n_epochs=4,
|
||||
gamma=0.99, # discount factor
|
||||
gae_lambda=0.95,
|
||||
clip_range=0.2,
|
||||
ent_coef=0.01, # entropy bonus for exploration
|
||||
policy_kwargs={
|
||||
"net_arch": [64, 64], # 2 hidden layers of 64 units
|
||||
},
|
||||
tensorboard_log=str(ckpt_dir / "tb_logs"),
|
||||
)
|
||||
|
||||
print(f"Policy network params: {sum(p.numel() for p in model.policy.parameters()):,}")
|
||||
print(f"Training for {args.timesteps} timesteps...")
|
||||
print()
|
||||
|
||||
try:
|
||||
model.learn(
|
||||
total_timesteps=args.timesteps,
|
||||
callback=checkpoint_cb,
|
||||
progress_bar=True,
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\nTraining interrupted.")
|
||||
|
||||
# Save final model
|
||||
final_path = ckpt_dir / "combat_ppo_final.zip"
|
||||
model.save(str(final_path))
|
||||
print(f"\nModel saved to {final_path}")
|
||||
|
||||
env.close()
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
from stable_baselines3 import PPO
|
||||
from training.rl.minecraft_env import MinecraftCombatEnv
|
||||
|
||||
print(f"=== Evaluating {args.model} ===")
|
||||
|
||||
env = MinecraftCombatEnv(
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
username="RLBot_eval",
|
||||
max_steps=args.max_steps,
|
||||
render_mode="human",
|
||||
)
|
||||
|
||||
model = PPO.load(args.model)
|
||||
|
||||
total_reward = 0
|
||||
total_kills = 0
|
||||
episodes = args.eval_episodes
|
||||
|
||||
for ep in range(episodes):
|
||||
obs, info = env.reset()
|
||||
ep_reward = 0
|
||||
done = False
|
||||
|
||||
while not done:
|
||||
action, _ = model.predict(obs, deterministic=True)
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
ep_reward += reward
|
||||
done = terminated or truncated
|
||||
|
||||
total_reward += ep_reward
|
||||
total_kills += info.get("kills", 0)
|
||||
print(f" Episode {ep+1}: reward={ep_reward:.1f} kills={info.get('kills', 0)} steps={info.get('step', 0)}")
|
||||
|
||||
print(f"\nAverage: reward={total_reward/episodes:.1f} kills={total_kills/episodes:.1f}")
|
||||
env.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Minecraft RL Combat Training")
|
||||
parser.add_argument("--host", default="192.168.0.244")
|
||||
parser.add_argument("--port", type=int, default=25568)
|
||||
parser.add_argument("--timesteps", type=int, default=10000)
|
||||
parser.add_argument("--max-steps", type=int, default=300)
|
||||
parser.add_argument("--save-freq", type=int, default=2000)
|
||||
parser.add_argument("--verbose", action="store_true")
|
||||
parser.add_argument("--eval", action="store_true")
|
||||
parser.add_argument("--eval-episodes", type=int, default=5)
|
||||
parser.add_argument("--model", default="training/rl/checkpoints/combat_ppo_final.zip")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.eval:
|
||||
evaluate(args)
|
||||
else:
|
||||
train(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user