0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline
Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
train_combat.py — Train a small policy network for Minecraft combat via PPO.
|
||||
|
||||
The agent learns to fight, flee, eat, and survive in a hostile Minecraft world.
|
||||
Uses the MinecraftCombatEnv gymnasium wrapper which controls a mineflayer bot.
|
||||
|
||||
Usage:
|
||||
# Install deps first:
|
||||
pip install gymnasium stable-baselines3 torch
|
||||
|
||||
# Train (on steel141 with mc-train conda env):
|
||||
python3 training/rl/train_combat.py
|
||||
|
||||
# Train with custom settings:
|
||||
python3 training/rl/train_combat.py --timesteps 50000 --host 192.168.0.244 --port 25568
|
||||
|
||||
# Evaluate a trained model:
|
||||
python3 training/rl/train_combat.py --eval --model training/rl/checkpoints/combat_ppo.zip
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
|
||||
def train(args):
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
|
||||
from training.rl.minecraft_env import MinecraftCombatEnv
|
||||
|
||||
print(f"=== Minecraft RL Combat Training ===")
|
||||
print(f"Host: {args.host}:{args.port}")
|
||||
print(f"Timesteps: {args.timesteps}")
|
||||
print(f"Policy: MlpPolicy (3-layer MLP)")
|
||||
print()
|
||||
|
||||
# Create environment
|
||||
env = MinecraftCombatEnv(
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
username=f"RLBot_{os.getpid() % 100}",
|
||||
max_steps=args.max_steps,
|
||||
render_mode="human" if args.verbose else None,
|
||||
)
|
||||
|
||||
# Checkpointing
|
||||
ckpt_dir = ROOT / "training" / "rl" / "checkpoints"
|
||||
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
checkpoint_cb = CheckpointCallback(
|
||||
save_freq=args.save_freq,
|
||||
save_path=str(ckpt_dir),
|
||||
name_prefix="combat_ppo",
|
||||
)
|
||||
|
||||
# Check for existing checkpoint to resume from
|
||||
latest_ckpt = None
|
||||
if ckpt_dir.exists():
|
||||
ckpts = sorted(ckpt_dir.glob("combat_ppo_*.zip"), key=lambda p: p.stat().st_mtime)
|
||||
if ckpts:
|
||||
latest_ckpt = str(ckpts[-1])
|
||||
print(f"RESUMING from: {latest_ckpt}")
|
||||
|
||||
if latest_ckpt:
|
||||
# Load existing model and continue training
|
||||
model = PPO.load(
|
||||
latest_ckpt,
|
||||
env=env,
|
||||
tensorboard_log=str(ckpt_dir / "tb_logs"),
|
||||
)
|
||||
model.learning_rate = 3e-4 # can adjust between runs
|
||||
else:
|
||||
# Fresh model
|
||||
model = PPO(
|
||||
"MlpPolicy",
|
||||
env,
|
||||
verbose=1,
|
||||
learning_rate=3e-4,
|
||||
n_steps=256, # collect 256 steps before update
|
||||
batch_size=64,
|
||||
n_epochs=4,
|
||||
gamma=0.99, # discount factor
|
||||
gae_lambda=0.95,
|
||||
clip_range=0.2,
|
||||
ent_coef=0.01, # entropy bonus for exploration
|
||||
policy_kwargs={
|
||||
"net_arch": [64, 64], # 2 hidden layers of 64 units
|
||||
},
|
||||
tensorboard_log=str(ckpt_dir / "tb_logs"),
|
||||
)
|
||||
|
||||
print(f"Policy network params: {sum(p.numel() for p in model.policy.parameters()):,}")
|
||||
print(f"Training for {args.timesteps} timesteps...")
|
||||
print()
|
||||
|
||||
try:
|
||||
model.learn(
|
||||
total_timesteps=args.timesteps,
|
||||
callback=checkpoint_cb,
|
||||
progress_bar=True,
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\nTraining interrupted.")
|
||||
|
||||
# Save final model
|
||||
final_path = ckpt_dir / "combat_ppo_final.zip"
|
||||
model.save(str(final_path))
|
||||
print(f"\nModel saved to {final_path}")
|
||||
|
||||
env.close()
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
from stable_baselines3 import PPO
|
||||
from training.rl.minecraft_env import MinecraftCombatEnv
|
||||
|
||||
print(f"=== Evaluating {args.model} ===")
|
||||
|
||||
env = MinecraftCombatEnv(
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
username="RLBot_eval",
|
||||
max_steps=args.max_steps,
|
||||
render_mode="human",
|
||||
)
|
||||
|
||||
model = PPO.load(args.model)
|
||||
|
||||
total_reward = 0
|
||||
total_kills = 0
|
||||
episodes = args.eval_episodes
|
||||
|
||||
for ep in range(episodes):
|
||||
obs, info = env.reset()
|
||||
ep_reward = 0
|
||||
done = False
|
||||
|
||||
while not done:
|
||||
action, _ = model.predict(obs, deterministic=True)
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
ep_reward += reward
|
||||
done = terminated or truncated
|
||||
|
||||
total_reward += ep_reward
|
||||
total_kills += info.get("kills", 0)
|
||||
print(f" Episode {ep+1}: reward={ep_reward:.1f} kills={info.get('kills', 0)} steps={info.get('step', 0)}")
|
||||
|
||||
print(f"\nAverage: reward={total_reward/episodes:.1f} kills={total_kills/episodes:.1f}")
|
||||
env.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Minecraft RL Combat Training")
|
||||
parser.add_argument("--host", default="192.168.0.244")
|
||||
parser.add_argument("--port", type=int, default=25568)
|
||||
parser.add_argument("--timesteps", type=int, default=10000)
|
||||
parser.add_argument("--max-steps", type=int, default=300)
|
||||
parser.add_argument("--save-freq", type=int, default=2000)
|
||||
parser.add_argument("--verbose", action="store_true")
|
||||
parser.add_argument("--eval", action="store_true")
|
||||
parser.add_argument("--eval-episodes", type=int, default=5)
|
||||
parser.add_argument("--model", default="training/rl/checkpoints/combat_ppo_final.zip")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.eval:
|
||||
evaluate(args)
|
||||
else:
|
||||
train(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user