da8f557219
GPU Scheduler (gpu.sethpc.xyz): - Live dashboard with 4 GPUs, training monitor, loss sparklines - Preset-based job scheduler with 3 triggers (time, finish_training, cost) - Model selection per GPU, pipeline configuration - Tool self-play and training pipeline types - Behind Google OAuth, live-refresh without page reload Tool Architecture (14 tools): - 3 new tools: world.nearby_entities, memory.read, memory.write - 7 script.* tools: write, validate, execute, read, list, delete, schedule - ScriptManager: full mcfunction datapack CRUD with RCON validation - Training data: 1,430 tool examples (up from 1,159) Plugin Deployment (paper-ai-25567): - WorldGuard 7.0.12, CoreProtect CE 23.1, EssentialsX 2.21.2, Vault 1.7.3 - Fresh greenfield world reset - 104 RCON-validated plugin training examples Event Dispatcher: - Watches server log for deaths, joins, advancements, PvP kills - Configurable trigger probability and cooldowns per event type - Deployed to dev server, fires god_system prompts on events - 21 event-response training examples Training Infrastructure: - train_lora.py: --save-steps 50, --resume from checkpoint - run_training.sh: stops Ollama, activates conda, restarts after - Passwordless sudo for ollama services on steel141 - Dev server added to MCSManager with autoStart Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1437 lines
58 KiB
Python
1437 lines
58 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Mortdecai GPU Scheduler — preset-based job scheduler with live GPU monitoring.
|
|
|
|
Features:
|
|
- GPU dashboard with live stats across the homelab
|
|
- Configuration presets (GPU assignments, model selection, pipeline type)
|
|
- Job scheduler with 3 trigger types: time, finish_training, cost
|
|
- Model management: load/unload Ollama models per GPU
|
|
- Training progress monitor with loss curves
|
|
|
|
Usage:
|
|
python3 gpu_scheduler.py --port 8098
|
|
|
|
Serve behind Caddy as gpu.sethpc.xyz with google_auth.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
from pathlib import Path
|
|
from urllib.parse import parse_qs, urlparse
|
|
from datetime import datetime, timedelta
|
|
|
|
PORT = 8098
|
|
DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "scheduler"
|
|
|
|
# ── GPU Inventory ──────────────────────────────────────────────────────────
|
|
|
|
GPUS = [
|
|
{
|
|
"id": "3090ti", "name": "RTX 3090 Ti", "vram_gb": 24, "vram_mb": 24564,
|
|
"host": "seth@192.168.0.141", "gpu_index": 1,
|
|
"ollama_port": 11434, "ollama_service": "ollama.service",
|
|
"capabilities": ["training", "inference", "self-play", "pipeline"],
|
|
"location": "steel141",
|
|
},
|
|
{
|
|
"id": "2080ti", "name": "RTX 2080 Ti", "vram_gb": 11, "vram_mb": 11264,
|
|
"host": "seth@192.168.0.141", "gpu_index": 0,
|
|
"ollama_port": 11435, "ollama_service": "ollama-gpu0.service",
|
|
"capabilities": ["inference", "self-play", "pipeline", "generator"],
|
|
"location": "steel141",
|
|
},
|
|
{
|
|
"id": "rtx4000", "name": "Quadro RTX 4000", "vram_gb": 8, "vram_mb": 8192,
|
|
"host": "pve197", "gpu_index": 0, "pct_id": 105,
|
|
"ollama_port": 11434, "ollama_service": "ollama.service",
|
|
"capabilities": ["inference", "self-play", "pipeline", "prod"],
|
|
"location": "pve197 → CT 105",
|
|
},
|
|
{
|
|
"id": "1660s", "name": "GTX 1660 Super", "vram_gb": 6, "vram_mb": 6144,
|
|
"host": "root@192.168.0.235", "gpu_index": 0,
|
|
"ollama_port": 11434, "ollama_service": "ollama.service",
|
|
"capabilities": ["generator", "inference-small"],
|
|
"location": "bedroom",
|
|
"ssh_extra": "-o StrictHostKeyChecking=no", "ssh_pass": "REDACTED_PASSWORD",
|
|
},
|
|
]
|
|
GPU_MAP = {g["id"]: g for g in GPUS}
|
|
|
|
# ── Pipeline Definitions ──────────────────────────────────────────────────
|
|
|
|
PIPELINE_TYPES = {
|
|
"training": {
|
|
"label": "Training (QLoRA)",
|
|
"description": "Fine-tune model via Unsloth QLoRA",
|
|
"gpu_req": ["training"],
|
|
"params": ["base_model", "dataset", "output_name", "epochs", "lr", "batch_size", "grad_accum", "max_seq_len", "save_steps"],
|
|
"defaults": {
|
|
"base_model": "Qwen/Qwen3.5-9B", "dataset": "auto", "output_name": "mortdecai-0.5.0",
|
|
"epochs": 1, "lr": 1e-4, "batch_size": 2, "grad_accum": 4, "max_seq_len": 2048, "save_steps": 50,
|
|
},
|
|
},
|
|
"self_play": {
|
|
"label": "Self-Play",
|
|
"description": "Model generates edge cases and learns from failures",
|
|
"gpu_req": ["inference"],
|
|
"params": ["model", "tiers", "rounds_per_tier", "rcon_host", "rcon_port", "rcon_pass"],
|
|
"defaults": {
|
|
"model": "mortdecai:0.4.0", "tiers": "1,2,3",
|
|
"rounds_per_tier": 50, "rcon_host": "192.168.0.244", "rcon_port": 25578,
|
|
"rcon_pass": "REDACTED_RCON",
|
|
},
|
|
},
|
|
"prompt_pipeline": {
|
|
"label": "Prompt Pipeline",
|
|
"description": "Small model generates prompts, big models process + RCON validate",
|
|
"gpu_req": ["generator", "inference"],
|
|
"params": ["gen_model", "proc_model", "batch_size", "interval"],
|
|
"defaults": {
|
|
"gen_model": "qwen3.5:0.8b", "proc_model": "mortdecai:0.4.0",
|
|
"batch_size": 30, "interval": 120,
|
|
},
|
|
},
|
|
"bakeoff": {
|
|
"label": "Bake-off",
|
|
"description": "Compare model versions on standard test prompts",
|
|
"gpu_req": ["inference"],
|
|
"params": ["models", "test_set", "rcon_host"],
|
|
"defaults": {
|
|
"models": "mortdecai:0.4.0,mortdecai:0.5.0", "test_set": "standard",
|
|
"rcon_host": "192.168.0.244",
|
|
},
|
|
},
|
|
"export_gguf": {
|
|
"label": "Export GGUF",
|
|
"description": "Convert LoRA adapter to GGUF for Ollama",
|
|
"gpu_req": ["training"],
|
|
"params": ["adapter_path", "output_name", "quant"],
|
|
"defaults": {
|
|
"adapter_path": "training/checkpoints/mortdecai-0.5.0",
|
|
"output_name": "mortdecai:0.5.0", "quant": "q4_k_m",
|
|
},
|
|
},
|
|
"tool_self_play": {
|
|
"label": "Tool Self-Play",
|
|
"description": "Exercise all 14 tools on the dev server — scripts, memory, entities, wiki",
|
|
"gpu_req": ["inference"],
|
|
"params": ["model", "rounds", "categories", "rcon_host", "rcon_port", "rcon_pass"],
|
|
"defaults": {
|
|
"model": "mortdecai:0.4.0", "rounds": 10,
|
|
"categories": "all",
|
|
"rcon_host": "192.168.0.112", "rcon_port": 25578,
|
|
"rcon_pass": "REDACTED_RCON",
|
|
},
|
|
},
|
|
"load_model": {
|
|
"label": "Load Model",
|
|
"description": "Load/switch Ollama model on a GPU",
|
|
"gpu_req": ["inference"],
|
|
"params": ["model"],
|
|
"defaults": {"model": "mortdecai:0.4.0"},
|
|
},
|
|
}
|
|
|
|
# ── State ──────────────────────────────────────────────────────────────────
|
|
|
|
_lock = threading.Lock()
|
|
_state = {
|
|
"gpus": {},
|
|
"training": None,
|
|
"last_refresh": None,
|
|
}
|
|
_presets = {} # id -> preset dict
|
|
_jobs = [] # list of job dicts
|
|
_schedule = [] # list of scheduled trigger dicts
|
|
_cost_tracker = {"total_kwh": 0.0, "total_cost": 0.0, "electricity_rate": 0.12}
|
|
|
|
TRAINING_LOG_PATTERN = "/home/seth/mc-ai-training/Minecraft-AI-model/training/train_run_*.log"
|
|
TRAINING_HOST = "seth@192.168.0.141"
|
|
|
|
|
|
# ── Persistence ────────────────────────────────────────────────────────────
|
|
|
|
def _ensure_data_dir():
|
|
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
def _save_presets():
|
|
_ensure_data_dir()
|
|
with open(DATA_DIR / "presets.json", "w") as f:
|
|
json.dump(_presets, f, indent=2)
|
|
|
|
def _save_jobs():
|
|
_ensure_data_dir()
|
|
with open(DATA_DIR / "jobs.json", "w") as f:
|
|
json.dump(_jobs, f, indent=2, default=str)
|
|
|
|
def _save_schedule():
|
|
_ensure_data_dir()
|
|
with open(DATA_DIR / "schedule.json", "w") as f:
|
|
json.dump(_schedule, f, indent=2, default=str)
|
|
|
|
def _load_persisted():
|
|
global _presets, _jobs, _schedule
|
|
_ensure_data_dir()
|
|
for name, target in [("presets.json", "_presets"), ("jobs.json", "_jobs"), ("schedule.json", "_schedule")]:
|
|
path = DATA_DIR / name
|
|
if path.exists():
|
|
with open(path) as f:
|
|
data = json.load(f)
|
|
if target == "_presets":
|
|
_presets = data
|
|
elif target == "_jobs":
|
|
_jobs = data
|
|
elif target == "_schedule":
|
|
_schedule = data
|
|
|
|
|
|
# ── SSH Helpers ────────────────────────────────────────────────────────────
|
|
|
|
def _ssh_cmd(gpu_or_host, cmd, timeout=8):
|
|
"""Run a command over SSH. Accepts a GPU dict or host string."""
|
|
if isinstance(gpu_or_host, dict):
|
|
gpu = gpu_or_host
|
|
host = gpu["host"]
|
|
extra = gpu.get("ssh_extra", "").split() if gpu.get("ssh_extra") else []
|
|
ssh_pass = gpu.get("ssh_pass")
|
|
# If pct_id is set, wrap command through proxmox host
|
|
if "pct_id" in gpu:
|
|
cmd = f"pct exec {gpu['pct_id']} -- bash -c '{cmd}'"
|
|
else:
|
|
host = gpu_or_host
|
|
extra = []
|
|
ssh_pass = None
|
|
|
|
try:
|
|
if ssh_pass:
|
|
full_cmd = ["sshpass", "-p", ssh_pass, "ssh", "-o", "ConnectTimeout=4"] + extra + [host, cmd]
|
|
else:
|
|
full_cmd = ["ssh", "-o", "ConnectTimeout=4", "-o", "BatchMode=yes"] + extra + [host, cmd]
|
|
r = subprocess.run(full_cmd, capture_output=True, text=True, timeout=timeout)
|
|
return r.stdout.strip() if r.returncode == 0 else None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _ollama_api(gpu, endpoint, method="GET", data=None):
|
|
"""Call Ollama API on a GPU via SSH curl."""
|
|
port = gpu["ollama_port"]
|
|
if method == "GET":
|
|
cmd = f"curl -s --connect-timeout 3 http://localhost:{port}{endpoint}"
|
|
else:
|
|
payload = json.dumps(data).replace("'", "'\\''") if data else "{}"
|
|
cmd = f"curl -s --connect-timeout 3 -X POST http://localhost:{port}{endpoint} -d '{payload}'"
|
|
raw = _ssh_cmd(gpu, cmd)
|
|
if raw:
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|
|
|
|
|
|
# ── GPU Monitoring ─────────────────────────────────────────────────────────
|
|
|
|
def _fetch_gpu_stats(gpu):
|
|
query = f"nvidia-smi --id={gpu['gpu_index']} --query-gpu=utilization.gpu,temperature.gpu,power.draw,memory.used,memory.total,fan.speed --format=csv,noheader,nounits"
|
|
raw = _ssh_cmd(gpu, query)
|
|
if not raw:
|
|
return {"online": False, "id": gpu["id"], "name": gpu["name"]}
|
|
|
|
parts = [p.strip() for p in raw.split(",")]
|
|
try:
|
|
return {
|
|
"online": True, "id": gpu["id"], "name": gpu["name"],
|
|
"vram_gb": gpu["vram_gb"], "location": gpu["location"],
|
|
"capabilities": gpu["capabilities"],
|
|
"utilization": int(parts[0]), "temperature": int(parts[1]),
|
|
"power_watts": float(parts[2]),
|
|
"vram_used_mb": int(parts[3]), "vram_total_mb": int(parts[4]),
|
|
"fan_speed": int(parts[5]) if parts[5] not in ("[N/A]", "[Not Supported]") else None,
|
|
"vram_pct": round(int(parts[3]) / int(parts[4]) * 100, 1),
|
|
}
|
|
except (ValueError, IndexError):
|
|
return {"online": True, "id": gpu["id"], "name": gpu["name"], "error": raw}
|
|
|
|
|
|
def _fetch_ollama_info(gpu):
|
|
"""Get running + available models from Ollama."""
|
|
ps = _ollama_api(gpu, "/api/ps") or {}
|
|
tags = _ollama_api(gpu, "/api/tags") or {}
|
|
running = []
|
|
for m in ps.get("models", []):
|
|
running.append({
|
|
"name": m.get("name", "?"),
|
|
"size_gb": round(m.get("size", 0) / 1e9, 1),
|
|
"vram_gb": round(m.get("size_vram", 0) / 1e9, 1),
|
|
})
|
|
available = [m.get("name", "?") for m in tags.get("models", [])]
|
|
return {"running": running, "available": available}
|
|
|
|
|
|
def _fetch_training_status():
|
|
# Find the most recently modified training log
|
|
log_path = _ssh_cmd(TRAINING_HOST, f"ls -t {TRAINING_LOG_PATTERN} 2>/dev/null | head -1", timeout=5)
|
|
if not log_path:
|
|
return None
|
|
raw = _ssh_cmd(TRAINING_HOST, f"tail -200 {log_path} 2>/dev/null", timeout=8)
|
|
if not raw:
|
|
return None
|
|
|
|
status = {"active": False, "loss_history": []}
|
|
|
|
progress_matches = re.findall(r'(\d+)%\|[^|]*\|\s*(\d+)/(\d+)\s*\[([^\]]+)\]', raw)
|
|
if progress_matches:
|
|
last = progress_matches[-1]
|
|
status["pct"] = int(last[0])
|
|
status["current_step"] = int(last[1])
|
|
status["total_steps"] = int(last[2])
|
|
timing = last[3]
|
|
eta_match = re.search(r'<([^,]+)', timing)
|
|
elapsed_match = re.match(r'([^<]+)', timing)
|
|
if eta_match:
|
|
status["eta"] = eta_match.group(1).strip()
|
|
if elapsed_match:
|
|
status["elapsed"] = elapsed_match.group(1).strip()
|
|
status["active"] = True
|
|
|
|
if "OutOfMemoryError" in raw:
|
|
status["active"] = False
|
|
status["error"] = "OOM"
|
|
elif "Error" in raw.split("\n")[-1] and "OutOfMemoryError" not in raw:
|
|
status["active"] = False
|
|
status["error"] = "crashed"
|
|
|
|
loss_matches = re.findall(r"'loss':\s*'([^']+)'", raw)
|
|
for lm in loss_matches:
|
|
try:
|
|
status["loss_history"].append(float(lm))
|
|
except ValueError:
|
|
pass
|
|
if status["loss_history"]:
|
|
status["latest_loss"] = status["loss_history"][-1]
|
|
|
|
lr_matches = re.findall(r"'learning_rate':\s*'([^']+)'", raw)
|
|
if lr_matches:
|
|
status["learning_rate"] = lr_matches[-1]
|
|
|
|
return status
|
|
|
|
|
|
def _fetch_processes(gpu):
|
|
cmd = f"nvidia-smi --id={gpu['gpu_index']} --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null"
|
|
raw = _ssh_cmd(gpu, cmd)
|
|
if not raw:
|
|
return []
|
|
procs = []
|
|
for line in raw.strip().split("\n"):
|
|
if not line.strip():
|
|
continue
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) >= 3:
|
|
procs.append({"pid": parts[0], "name": parts[1].split("/")[-1], "vram_mb": parts[2]})
|
|
return procs
|
|
|
|
|
|
def refresh_state():
|
|
new_gpus = {}
|
|
threads = []
|
|
|
|
def fetch_one(gpu):
|
|
stats = _fetch_gpu_stats(gpu)
|
|
stats["ollama"] = _fetch_ollama_info(gpu)
|
|
stats["processes"] = _fetch_processes(gpu)
|
|
# Check if any job is running on this GPU
|
|
active_jobs = [j for j in _jobs if j.get("status") == "running" and gpu["id"] in j.get("gpus", [])]
|
|
stats["active_job"] = active_jobs[0]["id"] if active_jobs else None
|
|
new_gpus[gpu["id"]] = stats
|
|
|
|
for gpu in GPUS:
|
|
t = threading.Thread(target=fetch_one, args=(gpu,))
|
|
t.start()
|
|
threads.append(t)
|
|
for t in threads:
|
|
t.join(timeout=12)
|
|
|
|
with _lock:
|
|
_state["gpus"] = new_gpus
|
|
_state["training"] = _fetch_training_status()
|
|
_state["last_refresh"] = time.strftime("%H:%M:%S")
|
|
|
|
|
|
def _bg_refresh_loop(interval=10):
|
|
while True:
|
|
try:
|
|
refresh_state()
|
|
_check_triggers()
|
|
except Exception as e:
|
|
print(f"[scheduler] refresh error: {e}")
|
|
time.sleep(interval)
|
|
|
|
|
|
# ── Job Execution ──────────────────────────────────────────────────────────
|
|
|
|
def _run_job_async(job):
|
|
"""Execute a job in a background thread."""
|
|
def _run():
|
|
job["status"] = "running"
|
|
job["started_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
_save_jobs()
|
|
print(f"[scheduler] starting job {job['id']}: {job['pipeline']}")
|
|
|
|
try:
|
|
pipeline = job["pipeline"]
|
|
params = job["params"]
|
|
gpus = job["gpus"]
|
|
|
|
if pipeline == "training":
|
|
_exec_training(job, params)
|
|
elif pipeline == "self_play":
|
|
_exec_self_play(job, params, gpus)
|
|
elif pipeline == "prompt_pipeline":
|
|
_exec_prompt_pipeline(job, params, gpus)
|
|
elif pipeline == "load_model":
|
|
_exec_load_model(job, params, gpus)
|
|
elif pipeline == "export_gguf":
|
|
_exec_export_gguf(job, params)
|
|
elif pipeline == "bakeoff":
|
|
_exec_bakeoff(job, params, gpus)
|
|
elif pipeline == "tool_self_play":
|
|
_exec_tool_self_play(job, params, gpus)
|
|
else:
|
|
job["error"] = f"unknown pipeline: {pipeline}"
|
|
job["status"] = "failed"
|
|
except Exception as e:
|
|
job["error"] = str(e)
|
|
job["status"] = "failed"
|
|
print(f"[scheduler] job {job['id']} failed: {e}")
|
|
|
|
if job["status"] == "running":
|
|
job["status"] = "completed"
|
|
job["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
_save_jobs()
|
|
print(f"[scheduler] job {job['id']} → {job['status']}")
|
|
|
|
t = threading.Thread(target=_run, daemon=True)
|
|
t.start()
|
|
return job
|
|
|
|
|
|
def _exec_training(job, params):
|
|
"""Launch training on the 3090 Ti via SSH."""
|
|
output_name = params.get('output_name', 'mortdecai-0.5.0')
|
|
log_name = f"train_run_{output_name}.log"
|
|
|
|
# Build the training command with conda environment activation
|
|
train_cmd = (
|
|
f"source /home/seth/miniconda3/etc/profile.d/conda.sh && "
|
|
f"conda activate mc-train && "
|
|
f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
|
|
f"TORCH_COMPILE_DISABLE=1 TORCHDYNAMO_DISABLE=1 CUDA_VISIBLE_DEVICES=0 "
|
|
f"python3 training/scripts/train_lora.py "
|
|
f"--model '{params.get('base_model', 'Qwen/Qwen3.5-9B')}' "
|
|
f"--output 'training/checkpoints/{output_name}' "
|
|
f"--lr {params.get('lr', 1e-4)} "
|
|
f"--epochs {int(params.get('epochs', 1))} "
|
|
f"--batch-size {int(params.get('batch_size', 2))} "
|
|
f"--grad-accum {int(params.get('grad_accum', 4))} "
|
|
f"--max-seq-len {int(params.get('max_seq_len', 2048))} "
|
|
f"--save-steps {int(params.get('save_steps', 50))}"
|
|
)
|
|
if params.get("resume"):
|
|
train_cmd += " --resume"
|
|
train_cmd += f" 2>&1 | tee training/{log_name}"
|
|
|
|
# Cancel any running jobs on the 3090 Ti to free VRAM
|
|
for j in _jobs:
|
|
if j.get("status") == "running" and "3090ti" in j.get("gpus", []) and j["id"] != job["id"]:
|
|
j["status"] = "cancelled"
|
|
print(f"[training] cancelled conflicting job {j['id']} on 3090ti")
|
|
_save_jobs()
|
|
|
|
# Stop both Ollama services AND prevent auto-restart
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service 2>/dev/null", timeout=10)
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama-gpu0.service 2>/dev/null", timeout=10)
|
|
time.sleep(2)
|
|
# Kill any lingering ollama processes holding GPU 1 VRAM
|
|
_ssh_cmd(TRAINING_HOST,
|
|
"for pid in $(nvidia-smi --id=1 --query-compute-apps=pid --format=csv,noheader,nounits 2>/dev/null); do kill $pid 2>/dev/null; done",
|
|
timeout=5)
|
|
time.sleep(3)
|
|
|
|
# Verify VRAM is free enough (need ~18GB free on 24GB card)
|
|
vram_check = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits")
|
|
if vram_check:
|
|
try:
|
|
free_mb = int(vram_check.strip())
|
|
except ValueError:
|
|
free_mb = 0
|
|
print(f"[training] 3090 Ti free VRAM: {free_mb}MB")
|
|
if free_mb < 18000:
|
|
# Last resort: try harder to free VRAM
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service; sudo systemctl stop ollama-gpu0.service", timeout=10)
|
|
time.sleep(5)
|
|
vram_check2 = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits")
|
|
try:
|
|
free_mb = int(vram_check2.strip()) if vram_check2 else 0
|
|
except ValueError:
|
|
free_mb = 0
|
|
if free_mb < 18000:
|
|
job["status"] = "failed"
|
|
job["error"] = f"Not enough VRAM: {free_mb}MB free, need 18000MB"
|
|
# Restart Ollama since we're not training
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10)
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10)
|
|
return
|
|
|
|
# Launch training via nohup with bash -l for conda
|
|
nohup_cmd = f"nohup bash -c '{train_cmd}' > /dev/null 2>&1 &"
|
|
_ssh_cmd(TRAINING_HOST, nohup_cmd, timeout=10)
|
|
job["log_path"] = f"training/{log_name}"
|
|
print(f"[training] launched, logging to {log_name}")
|
|
|
|
# Monitor until done
|
|
while job["status"] == "running":
|
|
time.sleep(30)
|
|
status = _fetch_training_status()
|
|
if status:
|
|
job["progress"] = status
|
|
if status.get("error"):
|
|
job["status"] = "failed"
|
|
job["error"] = status["error"]
|
|
break
|
|
if not status.get("active") and status.get("current_step", 0) == status.get("total_steps", 0) and status.get("total_steps", 0) > 0:
|
|
job["status"] = "completed"
|
|
break
|
|
|
|
# Restart Ollama services after training
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10)
|
|
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10)
|
|
|
|
|
|
def _exec_self_play(job, params, gpus):
|
|
resolved_gpus = [GPU_MAP[gid] for gid in gpus if gid in GPU_MAP]
|
|
if not resolved_gpus:
|
|
job["error"] = "no GPU assigned"
|
|
job["status"] = "failed"
|
|
return
|
|
|
|
model = params.get("model", "mortdecai:0.4.0")
|
|
tiers = [t.strip() for t in params.get("tiers", "1,2,3").split(",")]
|
|
rounds = int(params.get("rounds_per_tier", 50))
|
|
rcon_host = params.get("rcon_host", "192.168.0.244")
|
|
rcon_port = int(params.get("rcon_port", 25578))
|
|
rcon_pass = params.get("rcon_pass", "REDACTED_RCON")
|
|
script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/self_play.py"
|
|
|
|
# Distribute tiers round-robin across GPUs, launch all in parallel
|
|
gpu_assignments = {} # gpu_id -> list of tiers
|
|
for i, tier in enumerate(tiers):
|
|
gpu = resolved_gpus[i % len(resolved_gpus)]
|
|
gpu_assignments.setdefault(gpu["id"], []).append(tier)
|
|
|
|
job["gpu_assignments"] = {gid: ts for gid, ts in gpu_assignments.items()}
|
|
|
|
# Launch all GPU workers in parallel threads
|
|
errors = []
|
|
def run_on_gpu(gpu, assigned_tiers):
|
|
port = gpu["ollama_port"]
|
|
for tier in assigned_tiers:
|
|
if job["status"] != "running":
|
|
break
|
|
log_file = f"/tmp/selfplay_{gpu['id']}_{tier}.log"
|
|
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
|
|
f"python3 {script_path} --tier {tier} --rounds {rounds} "
|
|
f"--ollama-url http://localhost:{port} --model {model} "
|
|
f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass}")
|
|
_ssh_cmd(gpu, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10)
|
|
print(f"[self-play] {gpu['name']}: {tier} x{rounds} started")
|
|
# Wait for this tier to finish
|
|
for _ in range(rounds * 3):
|
|
time.sleep(10)
|
|
log = _ssh_cmd(gpu, f"tail -5 {log_file} 2>/dev/null")
|
|
if log and ("Complete" in log or "Error" in log or "Traceback" in log):
|
|
if "Error" in log or "Traceback" in log:
|
|
errors.append(f"{gpu['name']}/{tier}: {log[-200:]}")
|
|
break
|
|
if job["status"] != "running":
|
|
break
|
|
|
|
threads = []
|
|
for gid, assigned_tiers in gpu_assignments.items():
|
|
gpu = GPU_MAP[gid]
|
|
t = threading.Thread(target=run_on_gpu, args=(gpu, assigned_tiers), daemon=True)
|
|
t.start()
|
|
threads.append(t)
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
if errors:
|
|
job["error"] = "; ".join(errors[:3])
|
|
|
|
|
|
def _exec_prompt_pipeline(job, params, gpus):
|
|
gen_gpu = GPU_MAP.get(gpus[0]) if len(gpus) > 0 else None
|
|
proc_gpu = GPU_MAP.get(gpus[1]) if len(gpus) > 1 else gen_gpu
|
|
if not gen_gpu:
|
|
job["error"] = "no GPUs assigned"
|
|
job["status"] = "failed"
|
|
return
|
|
|
|
gen_port = gen_gpu["ollama_port"]
|
|
proc_port = proc_gpu["ollama_port"] if proc_gpu else gen_port
|
|
|
|
gen_host_ip = gen_gpu["host"].split("@")[-1]
|
|
proc_host_ip = proc_gpu["host"].split("@")[-1] if proc_gpu else gen_host_ip
|
|
|
|
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
|
|
f"python3 training/scripts/prompt_pipeline.py --mode all "
|
|
f"--gen-url http://{gen_host_ip}:{gen_port} "
|
|
f"--gen-model {params.get('gen_model', 'qwen3.5:0.8b')} "
|
|
f"--proc-urls http://{proc_host_ip}:{proc_port} "
|
|
f"--proc-model {params.get('proc_model', 'mortdecai:0.4.0')} "
|
|
f"--interval {params.get('interval', 120)}")
|
|
|
|
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/pipeline.log 2>&1 &", timeout=10)
|
|
|
|
|
|
def _exec_load_model(job, params, gpus):
|
|
for gid in gpus:
|
|
gpu = GPU_MAP.get(gid)
|
|
if not gpu:
|
|
continue
|
|
model = params.get("model", "mortdecai:0.4.0")
|
|
result = _ollama_api(gpu, "/api/generate", method="POST", data={
|
|
"model": model, "prompt": "test", "stream": False,
|
|
"options": {"num_predict": 1},
|
|
})
|
|
if result and "error" not in result:
|
|
job["result"] = f"Loaded {model} on {gpu['name']}"
|
|
else:
|
|
job["error"] = f"Failed to load {model} on {gpu['name']}: {result}"
|
|
job["status"] = "failed"
|
|
|
|
|
|
def _exec_export_gguf(job, params):
|
|
adapter = params.get("adapter_path", "training/checkpoints/mortdecai-0.5.0")
|
|
quant = params.get("quant", "q4_k_m")
|
|
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
|
|
f"python3 -m unsloth.save --model {adapter} --output_type gguf --quantization {quant}")
|
|
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/export_gguf.log 2>&1 &", timeout=10)
|
|
# Monitor
|
|
for _ in range(120):
|
|
time.sleep(15)
|
|
log = _ssh_cmd(TRAINING_HOST, "tail -3 /tmp/export_gguf.log 2>/dev/null")
|
|
if log and ("Saved" in log or "Error" in log or "error" in log):
|
|
if "Error" in log or "error" in log:
|
|
job["status"] = "failed"
|
|
job["error"] = log
|
|
break
|
|
|
|
|
|
def _exec_bakeoff(job, params, gpus):
|
|
gpu = GPU_MAP.get(gpus[0]) if gpus else None
|
|
if not gpu:
|
|
job["error"] = "no GPU assigned"
|
|
job["status"] = "failed"
|
|
return
|
|
models = params.get("models", "mortdecai:0.4.0")
|
|
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
|
|
f"python3 training/scripts/bakeoff.py --models {models}")
|
|
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/bakeoff.log 2>&1 &", timeout=10)
|
|
|
|
|
|
def _exec_tool_self_play(job, params, gpus):
|
|
"""Run tool-focused self-play on the dev server via the assigned GPU's Ollama."""
|
|
gpu = GPU_MAP.get(gpus[0]) if gpus else None
|
|
if not gpu:
|
|
job["error"] = "no GPU assigned"
|
|
job["status"] = "failed"
|
|
return
|
|
|
|
host_ip = gpu["host"].split("@")[-1] if "@" in gpu["host"] else gpu["host"]
|
|
# For pct-based GPUs, use the CT's external IP
|
|
if "pct_id" in gpu:
|
|
host_ip = "192.168.0.179" # CT 105 external IP
|
|
port = gpu["ollama_port"]
|
|
model = params.get("model", "mortdecai:0.4.0")
|
|
rounds = int(params.get("rounds", 10))
|
|
categories = params.get("categories", "all")
|
|
rcon_host = params.get("rcon_host", "192.168.0.112")
|
|
rcon_port = int(params.get("rcon_port", 25578))
|
|
rcon_pass = params.get("rcon_pass", "REDACTED_RCON")
|
|
|
|
script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/tool_self_play.py"
|
|
log_file = f"/tmp/tool_selfplay_{gpu['id']}.log"
|
|
|
|
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
|
|
f"python3 {script_path} "
|
|
f"--ollama-url http://{host_ip}:{port} --model {model} "
|
|
f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass} "
|
|
f"--rounds {rounds} --categories {categories}")
|
|
|
|
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10)
|
|
print(f"[tool-self-play] launched on {gpu['name']}, logging to {log_file}")
|
|
|
|
# Monitor until done
|
|
for _ in range(rounds * len(PIPELINE_TYPES) * 3):
|
|
time.sleep(15)
|
|
log = _ssh_cmd(TRAINING_HOST, f"tail -5 {log_file} 2>/dev/null")
|
|
if log and ("Complete" in log or "Traceback" in log):
|
|
if "Traceback" in log:
|
|
job["error"] = log[-300:]
|
|
job["status"] = "failed"
|
|
break
|
|
if job["status"] != "running":
|
|
break
|
|
|
|
|
|
# ── Trigger Engine ─────────────────────────────────────────────────────────
|
|
|
|
def _check_triggers():
|
|
"""Evaluate all scheduled triggers."""
|
|
now = datetime.now()
|
|
for sched in _schedule:
|
|
if sched.get("status") != "pending":
|
|
continue
|
|
|
|
trigger = sched["trigger"]
|
|
fired = False
|
|
|
|
if trigger["type"] == "time":
|
|
target_str = trigger.get("at")
|
|
if target_str:
|
|
try:
|
|
target = datetime.fromisoformat(target_str)
|
|
if now >= target:
|
|
fired = True
|
|
except ValueError:
|
|
pass
|
|
duration_s = trigger.get("duration_seconds")
|
|
created_str = sched.get("created_at")
|
|
if duration_s and created_str:
|
|
try:
|
|
created = datetime.fromisoformat(created_str)
|
|
if now >= created + timedelta(seconds=int(duration_s)):
|
|
fired = True
|
|
except ValueError:
|
|
pass
|
|
|
|
elif trigger["type"] == "finish_training":
|
|
training = _state.get("training")
|
|
if training:
|
|
total = training.get("total_steps", 0)
|
|
current = training.get("current_step", 0)
|
|
if total > 0 and current >= total and not training.get("active"):
|
|
fired = True
|
|
|
|
elif trigger["type"] == "cost":
|
|
threshold = float(trigger.get("threshold_usd", 999))
|
|
if _cost_tracker["total_cost"] >= threshold:
|
|
fired = True
|
|
|
|
if fired:
|
|
sched["status"] = "fired"
|
|
sched["fired_at"] = now.isoformat()
|
|
_save_schedule()
|
|
print(f"[scheduler] trigger fired: {sched['id']} → launching preset {sched['preset_id']}")
|
|
_launch_preset(sched["preset_id"])
|
|
|
|
|
|
def _launch_preset(preset_id):
|
|
"""Create and start a job from a preset."""
|
|
preset = _presets.get(preset_id)
|
|
if not preset:
|
|
print(f"[scheduler] preset {preset_id} not found")
|
|
return None
|
|
|
|
job = {
|
|
"id": str(uuid.uuid4())[:8],
|
|
"preset_id": preset_id,
|
|
"preset_name": preset.get("name", "?"),
|
|
"pipeline": preset["pipeline"],
|
|
"params": preset.get("params", {}),
|
|
"gpus": preset.get("gpus", []),
|
|
"status": "queued",
|
|
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
}
|
|
_jobs.append(job)
|
|
_save_jobs()
|
|
_run_job_async(job)
|
|
return job
|
|
|
|
|
|
# ── HTML Rendering ─────────────────────────────────────────────────────────
|
|
|
|
def _render_page():
|
|
with _lock:
|
|
state = dict(_state)
|
|
|
|
gpu_cards = ""
|
|
for gpu in GPUS:
|
|
data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]})
|
|
gpu_cards += _gpu_card_html(data)
|
|
|
|
training_html = _training_card_html(state.get("training"))
|
|
presets_list_html, presets_form_html = _presets_panel_html()
|
|
schedule_html = _schedule_panel_html()
|
|
jobs_html = _jobs_panel_html()
|
|
last_refresh = state.get("last_refresh", "never")
|
|
online_count = len([g for g in state["gpus"].values() if g.get("online")])
|
|
|
|
return f"""<!DOCTYPE html>
|
|
<html><head>
|
|
<title>Mortdecai GPU Scheduler</title>
|
|
<meta charset="utf-8">
|
|
<style>
|
|
{CSS}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
|
|
<header>
|
|
<h1>Mortdecai GPU Scheduler</h1>
|
|
<div class="subtitle"><span id="refresh-time">{online_count}/{len(GPUS)} GPUs online — refreshed {last_refresh}</span>
|
|
<button class="btn btn-sm" onclick="api('refresh')">Refresh</button>
|
|
</div>
|
|
</header>
|
|
|
|
<div class="layout">
|
|
<div class="main-col">
|
|
<div id="train-section">{training_html}</div>
|
|
<h2>GPUs</h2>
|
|
<div class="grid" id="gpu-grid">{gpu_cards}</div>
|
|
<div id="schedule-section">{schedule_html}</div>
|
|
<div id="jobs-section">{jobs_html}</div>
|
|
</div>
|
|
<div class="side-col">
|
|
<div class="panel">
|
|
<h3>Presets</h3>
|
|
<div id="presets-list">{presets_list_html}</div>
|
|
{presets_form_html}
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<script>
|
|
{JS}
|
|
</script>
|
|
</body></html>"""
|
|
|
|
|
|
def _gpu_card_html(d):
|
|
if not d.get("online"):
|
|
return f"""<div class="card offline"><div class="card-header"><b>{d.get('name','?')}</b><span class="bad">OFFLINE</span></div></div>"""
|
|
|
|
util = d.get("utilization", 0)
|
|
temp = d.get("temperature", 0)
|
|
vram_pct = d.get("vram_pct", 0)
|
|
vram_used = d.get("vram_used_mb", 0)
|
|
vram_total = d.get("vram_total_mb", 0)
|
|
power = d.get("power_watts", 0)
|
|
tc = "bad" if temp > 80 else "warn" if temp > 70 else "ok"
|
|
uc = "ok" if util > 50 else "warn" if util > 10 else "dim"
|
|
vc = "bad" if vram_pct > 90 else "warn" if vram_pct > 70 else "ok"
|
|
|
|
ollama = d.get("ollama", {})
|
|
running = ollama.get("running", [])
|
|
avail = ollama.get("available", [])
|
|
model_tags = " ".join(f'<span class="tag">{m["name"]}</span>' for m in running) if running else '<span class="tag dim">idle</span>'
|
|
|
|
avail_options = "".join(f'<option value="{m}">{m}</option>' for m in avail if m)
|
|
model_select = f"""<select class="model-select" id="ms-{d['id']}">{avail_options}</select>
|
|
<button class="btn btn-xs" onclick="loadModel('{d['id']}')">Load</button>""" if avail else ""
|
|
|
|
active_job = d.get("active_job")
|
|
job_badge = f'<span class="tag accent">job {active_job}</span>' if active_job else ""
|
|
|
|
caps = " ".join(f'<span class="cap">{c}</span>' for c in d.get("capabilities", []))
|
|
|
|
return f"""<div class="card" id="gpu-{d['id']}">
|
|
<div class="card-header"><b>{d['name']}</b><span class="{uc}">{'ACTIVE' if util>10 else 'IDLE'}</span></div>
|
|
<div class="card-sub">{d.get('location','')} {job_badge}</div>
|
|
<div class="bar-row"><span class="bar-label">GPU</span><div class="bar"><div class="bar-fill" style="width:{util}%">{util}%</div></div></div>
|
|
<div class="bar-row"><span class="bar-label">VRAM</span><div class="bar"><div class="bar-fill {vc}" style="width:{vram_pct}%">{vram_used}/{vram_total}MB</div></div></div>
|
|
<div class="stats"><span class="{tc}">{temp}C</span> <span>{power:.0f}W</span></div>
|
|
<div class="models">{model_tags}</div>
|
|
<div class="model-ctrl">{model_select}</div>
|
|
<div class="caps">{caps}</div>
|
|
</div>"""
|
|
|
|
|
|
def _training_card_html(t):
|
|
if not t:
|
|
return '<div class="card"><div class="card-header"><b>Training</b><span class="dim">no log</span></div></div>'
|
|
|
|
pct = t.get("pct", 0)
|
|
step = t.get("current_step", 0)
|
|
total = t.get("total_steps", 0)
|
|
error = t.get("error")
|
|
active = t.get("active", False)
|
|
loss = t.get("latest_loss")
|
|
lr = t.get("learning_rate", "?")
|
|
eta = t.get("eta", "?")
|
|
elapsed = t.get("elapsed", "?")
|
|
|
|
if error:
|
|
status = f'<span class="bad">CRASHED ({error})</span>'
|
|
elif active:
|
|
status = '<span class="ok">TRAINING</span>'
|
|
else:
|
|
status = '<span class="warn">STOPPED</span>'
|
|
|
|
# Sparkline
|
|
lh = t.get("loss_history", [])
|
|
spark = ""
|
|
if lh:
|
|
recent = lh[-40:]
|
|
mx, mn = max(recent), min(recent)
|
|
rng = mx - mn if mx != mn else 1
|
|
w, h = 400, 70
|
|
pts = " ".join(f"{i/(max(len(recent)-1,1))*w:.0f},{h-((v-mn)/rng*h):.0f}" for i, v in enumerate(recent))
|
|
spark = f"""<svg width="{w}" height="{h}" class="spark"><polyline points="{pts}" fill="none" stroke="#D35400" stroke-width="1.5"/>
|
|
<text x="0" y="10" fill="#666" font-size="9">{mx:.4f}</text><text x="0" y="{h}" fill="#666" font-size="9">{mn:.4f}</text></svg>"""
|
|
|
|
return f"""<div class="card train-card">
|
|
<div class="card-header"><b>Training</b>{status}</div>
|
|
<div class="progress"><div class="progress-fill" style="width:{pct}%">{step}/{total} ({pct}%)</div></div>
|
|
<div class="stats">
|
|
<span>Elapsed: {elapsed}</span> <span>ETA: {eta}</span>
|
|
<span>Loss: <b class="accent">{f'{loss:.4f}' if loss else '?'}</b></span> <span>LR: {lr}</span>
|
|
</div>
|
|
{spark}
|
|
</div>"""
|
|
|
|
|
|
def _presets_list_html():
|
|
"""Just the preset rows — refreshable without touching the form."""
|
|
rows = ""
|
|
for pid, p in sorted(_presets.items(), key=lambda x: x[1].get("name", "")):
|
|
gpus = ", ".join(p.get("gpus", []))
|
|
rows += f"""<div class="preset-row">
|
|
<div class="preset-name">{p['name']}</div>
|
|
<div class="preset-info">{p['pipeline']} — {gpus}</div>
|
|
<div class="preset-actions">
|
|
<button class="btn btn-xs" onclick="launchPreset('{pid}')">Run</button>
|
|
<button class="btn btn-xs" onclick="schedulePreset('{pid}')">Schedule</button>
|
|
<button class="btn btn-xs btn-danger" onclick="deletePreset('{pid}')">Del</button>
|
|
</div></div>"""
|
|
|
|
if not rows:
|
|
rows = '<div class="dim" style="padding:0.5rem">No presets yet. Create one below.</div>'
|
|
return rows
|
|
|
|
|
|
def _presets_panel_html():
|
|
"""Returns (list_html, form_html) — list refreshes live, form stays static."""
|
|
list_html = _presets_list_html()
|
|
|
|
pipe_opts = "".join(f'<option value="{k}">{v["label"]}</option>' for k, v in PIPELINE_TYPES.items())
|
|
gpu_checks = "".join(f'<label class="gpu-check"><input type="checkbox" name="gpus" value="{g["id"]}"> {g["name"]}</label>' for g in GPUS)
|
|
|
|
form_html = f"""<div class="create-form">
|
|
<h3 style="margin-top:0.8rem">New Preset</h3>
|
|
<form id="preset-form" onsubmit="return createPreset(event)">
|
|
<label>Name<input name="name" required placeholder="overnight-selfplay"></label>
|
|
<label>Pipeline<select name="pipeline" onchange="updateParamFields(this.value)">{pipe_opts}</select></label>
|
|
<div class="gpu-select"><label>GPUs</label>{gpu_checks}</div>
|
|
<div id="param-fields"></div>
|
|
<button type="submit" class="btn">Save Preset</button>
|
|
</form>
|
|
</div>"""
|
|
|
|
return list_html, form_html
|
|
|
|
|
|
def _schedule_panel_html():
|
|
rows = ""
|
|
for s in sorted(_schedule, key=lambda x: x.get("created_at", ""), reverse=True)[:10]:
|
|
preset_name = _presets.get(s.get("preset_id", ""), {}).get("name", s.get("preset_id", "?"))
|
|
trigger = s.get("trigger", {})
|
|
ttype = trigger.get("type", "?")
|
|
|
|
if ttype == "time":
|
|
if trigger.get("at"):
|
|
trigger_desc = f"at {trigger['at']}"
|
|
else:
|
|
secs = int(trigger.get("duration_seconds", 0))
|
|
trigger_desc = f"after {secs//3600}h{(secs%3600)//60}m"
|
|
elif ttype == "finish_training":
|
|
trigger_desc = "when training completes"
|
|
elif ttype == "cost":
|
|
trigger_desc = f"at ${trigger.get('threshold_usd', '?')}"
|
|
else:
|
|
trigger_desc = ttype
|
|
|
|
st = s.get("status", "?")
|
|
st_class = "ok" if st == "fired" else "warn" if st == "pending" else "dim"
|
|
|
|
sid = s["id"]
|
|
cancel_btn = f"""<button class="btn btn-xs btn-danger" onclick="cancelSchedule('{sid}')">Cancel</button>""" if st == 'pending' else ''
|
|
rows += f"""<div class="sched-row">
|
|
<span class="{st_class}">{st}</span>
|
|
<span>{preset_name}</span>
|
|
<span class="dim">{trigger_desc}</span>
|
|
{cancel_btn}
|
|
</div>"""
|
|
|
|
if not rows:
|
|
rows = '<div class="dim" style="padding:0.5rem">No scheduled triggers.</div>'
|
|
|
|
return f"""<div class="card">
|
|
<div class="card-header"><b>Scheduled Triggers</b><span class="dim">{len([s for s in _schedule if s.get('status')=='pending'])} pending</span></div>
|
|
{rows}
|
|
</div>"""
|
|
|
|
|
|
def _jobs_panel_html():
|
|
recent = sorted(_jobs, key=lambda j: j.get("created_at", ""), reverse=True)[:15]
|
|
rows = ""
|
|
for j in recent:
|
|
st = j.get("status", "?")
|
|
st_class = "ok" if st == "completed" else "bad" if st == "failed" else "warn" if st == "running" else "dim"
|
|
gpus = ", ".join(j.get("gpus", []))
|
|
name = j.get("preset_name", j.get("pipeline", "?"))
|
|
err = f' <span class="bad">({j["error"]})</span>' if j.get("error") else ""
|
|
|
|
jid = j["id"]
|
|
stop_btn = f"""<button class="btn btn-xs btn-danger" onclick="cancelJob('{jid}')">Stop</button>""" if st == 'running' else ''
|
|
created = j.get('created_at', '')[:16]
|
|
rows += f"""<div class="job-row">
|
|
<span class="{st_class}">{st}</span>
|
|
<span>{name}</span>
|
|
<span class="dim">{gpus}</span>
|
|
<span class="dim">{created}</span>
|
|
{err}
|
|
{stop_btn}
|
|
</div>"""
|
|
|
|
if not rows:
|
|
rows = '<div class="dim" style="padding:0.5rem">No jobs yet.</div>'
|
|
|
|
return f"""<div class="card">
|
|
<div class="card-header"><b>Jobs</b><span class="dim">{len([j for j in _jobs if j.get('status')=='running'])} running</span></div>
|
|
{rows}
|
|
</div>"""
|
|
|
|
|
|
# ── CSS ────────────────────────────────────────────────────────────────────
|
|
|
|
CSS = """
|
|
*{box-sizing:border-box;margin:0;padding:0}
|
|
body{font-family:'Courier New',monospace;background:#111;color:#e0e0e0;padding:1rem 1.5rem}
|
|
header{margin-bottom:1.2rem}
|
|
h1{color:#D35400;font-size:1.5rem;margin-bottom:0.2rem}
|
|
h2{color:#D35400;font-size:1.1rem;margin:1rem 0 0.6rem}
|
|
h3{color:#D35400;font-size:1rem;margin-bottom:0.6rem}
|
|
.subtitle{color:#666;font-size:0.8rem}
|
|
.accent{color:#D35400}
|
|
.ok{color:#4caf50} .warn{color:#ff9800} .bad{color:#f44336} .dim{color:#555}
|
|
|
|
.layout{display:grid;grid-template-columns:1fr 340px;gap:1.2rem}
|
|
@media(max-width:900px){.layout{grid-template-columns:1fr}}
|
|
.main-col{min-width:0}
|
|
.side-col{display:flex;flex-direction:column;gap:1rem}
|
|
|
|
.grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:0.8rem}
|
|
|
|
.card{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem;margin-bottom:0.8rem;transition:border-color 0.2s}
|
|
.card:hover{border-color:#D35400}
|
|
.card.offline{opacity:0.4;border-color:#f44336}
|
|
.card-header{display:flex;justify-content:space-between;align-items:center;margin-bottom:0.3rem;font-size:0.95rem}
|
|
.card-sub{color:#555;font-size:0.75rem;margin-bottom:0.5rem}
|
|
|
|
.bar-row{display:flex;align-items:center;margin:0.25rem 0;gap:0.4rem}
|
|
.bar-label{width:36px;color:#777;font-size:0.75rem}
|
|
.bar{flex:1;background:#222;border-radius:3px;height:20px;overflow:hidden}
|
|
.bar-fill{height:100%;border-radius:3px;background:#D35400;display:flex;align-items:center;padding-left:5px;font-size:0.7rem;color:#fff;min-width:fit-content;transition:width 0.5s}
|
|
.bar-fill.ok{background:#4caf50} .bar-fill.warn{background:#ff9800} .bar-fill.bad{background:#f44336}
|
|
|
|
.stats{display:flex;gap:0.8rem;font-size:0.8rem;margin:0.4rem 0;flex-wrap:wrap;color:#999}
|
|
.models{margin-top:0.4rem}
|
|
.model-ctrl{margin-top:0.3rem;display:flex;gap:0.3rem;align-items:center}
|
|
.model-select{background:#222;color:#ccc;border:1px solid #333;border-radius:3px;padding:2px 4px;font-size:0.75rem;font-family:monospace}
|
|
.tag{display:inline-block;background:#222;border:1px solid #333;border-radius:3px;padding:1px 6px;font-size:0.7rem;margin:1px}
|
|
.tag.accent{border-color:#D35400;color:#D35400}
|
|
.tag.dim{color:#444;border-color:#222}
|
|
.caps{margin-top:0.3rem;display:flex;gap:3px;flex-wrap:wrap}
|
|
.cap{font-size:0.65rem;color:#555;background:#1e1e1e;border-radius:2px;padding:1px 4px}
|
|
|
|
.train-card .progress{background:#222;border-radius:3px;height:26px;margin:0.4rem 0;overflow:hidden}
|
|
.train-card .progress-fill{height:100%;background:linear-gradient(90deg,#D35400,#e67e22);border-radius:3px;transition:width 1s;display:flex;align-items:center;justify-content:center;font-size:0.8rem;font-weight:bold;color:#fff}
|
|
.spark{display:block;margin-top:0.5rem;background:#1a1a1a;border:1px solid #222;border-radius:3px}
|
|
|
|
.panel{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem}
|
|
|
|
.preset-row{display:flex;flex-wrap:wrap;align-items:center;gap:0.4rem;padding:0.4rem 0;border-bottom:1px solid #222;font-size:0.8rem}
|
|
.preset-name{font-weight:bold;color:#e0e0e0;flex:1}
|
|
.preset-info{color:#777;font-size:0.75rem;width:100%}
|
|
.preset-actions{display:flex;gap:0.3rem}
|
|
|
|
.sched-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #222;font-size:0.8rem}
|
|
.job-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #1e1e1e;font-size:0.8rem}
|
|
|
|
.btn{background:#222;border:1px solid #D35400;color:#D35400;padding:5px 12px;border-radius:3px;cursor:pointer;font-family:monospace;font-size:0.8rem;transition:background 0.15s}
|
|
.btn:hover{background:#D35400;color:#fff}
|
|
.btn-sm{padding:3px 8px;font-size:0.75rem}
|
|
.btn-xs{padding:2px 6px;font-size:0.7rem}
|
|
.btn-danger{border-color:#f44336;color:#f44336}
|
|
.btn-danger:hover{background:#f44336;color:#fff}
|
|
|
|
.create-form{margin-top:0.5rem}
|
|
.create-form form{display:flex;flex-direction:column;gap:0.5rem;margin-top:0.6rem}
|
|
.create-form label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem}
|
|
.create-form input,.create-form select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem}
|
|
.gpu-select{display:flex;flex-wrap:wrap;gap:0.3rem;font-size:0.8rem;color:#999}
|
|
.gpu-check{display:flex;align-items:center;gap:0.2rem;font-size:0.75rem}
|
|
.gpu-check input{accent-color:#D35400}
|
|
|
|
.modal-overlay{display:none;position:fixed;top:0;left:0;width:100%;height:100%;background:rgba(0,0,0,0.7);z-index:100;justify-content:center;align-items:center}
|
|
.modal-overlay.active{display:flex}
|
|
.modal{background:#1a1a1a;border:1px solid #D35400;border-radius:8px;padding:1.2rem;width:400px;max-width:90vw}
|
|
.modal h3{margin-bottom:0.8rem}
|
|
.modal label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem;margin-bottom:0.4rem}
|
|
.modal input,.modal select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem}
|
|
.modal .btn-row{display:flex;gap:0.5rem;margin-top:0.8rem;justify-content:flex-end}
|
|
"""
|
|
|
|
# ── JS ─────────────────────────────────────────────────────────────────────
|
|
|
|
PIPELINE_TYPES_JSON = json.dumps({k: {"params": v["params"], "defaults": v["defaults"], "label": v["label"]} for k, v in PIPELINE_TYPES.items()})
|
|
|
|
JS = f"""
|
|
const PIPELINES = {PIPELINE_TYPES_JSON};
|
|
|
|
function api(action, data) {{
|
|
return fetch('/api/action', {{
|
|
method:'POST', headers:{{'Content-Type':'application/json'}},
|
|
body: JSON.stringify({{action, ...data}})
|
|
}}).then(r=>r.json());
|
|
}}
|
|
|
|
function updateParamFields(pipeline) {{
|
|
const p = PIPELINES[pipeline];
|
|
if (!p) return;
|
|
const container = document.getElementById('param-fields');
|
|
container.innerHTML = '';
|
|
for (const key of p.params) {{
|
|
const val = p.defaults[key] ?? '';
|
|
const label = document.createElement('label');
|
|
label.textContent = key;
|
|
const input = document.createElement('input');
|
|
input.name = 'param_' + key;
|
|
input.value = val;
|
|
label.appendChild(input);
|
|
container.appendChild(label);
|
|
}}
|
|
}}
|
|
|
|
function createPreset(e) {{
|
|
e.preventDefault();
|
|
const form = e.target;
|
|
const fd = new FormData(form);
|
|
const gpus = fd.getAll('gpus');
|
|
const params = {{}};
|
|
for (const [k,v] of fd.entries()) {{
|
|
if (k.startsWith('param_')) params[k.slice(6)] = v;
|
|
}}
|
|
api('create_preset', {{
|
|
name: fd.get('name'),
|
|
pipeline: fd.get('pipeline'),
|
|
gpus, params
|
|
}}).then(() => liveRefresh());
|
|
return false;
|
|
}}
|
|
|
|
function launchPreset(id) {{
|
|
if (confirm('Launch this preset now?'))
|
|
api('launch_preset', {{preset_id: id}}).then(() => setTimeout(()=>location.reload(), 1000));
|
|
}}
|
|
|
|
function deletePreset(id) {{
|
|
if (confirm('Delete this preset?'))
|
|
api('delete_preset', {{preset_id: id}}).then(() => location.reload());
|
|
}}
|
|
|
|
function loadModel(gpuId) {{
|
|
const sel = document.getElementById('ms-' + gpuId);
|
|
if (!sel) return;
|
|
api('load_model', {{gpu_id: gpuId, model: sel.value}}).then(() =>
|
|
setTimeout(()=>location.reload(), 3000));
|
|
}}
|
|
|
|
function cancelJob(id) {{
|
|
api('cancel_job', {{job_id: id}}).then(() => location.reload());
|
|
}}
|
|
|
|
function cancelSchedule(id) {{
|
|
api('cancel_schedule', {{schedule_id: id}}).then(() => location.reload());
|
|
}}
|
|
|
|
// Schedule modal
|
|
let _schedPresetId = null;
|
|
function schedulePreset(id) {{
|
|
_schedPresetId = id;
|
|
document.getElementById('sched-modal').classList.add('active');
|
|
}}
|
|
function closeModal() {{
|
|
document.getElementById('sched-modal').classList.remove('active');
|
|
}}
|
|
function submitSchedule(e) {{
|
|
e.preventDefault();
|
|
const fd = new FormData(e.target);
|
|
const ttype = fd.get('trigger_type');
|
|
const trigger = {{type: ttype}};
|
|
if (ttype === 'time') {{
|
|
const mode = fd.get('time_mode');
|
|
if (mode === 'at') trigger.at = fd.get('time_at');
|
|
else trigger.duration_seconds = parseInt(fd.get('duration_hours')||0)*3600 + parseInt(fd.get('duration_mins')||0)*60;
|
|
}} else if (ttype === 'cost') {{
|
|
trigger.threshold_usd = parseFloat(fd.get('cost_threshold'));
|
|
}}
|
|
api('create_schedule', {{preset_id: _schedPresetId, trigger}}).then(() => liveRefresh());
|
|
return false;
|
|
}}
|
|
|
|
// Init param fields for first pipeline
|
|
document.addEventListener('DOMContentLoaded', () => {{
|
|
const sel = document.querySelector('[name=pipeline]');
|
|
if (sel) updateParamFields(sel.value);
|
|
}});
|
|
|
|
// Live refresh — update dynamic sections without reloading the page
|
|
function liveRefresh() {{
|
|
fetch('/api/fragments').then(r => r.json()).then(f => {{
|
|
const ids = {{'gpu-grid':'gpus', 'train-section':'training', 'schedule-section':'schedule', 'jobs-section':'jobs', 'presets-list':'presets', 'refresh-time':'refresh_time'}};
|
|
for (const [elId, key] of Object.entries(ids)) {{
|
|
const el = document.getElementById(elId);
|
|
if (el && f[key] != null) el.innerHTML = f[key];
|
|
}}
|
|
}}).catch(() => {{}});
|
|
}}
|
|
setInterval(liveRefresh, 10000);
|
|
"""
|
|
|
|
|
|
# ── HTTP Handler ───────────────────────────────────────────────────────────
|
|
|
|
class SchedulerHandler(BaseHTTPRequestHandler):
|
|
def log_message(self, format, *args):
|
|
pass
|
|
|
|
def do_GET(self):
|
|
path = urlparse(self.path).path
|
|
if path in ("/", "/dashboard"):
|
|
html = _render_page()
|
|
# Inject schedule modal at end of body
|
|
modal = """<div class="modal-overlay" id="sched-modal">
|
|
<div class="modal"><h3>Schedule Trigger</h3>
|
|
<form onsubmit="return submitSchedule(event)">
|
|
<label>Trigger Type<select name="trigger_type" onchange="document.querySelectorAll('.trig-opts').forEach(e=>e.style.display='none');document.getElementById('trig-'+this.value).style.display='block'">
|
|
<option value="time">Time</option><option value="finish_training">After Training</option><option value="cost">Cost Threshold</option>
|
|
</select></label>
|
|
<div id="trig-time" class="trig-opts">
|
|
<label>Mode<select name="time_mode"><option value="at">At specific time</option><option value="duration">After duration</option></select></label>
|
|
<label>At (ISO)<input name="time_at" placeholder="2026-03-21T08:00:00"></label>
|
|
<label>Duration hours<input name="duration_hours" type="number" value="0"></label>
|
|
<label>Duration mins<input name="duration_mins" type="number" value="0"></label>
|
|
</div>
|
|
<div id="trig-finish_training" class="trig-opts" style="display:none">
|
|
<div style="font-size:0.8rem;color:#999;padding:0.3rem">Fires when current training run completes all steps.</div>
|
|
</div>
|
|
<div id="trig-cost" class="trig-opts" style="display:none">
|
|
<label>Threshold ($)<input name="cost_threshold" type="number" step="0.01" value="1.00"></label>
|
|
</div>
|
|
<div class="btn-row">
|
|
<button type="button" class="btn btn-sm" onclick="closeModal()">Cancel</button>
|
|
<button type="submit" class="btn btn-sm">Create Trigger</button>
|
|
</div>
|
|
</form></div></div>"""
|
|
html = html.replace("</body>", modal + "</body>")
|
|
self._respond(200, html, "text/html")
|
|
elif path == "/api/state":
|
|
with _lock:
|
|
data = {"gpus": _state["gpus"], "training": _state["training"],
|
|
"presets": _presets, "jobs": _jobs[-20:], "schedule": _schedule[-20:]}
|
|
self._respond(200, json.dumps(data, default=str, indent=2), "application/json")
|
|
elif path == "/api/training":
|
|
self._respond(200, json.dumps(_fetch_training_status(), default=str), "application/json")
|
|
elif path == "/api/presets":
|
|
self._respond(200, json.dumps(_presets, indent=2), "application/json")
|
|
elif path == "/api/pipelines":
|
|
info = {k: {"label": v["label"], "description": v["description"], "params": v["params"], "defaults": v["defaults"]}
|
|
for k, v in PIPELINE_TYPES.items()}
|
|
self._respond(200, json.dumps(info, indent=2), "application/json")
|
|
elif path == "/api/fragments":
|
|
# Return HTML fragments for live refresh (no full page reload)
|
|
with _lock:
|
|
state = dict(_state)
|
|
gpu_cards = ""
|
|
for gpu in GPUS:
|
|
data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]})
|
|
gpu_cards += _gpu_card_html(data)
|
|
online_count = len([g for g in state["gpus"].values() if g.get("online")])
|
|
last_refresh = state.get("last_refresh", "never")
|
|
fragments = {
|
|
"gpus": gpu_cards,
|
|
"training": _training_card_html(state.get("training")),
|
|
"schedule": _schedule_panel_html(),
|
|
"jobs": _jobs_panel_html(),
|
|
"presets": _presets_list_html(),
|
|
"refresh_time": f"{online_count}/{len(GPUS)} GPUs online — refreshed {last_refresh}",
|
|
}
|
|
self._respond(200, json.dumps(fragments), "application/json")
|
|
else:
|
|
self._respond(404, "Not found", "text/plain")
|
|
|
|
def do_POST(self):
|
|
path = urlparse(self.path).path
|
|
if path != "/api/action":
|
|
self._respond(404, "Not found", "text/plain")
|
|
return
|
|
|
|
length = int(self.headers.get("Content-Length", 0))
|
|
body = json.loads(self.rfile.read(length)) if length else {}
|
|
action = body.get("action", "")
|
|
|
|
try:
|
|
result = self._handle_action(action, body)
|
|
self._respond(200, json.dumps(result, default=str), "application/json")
|
|
except Exception as e:
|
|
self._respond(500, json.dumps({"ok": False, "error": str(e)}), "application/json")
|
|
|
|
def _handle_action(self, action, body):
|
|
if action == "refresh":
|
|
threading.Thread(target=refresh_state, daemon=True).start()
|
|
return {"ok": True}
|
|
|
|
elif action == "create_preset":
|
|
pid = str(uuid.uuid4())[:8]
|
|
_presets[pid] = {
|
|
"id": pid,
|
|
"name": body.get("name", "unnamed"),
|
|
"pipeline": body.get("pipeline", "self_play"),
|
|
"gpus": body.get("gpus", []),
|
|
"params": body.get("params", {}),
|
|
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
}
|
|
_save_presets()
|
|
return {"ok": True, "preset_id": pid}
|
|
|
|
elif action == "delete_preset":
|
|
pid = body.get("preset_id")
|
|
if pid in _presets:
|
|
del _presets[pid]
|
|
_save_presets()
|
|
return {"ok": True}
|
|
|
|
elif action == "launch_preset":
|
|
job = _launch_preset(body.get("preset_id"))
|
|
return {"ok": True, "job": job}
|
|
|
|
elif action == "create_schedule":
|
|
sid = str(uuid.uuid4())[:8]
|
|
sched = {
|
|
"id": sid,
|
|
"preset_id": body.get("preset_id"),
|
|
"trigger": body.get("trigger", {}),
|
|
"status": "pending",
|
|
"created_at": datetime.now().isoformat(),
|
|
}
|
|
_schedule.append(sched)
|
|
_save_schedule()
|
|
return {"ok": True, "schedule_id": sid}
|
|
|
|
elif action == "cancel_schedule":
|
|
sid = body.get("schedule_id")
|
|
for s in _schedule:
|
|
if s["id"] == sid:
|
|
s["status"] = "cancelled"
|
|
_save_schedule()
|
|
return {"ok": True}
|
|
|
|
elif action == "cancel_job":
|
|
jid = body.get("job_id")
|
|
for j in _jobs:
|
|
if j["id"] == jid and j["status"] == "running":
|
|
j["status"] = "cancelled"
|
|
_save_jobs()
|
|
return {"ok": True}
|
|
|
|
elif action == "load_model":
|
|
gpu_id = body.get("gpu_id")
|
|
model = body.get("model")
|
|
gpu = GPU_MAP.get(gpu_id)
|
|
if not gpu:
|
|
return {"ok": False, "error": "unknown GPU"}
|
|
result = _ollama_api(gpu, "/api/generate", method="POST", data={
|
|
"model": model, "prompt": "test", "stream": False,
|
|
"options": {"num_predict": 1},
|
|
})
|
|
return {"ok": True, "result": result}
|
|
|
|
elif action == "stop_ollama":
|
|
gpu_id = body.get("gpu_id", "3090ti")
|
|
gpu = GPU_MAP.get(gpu_id)
|
|
if gpu:
|
|
svc = gpu.get("ollama_service", "ollama.service")
|
|
_ssh_cmd(gpu, f"sudo systemctl stop {svc} 2>&1", timeout=10)
|
|
return {"ok": True}
|
|
|
|
elif action == "start_ollama":
|
|
gpu_id = body.get("gpu_id", "3090ti")
|
|
gpu = GPU_MAP.get(gpu_id)
|
|
if gpu:
|
|
svc = gpu.get("ollama_service", "ollama.service")
|
|
_ssh_cmd(gpu, f"sudo systemctl start {svc} 2>&1", timeout=10)
|
|
return {"ok": True}
|
|
|
|
return {"ok": False, "error": f"unknown action: {action}"}
|
|
|
|
def _respond(self, code, body, content_type):
|
|
self.send_response(code)
|
|
self.send_header("Content-Type", content_type)
|
|
self.send_header("Access-Control-Allow-Origin", "*")
|
|
self.end_headers()
|
|
self.wfile.write(body.encode() if isinstance(body, str) else body)
|
|
|
|
|
|
# ── Main ───────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Mortdecai GPU Scheduler")
|
|
parser.add_argument("--port", type=int, default=PORT)
|
|
parser.add_argument("--refresh-interval", type=int, default=10)
|
|
args = parser.parse_args()
|
|
|
|
_load_persisted()
|
|
print(f"Loaded {len(_presets)} presets, {len(_jobs)} jobs, {len(_schedule)} schedules")
|
|
|
|
t = threading.Thread(target=_bg_refresh_loop, args=(args.refresh_interval,), daemon=True)
|
|
t.start()
|
|
|
|
print("Initial GPU scan...")
|
|
refresh_state()
|
|
|
|
server = HTTPServer(("0.0.0.0", args.port), SchedulerHandler)
|
|
print(f"GPU Scheduler on http://0.0.0.0:{args.port}")
|
|
print(f" {len(GPUS)} GPUs, refresh {args.refresh_interval}s")
|
|
try:
|
|
server.serve_forever()
|
|
except KeyboardInterrupt:
|
|
print("\nShutting down.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|