Files
Mortdecai/web/gpu_scheduler.py
Mortdecai da8f557219 GPU scheduler, 14-tool architecture, plugin deployment, event dispatcher
GPU Scheduler (gpu.sethpc.xyz):
- Live dashboard with 4 GPUs, training monitor, loss sparklines
- Preset-based job scheduler with 3 triggers (time, finish_training, cost)
- Model selection per GPU, pipeline configuration
- Tool self-play and training pipeline types
- Behind Google OAuth, live-refresh without page reload

Tool Architecture (14 tools):
- 3 new tools: world.nearby_entities, memory.read, memory.write
- 7 script.* tools: write, validate, execute, read, list, delete, schedule
- ScriptManager: full mcfunction datapack CRUD with RCON validation
- Training data: 1,430 tool examples (up from 1,159)

Plugin Deployment (paper-ai-25567):
- WorldGuard 7.0.12, CoreProtect CE 23.1, EssentialsX 2.21.2, Vault 1.7.3
- Fresh greenfield world reset
- 104 RCON-validated plugin training examples

Event Dispatcher:
- Watches server log for deaths, joins, advancements, PvP kills
- Configurable trigger probability and cooldowns per event type
- Deployed to dev server, fires god_system prompts on events
- 21 event-response training examples

Training Infrastructure:
- train_lora.py: --save-steps 50, --resume from checkpoint
- run_training.sh: stops Ollama, activates conda, restarts after
- Passwordless sudo for ollama services on steel141
- Dev server added to MCSManager with autoStart

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 03:14:45 -04:00

1437 lines
58 KiB
Python

#!/usr/bin/env python3
"""
Mortdecai GPU Scheduler — preset-based job scheduler with live GPU monitoring.
Features:
- GPU dashboard with live stats across the homelab
- Configuration presets (GPU assignments, model selection, pipeline type)
- Job scheduler with 3 trigger types: time, finish_training, cost
- Model management: load/unload Ollama models per GPU
- Training progress monitor with loss curves
Usage:
python3 gpu_scheduler.py --port 8098
Serve behind Caddy as gpu.sethpc.xyz with google_auth.
"""
import argparse
import json
import os
import re
import subprocess
import threading
import time
import uuid
from http.server import HTTPServer, BaseHTTPRequestHandler
from pathlib import Path
from urllib.parse import parse_qs, urlparse
from datetime import datetime, timedelta
PORT = 8098
DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "scheduler"
# ── GPU Inventory ──────────────────────────────────────────────────────────
GPUS = [
{
"id": "3090ti", "name": "RTX 3090 Ti", "vram_gb": 24, "vram_mb": 24564,
"host": "seth@192.168.0.141", "gpu_index": 1,
"ollama_port": 11434, "ollama_service": "ollama.service",
"capabilities": ["training", "inference", "self-play", "pipeline"],
"location": "steel141",
},
{
"id": "2080ti", "name": "RTX 2080 Ti", "vram_gb": 11, "vram_mb": 11264,
"host": "seth@192.168.0.141", "gpu_index": 0,
"ollama_port": 11435, "ollama_service": "ollama-gpu0.service",
"capabilities": ["inference", "self-play", "pipeline", "generator"],
"location": "steel141",
},
{
"id": "rtx4000", "name": "Quadro RTX 4000", "vram_gb": 8, "vram_mb": 8192,
"host": "pve197", "gpu_index": 0, "pct_id": 105,
"ollama_port": 11434, "ollama_service": "ollama.service",
"capabilities": ["inference", "self-play", "pipeline", "prod"],
"location": "pve197 → CT 105",
},
{
"id": "1660s", "name": "GTX 1660 Super", "vram_gb": 6, "vram_mb": 6144,
"host": "root@192.168.0.235", "gpu_index": 0,
"ollama_port": 11434, "ollama_service": "ollama.service",
"capabilities": ["generator", "inference-small"],
"location": "bedroom",
"ssh_extra": "-o StrictHostKeyChecking=no", "ssh_pass": "REDACTED_PASSWORD",
},
]
GPU_MAP = {g["id"]: g for g in GPUS}
# ── Pipeline Definitions ──────────────────────────────────────────────────
PIPELINE_TYPES = {
"training": {
"label": "Training (QLoRA)",
"description": "Fine-tune model via Unsloth QLoRA",
"gpu_req": ["training"],
"params": ["base_model", "dataset", "output_name", "epochs", "lr", "batch_size", "grad_accum", "max_seq_len", "save_steps"],
"defaults": {
"base_model": "Qwen/Qwen3.5-9B", "dataset": "auto", "output_name": "mortdecai-0.5.0",
"epochs": 1, "lr": 1e-4, "batch_size": 2, "grad_accum": 4, "max_seq_len": 2048, "save_steps": 50,
},
},
"self_play": {
"label": "Self-Play",
"description": "Model generates edge cases and learns from failures",
"gpu_req": ["inference"],
"params": ["model", "tiers", "rounds_per_tier", "rcon_host", "rcon_port", "rcon_pass"],
"defaults": {
"model": "mortdecai:0.4.0", "tiers": "1,2,3",
"rounds_per_tier": 50, "rcon_host": "192.168.0.244", "rcon_port": 25578,
"rcon_pass": "REDACTED_RCON",
},
},
"prompt_pipeline": {
"label": "Prompt Pipeline",
"description": "Small model generates prompts, big models process + RCON validate",
"gpu_req": ["generator", "inference"],
"params": ["gen_model", "proc_model", "batch_size", "interval"],
"defaults": {
"gen_model": "qwen3.5:0.8b", "proc_model": "mortdecai:0.4.0",
"batch_size": 30, "interval": 120,
},
},
"bakeoff": {
"label": "Bake-off",
"description": "Compare model versions on standard test prompts",
"gpu_req": ["inference"],
"params": ["models", "test_set", "rcon_host"],
"defaults": {
"models": "mortdecai:0.4.0,mortdecai:0.5.0", "test_set": "standard",
"rcon_host": "192.168.0.244",
},
},
"export_gguf": {
"label": "Export GGUF",
"description": "Convert LoRA adapter to GGUF for Ollama",
"gpu_req": ["training"],
"params": ["adapter_path", "output_name", "quant"],
"defaults": {
"adapter_path": "training/checkpoints/mortdecai-0.5.0",
"output_name": "mortdecai:0.5.0", "quant": "q4_k_m",
},
},
"tool_self_play": {
"label": "Tool Self-Play",
"description": "Exercise all 14 tools on the dev server — scripts, memory, entities, wiki",
"gpu_req": ["inference"],
"params": ["model", "rounds", "categories", "rcon_host", "rcon_port", "rcon_pass"],
"defaults": {
"model": "mortdecai:0.4.0", "rounds": 10,
"categories": "all",
"rcon_host": "192.168.0.112", "rcon_port": 25578,
"rcon_pass": "REDACTED_RCON",
},
},
"load_model": {
"label": "Load Model",
"description": "Load/switch Ollama model on a GPU",
"gpu_req": ["inference"],
"params": ["model"],
"defaults": {"model": "mortdecai:0.4.0"},
},
}
# ── State ──────────────────────────────────────────────────────────────────
_lock = threading.Lock()
_state = {
"gpus": {},
"training": None,
"last_refresh": None,
}
_presets = {} # id -> preset dict
_jobs = [] # list of job dicts
_schedule = [] # list of scheduled trigger dicts
_cost_tracker = {"total_kwh": 0.0, "total_cost": 0.0, "electricity_rate": 0.12}
TRAINING_LOG_PATTERN = "/home/seth/mc-ai-training/Minecraft-AI-model/training/train_run_*.log"
TRAINING_HOST = "seth@192.168.0.141"
# ── Persistence ────────────────────────────────────────────────────────────
def _ensure_data_dir():
DATA_DIR.mkdir(parents=True, exist_ok=True)
def _save_presets():
_ensure_data_dir()
with open(DATA_DIR / "presets.json", "w") as f:
json.dump(_presets, f, indent=2)
def _save_jobs():
_ensure_data_dir()
with open(DATA_DIR / "jobs.json", "w") as f:
json.dump(_jobs, f, indent=2, default=str)
def _save_schedule():
_ensure_data_dir()
with open(DATA_DIR / "schedule.json", "w") as f:
json.dump(_schedule, f, indent=2, default=str)
def _load_persisted():
global _presets, _jobs, _schedule
_ensure_data_dir()
for name, target in [("presets.json", "_presets"), ("jobs.json", "_jobs"), ("schedule.json", "_schedule")]:
path = DATA_DIR / name
if path.exists():
with open(path) as f:
data = json.load(f)
if target == "_presets":
_presets = data
elif target == "_jobs":
_jobs = data
elif target == "_schedule":
_schedule = data
# ── SSH Helpers ────────────────────────────────────────────────────────────
def _ssh_cmd(gpu_or_host, cmd, timeout=8):
"""Run a command over SSH. Accepts a GPU dict or host string."""
if isinstance(gpu_or_host, dict):
gpu = gpu_or_host
host = gpu["host"]
extra = gpu.get("ssh_extra", "").split() if gpu.get("ssh_extra") else []
ssh_pass = gpu.get("ssh_pass")
# If pct_id is set, wrap command through proxmox host
if "pct_id" in gpu:
cmd = f"pct exec {gpu['pct_id']} -- bash -c '{cmd}'"
else:
host = gpu_or_host
extra = []
ssh_pass = None
try:
if ssh_pass:
full_cmd = ["sshpass", "-p", ssh_pass, "ssh", "-o", "ConnectTimeout=4"] + extra + [host, cmd]
else:
full_cmd = ["ssh", "-o", "ConnectTimeout=4", "-o", "BatchMode=yes"] + extra + [host, cmd]
r = subprocess.run(full_cmd, capture_output=True, text=True, timeout=timeout)
return r.stdout.strip() if r.returncode == 0 else None
except Exception:
return None
def _ollama_api(gpu, endpoint, method="GET", data=None):
"""Call Ollama API on a GPU via SSH curl."""
port = gpu["ollama_port"]
if method == "GET":
cmd = f"curl -s --connect-timeout 3 http://localhost:{port}{endpoint}"
else:
payload = json.dumps(data).replace("'", "'\\''") if data else "{}"
cmd = f"curl -s --connect-timeout 3 -X POST http://localhost:{port}{endpoint} -d '{payload}'"
raw = _ssh_cmd(gpu, cmd)
if raw:
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
return None
# ── GPU Monitoring ─────────────────────────────────────────────────────────
def _fetch_gpu_stats(gpu):
query = f"nvidia-smi --id={gpu['gpu_index']} --query-gpu=utilization.gpu,temperature.gpu,power.draw,memory.used,memory.total,fan.speed --format=csv,noheader,nounits"
raw = _ssh_cmd(gpu, query)
if not raw:
return {"online": False, "id": gpu["id"], "name": gpu["name"]}
parts = [p.strip() for p in raw.split(",")]
try:
return {
"online": True, "id": gpu["id"], "name": gpu["name"],
"vram_gb": gpu["vram_gb"], "location": gpu["location"],
"capabilities": gpu["capabilities"],
"utilization": int(parts[0]), "temperature": int(parts[1]),
"power_watts": float(parts[2]),
"vram_used_mb": int(parts[3]), "vram_total_mb": int(parts[4]),
"fan_speed": int(parts[5]) if parts[5] not in ("[N/A]", "[Not Supported]") else None,
"vram_pct": round(int(parts[3]) / int(parts[4]) * 100, 1),
}
except (ValueError, IndexError):
return {"online": True, "id": gpu["id"], "name": gpu["name"], "error": raw}
def _fetch_ollama_info(gpu):
"""Get running + available models from Ollama."""
ps = _ollama_api(gpu, "/api/ps") or {}
tags = _ollama_api(gpu, "/api/tags") or {}
running = []
for m in ps.get("models", []):
running.append({
"name": m.get("name", "?"),
"size_gb": round(m.get("size", 0) / 1e9, 1),
"vram_gb": round(m.get("size_vram", 0) / 1e9, 1),
})
available = [m.get("name", "?") for m in tags.get("models", [])]
return {"running": running, "available": available}
def _fetch_training_status():
# Find the most recently modified training log
log_path = _ssh_cmd(TRAINING_HOST, f"ls -t {TRAINING_LOG_PATTERN} 2>/dev/null | head -1", timeout=5)
if not log_path:
return None
raw = _ssh_cmd(TRAINING_HOST, f"tail -200 {log_path} 2>/dev/null", timeout=8)
if not raw:
return None
status = {"active": False, "loss_history": []}
progress_matches = re.findall(r'(\d+)%\|[^|]*\|\s*(\d+)/(\d+)\s*\[([^\]]+)\]', raw)
if progress_matches:
last = progress_matches[-1]
status["pct"] = int(last[0])
status["current_step"] = int(last[1])
status["total_steps"] = int(last[2])
timing = last[3]
eta_match = re.search(r'<([^,]+)', timing)
elapsed_match = re.match(r'([^<]+)', timing)
if eta_match:
status["eta"] = eta_match.group(1).strip()
if elapsed_match:
status["elapsed"] = elapsed_match.group(1).strip()
status["active"] = True
if "OutOfMemoryError" in raw:
status["active"] = False
status["error"] = "OOM"
elif "Error" in raw.split("\n")[-1] and "OutOfMemoryError" not in raw:
status["active"] = False
status["error"] = "crashed"
loss_matches = re.findall(r"'loss':\s*'([^']+)'", raw)
for lm in loss_matches:
try:
status["loss_history"].append(float(lm))
except ValueError:
pass
if status["loss_history"]:
status["latest_loss"] = status["loss_history"][-1]
lr_matches = re.findall(r"'learning_rate':\s*'([^']+)'", raw)
if lr_matches:
status["learning_rate"] = lr_matches[-1]
return status
def _fetch_processes(gpu):
cmd = f"nvidia-smi --id={gpu['gpu_index']} --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null"
raw = _ssh_cmd(gpu, cmd)
if not raw:
return []
procs = []
for line in raw.strip().split("\n"):
if not line.strip():
continue
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 3:
procs.append({"pid": parts[0], "name": parts[1].split("/")[-1], "vram_mb": parts[2]})
return procs
def refresh_state():
new_gpus = {}
threads = []
def fetch_one(gpu):
stats = _fetch_gpu_stats(gpu)
stats["ollama"] = _fetch_ollama_info(gpu)
stats["processes"] = _fetch_processes(gpu)
# Check if any job is running on this GPU
active_jobs = [j for j in _jobs if j.get("status") == "running" and gpu["id"] in j.get("gpus", [])]
stats["active_job"] = active_jobs[0]["id"] if active_jobs else None
new_gpus[gpu["id"]] = stats
for gpu in GPUS:
t = threading.Thread(target=fetch_one, args=(gpu,))
t.start()
threads.append(t)
for t in threads:
t.join(timeout=12)
with _lock:
_state["gpus"] = new_gpus
_state["training"] = _fetch_training_status()
_state["last_refresh"] = time.strftime("%H:%M:%S")
def _bg_refresh_loop(interval=10):
while True:
try:
refresh_state()
_check_triggers()
except Exception as e:
print(f"[scheduler] refresh error: {e}")
time.sleep(interval)
# ── Job Execution ──────────────────────────────────────────────────────────
def _run_job_async(job):
"""Execute a job in a background thread."""
def _run():
job["status"] = "running"
job["started_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
_save_jobs()
print(f"[scheduler] starting job {job['id']}: {job['pipeline']}")
try:
pipeline = job["pipeline"]
params = job["params"]
gpus = job["gpus"]
if pipeline == "training":
_exec_training(job, params)
elif pipeline == "self_play":
_exec_self_play(job, params, gpus)
elif pipeline == "prompt_pipeline":
_exec_prompt_pipeline(job, params, gpus)
elif pipeline == "load_model":
_exec_load_model(job, params, gpus)
elif pipeline == "export_gguf":
_exec_export_gguf(job, params)
elif pipeline == "bakeoff":
_exec_bakeoff(job, params, gpus)
elif pipeline == "tool_self_play":
_exec_tool_self_play(job, params, gpus)
else:
job["error"] = f"unknown pipeline: {pipeline}"
job["status"] = "failed"
except Exception as e:
job["error"] = str(e)
job["status"] = "failed"
print(f"[scheduler] job {job['id']} failed: {e}")
if job["status"] == "running":
job["status"] = "completed"
job["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
_save_jobs()
print(f"[scheduler] job {job['id']}{job['status']}")
t = threading.Thread(target=_run, daemon=True)
t.start()
return job
def _exec_training(job, params):
"""Launch training on the 3090 Ti via SSH."""
output_name = params.get('output_name', 'mortdecai-0.5.0')
log_name = f"train_run_{output_name}.log"
# Build the training command with conda environment activation
train_cmd = (
f"source /home/seth/miniconda3/etc/profile.d/conda.sh && "
f"conda activate mc-train && "
f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
f"TORCH_COMPILE_DISABLE=1 TORCHDYNAMO_DISABLE=1 CUDA_VISIBLE_DEVICES=0 "
f"python3 training/scripts/train_lora.py "
f"--model '{params.get('base_model', 'Qwen/Qwen3.5-9B')}' "
f"--output 'training/checkpoints/{output_name}' "
f"--lr {params.get('lr', 1e-4)} "
f"--epochs {int(params.get('epochs', 1))} "
f"--batch-size {int(params.get('batch_size', 2))} "
f"--grad-accum {int(params.get('grad_accum', 4))} "
f"--max-seq-len {int(params.get('max_seq_len', 2048))} "
f"--save-steps {int(params.get('save_steps', 50))}"
)
if params.get("resume"):
train_cmd += " --resume"
train_cmd += f" 2>&1 | tee training/{log_name}"
# Cancel any running jobs on the 3090 Ti to free VRAM
for j in _jobs:
if j.get("status") == "running" and "3090ti" in j.get("gpus", []) and j["id"] != job["id"]:
j["status"] = "cancelled"
print(f"[training] cancelled conflicting job {j['id']} on 3090ti")
_save_jobs()
# Stop both Ollama services AND prevent auto-restart
_ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service 2>/dev/null", timeout=10)
_ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama-gpu0.service 2>/dev/null", timeout=10)
time.sleep(2)
# Kill any lingering ollama processes holding GPU 1 VRAM
_ssh_cmd(TRAINING_HOST,
"for pid in $(nvidia-smi --id=1 --query-compute-apps=pid --format=csv,noheader,nounits 2>/dev/null); do kill $pid 2>/dev/null; done",
timeout=5)
time.sleep(3)
# Verify VRAM is free enough (need ~18GB free on 24GB card)
vram_check = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits")
if vram_check:
try:
free_mb = int(vram_check.strip())
except ValueError:
free_mb = 0
print(f"[training] 3090 Ti free VRAM: {free_mb}MB")
if free_mb < 18000:
# Last resort: try harder to free VRAM
_ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service; sudo systemctl stop ollama-gpu0.service", timeout=10)
time.sleep(5)
vram_check2 = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits")
try:
free_mb = int(vram_check2.strip()) if vram_check2 else 0
except ValueError:
free_mb = 0
if free_mb < 18000:
job["status"] = "failed"
job["error"] = f"Not enough VRAM: {free_mb}MB free, need 18000MB"
# Restart Ollama since we're not training
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10)
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10)
return
# Launch training via nohup with bash -l for conda
nohup_cmd = f"nohup bash -c '{train_cmd}' > /dev/null 2>&1 &"
_ssh_cmd(TRAINING_HOST, nohup_cmd, timeout=10)
job["log_path"] = f"training/{log_name}"
print(f"[training] launched, logging to {log_name}")
# Monitor until done
while job["status"] == "running":
time.sleep(30)
status = _fetch_training_status()
if status:
job["progress"] = status
if status.get("error"):
job["status"] = "failed"
job["error"] = status["error"]
break
if not status.get("active") and status.get("current_step", 0) == status.get("total_steps", 0) and status.get("total_steps", 0) > 0:
job["status"] = "completed"
break
# Restart Ollama services after training
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10)
_ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10)
def _exec_self_play(job, params, gpus):
resolved_gpus = [GPU_MAP[gid] for gid in gpus if gid in GPU_MAP]
if not resolved_gpus:
job["error"] = "no GPU assigned"
job["status"] = "failed"
return
model = params.get("model", "mortdecai:0.4.0")
tiers = [t.strip() for t in params.get("tiers", "1,2,3").split(",")]
rounds = int(params.get("rounds_per_tier", 50))
rcon_host = params.get("rcon_host", "192.168.0.244")
rcon_port = int(params.get("rcon_port", 25578))
rcon_pass = params.get("rcon_pass", "REDACTED_RCON")
script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/self_play.py"
# Distribute tiers round-robin across GPUs, launch all in parallel
gpu_assignments = {} # gpu_id -> list of tiers
for i, tier in enumerate(tiers):
gpu = resolved_gpus[i % len(resolved_gpus)]
gpu_assignments.setdefault(gpu["id"], []).append(tier)
job["gpu_assignments"] = {gid: ts for gid, ts in gpu_assignments.items()}
# Launch all GPU workers in parallel threads
errors = []
def run_on_gpu(gpu, assigned_tiers):
port = gpu["ollama_port"]
for tier in assigned_tiers:
if job["status"] != "running":
break
log_file = f"/tmp/selfplay_{gpu['id']}_{tier}.log"
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
f"python3 {script_path} --tier {tier} --rounds {rounds} "
f"--ollama-url http://localhost:{port} --model {model} "
f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass}")
_ssh_cmd(gpu, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10)
print(f"[self-play] {gpu['name']}: {tier} x{rounds} started")
# Wait for this tier to finish
for _ in range(rounds * 3):
time.sleep(10)
log = _ssh_cmd(gpu, f"tail -5 {log_file} 2>/dev/null")
if log and ("Complete" in log or "Error" in log or "Traceback" in log):
if "Error" in log or "Traceback" in log:
errors.append(f"{gpu['name']}/{tier}: {log[-200:]}")
break
if job["status"] != "running":
break
threads = []
for gid, assigned_tiers in gpu_assignments.items():
gpu = GPU_MAP[gid]
t = threading.Thread(target=run_on_gpu, args=(gpu, assigned_tiers), daemon=True)
t.start()
threads.append(t)
for t in threads:
t.join()
if errors:
job["error"] = "; ".join(errors[:3])
def _exec_prompt_pipeline(job, params, gpus):
gen_gpu = GPU_MAP.get(gpus[0]) if len(gpus) > 0 else None
proc_gpu = GPU_MAP.get(gpus[1]) if len(gpus) > 1 else gen_gpu
if not gen_gpu:
job["error"] = "no GPUs assigned"
job["status"] = "failed"
return
gen_port = gen_gpu["ollama_port"]
proc_port = proc_gpu["ollama_port"] if proc_gpu else gen_port
gen_host_ip = gen_gpu["host"].split("@")[-1]
proc_host_ip = proc_gpu["host"].split("@")[-1] if proc_gpu else gen_host_ip
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
f"python3 training/scripts/prompt_pipeline.py --mode all "
f"--gen-url http://{gen_host_ip}:{gen_port} "
f"--gen-model {params.get('gen_model', 'qwen3.5:0.8b')} "
f"--proc-urls http://{proc_host_ip}:{proc_port} "
f"--proc-model {params.get('proc_model', 'mortdecai:0.4.0')} "
f"--interval {params.get('interval', 120)}")
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/pipeline.log 2>&1 &", timeout=10)
def _exec_load_model(job, params, gpus):
for gid in gpus:
gpu = GPU_MAP.get(gid)
if not gpu:
continue
model = params.get("model", "mortdecai:0.4.0")
result = _ollama_api(gpu, "/api/generate", method="POST", data={
"model": model, "prompt": "test", "stream": False,
"options": {"num_predict": 1},
})
if result and "error" not in result:
job["result"] = f"Loaded {model} on {gpu['name']}"
else:
job["error"] = f"Failed to load {model} on {gpu['name']}: {result}"
job["status"] = "failed"
def _exec_export_gguf(job, params):
adapter = params.get("adapter_path", "training/checkpoints/mortdecai-0.5.0")
quant = params.get("quant", "q4_k_m")
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
f"python3 -m unsloth.save --model {adapter} --output_type gguf --quantization {quant}")
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/export_gguf.log 2>&1 &", timeout=10)
# Monitor
for _ in range(120):
time.sleep(15)
log = _ssh_cmd(TRAINING_HOST, "tail -3 /tmp/export_gguf.log 2>/dev/null")
if log and ("Saved" in log or "Error" in log or "error" in log):
if "Error" in log or "error" in log:
job["status"] = "failed"
job["error"] = log
break
def _exec_bakeoff(job, params, gpus):
gpu = GPU_MAP.get(gpus[0]) if gpus else None
if not gpu:
job["error"] = "no GPU assigned"
job["status"] = "failed"
return
models = params.get("models", "mortdecai:0.4.0")
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
f"python3 training/scripts/bakeoff.py --models {models}")
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/bakeoff.log 2>&1 &", timeout=10)
def _exec_tool_self_play(job, params, gpus):
"""Run tool-focused self-play on the dev server via the assigned GPU's Ollama."""
gpu = GPU_MAP.get(gpus[0]) if gpus else None
if not gpu:
job["error"] = "no GPU assigned"
job["status"] = "failed"
return
host_ip = gpu["host"].split("@")[-1] if "@" in gpu["host"] else gpu["host"]
# For pct-based GPUs, use the CT's external IP
if "pct_id" in gpu:
host_ip = "192.168.0.179" # CT 105 external IP
port = gpu["ollama_port"]
model = params.get("model", "mortdecai:0.4.0")
rounds = int(params.get("rounds", 10))
categories = params.get("categories", "all")
rcon_host = params.get("rcon_host", "192.168.0.112")
rcon_port = int(params.get("rcon_port", 25578))
rcon_pass = params.get("rcon_pass", "REDACTED_RCON")
script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/tool_self_play.py"
log_file = f"/tmp/tool_selfplay_{gpu['id']}.log"
cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
f"python3 {script_path} "
f"--ollama-url http://{host_ip}:{port} --model {model} "
f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass} "
f"--rounds {rounds} --categories {categories}")
_ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10)
print(f"[tool-self-play] launched on {gpu['name']}, logging to {log_file}")
# Monitor until done
for _ in range(rounds * len(PIPELINE_TYPES) * 3):
time.sleep(15)
log = _ssh_cmd(TRAINING_HOST, f"tail -5 {log_file} 2>/dev/null")
if log and ("Complete" in log or "Traceback" in log):
if "Traceback" in log:
job["error"] = log[-300:]
job["status"] = "failed"
break
if job["status"] != "running":
break
# ── Trigger Engine ─────────────────────────────────────────────────────────
def _check_triggers():
"""Evaluate all scheduled triggers."""
now = datetime.now()
for sched in _schedule:
if sched.get("status") != "pending":
continue
trigger = sched["trigger"]
fired = False
if trigger["type"] == "time":
target_str = trigger.get("at")
if target_str:
try:
target = datetime.fromisoformat(target_str)
if now >= target:
fired = True
except ValueError:
pass
duration_s = trigger.get("duration_seconds")
created_str = sched.get("created_at")
if duration_s and created_str:
try:
created = datetime.fromisoformat(created_str)
if now >= created + timedelta(seconds=int(duration_s)):
fired = True
except ValueError:
pass
elif trigger["type"] == "finish_training":
training = _state.get("training")
if training:
total = training.get("total_steps", 0)
current = training.get("current_step", 0)
if total > 0 and current >= total and not training.get("active"):
fired = True
elif trigger["type"] == "cost":
threshold = float(trigger.get("threshold_usd", 999))
if _cost_tracker["total_cost"] >= threshold:
fired = True
if fired:
sched["status"] = "fired"
sched["fired_at"] = now.isoformat()
_save_schedule()
print(f"[scheduler] trigger fired: {sched['id']} → launching preset {sched['preset_id']}")
_launch_preset(sched["preset_id"])
def _launch_preset(preset_id):
"""Create and start a job from a preset."""
preset = _presets.get(preset_id)
if not preset:
print(f"[scheduler] preset {preset_id} not found")
return None
job = {
"id": str(uuid.uuid4())[:8],
"preset_id": preset_id,
"preset_name": preset.get("name", "?"),
"pipeline": preset["pipeline"],
"params": preset.get("params", {}),
"gpus": preset.get("gpus", []),
"status": "queued",
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
}
_jobs.append(job)
_save_jobs()
_run_job_async(job)
return job
# ── HTML Rendering ─────────────────────────────────────────────────────────
def _render_page():
with _lock:
state = dict(_state)
gpu_cards = ""
for gpu in GPUS:
data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]})
gpu_cards += _gpu_card_html(data)
training_html = _training_card_html(state.get("training"))
presets_list_html, presets_form_html = _presets_panel_html()
schedule_html = _schedule_panel_html()
jobs_html = _jobs_panel_html()
last_refresh = state.get("last_refresh", "never")
online_count = len([g for g in state["gpus"].values() if g.get("online")])
return f"""<!DOCTYPE html>
<html><head>
<title>Mortdecai GPU Scheduler</title>
<meta charset="utf-8">
<style>
{CSS}
</style>
</head>
<body>
<header>
<h1>Mortdecai GPU Scheduler</h1>
<div class="subtitle"><span id="refresh-time">{online_count}/{len(GPUS)} GPUs online &mdash; refreshed {last_refresh}</span>
<button class="btn btn-sm" onclick="api('refresh')">Refresh</button>
</div>
</header>
<div class="layout">
<div class="main-col">
<div id="train-section">{training_html}</div>
<h2>GPUs</h2>
<div class="grid" id="gpu-grid">{gpu_cards}</div>
<div id="schedule-section">{schedule_html}</div>
<div id="jobs-section">{jobs_html}</div>
</div>
<div class="side-col">
<div class="panel">
<h3>Presets</h3>
<div id="presets-list">{presets_list_html}</div>
{presets_form_html}
</div>
</div>
</div>
<script>
{JS}
</script>
</body></html>"""
def _gpu_card_html(d):
if not d.get("online"):
return f"""<div class="card offline"><div class="card-header"><b>{d.get('name','?')}</b><span class="bad">OFFLINE</span></div></div>"""
util = d.get("utilization", 0)
temp = d.get("temperature", 0)
vram_pct = d.get("vram_pct", 0)
vram_used = d.get("vram_used_mb", 0)
vram_total = d.get("vram_total_mb", 0)
power = d.get("power_watts", 0)
tc = "bad" if temp > 80 else "warn" if temp > 70 else "ok"
uc = "ok" if util > 50 else "warn" if util > 10 else "dim"
vc = "bad" if vram_pct > 90 else "warn" if vram_pct > 70 else "ok"
ollama = d.get("ollama", {})
running = ollama.get("running", [])
avail = ollama.get("available", [])
model_tags = " ".join(f'<span class="tag">{m["name"]}</span>' for m in running) if running else '<span class="tag dim">idle</span>'
avail_options = "".join(f'<option value="{m}">{m}</option>' for m in avail if m)
model_select = f"""<select class="model-select" id="ms-{d['id']}">{avail_options}</select>
<button class="btn btn-xs" onclick="loadModel('{d['id']}')">Load</button>""" if avail else ""
active_job = d.get("active_job")
job_badge = f'<span class="tag accent">job {active_job}</span>' if active_job else ""
caps = " ".join(f'<span class="cap">{c}</span>' for c in d.get("capabilities", []))
return f"""<div class="card" id="gpu-{d['id']}">
<div class="card-header"><b>{d['name']}</b><span class="{uc}">{'ACTIVE' if util>10 else 'IDLE'}</span></div>
<div class="card-sub">{d.get('location','')} {job_badge}</div>
<div class="bar-row"><span class="bar-label">GPU</span><div class="bar"><div class="bar-fill" style="width:{util}%">{util}%</div></div></div>
<div class="bar-row"><span class="bar-label">VRAM</span><div class="bar"><div class="bar-fill {vc}" style="width:{vram_pct}%">{vram_used}/{vram_total}MB</div></div></div>
<div class="stats"><span class="{tc}">{temp}C</span> <span>{power:.0f}W</span></div>
<div class="models">{model_tags}</div>
<div class="model-ctrl">{model_select}</div>
<div class="caps">{caps}</div>
</div>"""
def _training_card_html(t):
if not t:
return '<div class="card"><div class="card-header"><b>Training</b><span class="dim">no log</span></div></div>'
pct = t.get("pct", 0)
step = t.get("current_step", 0)
total = t.get("total_steps", 0)
error = t.get("error")
active = t.get("active", False)
loss = t.get("latest_loss")
lr = t.get("learning_rate", "?")
eta = t.get("eta", "?")
elapsed = t.get("elapsed", "?")
if error:
status = f'<span class="bad">CRASHED ({error})</span>'
elif active:
status = '<span class="ok">TRAINING</span>'
else:
status = '<span class="warn">STOPPED</span>'
# Sparkline
lh = t.get("loss_history", [])
spark = ""
if lh:
recent = lh[-40:]
mx, mn = max(recent), min(recent)
rng = mx - mn if mx != mn else 1
w, h = 400, 70
pts = " ".join(f"{i/(max(len(recent)-1,1))*w:.0f},{h-((v-mn)/rng*h):.0f}" for i, v in enumerate(recent))
spark = f"""<svg width="{w}" height="{h}" class="spark"><polyline points="{pts}" fill="none" stroke="#D35400" stroke-width="1.5"/>
<text x="0" y="10" fill="#666" font-size="9">{mx:.4f}</text><text x="0" y="{h}" fill="#666" font-size="9">{mn:.4f}</text></svg>"""
return f"""<div class="card train-card">
<div class="card-header"><b>Training</b>{status}</div>
<div class="progress"><div class="progress-fill" style="width:{pct}%">{step}/{total} ({pct}%)</div></div>
<div class="stats">
<span>Elapsed: {elapsed}</span> <span>ETA: {eta}</span>
<span>Loss: <b class="accent">{f'{loss:.4f}' if loss else '?'}</b></span> <span>LR: {lr}</span>
</div>
{spark}
</div>"""
def _presets_list_html():
"""Just the preset rows — refreshable without touching the form."""
rows = ""
for pid, p in sorted(_presets.items(), key=lambda x: x[1].get("name", "")):
gpus = ", ".join(p.get("gpus", []))
rows += f"""<div class="preset-row">
<div class="preset-name">{p['name']}</div>
<div class="preset-info">{p['pipeline']} &mdash; {gpus}</div>
<div class="preset-actions">
<button class="btn btn-xs" onclick="launchPreset('{pid}')">Run</button>
<button class="btn btn-xs" onclick="schedulePreset('{pid}')">Schedule</button>
<button class="btn btn-xs btn-danger" onclick="deletePreset('{pid}')">Del</button>
</div></div>"""
if not rows:
rows = '<div class="dim" style="padding:0.5rem">No presets yet. Create one below.</div>'
return rows
def _presets_panel_html():
"""Returns (list_html, form_html) — list refreshes live, form stays static."""
list_html = _presets_list_html()
pipe_opts = "".join(f'<option value="{k}">{v["label"]}</option>' for k, v in PIPELINE_TYPES.items())
gpu_checks = "".join(f'<label class="gpu-check"><input type="checkbox" name="gpus" value="{g["id"]}"> {g["name"]}</label>' for g in GPUS)
form_html = f"""<div class="create-form">
<h3 style="margin-top:0.8rem">New Preset</h3>
<form id="preset-form" onsubmit="return createPreset(event)">
<label>Name<input name="name" required placeholder="overnight-selfplay"></label>
<label>Pipeline<select name="pipeline" onchange="updateParamFields(this.value)">{pipe_opts}</select></label>
<div class="gpu-select"><label>GPUs</label>{gpu_checks}</div>
<div id="param-fields"></div>
<button type="submit" class="btn">Save Preset</button>
</form>
</div>"""
return list_html, form_html
def _schedule_panel_html():
rows = ""
for s in sorted(_schedule, key=lambda x: x.get("created_at", ""), reverse=True)[:10]:
preset_name = _presets.get(s.get("preset_id", ""), {}).get("name", s.get("preset_id", "?"))
trigger = s.get("trigger", {})
ttype = trigger.get("type", "?")
if ttype == "time":
if trigger.get("at"):
trigger_desc = f"at {trigger['at']}"
else:
secs = int(trigger.get("duration_seconds", 0))
trigger_desc = f"after {secs//3600}h{(secs%3600)//60}m"
elif ttype == "finish_training":
trigger_desc = "when training completes"
elif ttype == "cost":
trigger_desc = f"at ${trigger.get('threshold_usd', '?')}"
else:
trigger_desc = ttype
st = s.get("status", "?")
st_class = "ok" if st == "fired" else "warn" if st == "pending" else "dim"
sid = s["id"]
cancel_btn = f"""<button class="btn btn-xs btn-danger" onclick="cancelSchedule('{sid}')">Cancel</button>""" if st == 'pending' else ''
rows += f"""<div class="sched-row">
<span class="{st_class}">{st}</span>
<span>{preset_name}</span>
<span class="dim">{trigger_desc}</span>
{cancel_btn}
</div>"""
if not rows:
rows = '<div class="dim" style="padding:0.5rem">No scheduled triggers.</div>'
return f"""<div class="card">
<div class="card-header"><b>Scheduled Triggers</b><span class="dim">{len([s for s in _schedule if s.get('status')=='pending'])} pending</span></div>
{rows}
</div>"""
def _jobs_panel_html():
recent = sorted(_jobs, key=lambda j: j.get("created_at", ""), reverse=True)[:15]
rows = ""
for j in recent:
st = j.get("status", "?")
st_class = "ok" if st == "completed" else "bad" if st == "failed" else "warn" if st == "running" else "dim"
gpus = ", ".join(j.get("gpus", []))
name = j.get("preset_name", j.get("pipeline", "?"))
err = f' <span class="bad">({j["error"]})</span>' if j.get("error") else ""
jid = j["id"]
stop_btn = f"""<button class="btn btn-xs btn-danger" onclick="cancelJob('{jid}')">Stop</button>""" if st == 'running' else ''
created = j.get('created_at', '')[:16]
rows += f"""<div class="job-row">
<span class="{st_class}">{st}</span>
<span>{name}</span>
<span class="dim">{gpus}</span>
<span class="dim">{created}</span>
{err}
{stop_btn}
</div>"""
if not rows:
rows = '<div class="dim" style="padding:0.5rem">No jobs yet.</div>'
return f"""<div class="card">
<div class="card-header"><b>Jobs</b><span class="dim">{len([j for j in _jobs if j.get('status')=='running'])} running</span></div>
{rows}
</div>"""
# ── CSS ────────────────────────────────────────────────────────────────────
CSS = """
*{box-sizing:border-box;margin:0;padding:0}
body{font-family:'Courier New',monospace;background:#111;color:#e0e0e0;padding:1rem 1.5rem}
header{margin-bottom:1.2rem}
h1{color:#D35400;font-size:1.5rem;margin-bottom:0.2rem}
h2{color:#D35400;font-size:1.1rem;margin:1rem 0 0.6rem}
h3{color:#D35400;font-size:1rem;margin-bottom:0.6rem}
.subtitle{color:#666;font-size:0.8rem}
.accent{color:#D35400}
.ok{color:#4caf50} .warn{color:#ff9800} .bad{color:#f44336} .dim{color:#555}
.layout{display:grid;grid-template-columns:1fr 340px;gap:1.2rem}
@media(max-width:900px){.layout{grid-template-columns:1fr}}
.main-col{min-width:0}
.side-col{display:flex;flex-direction:column;gap:1rem}
.grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:0.8rem}
.card{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem;margin-bottom:0.8rem;transition:border-color 0.2s}
.card:hover{border-color:#D35400}
.card.offline{opacity:0.4;border-color:#f44336}
.card-header{display:flex;justify-content:space-between;align-items:center;margin-bottom:0.3rem;font-size:0.95rem}
.card-sub{color:#555;font-size:0.75rem;margin-bottom:0.5rem}
.bar-row{display:flex;align-items:center;margin:0.25rem 0;gap:0.4rem}
.bar-label{width:36px;color:#777;font-size:0.75rem}
.bar{flex:1;background:#222;border-radius:3px;height:20px;overflow:hidden}
.bar-fill{height:100%;border-radius:3px;background:#D35400;display:flex;align-items:center;padding-left:5px;font-size:0.7rem;color:#fff;min-width:fit-content;transition:width 0.5s}
.bar-fill.ok{background:#4caf50} .bar-fill.warn{background:#ff9800} .bar-fill.bad{background:#f44336}
.stats{display:flex;gap:0.8rem;font-size:0.8rem;margin:0.4rem 0;flex-wrap:wrap;color:#999}
.models{margin-top:0.4rem}
.model-ctrl{margin-top:0.3rem;display:flex;gap:0.3rem;align-items:center}
.model-select{background:#222;color:#ccc;border:1px solid #333;border-radius:3px;padding:2px 4px;font-size:0.75rem;font-family:monospace}
.tag{display:inline-block;background:#222;border:1px solid #333;border-radius:3px;padding:1px 6px;font-size:0.7rem;margin:1px}
.tag.accent{border-color:#D35400;color:#D35400}
.tag.dim{color:#444;border-color:#222}
.caps{margin-top:0.3rem;display:flex;gap:3px;flex-wrap:wrap}
.cap{font-size:0.65rem;color:#555;background:#1e1e1e;border-radius:2px;padding:1px 4px}
.train-card .progress{background:#222;border-radius:3px;height:26px;margin:0.4rem 0;overflow:hidden}
.train-card .progress-fill{height:100%;background:linear-gradient(90deg,#D35400,#e67e22);border-radius:3px;transition:width 1s;display:flex;align-items:center;justify-content:center;font-size:0.8rem;font-weight:bold;color:#fff}
.spark{display:block;margin-top:0.5rem;background:#1a1a1a;border:1px solid #222;border-radius:3px}
.panel{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem}
.preset-row{display:flex;flex-wrap:wrap;align-items:center;gap:0.4rem;padding:0.4rem 0;border-bottom:1px solid #222;font-size:0.8rem}
.preset-name{font-weight:bold;color:#e0e0e0;flex:1}
.preset-info{color:#777;font-size:0.75rem;width:100%}
.preset-actions{display:flex;gap:0.3rem}
.sched-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #222;font-size:0.8rem}
.job-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #1e1e1e;font-size:0.8rem}
.btn{background:#222;border:1px solid #D35400;color:#D35400;padding:5px 12px;border-radius:3px;cursor:pointer;font-family:monospace;font-size:0.8rem;transition:background 0.15s}
.btn:hover{background:#D35400;color:#fff}
.btn-sm{padding:3px 8px;font-size:0.75rem}
.btn-xs{padding:2px 6px;font-size:0.7rem}
.btn-danger{border-color:#f44336;color:#f44336}
.btn-danger:hover{background:#f44336;color:#fff}
.create-form{margin-top:0.5rem}
.create-form form{display:flex;flex-direction:column;gap:0.5rem;margin-top:0.6rem}
.create-form label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem}
.create-form input,.create-form select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem}
.gpu-select{display:flex;flex-wrap:wrap;gap:0.3rem;font-size:0.8rem;color:#999}
.gpu-check{display:flex;align-items:center;gap:0.2rem;font-size:0.75rem}
.gpu-check input{accent-color:#D35400}
.modal-overlay{display:none;position:fixed;top:0;left:0;width:100%;height:100%;background:rgba(0,0,0,0.7);z-index:100;justify-content:center;align-items:center}
.modal-overlay.active{display:flex}
.modal{background:#1a1a1a;border:1px solid #D35400;border-radius:8px;padding:1.2rem;width:400px;max-width:90vw}
.modal h3{margin-bottom:0.8rem}
.modal label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem;margin-bottom:0.4rem}
.modal input,.modal select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem}
.modal .btn-row{display:flex;gap:0.5rem;margin-top:0.8rem;justify-content:flex-end}
"""
# ── JS ─────────────────────────────────────────────────────────────────────
PIPELINE_TYPES_JSON = json.dumps({k: {"params": v["params"], "defaults": v["defaults"], "label": v["label"]} for k, v in PIPELINE_TYPES.items()})
JS = f"""
const PIPELINES = {PIPELINE_TYPES_JSON};
function api(action, data) {{
return fetch('/api/action', {{
method:'POST', headers:{{'Content-Type':'application/json'}},
body: JSON.stringify({{action, ...data}})
}}).then(r=>r.json());
}}
function updateParamFields(pipeline) {{
const p = PIPELINES[pipeline];
if (!p) return;
const container = document.getElementById('param-fields');
container.innerHTML = '';
for (const key of p.params) {{
const val = p.defaults[key] ?? '';
const label = document.createElement('label');
label.textContent = key;
const input = document.createElement('input');
input.name = 'param_' + key;
input.value = val;
label.appendChild(input);
container.appendChild(label);
}}
}}
function createPreset(e) {{
e.preventDefault();
const form = e.target;
const fd = new FormData(form);
const gpus = fd.getAll('gpus');
const params = {{}};
for (const [k,v] of fd.entries()) {{
if (k.startsWith('param_')) params[k.slice(6)] = v;
}}
api('create_preset', {{
name: fd.get('name'),
pipeline: fd.get('pipeline'),
gpus, params
}}).then(() => liveRefresh());
return false;
}}
function launchPreset(id) {{
if (confirm('Launch this preset now?'))
api('launch_preset', {{preset_id: id}}).then(() => setTimeout(()=>location.reload(), 1000));
}}
function deletePreset(id) {{
if (confirm('Delete this preset?'))
api('delete_preset', {{preset_id: id}}).then(() => location.reload());
}}
function loadModel(gpuId) {{
const sel = document.getElementById('ms-' + gpuId);
if (!sel) return;
api('load_model', {{gpu_id: gpuId, model: sel.value}}).then(() =>
setTimeout(()=>location.reload(), 3000));
}}
function cancelJob(id) {{
api('cancel_job', {{job_id: id}}).then(() => location.reload());
}}
function cancelSchedule(id) {{
api('cancel_schedule', {{schedule_id: id}}).then(() => location.reload());
}}
// Schedule modal
let _schedPresetId = null;
function schedulePreset(id) {{
_schedPresetId = id;
document.getElementById('sched-modal').classList.add('active');
}}
function closeModal() {{
document.getElementById('sched-modal').classList.remove('active');
}}
function submitSchedule(e) {{
e.preventDefault();
const fd = new FormData(e.target);
const ttype = fd.get('trigger_type');
const trigger = {{type: ttype}};
if (ttype === 'time') {{
const mode = fd.get('time_mode');
if (mode === 'at') trigger.at = fd.get('time_at');
else trigger.duration_seconds = parseInt(fd.get('duration_hours')||0)*3600 + parseInt(fd.get('duration_mins')||0)*60;
}} else if (ttype === 'cost') {{
trigger.threshold_usd = parseFloat(fd.get('cost_threshold'));
}}
api('create_schedule', {{preset_id: _schedPresetId, trigger}}).then(() => liveRefresh());
return false;
}}
// Init param fields for first pipeline
document.addEventListener('DOMContentLoaded', () => {{
const sel = document.querySelector('[name=pipeline]');
if (sel) updateParamFields(sel.value);
}});
// Live refresh — update dynamic sections without reloading the page
function liveRefresh() {{
fetch('/api/fragments').then(r => r.json()).then(f => {{
const ids = {{'gpu-grid':'gpus', 'train-section':'training', 'schedule-section':'schedule', 'jobs-section':'jobs', 'presets-list':'presets', 'refresh-time':'refresh_time'}};
for (const [elId, key] of Object.entries(ids)) {{
const el = document.getElementById(elId);
if (el && f[key] != null) el.innerHTML = f[key];
}}
}}).catch(() => {{}});
}}
setInterval(liveRefresh, 10000);
"""
# ── HTTP Handler ───────────────────────────────────────────────────────────
class SchedulerHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
pass
def do_GET(self):
path = urlparse(self.path).path
if path in ("/", "/dashboard"):
html = _render_page()
# Inject schedule modal at end of body
modal = """<div class="modal-overlay" id="sched-modal">
<div class="modal"><h3>Schedule Trigger</h3>
<form onsubmit="return submitSchedule(event)">
<label>Trigger Type<select name="trigger_type" onchange="document.querySelectorAll('.trig-opts').forEach(e=>e.style.display='none');document.getElementById('trig-'+this.value).style.display='block'">
<option value="time">Time</option><option value="finish_training">After Training</option><option value="cost">Cost Threshold</option>
</select></label>
<div id="trig-time" class="trig-opts">
<label>Mode<select name="time_mode"><option value="at">At specific time</option><option value="duration">After duration</option></select></label>
<label>At (ISO)<input name="time_at" placeholder="2026-03-21T08:00:00"></label>
<label>Duration hours<input name="duration_hours" type="number" value="0"></label>
<label>Duration mins<input name="duration_mins" type="number" value="0"></label>
</div>
<div id="trig-finish_training" class="trig-opts" style="display:none">
<div style="font-size:0.8rem;color:#999;padding:0.3rem">Fires when current training run completes all steps.</div>
</div>
<div id="trig-cost" class="trig-opts" style="display:none">
<label>Threshold ($)<input name="cost_threshold" type="number" step="0.01" value="1.00"></label>
</div>
<div class="btn-row">
<button type="button" class="btn btn-sm" onclick="closeModal()">Cancel</button>
<button type="submit" class="btn btn-sm">Create Trigger</button>
</div>
</form></div></div>"""
html = html.replace("</body>", modal + "</body>")
self._respond(200, html, "text/html")
elif path == "/api/state":
with _lock:
data = {"gpus": _state["gpus"], "training": _state["training"],
"presets": _presets, "jobs": _jobs[-20:], "schedule": _schedule[-20:]}
self._respond(200, json.dumps(data, default=str, indent=2), "application/json")
elif path == "/api/training":
self._respond(200, json.dumps(_fetch_training_status(), default=str), "application/json")
elif path == "/api/presets":
self._respond(200, json.dumps(_presets, indent=2), "application/json")
elif path == "/api/pipelines":
info = {k: {"label": v["label"], "description": v["description"], "params": v["params"], "defaults": v["defaults"]}
for k, v in PIPELINE_TYPES.items()}
self._respond(200, json.dumps(info, indent=2), "application/json")
elif path == "/api/fragments":
# Return HTML fragments for live refresh (no full page reload)
with _lock:
state = dict(_state)
gpu_cards = ""
for gpu in GPUS:
data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]})
gpu_cards += _gpu_card_html(data)
online_count = len([g for g in state["gpus"].values() if g.get("online")])
last_refresh = state.get("last_refresh", "never")
fragments = {
"gpus": gpu_cards,
"training": _training_card_html(state.get("training")),
"schedule": _schedule_panel_html(),
"jobs": _jobs_panel_html(),
"presets": _presets_list_html(),
"refresh_time": f"{online_count}/{len(GPUS)} GPUs online &mdash; refreshed {last_refresh}",
}
self._respond(200, json.dumps(fragments), "application/json")
else:
self._respond(404, "Not found", "text/plain")
def do_POST(self):
path = urlparse(self.path).path
if path != "/api/action":
self._respond(404, "Not found", "text/plain")
return
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length else {}
action = body.get("action", "")
try:
result = self._handle_action(action, body)
self._respond(200, json.dumps(result, default=str), "application/json")
except Exception as e:
self._respond(500, json.dumps({"ok": False, "error": str(e)}), "application/json")
def _handle_action(self, action, body):
if action == "refresh":
threading.Thread(target=refresh_state, daemon=True).start()
return {"ok": True}
elif action == "create_preset":
pid = str(uuid.uuid4())[:8]
_presets[pid] = {
"id": pid,
"name": body.get("name", "unnamed"),
"pipeline": body.get("pipeline", "self_play"),
"gpus": body.get("gpus", []),
"params": body.get("params", {}),
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
}
_save_presets()
return {"ok": True, "preset_id": pid}
elif action == "delete_preset":
pid = body.get("preset_id")
if pid in _presets:
del _presets[pid]
_save_presets()
return {"ok": True}
elif action == "launch_preset":
job = _launch_preset(body.get("preset_id"))
return {"ok": True, "job": job}
elif action == "create_schedule":
sid = str(uuid.uuid4())[:8]
sched = {
"id": sid,
"preset_id": body.get("preset_id"),
"trigger": body.get("trigger", {}),
"status": "pending",
"created_at": datetime.now().isoformat(),
}
_schedule.append(sched)
_save_schedule()
return {"ok": True, "schedule_id": sid}
elif action == "cancel_schedule":
sid = body.get("schedule_id")
for s in _schedule:
if s["id"] == sid:
s["status"] = "cancelled"
_save_schedule()
return {"ok": True}
elif action == "cancel_job":
jid = body.get("job_id")
for j in _jobs:
if j["id"] == jid and j["status"] == "running":
j["status"] = "cancelled"
_save_jobs()
return {"ok": True}
elif action == "load_model":
gpu_id = body.get("gpu_id")
model = body.get("model")
gpu = GPU_MAP.get(gpu_id)
if not gpu:
return {"ok": False, "error": "unknown GPU"}
result = _ollama_api(gpu, "/api/generate", method="POST", data={
"model": model, "prompt": "test", "stream": False,
"options": {"num_predict": 1},
})
return {"ok": True, "result": result}
elif action == "stop_ollama":
gpu_id = body.get("gpu_id", "3090ti")
gpu = GPU_MAP.get(gpu_id)
if gpu:
svc = gpu.get("ollama_service", "ollama.service")
_ssh_cmd(gpu, f"sudo systemctl stop {svc} 2>&1", timeout=10)
return {"ok": True}
elif action == "start_ollama":
gpu_id = body.get("gpu_id", "3090ti")
gpu = GPU_MAP.get(gpu_id)
if gpu:
svc = gpu.get("ollama_service", "ollama.service")
_ssh_cmd(gpu, f"sudo systemctl start {svc} 2>&1", timeout=10)
return {"ok": True}
return {"ok": False, "error": f"unknown action: {action}"}
def _respond(self, code, body, content_type):
self.send_response(code)
self.send_header("Content-Type", content_type)
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
self.wfile.write(body.encode() if isinstance(body, str) else body)
# ── Main ───────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Mortdecai GPU Scheduler")
parser.add_argument("--port", type=int, default=PORT)
parser.add_argument("--refresh-interval", type=int, default=10)
args = parser.parse_args()
_load_persisted()
print(f"Loaded {len(_presets)} presets, {len(_jobs)} jobs, {len(_schedule)} schedules")
t = threading.Thread(target=_bg_refresh_loop, args=(args.refresh_interval,), daemon=True)
t.start()
print("Initial GPU scan...")
refresh_state()
server = HTTPServer(("0.0.0.0", args.port), SchedulerHandler)
print(f"GPU Scheduler on http://0.0.0.0:{args.port}")
print(f" {len(GPUS)} GPUs, refresh {args.refresh_interval}s")
try:
server.serve_forever()
except KeyboardInterrupt:
print("\nShutting down.")
if __name__ == "__main__":
main()