Mortdecai/web/gpu_scheduler.py

#!/usr/bin/env python3
"""
Mortdecai GPU Scheduler — preset-based job scheduler with live GPU monitoring.

Features:
  - GPU dashboard with live stats across the homelab
  - Configuration presets (GPU assignments, model selection, pipeline type)
  - Job scheduler with 3 trigger types: time, finish_training, cost
  - Model management: load/unload Ollama models per GPU
  - Training progress monitor with loss curves

Usage:
    python3 gpu_scheduler.py --port 8098

Serve behind Caddy as gpu.sethpc.xyz with google_auth.
"""

import argparse
import json
import os
import re
import subprocess
import threading
import time
import uuid
from http.server import HTTPServer, BaseHTTPRequestHandler
from pathlib import Path
from urllib.parse import parse_qs, urlparse
from datetime import datetime, timedelta

PORT = 8098
DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "scheduler"

# ── GPU Inventory ──────────────────────────────────────────────────────────

GPUS = [
    {
        "id": "3090ti", "name": "RTX 3090 Ti", "vram_gb": 24, "vram_mb": 24564,
        "host": "seth@192.168.0.141", "gpu_index": 1,
        "ollama_port": 11434, "ollama_service": "ollama.service",
        "capabilities": ["training", "inference", "self-play", "pipeline"],
        "location": "steel141",
    },
    {
        "id": "2080ti", "name": "RTX 2080 Ti", "vram_gb": 11, "vram_mb": 11264,
        "host": "seth@192.168.0.141", "gpu_index": 0,
        "ollama_port": 11435, "ollama_service": "ollama-gpu0.service",
        "capabilities": ["inference", "self-play", "pipeline", "generator"],
        "location": "steel141",
    },
    {
        "id": "rtx4000", "name": "Quadro RTX 4000", "vram_gb": 8, "vram_mb": 8192,
        "host": "pve197", "gpu_index": 0, "pct_id": 105,
        "ollama_port": 11434, "ollama_service": "ollama.service",
        "capabilities": ["inference", "self-play", "pipeline", "prod"],
        "location": "pve197 → CT 105",
    },
    {
        "id": "1660s", "name": "GTX 1660 Super", "vram_gb": 6, "vram_mb": 6144,
        "host": "root@192.168.0.235", "gpu_index": 0,
        "ollama_port": 11434, "ollama_service": "ollama.service",
        "capabilities": ["generator", "inference-small"],
        "location": "bedroom",
        "ssh_extra": "-o StrictHostKeyChecking=no", "ssh_pass": "REDACTED_PASSWORD",
    },
]
GPU_MAP = {g["id"]: g for g in GPUS}

# ── Pipeline Definitions ──────────────────────────────────────────────────

PIPELINE_TYPES = {
    "training": {
        "label": "Training (QLoRA)",
        "description": "Fine-tune model via Unsloth QLoRA",
        "gpu_req": ["training"],
        "params": ["base_model", "dataset", "output_name", "epochs", "lr", "batch_size", "grad_accum", "max_seq_len", "save_steps"],
        "defaults": {
            "base_model": "Qwen/Qwen3.5-9B", "dataset": "auto", "output_name": "mortdecai-0.5.0",
            "epochs": 1, "lr": 1e-4, "batch_size": 2, "grad_accum": 4, "max_seq_len": 2048, "save_steps": 50,
        },
    },
    "self_play": {
        "label": "Self-Play",
        "description": "Model generates edge cases and learns from failures",
        "gpu_req": ["inference"],
        "params": ["model", "tiers", "rounds_per_tier", "rcon_host", "rcon_port", "rcon_pass"],
        "defaults": {
            "model": "mortdecai:0.4.0", "tiers": "1,2,3",
            "rounds_per_tier": 50, "rcon_host": "192.168.0.244", "rcon_port": 25578,
            "rcon_pass": "REDACTED_RCON",
        },
    },
    "prompt_pipeline": {
        "label": "Prompt Pipeline",
        "description": "Small model generates prompts, big models process + RCON validate",
        "gpu_req": ["generator", "inference"],
        "params": ["gen_model", "proc_model", "batch_size", "interval"],
        "defaults": {
            "gen_model": "qwen3.5:0.8b", "proc_model": "mortdecai:0.4.0",
            "batch_size": 30, "interval": 120,
        },
    },
    "bakeoff": {
        "label": "Bake-off",
        "description": "Compare model versions on standard test prompts",
        "gpu_req": ["inference"],
        "params": ["models", "test_set", "rcon_host"],
        "defaults": {
            "models": "mortdecai:0.4.0,mortdecai:0.5.0", "test_set": "standard",
            "rcon_host": "192.168.0.244",
        },
    },
    "export_gguf": {
        "label": "Export GGUF",
        "description": "Convert LoRA adapter to GGUF for Ollama",
        "gpu_req": ["training"],
        "params": ["adapter_path", "output_name", "quant"],
        "defaults": {
            "adapter_path": "training/checkpoints/mortdecai-0.5.0",
            "output_name": "mortdecai:0.5.0", "quant": "q4_k_m",
        },
    },
    "tool_self_play": {
        "label": "Tool Self-Play",
        "description": "Exercise all 14 tools on the dev server — scripts, memory, entities, wiki",
        "gpu_req": ["inference"],
        "params": ["model", "rounds", "categories", "rcon_host", "rcon_port", "rcon_pass"],
        "defaults": {
            "model": "mortdecai:0.4.0", "rounds": 10,
            "categories": "all",
            "rcon_host": "192.168.0.112", "rcon_port": 25578,
            "rcon_pass": "REDACTED_RCON",
        },
    },
    "load_model": {
        "label": "Load Model",
        "description": "Load/switch Ollama model on a GPU",
        "gpu_req": ["inference"],
        "params": ["model"],
        "defaults": {"model": "mortdecai:0.4.0"},
    },
}

# ── State ──────────────────────────────────────────────────────────────────

_lock = threading.Lock()
_state = {
    "gpus": {},
    "training": None,
    "last_refresh": None,
}
_presets = {}   # id -> preset dict
_jobs = []      # list of job dicts
_schedule = []  # list of scheduled trigger dicts
_cost_tracker = {"total_kwh": 0.0, "total_cost": 0.0, "electricity_rate": 0.12}

TRAINING_LOG_PATTERN = "/home/seth/mc-ai-training/Minecraft-AI-model/training/train_run_*.log"
TRAINING_HOST = "seth@192.168.0.141"


# ── Persistence ────────────────────────────────────────────────────────────

def _ensure_data_dir():
    DATA_DIR.mkdir(parents=True, exist_ok=True)

def _save_presets():
    _ensure_data_dir()
    with open(DATA_DIR / "presets.json", "w") as f:
        json.dump(_presets, f, indent=2)

def _save_jobs():
    _ensure_data_dir()
    with open(DATA_DIR / "jobs.json", "w") as f:
        json.dump(_jobs, f, indent=2, default=str)

def _save_schedule():
    _ensure_data_dir()
    with open(DATA_DIR / "schedule.json", "w") as f:
        json.dump(_schedule, f, indent=2, default=str)

def _load_persisted():
    global _presets, _jobs, _schedule
    _ensure_data_dir()
    for name, target in [("presets.json", "_presets"), ("jobs.json", "_jobs"), ("schedule.json", "_schedule")]:
        path = DATA_DIR / name
        if path.exists():
            with open(path) as f:
                data = json.load(f)
            if target == "_presets":
                _presets = data
            elif target == "_jobs":
                _jobs = data
            elif target == "_schedule":
                _schedule = data


# ── SSH Helpers ────────────────────────────────────────────────────────────

def _ssh_cmd(gpu_or_host, cmd, timeout=8):
    """Run a command over SSH. Accepts a GPU dict or host string."""
    if isinstance(gpu_or_host, dict):
        gpu = gpu_or_host
        host = gpu["host"]
        extra = gpu.get("ssh_extra", "").split() if gpu.get("ssh_extra") else []
        ssh_pass = gpu.get("ssh_pass")
        # If pct_id is set, wrap command through proxmox host
        if "pct_id" in gpu:
            cmd = f"pct exec {gpu['pct_id']} -- bash -c '{cmd}'"
    else:
        host = gpu_or_host
        extra = []
        ssh_pass = None

    try:
        if ssh_pass:
            full_cmd = ["sshpass", "-p", ssh_pass, "ssh", "-o", "ConnectTimeout=4"] + extra + [host, cmd]
        else:
            full_cmd = ["ssh", "-o", "ConnectTimeout=4", "-o", "BatchMode=yes"] + extra + [host, cmd]
        r = subprocess.run(full_cmd, capture_output=True, text=True, timeout=timeout)
        return r.stdout.strip() if r.returncode == 0 else None
    except Exception:
        return None


def _ollama_api(gpu, endpoint, method="GET", data=None):
    """Call Ollama API on a GPU via SSH curl."""
    port = gpu["ollama_port"]
    if method == "GET":
        cmd = f"curl -s --connect-timeout 3 http://localhost:{port}{endpoint}"
    else:
        payload = json.dumps(data).replace("'", "'\\''") if data else "{}"
        cmd = f"curl -s --connect-timeout 3 -X POST http://localhost:{port}{endpoint} -d '{payload}'"
    raw = _ssh_cmd(gpu, cmd)
    if raw:
        try:
            return json.loads(raw)
        except json.JSONDecodeError:
            pass
    return None


# ── GPU Monitoring ─────────────────────────────────────────────────────────

def _fetch_gpu_stats(gpu):
    query = f"nvidia-smi --id={gpu['gpu_index']} --query-gpu=utilization.gpu,temperature.gpu,power.draw,memory.used,memory.total,fan.speed --format=csv,noheader,nounits"
    raw = _ssh_cmd(gpu, query)
    if not raw:
        return {"online": False, "id": gpu["id"], "name": gpu["name"]}

    parts = [p.strip() for p in raw.split(",")]
    try:
        return {
            "online": True, "id": gpu["id"], "name": gpu["name"],
            "vram_gb": gpu["vram_gb"], "location": gpu["location"],
            "capabilities": gpu["capabilities"],
            "utilization": int(parts[0]), "temperature": int(parts[1]),
            "power_watts": float(parts[2]),
            "vram_used_mb": int(parts[3]), "vram_total_mb": int(parts[4]),
            "fan_speed": int(parts[5]) if parts[5] not in ("[N/A]", "[Not Supported]") else None,
            "vram_pct": round(int(parts[3]) / int(parts[4]) * 100, 1),
        }
    except (ValueError, IndexError):
        return {"online": True, "id": gpu["id"], "name": gpu["name"], "error": raw}


def _fetch_ollama_info(gpu):
    """Get running + available models from Ollama."""
    ps = _ollama_api(gpu, "/api/ps") or {}
    tags = _ollama_api(gpu, "/api/tags") or {}
    running = []
    for m in ps.get("models", []):
        running.append({
            "name": m.get("name", "?"),
            "size_gb": round(m.get("size", 0) / 1e9, 1),
            "vram_gb": round(m.get("size_vram", 0) / 1e9, 1),
        })
    available = [m.get("name", "?") for m in tags.get("models", [])]
    return {"running": running, "available": available}


def _fetch_training_status():
    # Find the most recently modified training log
    log_path = _ssh_cmd(TRAINING_HOST, f"ls -t {TRAINING_LOG_PATTERN} 2>/dev/null | head -1", timeout=5)
    if not log_path:
        return None
    raw = _ssh_cmd(TRAINING_HOST, f"tail -200 {log_path} 2>/dev/null", timeout=8)
    if not raw:
        return None

    status = {"active": False, "loss_history": []}

    progress_matches = re.findall(r'(\d+)%\|[^|]*\|\s*(\d+)/(\d+)\s*\[([^\]]+)\]', raw)
    if progress_matches:
        last = progress_matches[-1]
        status["pct"] = int(last[0])
        status["current_step"] = int(last[1])
        status["total_steps"] = int(last[2])
        timing = last[3]
        eta_match = re.search(r'<([^,]+)', timing)
        elapsed_match = re.match(r'([^<]+)', timing)
        if eta_match:
            status["eta"] = eta_match.group(1).strip()
        if elapsed_match:
            status["elapsed"] = elapsed_match.group(1).strip()
        status["active"] = True

    if "OutOfMemoryError" in raw:
        status["active"] = False
        status["error"] = "OOM"
    elif "Error" in raw.split("\n")[-1] and "OutOfMemoryError" not in raw:
        status["active"] = False
        status["error"] = "crashed"

    loss_matches = re.findall(r"'loss':\s*'([^']+)'", raw)
    for lm in loss_matches:
        try:
            status["loss_history"].append(float(lm))
        except ValueError:
            pass
    if status["loss_history"]:
        status["latest_loss"] = status["loss_history"][-1]

    lr_matches = re.findall(r"'learning_rate':\s*'([^']+)'", raw)
    if lr_matches:
        status["learning_rate"] = lr_matches[-1]

    return status


def _fetch_processes(gpu):
    cmd = f"nvidia-smi --id={gpu['gpu_index']} --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null"
    raw = _ssh_cmd(gpu, cmd)
    if not raw:
        return []
    procs = []
    for line in raw.strip().split("\n"):
        if not line.strip():
            continue
        parts = [p.strip() for p in line.split(",")]
        if len(parts) >= 3:
            procs.append({"pid": parts[0], "name": parts[1].split("/")[-1], "vram_mb": parts[2]})
    return procs


def refresh_state():
    new_gpus = {}
    threads = []

    def fetch_one(gpu):
        stats = _fetch_gpu_stats(gpu)
        stats["ollama"] = _fetch_ollama_info(gpu)
        stats["processes"] = _fetch_processes(gpu)
        # Check if any job is running on this GPU
        active_jobs = [j for j in _jobs if j.get("status") == "running" and gpu["id"] in j.get("gpus", [])]
        stats["active_job"] = active_jobs[0]["id"] if active_jobs else None
        new_gpus[gpu["id"]] = stats

    for gpu in GPUS:
        t = threading.Thread(target=fetch_one, args=(gpu,))
        t.start()
        threads.append(t)
    for t in threads:
        t.join(timeout=12)

    with _lock:
        _state["gpus"] = new_gpus
        _state["training"] = _fetch_training_status()
        _state["last_refresh"] = time.strftime("%H:%M:%S")


def _bg_refresh_loop(interval=10):
    while True:
        try:
            refresh_state()
            _check_triggers()
        except Exception as e:
            print(f"[scheduler] refresh error: {e}")
        time.sleep(interval)


# ── Job Execution ──────────────────────────────────────────────────────────

def _run_job_async(job):
    """Execute a job in a background thread."""
    def _run():
        job["status"] = "running"
        job["started_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
        _save_jobs()
        print(f"[scheduler] starting job {job['id']}: {job['pipeline']}")

        try:
            pipeline = job["pipeline"]
            params = job["params"]
            gpus = job["gpus"]

            if pipeline == "training":
                _exec_training(job, params)
            elif pipeline == "self_play":
                _exec_self_play(job, params, gpus)
            elif pipeline == "prompt_pipeline":
                _exec_prompt_pipeline(job, params, gpus)
            elif pipeline == "load_model":
                _exec_load_model(job, params, gpus)
            elif pipeline == "export_gguf":
                _exec_export_gguf(job, params)
            elif pipeline == "bakeoff":
                _exec_bakeoff(job, params, gpus)
            elif pipeline == "tool_self_play":
                _exec_tool_self_play(job, params, gpus)
            else:
                job["error"] = f"unknown pipeline: {pipeline}"
                job["status"] = "failed"
        except Exception as e:
            job["error"] = str(e)
            job["status"] = "failed"
            print(f"[scheduler] job {job['id']} failed: {e}")

        if job["status"] == "running":
            job["status"] = "completed"
        job["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
        _save_jobs()
        print(f"[scheduler] job {job['id']} → {job['status']}")

    t = threading.Thread(target=_run, daemon=True)
    t.start()
    return job


def _exec_training(job, params):
    """Launch training on the 3090 Ti via SSH."""
    output_name = params.get('output_name', 'mortdecai-0.5.0')
    log_name = f"train_run_{output_name}.log"

    # Build the training command with conda environment activation
    train_cmd = (
        f"source /home/seth/miniconda3/etc/profile.d/conda.sh && "
        f"conda activate mc-train && "
        f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
        f"TORCH_COMPILE_DISABLE=1 TORCHDYNAMO_DISABLE=1 CUDA_VISIBLE_DEVICES=0 "
        f"python3 training/scripts/train_lora.py "
        f"--model '{params.get('base_model', 'Qwen/Qwen3.5-9B')}' "
        f"--output 'training/checkpoints/{output_name}' "
        f"--lr {params.get('lr', 1e-4)} "
        f"--epochs {int(params.get('epochs', 1))} "
        f"--batch-size {int(params.get('batch_size', 2))} "
        f"--grad-accum {int(params.get('grad_accum', 4))} "
        f"--max-seq-len {int(params.get('max_seq_len', 2048))} "
        f"--save-steps {int(params.get('save_steps', 50))}"
    )
    if params.get("resume"):
        train_cmd += " --resume"
    train_cmd += f" 2>&1 | tee training/{log_name}"

    # Cancel any running jobs on the 3090 Ti to free VRAM
    for j in _jobs:
        if j.get("status") == "running" and "3090ti" in j.get("gpus", []) and j["id"] != job["id"]:
            j["status"] = "cancelled"
            print(f"[training] cancelled conflicting job {j['id']} on 3090ti")
    _save_jobs()

    # Stop both Ollama services AND prevent auto-restart
    _ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service 2>/dev/null", timeout=10)
    _ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama-gpu0.service 2>/dev/null", timeout=10)
    time.sleep(2)
    # Kill any lingering ollama processes holding GPU 1 VRAM
    _ssh_cmd(TRAINING_HOST,
        "for pid in $(nvidia-smi --id=1 --query-compute-apps=pid --format=csv,noheader,nounits 2>/dev/null); do kill $pid 2>/dev/null; done",
        timeout=5)
    time.sleep(3)

    # Verify VRAM is free enough (need ~18GB free on 24GB card)
    vram_check = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits")
    if vram_check:
        try:
            free_mb = int(vram_check.strip())
        except ValueError:
            free_mb = 0
        print(f"[training] 3090 Ti free VRAM: {free_mb}MB")
        if free_mb < 18000:
            # Last resort: try harder to free VRAM
            _ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service; sudo systemctl stop ollama-gpu0.service", timeout=10)
            time.sleep(5)
            vram_check2 = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits")
            try:
                free_mb = int(vram_check2.strip()) if vram_check2 else 0
            except ValueError:
                free_mb = 0
            if free_mb < 18000:
                job["status"] = "failed"
                job["error"] = f"Not enough VRAM: {free_mb}MB free, need 18000MB"
                # Restart Ollama since we're not training
                _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10)
                _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10)
                return

    # Launch training via nohup with bash -l for conda
    nohup_cmd = f"nohup bash -c '{train_cmd}' > /dev/null 2>&1 &"
    _ssh_cmd(TRAINING_HOST, nohup_cmd, timeout=10)
    job["log_path"] = f"training/{log_name}"
    print(f"[training] launched, logging to {log_name}")

    # Monitor until done
    while job["status"] == "running":
        time.sleep(30)
        status = _fetch_training_status()
        if status:
            job["progress"] = status
            if status.get("error"):
                job["status"] = "failed"
                job["error"] = status["error"]
                break
            if not status.get("active") and status.get("current_step", 0) == status.get("total_steps", 0) and status.get("total_steps", 0) > 0:
                job["status"] = "completed"
                break

    # Restart Ollama services after training
    _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10)
    _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10)


def _exec_self_play(job, params, gpus):
    resolved_gpus = [GPU_MAP[gid] for gid in gpus if gid in GPU_MAP]
    if not resolved_gpus:
        job["error"] = "no GPU assigned"
        job["status"] = "failed"
        return

    model = params.get("model", "mortdecai:0.4.0")
    tiers = [t.strip() for t in params.get("tiers", "1,2,3").split(",")]
    rounds = int(params.get("rounds_per_tier", 50))
    rcon_host = params.get("rcon_host", "192.168.0.244")
    rcon_port = int(params.get("rcon_port", 25578))
    rcon_pass = params.get("rcon_pass", "REDACTED_RCON")
    script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/self_play.py"

    # Distribute tiers round-robin across GPUs, launch all in parallel
    gpu_assignments = {}  # gpu_id -> list of tiers
    for i, tier in enumerate(tiers):
        gpu = resolved_gpus[i % len(resolved_gpus)]
        gpu_assignments.setdefault(gpu["id"], []).append(tier)

    job["gpu_assignments"] = {gid: ts for gid, ts in gpu_assignments.items()}

    # Launch all GPU workers in parallel threads
    errors = []
    def run_on_gpu(gpu, assigned_tiers):
        port = gpu["ollama_port"]
        for tier in assigned_tiers:
            if job["status"] != "running":
                break
            log_file = f"/tmp/selfplay_{gpu['id']}_{tier}.log"
            cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
                   f"python3 {script_path} --tier {tier} --rounds {rounds} "
                   f"--ollama-url http://localhost:{port} --model {model} "
                   f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass}")
            _ssh_cmd(gpu, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10)
            print(f"[self-play] {gpu['name']}: {tier} x{rounds} started")
            # Wait for this tier to finish
            for _ in range(rounds * 3):
                time.sleep(10)
                log = _ssh_cmd(gpu, f"tail -5 {log_file} 2>/dev/null")
                if log and ("Complete" in log or "Error" in log or "Traceback" in log):
                    if "Error" in log or "Traceback" in log:
                        errors.append(f"{gpu['name']}/{tier}: {log[-200:]}")
                    break
                if job["status"] != "running":
                    break

    threads = []
    for gid, assigned_tiers in gpu_assignments.items():
        gpu = GPU_MAP[gid]
        t = threading.Thread(target=run_on_gpu, args=(gpu, assigned_tiers), daemon=True)
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    if errors:
        job["error"] = "; ".join(errors[:3])


def _exec_prompt_pipeline(job, params, gpus):
    gen_gpu = GPU_MAP.get(gpus[0]) if len(gpus) > 0 else None
    proc_gpu = GPU_MAP.get(gpus[1]) if len(gpus) > 1 else gen_gpu
    if not gen_gpu:
        job["error"] = "no GPUs assigned"
        job["status"] = "failed"
        return

    gen_port = gen_gpu["ollama_port"]
    proc_port = proc_gpu["ollama_port"] if proc_gpu else gen_port

    gen_host_ip = gen_gpu["host"].split("@")[-1]
    proc_host_ip = proc_gpu["host"].split("@")[-1] if proc_gpu else gen_host_ip

    cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
           f"python3 training/scripts/prompt_pipeline.py --mode all "
           f"--gen-url http://{gen_host_ip}:{gen_port} "
           f"--gen-model {params.get('gen_model', 'qwen3.5:0.8b')} "
           f"--proc-urls http://{proc_host_ip}:{proc_port} "
           f"--proc-model {params.get('proc_model', 'mortdecai:0.4.0')} "
           f"--interval {params.get('interval', 120)}")

    _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/pipeline.log 2>&1 &", timeout=10)


def _exec_load_model(job, params, gpus):
    for gid in gpus:
        gpu = GPU_MAP.get(gid)
        if not gpu:
            continue
        model = params.get("model", "mortdecai:0.4.0")
        result = _ollama_api(gpu, "/api/generate", method="POST", data={
            "model": model, "prompt": "test", "stream": False,
            "options": {"num_predict": 1},
        })
        if result and "error" not in result:
            job["result"] = f"Loaded {model} on {gpu['name']}"
        else:
            job["error"] = f"Failed to load {model} on {gpu['name']}: {result}"
            job["status"] = "failed"


def _exec_export_gguf(job, params):
    adapter = params.get("adapter_path", "training/checkpoints/mortdecai-0.5.0")
    quant = params.get("quant", "q4_k_m")
    cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
           f"python3 -m unsloth.save --model {adapter} --output_type gguf --quantization {quant}")
    _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/export_gguf.log 2>&1 &", timeout=10)
    # Monitor
    for _ in range(120):
        time.sleep(15)
        log = _ssh_cmd(TRAINING_HOST, "tail -3 /tmp/export_gguf.log 2>/dev/null")
        if log and ("Saved" in log or "Error" in log or "error" in log):
            if "Error" in log or "error" in log:
                job["status"] = "failed"
                job["error"] = log
            break


def _exec_bakeoff(job, params, gpus):
    gpu = GPU_MAP.get(gpus[0]) if gpus else None
    if not gpu:
        job["error"] = "no GPU assigned"
        job["status"] = "failed"
        return
    models = params.get("models", "mortdecai:0.4.0")
    cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
           f"python3 training/scripts/bakeoff.py --models {models}")
    _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/bakeoff.log 2>&1 &", timeout=10)


def _exec_tool_self_play(job, params, gpus):
    """Run tool-focused self-play on the dev server via the assigned GPU's Ollama."""
    gpu = GPU_MAP.get(gpus[0]) if gpus else None
    if not gpu:
        job["error"] = "no GPU assigned"
        job["status"] = "failed"
        return

    host_ip = gpu["host"].split("@")[-1] if "@" in gpu["host"] else gpu["host"]
    # For pct-based GPUs, use the CT's external IP
    if "pct_id" in gpu:
        host_ip = "192.168.0.179"  # CT 105 external IP
    port = gpu["ollama_port"]
    model = params.get("model", "mortdecai:0.4.0")
    rounds = int(params.get("rounds", 10))
    categories = params.get("categories", "all")
    rcon_host = params.get("rcon_host", "192.168.0.112")
    rcon_port = int(params.get("rcon_port", 25578))
    rcon_pass = params.get("rcon_pass", "REDACTED_RCON")

    script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/tool_self_play.py"
    log_file = f"/tmp/tool_selfplay_{gpu['id']}.log"

    cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && "
           f"python3 {script_path} "
           f"--ollama-url http://{host_ip}:{port} --model {model} "
           f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass} "
           f"--rounds {rounds} --categories {categories}")

    _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10)
    print(f"[tool-self-play] launched on {gpu['name']}, logging to {log_file}")

    # Monitor until done
    for _ in range(rounds * len(PIPELINE_TYPES) * 3):
        time.sleep(15)
        log = _ssh_cmd(TRAINING_HOST, f"tail -5 {log_file} 2>/dev/null")
        if log and ("Complete" in log or "Traceback" in log):
            if "Traceback" in log:
                job["error"] = log[-300:]
                job["status"] = "failed"
            break
        if job["status"] != "running":
            break


# ── Trigger Engine ─────────────────────────────────────────────────────────

def _check_triggers():
    """Evaluate all scheduled triggers."""
    now = datetime.now()
    for sched in _schedule:
        if sched.get("status") != "pending":
            continue

        trigger = sched["trigger"]
        fired = False

        if trigger["type"] == "time":
            target_str = trigger.get("at")
            if target_str:
                try:
                    target = datetime.fromisoformat(target_str)
                    if now >= target:
                        fired = True
                except ValueError:
                    pass
            duration_s = trigger.get("duration_seconds")
            created_str = sched.get("created_at")
            if duration_s and created_str:
                try:
                    created = datetime.fromisoformat(created_str)
                    if now >= created + timedelta(seconds=int(duration_s)):
                        fired = True
                except ValueError:
                    pass

        elif trigger["type"] == "finish_training":
            training = _state.get("training")
            if training:
                total = training.get("total_steps", 0)
                current = training.get("current_step", 0)
                if total > 0 and current >= total and not training.get("active"):
                    fired = True

        elif trigger["type"] == "cost":
            threshold = float(trigger.get("threshold_usd", 999))
            if _cost_tracker["total_cost"] >= threshold:
                fired = True

        if fired:
            sched["status"] = "fired"
            sched["fired_at"] = now.isoformat()
            _save_schedule()
            print(f"[scheduler] trigger fired: {sched['id']} → launching preset {sched['preset_id']}")
            _launch_preset(sched["preset_id"])


def _launch_preset(preset_id):
    """Create and start a job from a preset."""
    preset = _presets.get(preset_id)
    if not preset:
        print(f"[scheduler] preset {preset_id} not found")
        return None

    job = {
        "id": str(uuid.uuid4())[:8],
        "preset_id": preset_id,
        "preset_name": preset.get("name", "?"),
        "pipeline": preset["pipeline"],
        "params": preset.get("params", {}),
        "gpus": preset.get("gpus", []),
        "status": "queued",
        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
    }
    _jobs.append(job)
    _save_jobs()
    _run_job_async(job)
    return job


# ── HTML Rendering ─────────────────────────────────────────────────────────

def _render_page():
    with _lock:
        state = dict(_state)

    gpu_cards = ""
    for gpu in GPUS:
        data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]})
        gpu_cards += _gpu_card_html(data)

    training_html = _training_card_html(state.get("training"))
    presets_list_html, presets_form_html = _presets_panel_html()
    schedule_html = _schedule_panel_html()
    jobs_html = _jobs_panel_html()
    last_refresh = state.get("last_refresh", "never")
    online_count = len([g for g in state["gpus"].values() if g.get("online")])

    return f"""<!DOCTYPE html>
<html><head>
<title>Mortdecai GPU Scheduler</title>
<meta charset="utf-8">
<style>
{CSS}
</style>
</head>
<body>

<header>
  <h1>Mortdecai GPU Scheduler</h1>
  <div class="subtitle"><span id="refresh-time">{online_count}/{len(GPUS)} GPUs online &mdash; refreshed {last_refresh}</span>
    <button class="btn btn-sm" onclick="api('refresh')">Refresh</button>
  </div>
</header>

<div class="layout">
  <div class="main-col">
    <div id="train-section">{training_html}</div>
    <h2>GPUs</h2>
    <div class="grid" id="gpu-grid">{gpu_cards}</div>
    <div id="schedule-section">{schedule_html}</div>
    <div id="jobs-section">{jobs_html}</div>
  </div>
  <div class="side-col">
    <div class="panel">
    <h3>Presets</h3>
    <div id="presets-list">{presets_list_html}</div>
    {presets_form_html}
    </div>
  </div>
</div>

<script>
{JS}
</script>
</body></html>"""


def _gpu_card_html(d):
    if not d.get("online"):
        return f"""<div class="card offline"><div class="card-header"><b>{d.get('name','?')}</b><span class="bad">OFFLINE</span></div></div>"""

    util = d.get("utilization", 0)
    temp = d.get("temperature", 0)
    vram_pct = d.get("vram_pct", 0)
    vram_used = d.get("vram_used_mb", 0)
    vram_total = d.get("vram_total_mb", 0)
    power = d.get("power_watts", 0)
    tc = "bad" if temp > 80 else "warn" if temp > 70 else "ok"
    uc = "ok" if util > 50 else "warn" if util > 10 else "dim"
    vc = "bad" if vram_pct > 90 else "warn" if vram_pct > 70 else "ok"

    ollama = d.get("ollama", {})
    running = ollama.get("running", [])
    avail = ollama.get("available", [])
    model_tags = " ".join(f'<span class="tag">{m["name"]}</span>' for m in running) if running else '<span class="tag dim">idle</span>'

    avail_options = "".join(f'<option value="{m}">{m}</option>' for m in avail if m)
    model_select = f"""<select class="model-select" id="ms-{d['id']}">{avail_options}</select>
    <button class="btn btn-xs" onclick="loadModel('{d['id']}')">Load</button>""" if avail else ""

    active_job = d.get("active_job")
    job_badge = f'<span class="tag accent">job {active_job}</span>' if active_job else ""

    caps = " ".join(f'<span class="cap">{c}</span>' for c in d.get("capabilities", []))

    return f"""<div class="card" id="gpu-{d['id']}">
<div class="card-header"><b>{d['name']}</b><span class="{uc}">{'ACTIVE' if util>10 else 'IDLE'}</span></div>
<div class="card-sub">{d.get('location','')} {job_badge}</div>
<div class="bar-row"><span class="bar-label">GPU</span><div class="bar"><div class="bar-fill" style="width:{util}%">{util}%</div></div></div>
<div class="bar-row"><span class="bar-label">VRAM</span><div class="bar"><div class="bar-fill {vc}" style="width:{vram_pct}%">{vram_used}/{vram_total}MB</div></div></div>
<div class="stats"><span class="{tc}">{temp}C</span> <span>{power:.0f}W</span></div>
<div class="models">{model_tags}</div>
<div class="model-ctrl">{model_select}</div>
<div class="caps">{caps}</div>
</div>"""


def _training_card_html(t):
    if not t:
        return '<div class="card"><div class="card-header"><b>Training</b><span class="dim">no log</span></div></div>'

    pct = t.get("pct", 0)
    step = t.get("current_step", 0)
    total = t.get("total_steps", 0)
    error = t.get("error")
    active = t.get("active", False)
    loss = t.get("latest_loss")
    lr = t.get("learning_rate", "?")
    eta = t.get("eta", "?")
    elapsed = t.get("elapsed", "?")

    if error:
        status = f'<span class="bad">CRASHED ({error})</span>'
    elif active:
        status = '<span class="ok">TRAINING</span>'
    else:
        status = '<span class="warn">STOPPED</span>'

    # Sparkline
    lh = t.get("loss_history", [])
    spark = ""
    if lh:
        recent = lh[-40:]
        mx, mn = max(recent), min(recent)
        rng = mx - mn if mx != mn else 1
        w, h = 400, 70
        pts = " ".join(f"{i/(max(len(recent)-1,1))*w:.0f},{h-((v-mn)/rng*h):.0f}" for i, v in enumerate(recent))
        spark = f"""<svg width="{w}" height="{h}" class="spark"><polyline points="{pts}" fill="none" stroke="#D35400" stroke-width="1.5"/>
<text x="0" y="10" fill="#666" font-size="9">{mx:.4f}</text><text x="0" y="{h}" fill="#666" font-size="9">{mn:.4f}</text></svg>"""

    return f"""<div class="card train-card">
<div class="card-header"><b>Training</b>{status}</div>
<div class="progress"><div class="progress-fill" style="width:{pct}%">{step}/{total} ({pct}%)</div></div>
<div class="stats">
  <span>Elapsed: {elapsed}</span> <span>ETA: {eta}</span>
  <span>Loss: <b class="accent">{f'{loss:.4f}' if loss else '?'}</b></span> <span>LR: {lr}</span>
</div>
{spark}
</div>"""


def _presets_list_html():
    """Just the preset rows — refreshable without touching the form."""
    rows = ""
    for pid, p in sorted(_presets.items(), key=lambda x: x[1].get("name", "")):
        gpus = ", ".join(p.get("gpus", []))
        rows += f"""<div class="preset-row">
<div class="preset-name">{p['name']}</div>
<div class="preset-info">{p['pipeline']} &mdash; {gpus}</div>
<div class="preset-actions">
  <button class="btn btn-xs" onclick="launchPreset('{pid}')">Run</button>
  <button class="btn btn-xs" onclick="schedulePreset('{pid}')">Schedule</button>
  <button class="btn btn-xs btn-danger" onclick="deletePreset('{pid}')">Del</button>
</div></div>"""

    if not rows:
        rows = '<div class="dim" style="padding:0.5rem">No presets yet. Create one below.</div>'
    return rows


def _presets_panel_html():
    """Returns (list_html, form_html) — list refreshes live, form stays static."""
    list_html = _presets_list_html()

    pipe_opts = "".join(f'<option value="{k}">{v["label"]}</option>' for k, v in PIPELINE_TYPES.items())
    gpu_checks = "".join(f'<label class="gpu-check"><input type="checkbox" name="gpus" value="{g["id"]}"> {g["name"]}</label>' for g in GPUS)

    form_html = f"""<div class="create-form">
<h3 style="margin-top:0.8rem">New Preset</h3>
<form id="preset-form" onsubmit="return createPreset(event)">
  <label>Name<input name="name" required placeholder="overnight-selfplay"></label>
  <label>Pipeline<select name="pipeline" onchange="updateParamFields(this.value)">{pipe_opts}</select></label>
  <div class="gpu-select"><label>GPUs</label>{gpu_checks}</div>
  <div id="param-fields"></div>
  <button type="submit" class="btn">Save Preset</button>
</form>
</div>"""

    return list_html, form_html


def _schedule_panel_html():
    rows = ""
    for s in sorted(_schedule, key=lambda x: x.get("created_at", ""), reverse=True)[:10]:
        preset_name = _presets.get(s.get("preset_id", ""), {}).get("name", s.get("preset_id", "?"))
        trigger = s.get("trigger", {})
        ttype = trigger.get("type", "?")

        if ttype == "time":
            if trigger.get("at"):
                trigger_desc = f"at {trigger['at']}"
            else:
                secs = int(trigger.get("duration_seconds", 0))
                trigger_desc = f"after {secs//3600}h{(secs%3600)//60}m"
        elif ttype == "finish_training":
            trigger_desc = "when training completes"
        elif ttype == "cost":
            trigger_desc = f"at ${trigger.get('threshold_usd', '?')}"
        else:
            trigger_desc = ttype

        st = s.get("status", "?")
        st_class = "ok" if st == "fired" else "warn" if st == "pending" else "dim"

        sid = s["id"]
        cancel_btn = f"""<button class="btn btn-xs btn-danger" onclick="cancelSchedule('{sid}')">Cancel</button>""" if st == 'pending' else ''
        rows += f"""<div class="sched-row">
<span class="{st_class}">{st}</span>
<span>{preset_name}</span>
<span class="dim">{trigger_desc}</span>
{cancel_btn}
</div>"""

    if not rows:
        rows = '<div class="dim" style="padding:0.5rem">No scheduled triggers.</div>'

    return f"""<div class="card">
<div class="card-header"><b>Scheduled Triggers</b><span class="dim">{len([s for s in _schedule if s.get('status')=='pending'])} pending</span></div>
{rows}
</div>"""


def _jobs_panel_html():
    recent = sorted(_jobs, key=lambda j: j.get("created_at", ""), reverse=True)[:15]
    rows = ""
    for j in recent:
        st = j.get("status", "?")
        st_class = "ok" if st == "completed" else "bad" if st == "failed" else "warn" if st == "running" else "dim"
        gpus = ", ".join(j.get("gpus", []))
        name = j.get("preset_name", j.get("pipeline", "?"))
        err = f' <span class="bad">({j["error"]})</span>' if j.get("error") else ""

        jid = j["id"]
        stop_btn = f"""<button class="btn btn-xs btn-danger" onclick="cancelJob('{jid}')">Stop</button>""" if st == 'running' else ''
        created = j.get('created_at', '')[:16]
        rows += f"""<div class="job-row">
<span class="{st_class}">{st}</span>
<span>{name}</span>
<span class="dim">{gpus}</span>
<span class="dim">{created}</span>
{err}
{stop_btn}
</div>"""

    if not rows:
        rows = '<div class="dim" style="padding:0.5rem">No jobs yet.</div>'

    return f"""<div class="card">
<div class="card-header"><b>Jobs</b><span class="dim">{len([j for j in _jobs if j.get('status')=='running'])} running</span></div>
{rows}
</div>"""


# ── CSS ────────────────────────────────────────────────────────────────────

CSS = """
*{box-sizing:border-box;margin:0;padding:0}
body{font-family:'Courier New',monospace;background:#111;color:#e0e0e0;padding:1rem 1.5rem}
header{margin-bottom:1.2rem}
h1{color:#D35400;font-size:1.5rem;margin-bottom:0.2rem}
h2{color:#D35400;font-size:1.1rem;margin:1rem 0 0.6rem}
h3{color:#D35400;font-size:1rem;margin-bottom:0.6rem}
.subtitle{color:#666;font-size:0.8rem}
.accent{color:#D35400}
.ok{color:#4caf50} .warn{color:#ff9800} .bad{color:#f44336} .dim{color:#555}

.layout{display:grid;grid-template-columns:1fr 340px;gap:1.2rem}
@media(max-width:900px){.layout{grid-template-columns:1fr}}
.main-col{min-width:0}
.side-col{display:flex;flex-direction:column;gap:1rem}

.grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:0.8rem}

.card{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem;margin-bottom:0.8rem;transition:border-color 0.2s}
.card:hover{border-color:#D35400}
.card.offline{opacity:0.4;border-color:#f44336}
.card-header{display:flex;justify-content:space-between;align-items:center;margin-bottom:0.3rem;font-size:0.95rem}
.card-sub{color:#555;font-size:0.75rem;margin-bottom:0.5rem}

.bar-row{display:flex;align-items:center;margin:0.25rem 0;gap:0.4rem}
.bar-label{width:36px;color:#777;font-size:0.75rem}
.bar{flex:1;background:#222;border-radius:3px;height:20px;overflow:hidden}
.bar-fill{height:100%;border-radius:3px;background:#D35400;display:flex;align-items:center;padding-left:5px;font-size:0.7rem;color:#fff;min-width:fit-content;transition:width 0.5s}
.bar-fill.ok{background:#4caf50} .bar-fill.warn{background:#ff9800} .bar-fill.bad{background:#f44336}

.stats{display:flex;gap:0.8rem;font-size:0.8rem;margin:0.4rem 0;flex-wrap:wrap;color:#999}
.models{margin-top:0.4rem}
.model-ctrl{margin-top:0.3rem;display:flex;gap:0.3rem;align-items:center}
.model-select{background:#222;color:#ccc;border:1px solid #333;border-radius:3px;padding:2px 4px;font-size:0.75rem;font-family:monospace}
.tag{display:inline-block;background:#222;border:1px solid #333;border-radius:3px;padding:1px 6px;font-size:0.7rem;margin:1px}
.tag.accent{border-color:#D35400;color:#D35400}
.tag.dim{color:#444;border-color:#222}
.caps{margin-top:0.3rem;display:flex;gap:3px;flex-wrap:wrap}
.cap{font-size:0.65rem;color:#555;background:#1e1e1e;border-radius:2px;padding:1px 4px}

.train-card .progress{background:#222;border-radius:3px;height:26px;margin:0.4rem 0;overflow:hidden}
.train-card .progress-fill{height:100%;background:linear-gradient(90deg,#D35400,#e67e22);border-radius:3px;transition:width 1s;display:flex;align-items:center;justify-content:center;font-size:0.8rem;font-weight:bold;color:#fff}
.spark{display:block;margin-top:0.5rem;background:#1a1a1a;border:1px solid #222;border-radius:3px}

.panel{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem}

.preset-row{display:flex;flex-wrap:wrap;align-items:center;gap:0.4rem;padding:0.4rem 0;border-bottom:1px solid #222;font-size:0.8rem}
.preset-name{font-weight:bold;color:#e0e0e0;flex:1}
.preset-info{color:#777;font-size:0.75rem;width:100%}
.preset-actions{display:flex;gap:0.3rem}

.sched-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #222;font-size:0.8rem}
.job-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #1e1e1e;font-size:0.8rem}

.btn{background:#222;border:1px solid #D35400;color:#D35400;padding:5px 12px;border-radius:3px;cursor:pointer;font-family:monospace;font-size:0.8rem;transition:background 0.15s}
.btn:hover{background:#D35400;color:#fff}
.btn-sm{padding:3px 8px;font-size:0.75rem}
.btn-xs{padding:2px 6px;font-size:0.7rem}
.btn-danger{border-color:#f44336;color:#f44336}
.btn-danger:hover{background:#f44336;color:#fff}

.create-form{margin-top:0.5rem}
.create-form form{display:flex;flex-direction:column;gap:0.5rem;margin-top:0.6rem}
.create-form label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem}
.create-form input,.create-form select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem}
.gpu-select{display:flex;flex-wrap:wrap;gap:0.3rem;font-size:0.8rem;color:#999}
.gpu-check{display:flex;align-items:center;gap:0.2rem;font-size:0.75rem}
.gpu-check input{accent-color:#D35400}

.modal-overlay{display:none;position:fixed;top:0;left:0;width:100%;height:100%;background:rgba(0,0,0,0.7);z-index:100;justify-content:center;align-items:center}
.modal-overlay.active{display:flex}
.modal{background:#1a1a1a;border:1px solid #D35400;border-radius:8px;padding:1.2rem;width:400px;max-width:90vw}
.modal h3{margin-bottom:0.8rem}
.modal label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem;margin-bottom:0.4rem}
.modal input,.modal select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem}
.modal .btn-row{display:flex;gap:0.5rem;margin-top:0.8rem;justify-content:flex-end}
"""

# ── JS ─────────────────────────────────────────────────────────────────────

PIPELINE_TYPES_JSON = json.dumps({k: {"params": v["params"], "defaults": v["defaults"], "label": v["label"]} for k, v in PIPELINE_TYPES.items()})

JS = f"""
const PIPELINES = {PIPELINE_TYPES_JSON};

function api(action, data) {{
  return fetch('/api/action', {{
    method:'POST', headers:{{'Content-Type':'application/json'}},
    body: JSON.stringify({{action, ...data}})
  }}).then(r=>r.json());
}}

function updateParamFields(pipeline) {{
  const p = PIPELINES[pipeline];
  if (!p) return;
  const container = document.getElementById('param-fields');
  container.innerHTML = '';
  for (const key of p.params) {{
    const val = p.defaults[key] ?? '';
    const label = document.createElement('label');
    label.textContent = key;
    const input = document.createElement('input');
    input.name = 'param_' + key;
    input.value = val;
    label.appendChild(input);
    container.appendChild(label);
  }}
}}

function createPreset(e) {{
  e.preventDefault();
  const form = e.target;
  const fd = new FormData(form);
  const gpus = fd.getAll('gpus');
  const params = {{}};
  for (const [k,v] of fd.entries()) {{
    if (k.startsWith('param_')) params[k.slice(6)] = v;
  }}
  api('create_preset', {{
    name: fd.get('name'),
    pipeline: fd.get('pipeline'),
    gpus, params
  }}).then(() => liveRefresh());
  return false;
}}

function launchPreset(id) {{
  if (confirm('Launch this preset now?'))
    api('launch_preset', {{preset_id: id}}).then(() => setTimeout(()=>location.reload(), 1000));
}}

function deletePreset(id) {{
  if (confirm('Delete this preset?'))
    api('delete_preset', {{preset_id: id}}).then(() => location.reload());
}}

function loadModel(gpuId) {{
  const sel = document.getElementById('ms-' + gpuId);
  if (!sel) return;
  api('load_model', {{gpu_id: gpuId, model: sel.value}}).then(() =>
    setTimeout(()=>location.reload(), 3000));
}}

function cancelJob(id) {{
  api('cancel_job', {{job_id: id}}).then(() => location.reload());
}}

function cancelSchedule(id) {{
  api('cancel_schedule', {{schedule_id: id}}).then(() => location.reload());
}}

// Schedule modal
let _schedPresetId = null;
function schedulePreset(id) {{
  _schedPresetId = id;
  document.getElementById('sched-modal').classList.add('active');
}}
function closeModal() {{
  document.getElementById('sched-modal').classList.remove('active');
}}
function submitSchedule(e) {{
  e.preventDefault();
  const fd = new FormData(e.target);
  const ttype = fd.get('trigger_type');
  const trigger = {{type: ttype}};
  if (ttype === 'time') {{
    const mode = fd.get('time_mode');
    if (mode === 'at') trigger.at = fd.get('time_at');
    else trigger.duration_seconds = parseInt(fd.get('duration_hours')||0)*3600 + parseInt(fd.get('duration_mins')||0)*60;
  }} else if (ttype === 'cost') {{
    trigger.threshold_usd = parseFloat(fd.get('cost_threshold'));
  }}
  api('create_schedule', {{preset_id: _schedPresetId, trigger}}).then(() => liveRefresh());
  return false;
}}

// Init param fields for first pipeline
document.addEventListener('DOMContentLoaded', () => {{
  const sel = document.querySelector('[name=pipeline]');
  if (sel) updateParamFields(sel.value);
}});

// Live refresh — update dynamic sections without reloading the page
function liveRefresh() {{
  fetch('/api/fragments').then(r => r.json()).then(f => {{
    const ids = {{'gpu-grid':'gpus', 'train-section':'training', 'schedule-section':'schedule', 'jobs-section':'jobs', 'presets-list':'presets', 'refresh-time':'refresh_time'}};
    for (const [elId, key] of Object.entries(ids)) {{
      const el = document.getElementById(elId);
      if (el && f[key] != null) el.innerHTML = f[key];
    }}
  }}).catch(() => {{}});
}}
setInterval(liveRefresh, 10000);
"""


# ── HTTP Handler ───────────────────────────────────────────────────────────

class SchedulerHandler(BaseHTTPRequestHandler):
    def log_message(self, format, *args):
        pass

    def do_GET(self):
        path = urlparse(self.path).path
        if path in ("/", "/dashboard"):
            html = _render_page()
            # Inject schedule modal at end of body
            modal = """<div class="modal-overlay" id="sched-modal">
<div class="modal"><h3>Schedule Trigger</h3>
<form onsubmit="return submitSchedule(event)">
  <label>Trigger Type<select name="trigger_type" onchange="document.querySelectorAll('.trig-opts').forEach(e=>e.style.display='none');document.getElementById('trig-'+this.value).style.display='block'">
    <option value="time">Time</option><option value="finish_training">After Training</option><option value="cost">Cost Threshold</option>
  </select></label>
  <div id="trig-time" class="trig-opts">
    <label>Mode<select name="time_mode"><option value="at">At specific time</option><option value="duration">After duration</option></select></label>
    <label>At (ISO)<input name="time_at" placeholder="2026-03-21T08:00:00"></label>
    <label>Duration hours<input name="duration_hours" type="number" value="0"></label>
    <label>Duration mins<input name="duration_mins" type="number" value="0"></label>
  </div>
  <div id="trig-finish_training" class="trig-opts" style="display:none">
    <div style="font-size:0.8rem;color:#999;padding:0.3rem">Fires when current training run completes all steps.</div>
  </div>
  <div id="trig-cost" class="trig-opts" style="display:none">
    <label>Threshold ($)<input name="cost_threshold" type="number" step="0.01" value="1.00"></label>
  </div>
  <div class="btn-row">
    <button type="button" class="btn btn-sm" onclick="closeModal()">Cancel</button>
    <button type="submit" class="btn btn-sm">Create Trigger</button>
  </div>
</form></div></div>"""
            html = html.replace("</body>", modal + "</body>")
            self._respond(200, html, "text/html")
        elif path == "/api/state":
            with _lock:
                data = {"gpus": _state["gpus"], "training": _state["training"],
                        "presets": _presets, "jobs": _jobs[-20:], "schedule": _schedule[-20:]}
            self._respond(200, json.dumps(data, default=str, indent=2), "application/json")
        elif path == "/api/training":
            self._respond(200, json.dumps(_fetch_training_status(), default=str), "application/json")
        elif path == "/api/presets":
            self._respond(200, json.dumps(_presets, indent=2), "application/json")
        elif path == "/api/pipelines":
            info = {k: {"label": v["label"], "description": v["description"], "params": v["params"], "defaults": v["defaults"]}
                    for k, v in PIPELINE_TYPES.items()}
            self._respond(200, json.dumps(info, indent=2), "application/json")
        elif path == "/api/fragments":
            # Return HTML fragments for live refresh (no full page reload)
            with _lock:
                state = dict(_state)
            gpu_cards = ""
            for gpu in GPUS:
                data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]})
                gpu_cards += _gpu_card_html(data)
            online_count = len([g for g in state["gpus"].values() if g.get("online")])
            last_refresh = state.get("last_refresh", "never")
            fragments = {
                "gpus": gpu_cards,
                "training": _training_card_html(state.get("training")),
                "schedule": _schedule_panel_html(),
                "jobs": _jobs_panel_html(),
                "presets": _presets_list_html(),
                "refresh_time": f"{online_count}/{len(GPUS)} GPUs online &mdash; refreshed {last_refresh}",
            }
            self._respond(200, json.dumps(fragments), "application/json")
        else:
            self._respond(404, "Not found", "text/plain")

    def do_POST(self):
        path = urlparse(self.path).path
        if path != "/api/action":
            self._respond(404, "Not found", "text/plain")
            return

        length = int(self.headers.get("Content-Length", 0))
        body = json.loads(self.rfile.read(length)) if length else {}
        action = body.get("action", "")

        try:
            result = self._handle_action(action, body)
            self._respond(200, json.dumps(result, default=str), "application/json")
        except Exception as e:
            self._respond(500, json.dumps({"ok": False, "error": str(e)}), "application/json")

    def _handle_action(self, action, body):
        if action == "refresh":
            threading.Thread(target=refresh_state, daemon=True).start()
            return {"ok": True}

        elif action == "create_preset":
            pid = str(uuid.uuid4())[:8]
            _presets[pid] = {
                "id": pid,
                "name": body.get("name", "unnamed"),
                "pipeline": body.get("pipeline", "self_play"),
                "gpus": body.get("gpus", []),
                "params": body.get("params", {}),
                "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            }
            _save_presets()
            return {"ok": True, "preset_id": pid}

        elif action == "delete_preset":
            pid = body.get("preset_id")
            if pid in _presets:
                del _presets[pid]
                _save_presets()
            return {"ok": True}

        elif action == "launch_preset":
            job = _launch_preset(body.get("preset_id"))
            return {"ok": True, "job": job}

        elif action == "create_schedule":
            sid = str(uuid.uuid4())[:8]
            sched = {
                "id": sid,
                "preset_id": body.get("preset_id"),
                "trigger": body.get("trigger", {}),
                "status": "pending",
                "created_at": datetime.now().isoformat(),
            }
            _schedule.append(sched)
            _save_schedule()
            return {"ok": True, "schedule_id": sid}

        elif action == "cancel_schedule":
            sid = body.get("schedule_id")
            for s in _schedule:
                if s["id"] == sid:
                    s["status"] = "cancelled"
            _save_schedule()
            return {"ok": True}

        elif action == "cancel_job":
            jid = body.get("job_id")
            for j in _jobs:
                if j["id"] == jid and j["status"] == "running":
                    j["status"] = "cancelled"
            _save_jobs()
            return {"ok": True}

        elif action == "load_model":
            gpu_id = body.get("gpu_id")
            model = body.get("model")
            gpu = GPU_MAP.get(gpu_id)
            if not gpu:
                return {"ok": False, "error": "unknown GPU"}
            result = _ollama_api(gpu, "/api/generate", method="POST", data={
                "model": model, "prompt": "test", "stream": False,
                "options": {"num_predict": 1},
            })
            return {"ok": True, "result": result}

        elif action == "stop_ollama":
            gpu_id = body.get("gpu_id", "3090ti")
            gpu = GPU_MAP.get(gpu_id)
            if gpu:
                svc = gpu.get("ollama_service", "ollama.service")
                _ssh_cmd(gpu, f"sudo systemctl stop {svc} 2>&1", timeout=10)
            return {"ok": True}

        elif action == "start_ollama":
            gpu_id = body.get("gpu_id", "3090ti")
            gpu = GPU_MAP.get(gpu_id)
            if gpu:
                svc = gpu.get("ollama_service", "ollama.service")
                _ssh_cmd(gpu, f"sudo systemctl start {svc} 2>&1", timeout=10)
            return {"ok": True}

        return {"ok": False, "error": f"unknown action: {action}"}

    def _respond(self, code, body, content_type):
        self.send_response(code)
        self.send_header("Content-Type", content_type)
        self.send_header("Access-Control-Allow-Origin", "*")
        self.end_headers()
        self.wfile.write(body.encode() if isinstance(body, str) else body)


# ── Main ───────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Mortdecai GPU Scheduler")
    parser.add_argument("--port", type=int, default=PORT)
    parser.add_argument("--refresh-interval", type=int, default=10)
    args = parser.parse_args()

    _load_persisted()
    print(f"Loaded {len(_presets)} presets, {len(_jobs)} jobs, {len(_schedule)} schedules")

    t = threading.Thread(target=_bg_refresh_loop, args=(args.refresh_interval,), daemon=True)
    t.start()

    print("Initial GPU scan...")
    refresh_state()

    server = HTTPServer(("0.0.0.0", args.port), SchedulerHandler)
    print(f"GPU Scheduler on http://0.0.0.0:{args.port}")
    print(f"  {len(GPUS)} GPUs, refresh {args.refresh_interval}s")
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        print("\nShutting down.")


if __name__ == "__main__":
    main()