From c5865feb35ba4418cc50e083d100e615ec7185b4 Mon Sep 17 00:00:00 2001 From: Seth Freiberg Date: Fri, 20 Mar 2026 19:26:43 -0400 Subject: [PATCH] =?UTF-8?q?Mortdecai=20Gateway=20=E2=80=94=20authenticated?= =?UTF-8?q?=20Ollama=20proxy=20with=20power=20metering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - API key auth on all inference endpoints - Power/cost tracking: GPU TDP × inference time × electricity rate - Spending cap enforcement - Web dashboard with live stats - Docker compose for AMD ROCm (Strix Halo) or NVIDIA - Auto-setup script with GGUF loading - Tested against local Ollama Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 6 + .gitignore | 6 + Dockerfile | 5 + README.md | 78 +++++++++++ docker-compose.yml | 62 +++++++++ gateway.py | 319 +++++++++++++++++++++++++++++++++++++++++++++ setup.sh | 85 ++++++++++++ 7 files changed, 561 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 gateway.py create mode 100755 setup.sh diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..63f39e0 --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +# Mortdecai Gateway Configuration +API_KEY=mk_change_this_to_a_real_key +GPU_TDP_WATTS=54 +SYSTEM_OVERHEAD_WATTS=30 +ELECTRICITY_RATE=0.15 +SPENDING_CAP=10.00 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..99b5498 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.env +SESSION.default.md +GITEA_API.md +__pycache__/ +*.pyc +models/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f1b767c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM python:3.11-slim +RUN pip install --no-cache-dir requests +WORKDIR /app +COPY gateway.py . +CMD ["python3", "gateway.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..e374434 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ +# Mortdecai Gateway + +Authenticated Ollama proxy with power metering. Deploy on any machine with a GPU to contribute inference compute to the Mortdecai training pipeline. + +## Quick Start + +```bash +git clone +cd mortdecai-gateway +mkdir -p models +# Copy the GGUF file into models/ +cp /path/to/mortdecai-v4.gguf models/ +chmod +x setup.sh +./setup.sh +``` + +Dashboard: http://localhost:8434/dashboard + +## What It Does + +``` +Your GPU → Ollama → Gateway (auth + metering) → Port 8434 → Internet +``` + +The gateway sits in front of Ollama and: +- Authenticates requests via API key +- Tracks inference time, tokens, energy usage +- Estimates electricity cost (GPU TDP × time × rate) +- Enforces a spending cap +- Provides a dashboard with live stats + +## Configuration + +Edit `.env`: + +``` +API_KEY=mk_your_secret_key +GPU_TDP_WATTS=54 # Your GPU's TDP +SYSTEM_OVERHEAD_WATTS=30 # CPU/RAM draw during inference +ELECTRICITY_RATE=0.15 # $/kWh +SPENDING_CAP=10.00 # $ before gateway stops accepting +``` + +## Endpoints + +| Endpoint | Auth | Description | +|----------|------|-------------| +| `GET /health` | No | Ollama status + loaded models | +| `GET /dashboard` | No | Web dashboard with live stats | +| `GET /stats` | Yes | JSON usage stats | +| `POST /api/chat` | Yes | Proxied to Ollama | +| `POST /api/generate` | Yes | Proxied to Ollama | +| `*` | Yes | Everything else proxied to Ollama | + +## Response Metadata + +Every proxied response includes a `_gateway` field: + +```json +{ + "message": { "role": "assistant", "content": "..." }, + "_gateway": { + "duration_seconds": 3.42, + "energy_wh": 0.0798, + "estimated_cost": 0.000012, + "total_cost": 0.0342, + "budget_remaining": 9.9658 + } +} +``` + +## AMD ROCm + +The Docker compose uses `ollama/ollama:rocm` by default. Requires ROCm drivers on the host. For Strix Halo, ensure BIOS is set to reserved VRAM mode. + +## NVIDIA + +Edit `docker-compose.yml`: uncomment the `deploy` section and comment out the `devices` section. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..259f695 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +version: "3.8" + +# Mortdecai Inference Gateway +# Deploy on any machine with Ollama-compatible GPU +# +# Usage: +# docker compose up -d +# # Dashboard at http://localhost:8434/dashboard +# +# For AMD ROCm (Strix Halo, RX 7000, etc): +# Ollama image auto-detects ROCm. Ensure rocm drivers are installed on host. +# +# For NVIDIA: +# Requires nvidia-container-toolkit installed on host. + +services: + ollama: + image: ollama/ollama:rocm + container_name: mortdecai-ollama + restart: unless-stopped + ports: + - "127.0.0.1:11434:11434" # Only accessible to gateway, not exposed + volumes: + - ollama-data:/root/.ollama + - ./models:/models + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + environment: + - OLLAMA_HOST=0.0.0.0:11434 + # For NVIDIA, replace 'devices' above with: + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # count: all + # capabilities: [gpu] + + gateway: + build: . + container_name: mortdecai-gateway + restart: unless-stopped + ports: + - "8434:8434" # This is the only exposed port + environment: + - OLLAMA_URL=http://ollama:11434 + - API_KEY=${API_KEY:-mk_mortdecai_default} + - GATEWAY_PORT=8434 + - GPU_TDP_WATTS=${GPU_TDP_WATTS:-54} + - SYSTEM_OVERHEAD_WATTS=${SYSTEM_OVERHEAD_WATTS:-30} + - ELECTRICITY_RATE=${ELECTRICITY_RATE:-0.15} + - SPENDING_CAP=${SPENDING_CAP:-10.00} + - STATS_FILE=/data/stats.json + volumes: + - gateway-data:/data + depends_on: + - ollama + +volumes: + ollama-data: + gateway-data: diff --git a/gateway.py b/gateway.py new file mode 100644 index 0000000..8ab0c62 --- /dev/null +++ b/gateway.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Mortdecai Ollama Gateway — authenticated proxy with power metering. + +Sits in front of Ollama, provides: +- API key authentication +- Power/cost tracking (GPU utilization × TDP × electricity rate) +- Usage dashboard +- Spending cap enforcement +- Health check endpoint + +Usage: + python3 gateway.py + OLLAMA_URL=http://localhost:11434 API_KEY=mk_test python3 gateway.py +""" + +import json +import os +import time +import threading +import subprocess +from http.server import HTTPServer, BaseHTTPRequestHandler +from urllib.parse import urlparse, parse_qs +import requests + +# --- Config --- +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") +LISTEN_PORT = int(os.environ.get("GATEWAY_PORT", "8434")) +API_KEY = os.environ.get("API_KEY", "mk_mortdecai_default") +ELECTRICITY_RATE = float(os.environ.get("ELECTRICITY_RATE", "0.15")) # $/kWh +GPU_TDP_WATTS = float(os.environ.get("GPU_TDP_WATTS", "54")) # Strix Halo iGPU +SYSTEM_OVERHEAD_WATTS = float(os.environ.get("SYSTEM_OVERHEAD_WATTS", "30")) # CPU/RAM/etc idle draw during inference +SPENDING_CAP = float(os.environ.get("SPENDING_CAP", "10.00")) # $ before refusing requests +STATS_FILE = os.environ.get("STATS_FILE", "/var/lib/mortdecai-gateway/stats.json") + +# --- Stats tracking --- +_stats_lock = threading.Lock() +_stats = { + "total_requests": 0, + "total_tokens_in": 0, + "total_tokens_out": 0, + "total_inference_seconds": 0, + "total_energy_wh": 0.0, + "total_cost": 0.0, + "started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"), + "last_request_at": None, + "requests_rejected": 0, +} + + +def _load_stats(): + global _stats + try: + with open(STATS_FILE) as f: + saved = json.load(f) + _stats.update(saved) + except: + pass + + +def _save_stats(): + try: + os.makedirs(os.path.dirname(STATS_FILE), exist_ok=True) + with open(STATS_FILE, "w") as f: + json.dump(_stats, f, indent=2) + except: + pass + + +def _track_request(tokens_in, tokens_out, duration_seconds): + """Track a completed inference request.""" + with _stats_lock: + _stats["total_requests"] += 1 + _stats["total_tokens_in"] += tokens_in + _stats["total_tokens_out"] += tokens_out + _stats["total_inference_seconds"] += duration_seconds + _stats["last_request_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") + + # Power calculation + # GPU draws TDP watts during inference, plus system overhead + total_watts = GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS + energy_wh = (total_watts * duration_seconds) / 3600 + cost = (energy_wh / 1000) * ELECTRICITY_RATE + + _stats["total_energy_wh"] += energy_wh + _stats["total_cost"] += cost + + # Save every 10 requests + if _stats["total_requests"] % 10 == 0: + _save_stats() + + +def _check_budget(): + """Returns True if under spending cap.""" + with _stats_lock: + return _stats["total_cost"] < SPENDING_CAP + + +def _get_gpu_utilization(): + """Get current GPU utilization via nvidia-smi or rocm-smi.""" + try: + # Try nvidia-smi first + result = subprocess.run( + ["nvidia-smi", "--query-gpu=utilization.gpu,temperature.gpu,power.draw", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + parts = [p.strip() for p in result.stdout.strip().split(",")] + return { + "utilization": float(parts[0]), + "temperature": float(parts[1]), + "power_watts": float(parts[2]) if parts[2] != "[N/A]" else GPU_TDP_WATTS, + "source": "nvidia-smi" + } + except: + pass + + try: + # Try rocm-smi for AMD + result = subprocess.run( + ["rocm-smi", "--showuse", "--showtemp", "--json"], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + data = json.loads(result.stdout) + # Parse rocm-smi JSON (format varies by version) + for card_id, card_data in data.items(): + if isinstance(card_data, dict): + return { + "utilization": float(card_data.get("GPU use (%)", 0)), + "temperature": float(card_data.get("Temperature (Sensor edge) (C)", 0)), + "power_watts": GPU_TDP_WATTS, + "source": "rocm-smi" + } + except: + pass + + return {"utilization": 0, "temperature": 0, "power_watts": 0, "source": "unavailable"} + + +# --- HTTP Handler --- + +class GatewayHandler(BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + pass # Quiet + + def _check_auth(self): + auth = self.headers.get("Authorization", "") + if auth == f"Bearer {API_KEY}" or auth == API_KEY: + return True + self._send_json(401, {"error": "Invalid API key"}) + return False + + def _send_json(self, status, data): + body = json.dumps(data).encode() + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", len(body)) + self.end_headers() + self.wfile.write(body) + + def _proxy_to_ollama(self, path, body=None): + """Proxy request to Ollama and track usage.""" + if not _check_budget(): + with _stats_lock: + _stats["requests_rejected"] += 1 + self._send_json(402, { + "error": "Spending cap reached", + "total_cost": _stats["total_cost"], + "cap": SPENDING_CAP, + }) + return + + t0 = time.time() + try: + if body: + r = requests.post(f"{OLLAMA_URL}{path}", json=body, timeout=120) + else: + r = requests.get(f"{OLLAMA_URL}{path}", timeout=10) + + duration = time.time() - t0 + data = r.json() + + # Track token usage from response + tokens_in = data.get("prompt_eval_count", 0) + tokens_out = data.get("eval_count", 0) + if tokens_in or tokens_out: + _track_request(tokens_in, tokens_out, duration) + + # Add gateway metadata to response + if isinstance(data, dict): + data["_gateway"] = { + "duration_seconds": round(duration, 2), + "energy_wh": round((GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS) * duration / 3600, 4), + "estimated_cost": round(((GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS) * duration / 3600 / 1000) * ELECTRICITY_RATE, 6), + "total_cost": round(_stats["total_cost"], 4), + "budget_remaining": round(SPENDING_CAP - _stats["total_cost"], 4), + } + + self._send_json(r.status_code, data) + + except requests.exceptions.ConnectionError: + self._send_json(502, {"error": "Ollama is not running"}) + except requests.exceptions.Timeout: + self._send_json(504, {"error": "Ollama timeout"}) + except Exception as e: + self._send_json(500, {"error": str(e)}) + + def do_GET(self): + parsed = urlparse(self.path) + + # Public endpoints (no auth) + if parsed.path == "/health": + try: + r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5) + models = [m["name"] for m in r.json().get("models", [])] + self._send_json(200, {"status": "ok", "ollama": "connected", "models": models}) + except: + self._send_json(503, {"status": "error", "ollama": "disconnected"}) + return + + if parsed.path == "/stats": + if not self._check_auth(): + return + gpu = _get_gpu_utilization() + with _stats_lock: + stats_copy = dict(_stats) + stats_copy["gpu"] = gpu + stats_copy["config"] = { + "gpu_tdp_watts": GPU_TDP_WATTS, + "system_overhead_watts": SYSTEM_OVERHEAD_WATTS, + "electricity_rate": ELECTRICITY_RATE, + "spending_cap": SPENDING_CAP, + } + self._send_json(200, stats_copy) + return + + if parsed.path == "/dashboard": + self._serve_dashboard() + return + + # Proxy everything else to Ollama + if not self._check_auth(): + return + self._proxy_to_ollama(self.path) + + def do_POST(self): + if not self._check_auth(): + return + + length = int(self.headers.get("Content-Length", 0)) + body = json.loads(self.rfile.read(length)) if length > 0 else None + + self._proxy_to_ollama(self.path, body) + + def _serve_dashboard(self): + """Simple HTML dashboard showing usage stats.""" + with _stats_lock: + s = dict(_stats) + gpu = _get_gpu_utilization() + + html = f""" +Mortdecai Gateway + + +

Mortdecai Gateway

+
Status: {"ACTIVE" if _check_budget() else "PAUSED (cap reached)"}
+
Total Requests: {s['total_requests']}
+
Tokens (in/out): {s['total_tokens_in']:,} / {s['total_tokens_out']:,}
+
Inference Time: {s['total_inference_seconds']:.0f}s
+
Energy Used: {s['total_energy_wh']:.1f} Wh
+
Estimated Cost: ${s['total_cost']:.4f} / ${SPENDING_CAP:.2f}
+
Rejected (over cap): {s['requests_rejected']}
+
GPU Utilization: {gpu['utilization']}% ({gpu['source']})
+
GPU Temperature: {gpu['temperature']}°C
+
Last Request: {s['last_request_at'] or 'never'}
+
Config: TDP={GPU_TDP_WATTS}W + {SYSTEM_OVERHEAD_WATTS}W overhead @ ${ELECTRICITY_RATE}/kWh
+""" + + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write(html.encode()) + + +def main(): + _load_stats() + + print(f"Mortdecai Gateway starting") + print(f" Ollama: {OLLAMA_URL}") + print(f" Listen: 0.0.0.0:{LISTEN_PORT}") + print(f" TDP: {GPU_TDP_WATTS}W + {SYSTEM_OVERHEAD_WATTS}W overhead") + print(f" Rate: ${ELECTRICITY_RATE}/kWh") + print(f" Cap: ${SPENDING_CAP}") + print(f" Dashboard: http://localhost:{LISTEN_PORT}/dashboard") + + # Save stats periodically + def _periodic_save(): + while True: + time.sleep(60) + with _stats_lock: + _save_stats() + + t = threading.Thread(target=_periodic_save, daemon=True) + t.start() + + server = HTTPServer(("0.0.0.0", LISTEN_PORT), GatewayHandler) + server.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..4eaae57 --- /dev/null +++ b/setup.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Quick setup for Mortdecai Gateway +# Run this after cloning the repo + +set -e + +echo "=== Mortdecai Gateway Setup ===" + +# Generate API key if not set +if [ ! -f .env ]; then + KEY="mk_$(openssl rand -hex 16)" + cat > .env << EOF +API_KEY=$KEY +GPU_TDP_WATTS=54 +SYSTEM_OVERHEAD_WATTS=30 +ELECTRICITY_RATE=0.15 +SPENDING_CAP=10.00 +EOF + echo "Generated API key: $KEY" + echo "Saved to .env" +else + echo ".env already exists" +fi + +# Start containers +echo "Starting containers..." +docker compose up -d + +# Wait for Ollama to be ready +echo "Waiting for Ollama..." +for i in $(seq 1 30); do + if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then + echo "Ollama is ready" + break + fi + sleep 2 +done + +# Load the model if GGUF exists +if ls models/*.gguf 1>/dev/null 2>&1; then + GGUF=$(ls models/*.gguf | head -1) + MODEL_NAME=$(basename "$GGUF" .gguf | tr '[:upper:]' '[:lower:]') + echo "Loading model from $GGUF..." + + cat > /tmp/Modelfile << MEOF +FROM /models/$(basename $GGUF) +TEMPLATE """{{- if .Messages }} +{{- if or .System .Tools }}<|im_start|>system +{{- if .System }} +{{ .System }} +{{- end }} +<|im_end|> +{{ end }} +{{- range \$m := .Messages }} +{{- if eq \$m.Role "user" }}<|im_start|>user +{{ \$m.Content }}<|im_end|> +{{- else if eq \$m.Role "assistant" }}<|im_start|>assistant +{{ \$m.Content }}<|im_end|> +{{- end }} +{{- end }}<|im_start|>assistant +{{ end }}""" +PARAMETER stop <|im_end|> +PARAMETER stop <|im_start|> +PARAMETER temperature 0.7 +MEOF + + docker exec mortdecai-ollama ollama create mortdecai-v4 -f /tmp/Modelfile + echo "Model loaded as mortdecai-v4" +else + echo "No GGUF found in models/ — place your GGUF file there and run:" + echo " docker exec mortdecai-ollama ollama create mortdecai-v4 -f Modelfile" +fi + +echo "" +echo "=== Setup Complete ===" +echo "Dashboard: http://localhost:8434/dashboard" +echo "API Key: $(grep API_KEY .env | cut -d= -f2)" +echo "" +echo "Test: curl -s http://localhost:8434/health" +echo "" +echo "To use from remote:" +echo " curl -X POST http://YOUR_IP:8434/api/chat \\" +echo " -H 'Authorization: Bearer YOUR_API_KEY' \\" +echo " -H 'Content-Type: application/json' \\" +echo " -d '{\"model\": \"mortdecai-v4\", \"messages\": [{\"role\": \"user\", \"content\": \"test\"}]}'"