Full cost model: marginal power, labor, profit, live config

Cost model:
- Marginal billing: only charge for watts above idle
- Dedicated billing: charge for all uptime (optional)
- Labor rate: $/hr for operator time, manually logged
- Profit margin: percentage markup on electricity cost
- All parameters adjustable live via POST /config

Dashboard shows:
- Cost breakdown with progress bar
- Power model (idle→load for GPU and system)
- Marginal watts per inference call
- Labor hours + labor cost
- Total owed (electricity + labor + margin)
- GPU utilization, temperature, power draw
- Avg cost per request, estimated remaining requests

Endpoints:
- GET /config — view current cost config
- POST /config — update any parameter live
- GET /stats — full usage stats + cost config (auth required)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-20 19:49:14 -04:00
parent 648b123f14
commit 6d3df9ae58
2 changed files with 183 additions and 43 deletions
+22 -5
View File
@@ -1,7 +1,24 @@
# Mortdecai Gateway Configuration # Mortdecai Gateway Configuration
# All values can also be adjusted live via POST /config
# Auth
API_KEY=mk_change_this_to_a_real_key API_KEY=mk_change_this_to_a_real_key
GPU_TDP_WATTS=54
SYSTEM_OVERHEAD_WATTS=30 # Power model
ELECTRICITY_RATE=0.15 GPU_IDLE_WATTS=15 # GPU at idle (watts)
SPENDING_CAP=10.00 GPU_LOAD_WATTS=54 # GPU during inference (watts)
ALLOW_MODEL_UPDATES=false SYSTEM_IDLE_WATTS=45 # Whole system idle (watts)
SYSTEM_INFERENCE_WATTS=65 # Whole system during inference (watts)
# Billing
ELECTRICITY_RATE=0.15 # $/kWh
BILLING_MODE=marginal # "marginal" (only extra watts) or "dedicated" (all uptime)
BASE_RATE_PER_HOUR=0.00 # $/hr base (dedicated mode only)
SPENDING_CAP=10.00 # $ before gateway stops accepting
# Labor & profit
LABOR_RATE_PER_HOUR=0.00 # $/hr for setup/maintenance time
PROFIT_MARGIN=0.00 # Markup multiplier (0.10 = 10%)
# Features
ALLOW_MODEL_UPDATES=false # Allow remote model push via /admin/update-model
+161 -38
View File
@@ -27,11 +27,52 @@ import requests
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
LISTEN_PORT = int(os.environ.get("GATEWAY_PORT", "8434")) LISTEN_PORT = int(os.environ.get("GATEWAY_PORT", "8434"))
API_KEY = os.environ.get("API_KEY", "mk_mortdecai_default") API_KEY = os.environ.get("API_KEY", "mk_mortdecai_default")
ELECTRICITY_RATE = float(os.environ.get("ELECTRICITY_RATE", "0.15")) # $/kWh
GPU_TDP_WATTS = float(os.environ.get("GPU_TDP_WATTS", "54")) # Strix Halo iGPU
SYSTEM_OVERHEAD_WATTS = float(os.environ.get("SYSTEM_OVERHEAD_WATTS", "30")) # CPU/RAM/etc idle draw during inference
SPENDING_CAP = float(os.environ.get("SPENDING_CAP", "10.00")) # $ before refusing requests
STATS_FILE = os.environ.get("STATS_FILE", "/var/lib/mortdecai-gateway/stats.json") STATS_FILE = os.environ.get("STATS_FILE", "/var/lib/mortdecai-gateway/stats.json")
CONFIG_FILE = os.environ.get("CONFIG_FILE", "/var/lib/mortdecai-gateway/cost_config.json")
# Default cost config (overridden by config file or env vars)
_DEFAULT_COST_CONFIG = {
"electricity_rate": 0.15, # $/kWh
"gpu_idle_watts": 15, # GPU at idle
"gpu_load_watts": 54, # GPU during inference
"system_idle_watts": 45, # Whole system idle (CPU/RAM/fans/PSU)
"system_inference_watts": 65, # Whole system during inference
"billing_mode": "marginal", # "marginal" = only extra watts; "dedicated" = all uptime
"base_rate_per_hour": 0.00, # $/hr for keeping machine on (dedicated mode only)
"spending_cap": 10.00, # $ before refusing requests
"labor_rate_per_hour": 0.00, # $/hr for operator's time (setup, maintenance)
"profit_margin": 0.00, # multiplier (0.10 = 10% markup)
"labor_hours_logged": 0.0, # total hours spent on setup/maintenance
}
def _load_cost_config():
config = dict(_DEFAULT_COST_CONFIG)
# Override from file
try:
with open(CONFIG_FILE) as f:
config.update(json.load(f))
except:
pass
# Override from env vars
for key in _DEFAULT_COST_CONFIG:
env_key = key.upper()
val = os.environ.get(env_key)
if val is not None:
try:
config[key] = type(_DEFAULT_COST_CONFIG[key])(val)
except:
pass
return config
def _save_cost_config(config):
try:
os.makedirs(os.path.dirname(CONFIG_FILE), exist_ok=True)
with open(CONFIG_FILE, "w") as f:
json.dump(config, f, indent=2)
except:
pass
COST_CONFIG = _load_cost_config()
# --- Stats tracking --- # --- Stats tracking ---
_stats_lock = threading.Lock() _stats_lock = threading.Lock()
@@ -67,25 +108,49 @@ def _save_stats():
pass pass
def _calc_marginal_cost(duration_seconds):
"""Calculate marginal electricity cost for an inference call."""
c = COST_CONFIG
if c["billing_mode"] == "marginal":
# Only charge for extra watts above idle
marginal_gpu = c["gpu_load_watts"] - c["gpu_idle_watts"]
marginal_system = c["system_inference_watts"] - c["system_idle_watts"]
marginal_watts = marginal_gpu + marginal_system
else:
# Dedicated: charge for full system draw during inference
marginal_watts = c["gpu_load_watts"] + c["system_inference_watts"]
energy_wh = (marginal_watts * duration_seconds) / 3600
electricity_cost = (energy_wh / 1000) * c["electricity_rate"]
# Apply profit margin
cost = electricity_cost * (1 + c["profit_margin"])
return marginal_watts, energy_wh, cost
def _track_request(tokens_in, tokens_out, duration_seconds): def _track_request(tokens_in, tokens_out, duration_seconds):
"""Track a completed inference request.""" """Track a completed inference request."""
marginal_watts, energy_wh, cost = _calc_marginal_cost(duration_seconds)
with _stats_lock: with _stats_lock:
_stats["total_requests"] += 1 _stats["total_requests"] += 1
_stats["total_tokens_in"] += tokens_in _stats["total_tokens_in"] += tokens_in
_stats["total_tokens_out"] += tokens_out _stats["total_tokens_out"] += tokens_out
_stats["total_inference_seconds"] += duration_seconds _stats["total_inference_seconds"] += duration_seconds
_stats["last_request_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") _stats["last_request_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
# Power calculation
# GPU draws TDP watts during inference, plus system overhead
total_watts = GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS
energy_wh = (total_watts * duration_seconds) / 3600
cost = (energy_wh / 1000) * ELECTRICITY_RATE
_stats["total_energy_wh"] += energy_wh _stats["total_energy_wh"] += energy_wh
_stats["total_cost"] += cost _stats["total_cost"] += cost
_stats["total_marginal_watts_avg"] = (
_stats.get("total_marginal_watts_avg", marginal_watts) * 0.95 + marginal_watts * 0.05
)
# Base rate for dedicated mode
if COST_CONFIG["billing_mode"] == "dedicated" and COST_CONFIG["base_rate_per_hour"] > 0:
# Add base rate proportional to time since last request
last = _stats.get("_last_base_calc", time.time())
elapsed_hours = (time.time() - last) / 3600
_stats["total_cost"] += COST_CONFIG["base_rate_per_hour"] * elapsed_hours
_stats["_last_base_calc"] = time.time()
# Save every 10 requests
if _stats["total_requests"] % 10 == 0: if _stats["total_requests"] % 10 == 0:
_save_stats() _save_stats()
@@ -93,7 +158,7 @@ def _track_request(tokens_in, tokens_out, duration_seconds):
def _check_budget(): def _check_budget():
"""Returns True if under spending cap.""" """Returns True if under spending cap."""
with _stats_lock: with _stats_lock:
return _stats["total_cost"] < SPENDING_CAP return _stats["total_cost"] < COST_CONFIG["spending_cap"]
def _get_gpu_utilization(): def _get_gpu_utilization():
@@ -190,12 +255,15 @@ class GatewayHandler(BaseHTTPRequestHandler):
# Add gateway metadata to response # Add gateway metadata to response
if isinstance(data, dict): if isinstance(data, dict):
mw, ewh, ecost = _calc_marginal_cost(duration)
data["_gateway"] = { data["_gateway"] = {
"duration_seconds": round(duration, 2), "duration_seconds": round(duration, 2),
"energy_wh": round((GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS) * duration / 3600, 4), "marginal_watts": round(mw, 1),
"estimated_cost": round(((GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS) * duration / 3600 / 1000) * ELECTRICITY_RATE, 6), "energy_wh": round(ewh, 4),
"estimated_cost": round(ecost, 6),
"total_cost": round(_stats["total_cost"], 4), "total_cost": round(_stats["total_cost"], 4),
"budget_remaining": round(SPENDING_CAP - _stats["total_cost"], 4), "budget_remaining": round(COST_CONFIG["spending_cap"] - _stats["total_cost"], 4),
"billing_mode": COST_CONFIG["billing_mode"],
} }
self._send_json(r.status_code, data) self._send_json(r.status_code, data)
@@ -225,17 +293,18 @@ class GatewayHandler(BaseHTTPRequestHandler):
return return
gpu = _get_gpu_utilization() gpu = _get_gpu_utilization()
with _stats_lock: with _stats_lock:
stats_copy = dict(_stats) stats_copy = {k: v for k, v in _stats.items() if not k.startswith("_")}
stats_copy["gpu"] = gpu stats_copy["gpu"] = gpu
stats_copy["config"] = { stats_copy["cost_config"] = COST_CONFIG
"gpu_tdp_watts": GPU_TDP_WATTS,
"system_overhead_watts": SYSTEM_OVERHEAD_WATTS,
"electricity_rate": ELECTRICITY_RATE,
"spending_cap": SPENDING_CAP,
}
self._send_json(200, stats_copy) self._send_json(200, stats_copy)
return return
if parsed.path == "/config":
if not self._check_auth():
return
self._send_json(200, COST_CONFIG)
return
if parsed.path == "/dashboard": if parsed.path == "/dashboard":
self._serve_dashboard() self._serve_dashboard()
return return
@@ -252,6 +321,16 @@ class GatewayHandler(BaseHTTPRequestHandler):
length = int(self.headers.get("Content-Length", 0)) length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length)) if length > 0 else None body = json.loads(self.rfile.read(length)) if length > 0 else None
# Config update endpoint — adjust cost parameters live
if self.path == "/config" and body:
global COST_CONFIG
for key in body:
if key in COST_CONFIG:
COST_CONFIG[key] = type(_DEFAULT_COST_CONFIG.get(key, ""))(body[key])
_save_cost_config(COST_CONFIG)
self._send_json(200, {"status": "updated", "config": COST_CONFIG})
return
# Model update endpoint — downloads new GGUF and reloads # Model update endpoint — downloads new GGUF and reloads
if self.path == "/admin/update-model" and body: if self.path == "/admin/update-model" and body:
self._handle_model_update(body) self._handle_model_update(body)
@@ -299,31 +378,75 @@ class GatewayHandler(BaseHTTPRequestHandler):
def _serve_dashboard(self): def _serve_dashboard(self):
"""Simple HTML dashboard showing usage stats.""" """Simple HTML dashboard showing usage stats."""
with _stats_lock: with _stats_lock:
s = dict(_stats) s = {k: v for k, v in _stats.items() if not k.startswith("_")}
gpu = _get_gpu_utilization() gpu = _get_gpu_utilization()
c = COST_CONFIG
marginal_w = (c["gpu_load_watts"] - c["gpu_idle_watts"]) + (c["system_inference_watts"] - c["system_idle_watts"])
active = _check_budget()
avg_cost_per_req = s["total_cost"] / max(s["total_requests"], 1)
reqs_remaining = int((c["spending_cap"] - s["total_cost"]) / max(avg_cost_per_req, 0.000001)) if avg_cost_per_req > 0 else ""
html = f"""<!DOCTYPE html> html = f"""<!DOCTYPE html>
<html><head><title>Mortdecai Gateway</title> <html><head><title>Mortdecai Gateway</title>
<meta http-equiv="refresh" content="10"> <meta http-equiv="refresh" content="10">
<style> <style>
body {{ font-family: monospace; background: #1a1a1a; color: #e0e0e0; padding: 2rem; }} body {{ font-family: monospace; background: #1a1a1a; color: #e0e0e0; padding: 2rem; max-width: 700px; margin: 0 auto; }}
h1 {{ color: #D35400; }} h1 {{ color: #D35400; }}
.stat {{ background: #252525; border: 1px solid #333; padding: 1rem; margin: 0.5rem 0; border-radius: 6px; }} h2 {{ color: #D35400; font-size: 1rem; margin-top: 1.5rem; border-bottom: 1px solid #333; padding-bottom: 0.3rem; }}
.stat {{ background: #252525; border: 1px solid #333; padding: 0.8rem 1rem; margin: 0.3rem 0; border-radius: 4px; display: flex; justify-content: space-between; }}
.label {{ color: #999; }} .label {{ color: #999; }}
.value {{ color: #D35400; font-size: 1.2rem; font-weight: bold; }} .value {{ color: #D35400; font-weight: bold; }}
.ok {{ color: #4caf50; }}
.warn {{ color: #ff9800; }}
.bad {{ color: #f44336; }}
.bar {{ background: #333; border-radius: 3px; height: 20px; margin: 0.5rem 0; }}
.bar-fill {{ background: #D35400; height: 100%; border-radius: 3px; transition: width 0.5s; }}
</style></head><body> </style></head><body>
<h1>Mortdecai Gateway</h1> <h1>Mortdecai Gateway</h1>
<div class="stat"><span class="label">Status:</span> <span class="value">{"ACTIVE" if _check_budget() else "PAUSED (cap reached)"}</span></div>
<div class="stat"><span class="label">Total Requests:</span> <span class="value">{s['total_requests']}</span></div> <div class="stat"><span class="label">Status</span>
<div class="stat"><span class="label">Tokens (in/out):</span> <span class="value">{s['total_tokens_in']:,} / {s['total_tokens_out']:,}</span></div> <span class="value {'ok' if active else 'bad'}">{'● ACTIVE' if active else '● PAUSED (cap reached)'}</span></div>
<div class="stat"><span class="label">Inference Time:</span> <span class="value">{s['total_inference_seconds']:.0f}s</span></div>
<div class="stat"><span class="label">Energy Used:</span> <span class="value">{s['total_energy_wh']:.1f} Wh</span></div> <h2>Usage</h2>
<div class="stat"><span class="label">Estimated Cost:</span> <span class="value">${s['total_cost']:.4f} / ${SPENDING_CAP:.2f}</span></div> <div class="stat"><span class="label">Requests</span><span class="value">{s['total_requests']:,}</span></div>
<div class="stat"><span class="label">Rejected (over cap):</span> <span class="value">{s['requests_rejected']}</span></div> <div class="stat"><span class="label">Tokens (in / out)</span><span class="value">{s['total_tokens_in']:,} / {s['total_tokens_out']:,}</span></div>
<div class="stat"><span class="label">GPU Utilization:</span> <span class="value">{gpu['utilization']}% ({gpu['source']})</span></div> <div class="stat"><span class="label">Inference Time</span><span class="value">{s['total_inference_seconds']:.0f}s ({s['total_inference_seconds']/3600:.1f}h)</span></div>
<div class="stat"><span class="label">GPU Temperature:</span> <span class="value">{gpu['temperature']}°C</span></div> <div class="stat"><span class="label">Avg per Request</span><span class="value">{s['total_inference_seconds']/max(s['total_requests'],1):.1f}s, {s['total_tokens_out']//max(s['total_requests'],1)} tokens</span></div>
<div class="stat"><span class="label">Last Request:</span> <span class="value">{s['last_request_at'] or 'never'}</span></div> <div class="stat"><span class="label">Rejected (cap)</span><span class="value">{s['requests_rejected']}</span></div>
<div class="stat"><span class="label">Config:</span> <span class="value">TDP={GPU_TDP_WATTS}W + {SYSTEM_OVERHEAD_WATTS}W overhead @ ${ELECTRICITY_RATE}/kWh</span></div> <div class="stat"><span class="label">Last Request</span><span class="value">{s['last_request_at'] or 'never'}</span></div>
<h2>Cost</h2>
<div class="bar"><div class="bar-fill" style="width: {min(s['total_cost']/max(c['spending_cap'],0.01)*100, 100):.0f}%"></div></div>
<div class="stat"><span class="label">Spent</span><span class="value">${s['total_cost']:.4f}</span></div>
<div class="stat"><span class="label">Budget</span><span class="value">${c['spending_cap']:.2f}</span></div>
<div class="stat"><span class="label">Remaining</span><span class="value">${c['spending_cap'] - s['total_cost']:.4f} (~{reqs_remaining} requests)</span></div>
<div class="stat"><span class="label">Avg Cost/Request</span><span class="value">${avg_cost_per_req:.6f}</span></div>
<div class="stat"><span class="label">Energy Used</span><span class="value">{s['total_energy_wh']:.1f} Wh ({s['total_energy_wh']/1000:.4f} kWh)</span></div>
<h2>Labor & Profit</h2>
<div class="stat"><span class="label">Labor Rate</span><span class="value">${c['labor_rate_per_hour']:.2f}/hr</span></div>
<div class="stat"><span class="label">Hours Logged</span><span class="value">{c['labor_hours_logged']:.1f}h</span></div>
<div class="stat"><span class="label">Labor Cost</span><span class="value">${c['labor_rate_per_hour'] * c['labor_hours_logged']:.2f}</span></div>
<div class="stat"><span class="label">Profit Margin</span><span class="value">{c['profit_margin']*100:.0f}%</span></div>
<div class="stat"><span class="label">Total Owed (electricity + labor + margin)</span><span class="value">${s['total_cost'] + c['labor_rate_per_hour'] * c['labor_hours_logged']:.4f}</span></div>
<h2>Power Model</h2>
<div class="stat"><span class="label">Billing Mode</span><span class="value">{c['billing_mode']}</span></div>
<div class="stat"><span class="label">GPU (idle → load)</span><span class="value">{c['gpu_idle_watts']}W → {c['gpu_load_watts']}W</span></div>
<div class="stat"><span class="label">System (idle → load)</span><span class="value">{c['system_idle_watts']}W → {c['system_inference_watts']}W</span></div>
<div class="stat"><span class="label">Marginal Draw</span><span class="value">{marginal_w}W per inference call</span></div>
<div class="stat"><span class="label">Electricity Rate</span><span class="value">${c['electricity_rate']}/kWh</span></div>
{'<div class="stat"><span class="label">Base Rate</span><span class="value">$' + f"{c['base_rate_per_hour']:.3f}" + '/hr</span></div>' if c['billing_mode'] == 'dedicated' else ''}
<h2>GPU</h2>
<div class="stat"><span class="label">Utilization</span><span class="value">{gpu['utilization']}%</span></div>
<div class="stat"><span class="label">Temperature</span><span class="value {'warn' if gpu['temperature'] > 75 else 'ok'}">{gpu['temperature']}°C</span></div>
<div class="stat"><span class="label">Power Draw</span><span class="value">{gpu['power_watts']}W</span></div>
<div class="stat"><span class="label">Source</span><span class="value">{gpu['source']}</span></div>
<p style="color:#555; font-size:0.8rem; margin-top:2rem;">
Config: GET /config | Update: POST /config | Stats: GET /stats (auth required)
</p>
</body></html>""" </body></html>"""
self.send_response(200) self.send_response(200)