From 6d3df9ae58652c731b6626a63416cf61343db421 Mon Sep 17 00:00:00 2001 From: Seth Freiberg Date: Fri, 20 Mar 2026 19:49:14 -0400 Subject: [PATCH] Full cost model: marginal power, labor, profit, live config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cost model: - Marginal billing: only charge for watts above idle - Dedicated billing: charge for all uptime (optional) - Labor rate: $/hr for operator time, manually logged - Profit margin: percentage markup on electricity cost - All parameters adjustable live via POST /config Dashboard shows: - Cost breakdown with progress bar - Power model (idle→load for GPU and system) - Marginal watts per inference call - Labor hours + labor cost - Total owed (electricity + labor + margin) - GPU utilization, temperature, power draw - Avg cost per request, estimated remaining requests Endpoints: - GET /config — view current cost config - POST /config — update any parameter live - GET /stats — full usage stats + cost config (auth required) Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 27 +++++-- gateway.py | 199 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 183 insertions(+), 43 deletions(-) diff --git a/.env.example b/.env.example index 2c3096c..bdca4a4 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,24 @@ # Mortdecai Gateway Configuration +# All values can also be adjusted live via POST /config + +# Auth API_KEY=mk_change_this_to_a_real_key -GPU_TDP_WATTS=54 -SYSTEM_OVERHEAD_WATTS=30 -ELECTRICITY_RATE=0.15 -SPENDING_CAP=10.00 -ALLOW_MODEL_UPDATES=false + +# Power model +GPU_IDLE_WATTS=15 # GPU at idle (watts) +GPU_LOAD_WATTS=54 # GPU during inference (watts) +SYSTEM_IDLE_WATTS=45 # Whole system idle (watts) +SYSTEM_INFERENCE_WATTS=65 # Whole system during inference (watts) + +# Billing +ELECTRICITY_RATE=0.15 # $/kWh +BILLING_MODE=marginal # "marginal" (only extra watts) or "dedicated" (all uptime) +BASE_RATE_PER_HOUR=0.00 # $/hr base (dedicated mode only) +SPENDING_CAP=10.00 # $ before gateway stops accepting + +# Labor & profit +LABOR_RATE_PER_HOUR=0.00 # $/hr for setup/maintenance time +PROFIT_MARGIN=0.00 # Markup multiplier (0.10 = 10%) + +# Features +ALLOW_MODEL_UPDATES=false # Allow remote model push via /admin/update-model diff --git a/gateway.py b/gateway.py index 45375d9..f60d5e0 100644 --- a/gateway.py +++ b/gateway.py @@ -27,11 +27,52 @@ import requests OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") LISTEN_PORT = int(os.environ.get("GATEWAY_PORT", "8434")) API_KEY = os.environ.get("API_KEY", "mk_mortdecai_default") -ELECTRICITY_RATE = float(os.environ.get("ELECTRICITY_RATE", "0.15")) # $/kWh -GPU_TDP_WATTS = float(os.environ.get("GPU_TDP_WATTS", "54")) # Strix Halo iGPU -SYSTEM_OVERHEAD_WATTS = float(os.environ.get("SYSTEM_OVERHEAD_WATTS", "30")) # CPU/RAM/etc idle draw during inference -SPENDING_CAP = float(os.environ.get("SPENDING_CAP", "10.00")) # $ before refusing requests STATS_FILE = os.environ.get("STATS_FILE", "/var/lib/mortdecai-gateway/stats.json") +CONFIG_FILE = os.environ.get("CONFIG_FILE", "/var/lib/mortdecai-gateway/cost_config.json") + +# Default cost config (overridden by config file or env vars) +_DEFAULT_COST_CONFIG = { + "electricity_rate": 0.15, # $/kWh + "gpu_idle_watts": 15, # GPU at idle + "gpu_load_watts": 54, # GPU during inference + "system_idle_watts": 45, # Whole system idle (CPU/RAM/fans/PSU) + "system_inference_watts": 65, # Whole system during inference + "billing_mode": "marginal", # "marginal" = only extra watts; "dedicated" = all uptime + "base_rate_per_hour": 0.00, # $/hr for keeping machine on (dedicated mode only) + "spending_cap": 10.00, # $ before refusing requests + "labor_rate_per_hour": 0.00, # $/hr for operator's time (setup, maintenance) + "profit_margin": 0.00, # multiplier (0.10 = 10% markup) + "labor_hours_logged": 0.0, # total hours spent on setup/maintenance +} + +def _load_cost_config(): + config = dict(_DEFAULT_COST_CONFIG) + # Override from file + try: + with open(CONFIG_FILE) as f: + config.update(json.load(f)) + except: + pass + # Override from env vars + for key in _DEFAULT_COST_CONFIG: + env_key = key.upper() + val = os.environ.get(env_key) + if val is not None: + try: + config[key] = type(_DEFAULT_COST_CONFIG[key])(val) + except: + pass + return config + +def _save_cost_config(config): + try: + os.makedirs(os.path.dirname(CONFIG_FILE), exist_ok=True) + with open(CONFIG_FILE, "w") as f: + json.dump(config, f, indent=2) + except: + pass + +COST_CONFIG = _load_cost_config() # --- Stats tracking --- _stats_lock = threading.Lock() @@ -67,25 +108,49 @@ def _save_stats(): pass +def _calc_marginal_cost(duration_seconds): + """Calculate marginal electricity cost for an inference call.""" + c = COST_CONFIG + if c["billing_mode"] == "marginal": + # Only charge for extra watts above idle + marginal_gpu = c["gpu_load_watts"] - c["gpu_idle_watts"] + marginal_system = c["system_inference_watts"] - c["system_idle_watts"] + marginal_watts = marginal_gpu + marginal_system + else: + # Dedicated: charge for full system draw during inference + marginal_watts = c["gpu_load_watts"] + c["system_inference_watts"] + + energy_wh = (marginal_watts * duration_seconds) / 3600 + electricity_cost = (energy_wh / 1000) * c["electricity_rate"] + # Apply profit margin + cost = electricity_cost * (1 + c["profit_margin"]) + return marginal_watts, energy_wh, cost + + def _track_request(tokens_in, tokens_out, duration_seconds): """Track a completed inference request.""" + marginal_watts, energy_wh, cost = _calc_marginal_cost(duration_seconds) + with _stats_lock: _stats["total_requests"] += 1 _stats["total_tokens_in"] += tokens_in _stats["total_tokens_out"] += tokens_out _stats["total_inference_seconds"] += duration_seconds _stats["last_request_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") - - # Power calculation - # GPU draws TDP watts during inference, plus system overhead - total_watts = GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS - energy_wh = (total_watts * duration_seconds) / 3600 - cost = (energy_wh / 1000) * ELECTRICITY_RATE - _stats["total_energy_wh"] += energy_wh _stats["total_cost"] += cost + _stats["total_marginal_watts_avg"] = ( + _stats.get("total_marginal_watts_avg", marginal_watts) * 0.95 + marginal_watts * 0.05 + ) + + # Base rate for dedicated mode + if COST_CONFIG["billing_mode"] == "dedicated" and COST_CONFIG["base_rate_per_hour"] > 0: + # Add base rate proportional to time since last request + last = _stats.get("_last_base_calc", time.time()) + elapsed_hours = (time.time() - last) / 3600 + _stats["total_cost"] += COST_CONFIG["base_rate_per_hour"] * elapsed_hours + _stats["_last_base_calc"] = time.time() - # Save every 10 requests if _stats["total_requests"] % 10 == 0: _save_stats() @@ -93,7 +158,7 @@ def _track_request(tokens_in, tokens_out, duration_seconds): def _check_budget(): """Returns True if under spending cap.""" with _stats_lock: - return _stats["total_cost"] < SPENDING_CAP + return _stats["total_cost"] < COST_CONFIG["spending_cap"] def _get_gpu_utilization(): @@ -190,12 +255,15 @@ class GatewayHandler(BaseHTTPRequestHandler): # Add gateway metadata to response if isinstance(data, dict): + mw, ewh, ecost = _calc_marginal_cost(duration) data["_gateway"] = { "duration_seconds": round(duration, 2), - "energy_wh": round((GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS) * duration / 3600, 4), - "estimated_cost": round(((GPU_TDP_WATTS + SYSTEM_OVERHEAD_WATTS) * duration / 3600 / 1000) * ELECTRICITY_RATE, 6), + "marginal_watts": round(mw, 1), + "energy_wh": round(ewh, 4), + "estimated_cost": round(ecost, 6), "total_cost": round(_stats["total_cost"], 4), - "budget_remaining": round(SPENDING_CAP - _stats["total_cost"], 4), + "budget_remaining": round(COST_CONFIG["spending_cap"] - _stats["total_cost"], 4), + "billing_mode": COST_CONFIG["billing_mode"], } self._send_json(r.status_code, data) @@ -225,17 +293,18 @@ class GatewayHandler(BaseHTTPRequestHandler): return gpu = _get_gpu_utilization() with _stats_lock: - stats_copy = dict(_stats) + stats_copy = {k: v for k, v in _stats.items() if not k.startswith("_")} stats_copy["gpu"] = gpu - stats_copy["config"] = { - "gpu_tdp_watts": GPU_TDP_WATTS, - "system_overhead_watts": SYSTEM_OVERHEAD_WATTS, - "electricity_rate": ELECTRICITY_RATE, - "spending_cap": SPENDING_CAP, - } + stats_copy["cost_config"] = COST_CONFIG self._send_json(200, stats_copy) return + if parsed.path == "/config": + if not self._check_auth(): + return + self._send_json(200, COST_CONFIG) + return + if parsed.path == "/dashboard": self._serve_dashboard() return @@ -252,6 +321,16 @@ class GatewayHandler(BaseHTTPRequestHandler): length = int(self.headers.get("Content-Length", 0)) body = json.loads(self.rfile.read(length)) if length > 0 else None + # Config update endpoint — adjust cost parameters live + if self.path == "/config" and body: + global COST_CONFIG + for key in body: + if key in COST_CONFIG: + COST_CONFIG[key] = type(_DEFAULT_COST_CONFIG.get(key, ""))(body[key]) + _save_cost_config(COST_CONFIG) + self._send_json(200, {"status": "updated", "config": COST_CONFIG}) + return + # Model update endpoint — downloads new GGUF and reloads if self.path == "/admin/update-model" and body: self._handle_model_update(body) @@ -299,31 +378,75 @@ class GatewayHandler(BaseHTTPRequestHandler): def _serve_dashboard(self): """Simple HTML dashboard showing usage stats.""" with _stats_lock: - s = dict(_stats) + s = {k: v for k, v in _stats.items() if not k.startswith("_")} gpu = _get_gpu_utilization() + c = COST_CONFIG + marginal_w = (c["gpu_load_watts"] - c["gpu_idle_watts"]) + (c["system_inference_watts"] - c["system_idle_watts"]) + active = _check_budget() + avg_cost_per_req = s["total_cost"] / max(s["total_requests"], 1) + reqs_remaining = int((c["spending_cap"] - s["total_cost"]) / max(avg_cost_per_req, 0.000001)) if avg_cost_per_req > 0 else "∞" html = f""" Mortdecai Gateway

Mortdecai Gateway

-
Status: {"ACTIVE" if _check_budget() else "PAUSED (cap reached)"}
-
Total Requests: {s['total_requests']}
-
Tokens (in/out): {s['total_tokens_in']:,} / {s['total_tokens_out']:,}
-
Inference Time: {s['total_inference_seconds']:.0f}s
-
Energy Used: {s['total_energy_wh']:.1f} Wh
-
Estimated Cost: ${s['total_cost']:.4f} / ${SPENDING_CAP:.2f}
-
Rejected (over cap): {s['requests_rejected']}
-
GPU Utilization: {gpu['utilization']}% ({gpu['source']})
-
GPU Temperature: {gpu['temperature']}°C
-
Last Request: {s['last_request_at'] or 'never'}
-
Config: TDP={GPU_TDP_WATTS}W + {SYSTEM_OVERHEAD_WATTS}W overhead @ ${ELECTRICITY_RATE}/kWh
+ +
Status +{'● ACTIVE' if active else '● PAUSED (cap reached)'}
+ +

Usage

+
Requests{s['total_requests']:,}
+
Tokens (in / out){s['total_tokens_in']:,} / {s['total_tokens_out']:,}
+
Inference Time{s['total_inference_seconds']:.0f}s ({s['total_inference_seconds']/3600:.1f}h)
+
Avg per Request{s['total_inference_seconds']/max(s['total_requests'],1):.1f}s, {s['total_tokens_out']//max(s['total_requests'],1)} tokens
+
Rejected (cap){s['requests_rejected']}
+
Last Request{s['last_request_at'] or 'never'}
+ +

Cost

+
+
Spent${s['total_cost']:.4f}
+
Budget${c['spending_cap']:.2f}
+
Remaining${c['spending_cap'] - s['total_cost']:.4f} (~{reqs_remaining} requests)
+
Avg Cost/Request${avg_cost_per_req:.6f}
+
Energy Used{s['total_energy_wh']:.1f} Wh ({s['total_energy_wh']/1000:.4f} kWh)
+ +

Labor & Profit

+
Labor Rate${c['labor_rate_per_hour']:.2f}/hr
+
Hours Logged{c['labor_hours_logged']:.1f}h
+
Labor Cost${c['labor_rate_per_hour'] * c['labor_hours_logged']:.2f}
+
Profit Margin{c['profit_margin']*100:.0f}%
+
Total Owed (electricity + labor + margin)${s['total_cost'] + c['labor_rate_per_hour'] * c['labor_hours_logged']:.4f}
+ +

Power Model

+
Billing Mode{c['billing_mode']}
+
GPU (idle → load){c['gpu_idle_watts']}W → {c['gpu_load_watts']}W
+
System (idle → load){c['system_idle_watts']}W → {c['system_inference_watts']}W
+
Marginal Draw{marginal_w}W per inference call
+
Electricity Rate${c['electricity_rate']}/kWh
+{'
Base Rate$' + f"{c['base_rate_per_hour']:.3f}" + '/hr
' if c['billing_mode'] == 'dedicated' else ''} + +

GPU

+
Utilization{gpu['utilization']}%
+
Temperature{gpu['temperature']}°C
+
Power Draw{gpu['power_watts']}W
+
Source{gpu['source']}
+ +

+Config: GET /config | Update: POST /config | Stats: GET /stats (auth required) +

""" self.send_response(200)