API cascade: Haiku → Gemini → local model, full POS status reports

- _llm_call routes through cascade: Haiku ($20) → Gemini Flash Lite ($20) → Ollama fallback - Gemini API call function with persistent cost tracking - Full status report on POS printer at each $1 milestone (cost, audit counts, services) - Prayer title flash: "Your prayers have been answered!" Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 04:55:59 -04:00
parent 616aab7cf4
commit 6f585e0021
1 changed files with 153 additions and 23 deletions
@@ -1988,13 +1988,36 @@ def build_message_system_prompt(config) -> str:
 def _llm_call(model: str, system: str, user: str, config: dict,
              fmt = None, temperature: float = 0.85,
              max_tokens: int = 400, timeout: int = 60) -> str:
-    """LLM call — routes to Anthropic API or Ollama based on config."""
+    """LLM call — routes based on config. Dev server cascades: Haiku → Gemini → Ollama fallback."""
    provider = config.get("llm_provider", "ollama")

    if provider == "anthropic":
-        return _anthropic_call(model, system, user, config, temperature, max_tokens, timeout)
+        haiku_budget = config.get("anthropic_budget", 20.00)
+        gemini_budget = config.get("gemini_budget", 20.00)

-    # Default: Ollama
+        # Stage 1: Haiku until budget exhausted
+        if _get_anthropic_cost() < haiku_budget:
+            return _anthropic_call(model, system, user, config, temperature, max_tokens, timeout)
+
+        # Stage 2: Gemini until its budget exhausted
+        if _get_gemini_cost() < gemini_budget:
+            log.info("Haiku budget exhausted, using Gemini Flash Lite")
+            return _gemini_call(system, user, config, temperature, max_tokens, timeout)
+
+        # Stage 3: Fall back to local Ollama model
+        log.info("All API budgets exhausted, falling back to Ollama")
+        fallback = config.get("fallback_model", "qwen3-8b-mc-lora-v3")
+        payload = {
+            "model": fallback,
+            "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
+            "stream": False,
+            "options": {"temperature": temperature, "num_predict": max_tokens},
+        }
+        r = requests.post(f"{config['ollama_url']}/api/chat", json=payload, timeout=timeout)
+        r.raise_for_status()
+        return r.json()["message"]["content"]
+
+    # Default: Ollama (prod servers use this path)
    payload = {
        "model": model,
        "messages": [
@@ -2014,6 +2037,74 @@ def _llm_call(model: str, system: str, user: str, config: dict,
    return r.json()["message"]["content"]


+# --- Gemini API cost tracking and call ---
+
+_gemini_cost_lock = threading.Lock()
+_gemini_cost_file = "/var/log/mc_gemini_cost.json"
+
+def _load_gemini_cost():
+    try:
+        with open(_gemini_cost_file) as f:
+            return json.load(f).get("total_cost", 0.0)
+    except:
+        return 0.0
+
+_gemini_total_cost = _load_gemini_cost()
+
+def _save_gemini_cost():
+    try:
+        with open(_gemini_cost_file, "w") as f:
+            json.dump({"total_cost": _gemini_total_cost, "updated": time.strftime("%Y-%m-%dT%H:%M:%SZ")}, f)
+    except:
+        pass
+
+def _get_gemini_cost():
+    with _gemini_cost_lock:
+        return _gemini_total_cost
+
+def _gemini_call(system: str, user: str, config: dict,
+                 temperature: float = 0.85, max_tokens: int = 400,
+                 timeout: int = 60) -> str:
+    """Call Gemini Flash Lite API. Tracks cost."""
+    global _gemini_total_cost
+
+    api_key = config.get("gemini_api_key", "REDACTED_GEMINI_KEY_2")
+    model = config.get("gemini_model", "gemini-2.5-flash-lite")
+    budget = config.get("gemini_budget", 20.00)
+
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
+
+    body = {
+        "contents": [{"parts": [{"text": f"SYSTEM: {system}\n\nUSER: {user}"}]}],
+        "generationConfig": {
+            "temperature": temperature,
+            "maxOutputTokens": max_tokens,
+        },
+    }
+
+    r = requests.post(url, json=body, timeout=timeout)
+    r.raise_for_status()
+    data = r.json()
+
+    text = data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "")
+
+    # Estimate cost (Gemini Flash Lite: $0.075/M input, $0.30/M output)
+    usage = data.get("usageMetadata", {})
+    input_tokens = usage.get("promptTokenCount", 500)
+    output_tokens = usage.get("candidatesTokenCount", 150)
+    cost = (input_tokens / 1_000_000) * 0.075 + (output_tokens / 1_000_000) * 0.30
+
+    with _gemini_cost_lock:
+        prev_dollar = int(_gemini_total_cost)
+        _gemini_total_cost += cost
+        _save_gemini_cost()
+        curr_dollar = int(_gemini_total_cost)
+        if curr_dollar > prev_dollar:
+            log.info(f"Gemini cost milestone: ${_gemini_total_cost:.4f} / ${budget:.2f}")
+
+    return text
+
+
 # --- Anthropic API cost tracking ---

 _anthropic_cost_lock = threading.Lock()
@@ -2048,20 +2139,6 @@ def _anthropic_call(model: str, system: str, user: str, config: dict,
    api_key = config.get("anthropic_api_key", "")
    budget = config.get("anthropic_budget", 5.00)

-    with _anthropic_cost_lock:
-        if _anthropic_total_cost >= budget:
-            log.warning(f"Anthropic budget exhausted (${_anthropic_total_cost:.4f} >= ${budget:.2f}). Falling back to Ollama.")
-            # Fall back to Ollama
-            payload = {
-                "model": config.get("fallback_model", config.get("model", "gemma3n:e4b")),
-                "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
-                "stream": False,
-                "options": {"temperature": temperature, "num_predict": max_tokens},
-            }
-            r = requests.post(f"{config['ollama_url']}/api/chat", json=payload, timeout=timeout)
-            r.raise_for_status()
-            return r.json()["message"]["content"]
-
    headers = {
        "x-api-key": api_key,
        "anthropic-version": "2023-06-01",
@@ -2092,18 +2169,71 @@ def _anthropic_call(model: str, system: str, user: str, config: dict,
        curr_dollar = int(_anthropic_total_cost)
        if curr_dollar > prev_dollar:
            log.info(f"Anthropic cost milestone: ${_anthropic_total_cost:.4f} / ${budget:.2f}")
-            # Print to POS printer
+            # Print full status report to POS printer
            try:
                import socket as _sock
+                import subprocess as _sp
                from escpos.printer import Dummy as _Dummy
                _p = _Dummy(profile="default")
-                _p.set(font='b', align='center', bold=True)
-                _p.text("MC AI TRAINING COST\n")
+                _cols = 57
+
+                _p.set(font='b', align='center', bold=True, height=2)
+                _p.text("MC AI TRAINING\n")
+                _p.set(font='b', align='center', bold=True, height=1)
+                _p.text("STATUS REPORT\n")
                _p.set(font='b', align='center', bold=False)
-                _p.text(f"${_anthropic_total_cost:.4f} / ${budget:.2f}\n")
-                _p.text(f"{time.strftime('%Y-%m-%d %H:%M')}\n")
-                _p.text("=" * 57 + "\n")
+                _p.text(time.strftime("%Y-%m-%d %H:%M") + "\n")
+                _p.text("=" * _cols + "\n")
+
+                # Cost
+                _p.set(font='b', align='left', bold=True)
+                _p.text("CLAUDE HAIKU API\n")
+                _p.set(font='b', align='left', bold=True)
+                _p.text(f"  Spent:          ${_anthropic_total_cost:.4f}\n")
+                _p.set(font='b', align='left', bold=False)
+                _p.text(f"  Budget:         ${budget:.2f}\n")
+                _p.text(f"  Remaining:      ${budget - _anthropic_total_cost:.4f}\n")
+                _p.text("-" * _cols + "\n")
+
+                # Audit log counts
+                try:
+                    def _wc(path):
+                        try:
+                            with open(path) as _f:
+                                return sum(1 for _ in _f)
+                        except: return 0
+                    _dev = _wc("/var/log/mc_training_audit_dev.jsonl")
+                    _prod = _wc("/var/log/mc_training_audit.jsonl")
+                    _shrink = _wc("/var/log/mc_training_audit_shrink.jsonl")
+                    _p.set(font='b', align='left', bold=True)
+                    _p.text("TRAINING DATA\n")
+                    _p.set(font='b', align='left', bold=False)
+                    _p.text(f"  Dev audit:      {_dev}\n")
+                    _p.text(f"  Prod audit:     {_prod}\n")
+                    _p.text(f"  Shrink audit:   {_shrink}\n")
+                    _p.text(f"  Total pending:  {_dev + _prod + _shrink}\n")
+                    _p.text("-" * _cols + "\n")
+                except: pass
+
+                # Services
+                try:
+                    _p.set(font='b', align='left', bold=True)
+                    _p.text("SERVICES\n")
+                    _p.set(font='b', align='left', bold=False)
+                    for _svc in ["mc-aigod-paper", "mc-aigod-dev", "mc-aigod"]:
+                        try:
+                            _r = _sp.run(["systemctl", "is-active", f"{_svc}.service"], capture_output=True, text=True, timeout=3)
+                            _st = "OK" if _r.stdout.strip() == "active" else "DOWN"
+                        except: _st = "?"
+                        _p.text(f"  {_svc:24} [{_st}]\n")
+                    _p.text("-" * _cols + "\n")
+                except: pass
+
+                _p.set(font='b', align='center', bold=False)
+                _p.text(f"${curr_dollar} milestone\n")
+                _p.text("=" * _cols + "\n")
                _p.cut()
+
                with _sock.create_connection(("192.168.0.137", 9100), timeout=5) as _s:
                    _s.sendall(_p.output)
            except Exception as _pe: