API cascade: Haiku → Gemini → local model, full POS status reports
- _llm_call routes through cascade: Haiku ($20) → Gemini Flash Lite ($20) → Ollama fallback - Gemini API call function with persistent cost tracking - Full status report on POS printer at each $1 milestone (cost, audit counts, services) - Prayer title flash: "Your prayers have been answered!" Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+153
-23
@@ -1988,13 +1988,36 @@ def build_message_system_prompt(config) -> str:
|
||||
def _llm_call(model: str, system: str, user: str, config: dict,
|
||||
fmt = None, temperature: float = 0.85,
|
||||
max_tokens: int = 400, timeout: int = 60) -> str:
|
||||
"""LLM call — routes to Anthropic API or Ollama based on config."""
|
||||
"""LLM call — routes based on config. Dev server cascades: Haiku → Gemini → Ollama fallback."""
|
||||
provider = config.get("llm_provider", "ollama")
|
||||
|
||||
if provider == "anthropic":
|
||||
return _anthropic_call(model, system, user, config, temperature, max_tokens, timeout)
|
||||
haiku_budget = config.get("anthropic_budget", 20.00)
|
||||
gemini_budget = config.get("gemini_budget", 20.00)
|
||||
|
||||
# Default: Ollama
|
||||
# Stage 1: Haiku until budget exhausted
|
||||
if _get_anthropic_cost() < haiku_budget:
|
||||
return _anthropic_call(model, system, user, config, temperature, max_tokens, timeout)
|
||||
|
||||
# Stage 2: Gemini until its budget exhausted
|
||||
if _get_gemini_cost() < gemini_budget:
|
||||
log.info("Haiku budget exhausted, using Gemini Flash Lite")
|
||||
return _gemini_call(system, user, config, temperature, max_tokens, timeout)
|
||||
|
||||
# Stage 3: Fall back to local Ollama model
|
||||
log.info("All API budgets exhausted, falling back to Ollama")
|
||||
fallback = config.get("fallback_model", "qwen3-8b-mc-lora-v3")
|
||||
payload = {
|
||||
"model": fallback,
|
||||
"messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature, "num_predict": max_tokens},
|
||||
}
|
||||
r = requests.post(f"{config['ollama_url']}/api/chat", json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
return r.json()["message"]["content"]
|
||||
|
||||
# Default: Ollama (prod servers use this path)
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
@@ -2014,6 +2037,74 @@ def _llm_call(model: str, system: str, user: str, config: dict,
|
||||
return r.json()["message"]["content"]
|
||||
|
||||
|
||||
# --- Gemini API cost tracking and call ---
|
||||
|
||||
_gemini_cost_lock = threading.Lock()
|
||||
_gemini_cost_file = "/var/log/mc_gemini_cost.json"
|
||||
|
||||
def _load_gemini_cost():
|
||||
try:
|
||||
with open(_gemini_cost_file) as f:
|
||||
return json.load(f).get("total_cost", 0.0)
|
||||
except:
|
||||
return 0.0
|
||||
|
||||
_gemini_total_cost = _load_gemini_cost()
|
||||
|
||||
def _save_gemini_cost():
|
||||
try:
|
||||
with open(_gemini_cost_file, "w") as f:
|
||||
json.dump({"total_cost": _gemini_total_cost, "updated": time.strftime("%Y-%m-%dT%H:%M:%SZ")}, f)
|
||||
except:
|
||||
pass
|
||||
|
||||
def _get_gemini_cost():
|
||||
with _gemini_cost_lock:
|
||||
return _gemini_total_cost
|
||||
|
||||
def _gemini_call(system: str, user: str, config: dict,
|
||||
temperature: float = 0.85, max_tokens: int = 400,
|
||||
timeout: int = 60) -> str:
|
||||
"""Call Gemini Flash Lite API. Tracks cost."""
|
||||
global _gemini_total_cost
|
||||
|
||||
api_key = config.get("gemini_api_key", "REDACTED_GEMINI_KEY_2")
|
||||
model = config.get("gemini_model", "gemini-2.5-flash-lite")
|
||||
budget = config.get("gemini_budget", 20.00)
|
||||
|
||||
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
|
||||
|
||||
body = {
|
||||
"contents": [{"parts": [{"text": f"SYSTEM: {system}\n\nUSER: {user}"}]}],
|
||||
"generationConfig": {
|
||||
"temperature": temperature,
|
||||
"maxOutputTokens": max_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
r = requests.post(url, json=body, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
|
||||
text = data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "")
|
||||
|
||||
# Estimate cost (Gemini Flash Lite: $0.075/M input, $0.30/M output)
|
||||
usage = data.get("usageMetadata", {})
|
||||
input_tokens = usage.get("promptTokenCount", 500)
|
||||
output_tokens = usage.get("candidatesTokenCount", 150)
|
||||
cost = (input_tokens / 1_000_000) * 0.075 + (output_tokens / 1_000_000) * 0.30
|
||||
|
||||
with _gemini_cost_lock:
|
||||
prev_dollar = int(_gemini_total_cost)
|
||||
_gemini_total_cost += cost
|
||||
_save_gemini_cost()
|
||||
curr_dollar = int(_gemini_total_cost)
|
||||
if curr_dollar > prev_dollar:
|
||||
log.info(f"Gemini cost milestone: ${_gemini_total_cost:.4f} / ${budget:.2f}")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# --- Anthropic API cost tracking ---
|
||||
|
||||
_anthropic_cost_lock = threading.Lock()
|
||||
@@ -2048,20 +2139,6 @@ def _anthropic_call(model: str, system: str, user: str, config: dict,
|
||||
api_key = config.get("anthropic_api_key", "")
|
||||
budget = config.get("anthropic_budget", 5.00)
|
||||
|
||||
with _anthropic_cost_lock:
|
||||
if _anthropic_total_cost >= budget:
|
||||
log.warning(f"Anthropic budget exhausted (${_anthropic_total_cost:.4f} >= ${budget:.2f}). Falling back to Ollama.")
|
||||
# Fall back to Ollama
|
||||
payload = {
|
||||
"model": config.get("fallback_model", config.get("model", "gemma3n:e4b")),
|
||||
"messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature, "num_predict": max_tokens},
|
||||
}
|
||||
r = requests.post(f"{config['ollama_url']}/api/chat", json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
return r.json()["message"]["content"]
|
||||
|
||||
headers = {
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
@@ -2092,18 +2169,71 @@ def _anthropic_call(model: str, system: str, user: str, config: dict,
|
||||
curr_dollar = int(_anthropic_total_cost)
|
||||
if curr_dollar > prev_dollar:
|
||||
log.info(f"Anthropic cost milestone: ${_anthropic_total_cost:.4f} / ${budget:.2f}")
|
||||
# Print to POS printer
|
||||
# Print full status report to POS printer
|
||||
try:
|
||||
import socket as _sock
|
||||
import subprocess as _sp
|
||||
from escpos.printer import Dummy as _Dummy
|
||||
_p = _Dummy(profile="default")
|
||||
_p.set(font='b', align='center', bold=True)
|
||||
_p.text("MC AI TRAINING COST\n")
|
||||
_cols = 57
|
||||
|
||||
_p.set(font='b', align='center', bold=True, height=2)
|
||||
_p.text("MC AI TRAINING\n")
|
||||
_p.set(font='b', align='center', bold=True, height=1)
|
||||
_p.text("STATUS REPORT\n")
|
||||
_p.set(font='b', align='center', bold=False)
|
||||
_p.text(f"${_anthropic_total_cost:.4f} / ${budget:.2f}\n")
|
||||
_p.text(f"{time.strftime('%Y-%m-%d %H:%M')}\n")
|
||||
_p.text("=" * 57 + "\n")
|
||||
_p.text(time.strftime("%Y-%m-%d %H:%M") + "\n")
|
||||
_p.text("=" * _cols + "\n")
|
||||
|
||||
# Cost
|
||||
_p.set(font='b', align='left', bold=True)
|
||||
_p.text("CLAUDE HAIKU API\n")
|
||||
_p.set(font='b', align='left', bold=True)
|
||||
_p.text(f" Spent: ${_anthropic_total_cost:.4f}\n")
|
||||
_p.set(font='b', align='left', bold=False)
|
||||
_p.text(f" Budget: ${budget:.2f}\n")
|
||||
_p.text(f" Remaining: ${budget - _anthropic_total_cost:.4f}\n")
|
||||
_p.text("-" * _cols + "\n")
|
||||
|
||||
# Audit log counts
|
||||
try:
|
||||
def _wc(path):
|
||||
try:
|
||||
with open(path) as _f:
|
||||
return sum(1 for _ in _f)
|
||||
except: return 0
|
||||
_dev = _wc("/var/log/mc_training_audit_dev.jsonl")
|
||||
_prod = _wc("/var/log/mc_training_audit.jsonl")
|
||||
_shrink = _wc("/var/log/mc_training_audit_shrink.jsonl")
|
||||
_p.set(font='b', align='left', bold=True)
|
||||
_p.text("TRAINING DATA\n")
|
||||
_p.set(font='b', align='left', bold=False)
|
||||
_p.text(f" Dev audit: {_dev}\n")
|
||||
_p.text(f" Prod audit: {_prod}\n")
|
||||
_p.text(f" Shrink audit: {_shrink}\n")
|
||||
_p.text(f" Total pending: {_dev + _prod + _shrink}\n")
|
||||
_p.text("-" * _cols + "\n")
|
||||
except: pass
|
||||
|
||||
# Services
|
||||
try:
|
||||
_p.set(font='b', align='left', bold=True)
|
||||
_p.text("SERVICES\n")
|
||||
_p.set(font='b', align='left', bold=False)
|
||||
for _svc in ["mc-aigod-paper", "mc-aigod-dev", "mc-aigod"]:
|
||||
try:
|
||||
_r = _sp.run(["systemctl", "is-active", f"{_svc}.service"], capture_output=True, text=True, timeout=3)
|
||||
_st = "OK" if _r.stdout.strip() == "active" else "DOWN"
|
||||
except: _st = "?"
|
||||
_p.text(f" {_svc:24} [{_st}]\n")
|
||||
_p.text("-" * _cols + "\n")
|
||||
except: pass
|
||||
|
||||
_p.set(font='b', align='center', bold=False)
|
||||
_p.text(f"${curr_dollar} milestone\n")
|
||||
_p.text("=" * _cols + "\n")
|
||||
_p.cut()
|
||||
|
||||
with _sock.create_connection(("192.168.0.137", 9100), timeout=5) as _s:
|
||||
_s.sendall(_p.output)
|
||||
except Exception as _pe:
|
||||
|
||||
Reference in New Issue
Block a user