diff --git a/mc_aigod_paper.py b/mc_aigod_paper.py index 196105d..ce092ee 100644 --- a/mc_aigod_paper.py +++ b/mc_aigod_paper.py @@ -1988,13 +1988,36 @@ def build_message_system_prompt(config) -> str: def _llm_call(model: str, system: str, user: str, config: dict, fmt = None, temperature: float = 0.85, max_tokens: int = 400, timeout: int = 60) -> str: - """LLM call — routes to Anthropic API or Ollama based on config.""" + """LLM call — routes based on config. Dev server cascades: Haiku → Gemini → Ollama fallback.""" provider = config.get("llm_provider", "ollama") if provider == "anthropic": - return _anthropic_call(model, system, user, config, temperature, max_tokens, timeout) + haiku_budget = config.get("anthropic_budget", 20.00) + gemini_budget = config.get("gemini_budget", 20.00) - # Default: Ollama + # Stage 1: Haiku until budget exhausted + if _get_anthropic_cost() < haiku_budget: + return _anthropic_call(model, system, user, config, temperature, max_tokens, timeout) + + # Stage 2: Gemini until its budget exhausted + if _get_gemini_cost() < gemini_budget: + log.info("Haiku budget exhausted, using Gemini Flash Lite") + return _gemini_call(system, user, config, temperature, max_tokens, timeout) + + # Stage 3: Fall back to local Ollama model + log.info("All API budgets exhausted, falling back to Ollama") + fallback = config.get("fallback_model", "qwen3-8b-mc-lora-v3") + payload = { + "model": fallback, + "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}], + "stream": False, + "options": {"temperature": temperature, "num_predict": max_tokens}, + } + r = requests.post(f"{config['ollama_url']}/api/chat", json=payload, timeout=timeout) + r.raise_for_status() + return r.json()["message"]["content"] + + # Default: Ollama (prod servers use this path) payload = { "model": model, "messages": [ @@ -2014,6 +2037,74 @@ def _llm_call(model: str, system: str, user: str, config: dict, return r.json()["message"]["content"] +# --- Gemini API cost tracking and call --- + +_gemini_cost_lock = threading.Lock() +_gemini_cost_file = "/var/log/mc_gemini_cost.json" + +def _load_gemini_cost(): + try: + with open(_gemini_cost_file) as f: + return json.load(f).get("total_cost", 0.0) + except: + return 0.0 + +_gemini_total_cost = _load_gemini_cost() + +def _save_gemini_cost(): + try: + with open(_gemini_cost_file, "w") as f: + json.dump({"total_cost": _gemini_total_cost, "updated": time.strftime("%Y-%m-%dT%H:%M:%SZ")}, f) + except: + pass + +def _get_gemini_cost(): + with _gemini_cost_lock: + return _gemini_total_cost + +def _gemini_call(system: str, user: str, config: dict, + temperature: float = 0.85, max_tokens: int = 400, + timeout: int = 60) -> str: + """Call Gemini Flash Lite API. Tracks cost.""" + global _gemini_total_cost + + api_key = config.get("gemini_api_key", "REDACTED_GEMINI_KEY_2") + model = config.get("gemini_model", "gemini-2.5-flash-lite") + budget = config.get("gemini_budget", 20.00) + + url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}" + + body = { + "contents": [{"parts": [{"text": f"SYSTEM: {system}\n\nUSER: {user}"}]}], + "generationConfig": { + "temperature": temperature, + "maxOutputTokens": max_tokens, + }, + } + + r = requests.post(url, json=body, timeout=timeout) + r.raise_for_status() + data = r.json() + + text = data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "") + + # Estimate cost (Gemini Flash Lite: $0.075/M input, $0.30/M output) + usage = data.get("usageMetadata", {}) + input_tokens = usage.get("promptTokenCount", 500) + output_tokens = usage.get("candidatesTokenCount", 150) + cost = (input_tokens / 1_000_000) * 0.075 + (output_tokens / 1_000_000) * 0.30 + + with _gemini_cost_lock: + prev_dollar = int(_gemini_total_cost) + _gemini_total_cost += cost + _save_gemini_cost() + curr_dollar = int(_gemini_total_cost) + if curr_dollar > prev_dollar: + log.info(f"Gemini cost milestone: ${_gemini_total_cost:.4f} / ${budget:.2f}") + + return text + + # --- Anthropic API cost tracking --- _anthropic_cost_lock = threading.Lock() @@ -2048,20 +2139,6 @@ def _anthropic_call(model: str, system: str, user: str, config: dict, api_key = config.get("anthropic_api_key", "") budget = config.get("anthropic_budget", 5.00) - with _anthropic_cost_lock: - if _anthropic_total_cost >= budget: - log.warning(f"Anthropic budget exhausted (${_anthropic_total_cost:.4f} >= ${budget:.2f}). Falling back to Ollama.") - # Fall back to Ollama - payload = { - "model": config.get("fallback_model", config.get("model", "gemma3n:e4b")), - "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}], - "stream": False, - "options": {"temperature": temperature, "num_predict": max_tokens}, - } - r = requests.post(f"{config['ollama_url']}/api/chat", json=payload, timeout=timeout) - r.raise_for_status() - return r.json()["message"]["content"] - headers = { "x-api-key": api_key, "anthropic-version": "2023-06-01", @@ -2092,18 +2169,71 @@ def _anthropic_call(model: str, system: str, user: str, config: dict, curr_dollar = int(_anthropic_total_cost) if curr_dollar > prev_dollar: log.info(f"Anthropic cost milestone: ${_anthropic_total_cost:.4f} / ${budget:.2f}") - # Print to POS printer + # Print full status report to POS printer try: import socket as _sock + import subprocess as _sp from escpos.printer import Dummy as _Dummy _p = _Dummy(profile="default") - _p.set(font='b', align='center', bold=True) - _p.text("MC AI TRAINING COST\n") + _cols = 57 + + _p.set(font='b', align='center', bold=True, height=2) + _p.text("MC AI TRAINING\n") + _p.set(font='b', align='center', bold=True, height=1) + _p.text("STATUS REPORT\n") _p.set(font='b', align='center', bold=False) - _p.text(f"${_anthropic_total_cost:.4f} / ${budget:.2f}\n") - _p.text(f"{time.strftime('%Y-%m-%d %H:%M')}\n") - _p.text("=" * 57 + "\n") + _p.text(time.strftime("%Y-%m-%d %H:%M") + "\n") + _p.text("=" * _cols + "\n") + + # Cost + _p.set(font='b', align='left', bold=True) + _p.text("CLAUDE HAIKU API\n") + _p.set(font='b', align='left', bold=True) + _p.text(f" Spent: ${_anthropic_total_cost:.4f}\n") + _p.set(font='b', align='left', bold=False) + _p.text(f" Budget: ${budget:.2f}\n") + _p.text(f" Remaining: ${budget - _anthropic_total_cost:.4f}\n") + _p.text("-" * _cols + "\n") + + # Audit log counts + try: + def _wc(path): + try: + with open(path) as _f: + return sum(1 for _ in _f) + except: return 0 + _dev = _wc("/var/log/mc_training_audit_dev.jsonl") + _prod = _wc("/var/log/mc_training_audit.jsonl") + _shrink = _wc("/var/log/mc_training_audit_shrink.jsonl") + _p.set(font='b', align='left', bold=True) + _p.text("TRAINING DATA\n") + _p.set(font='b', align='left', bold=False) + _p.text(f" Dev audit: {_dev}\n") + _p.text(f" Prod audit: {_prod}\n") + _p.text(f" Shrink audit: {_shrink}\n") + _p.text(f" Total pending: {_dev + _prod + _shrink}\n") + _p.text("-" * _cols + "\n") + except: pass + + # Services + try: + _p.set(font='b', align='left', bold=True) + _p.text("SERVICES\n") + _p.set(font='b', align='left', bold=False) + for _svc in ["mc-aigod-paper", "mc-aigod-dev", "mc-aigod"]: + try: + _r = _sp.run(["systemctl", "is-active", f"{_svc}.service"], capture_output=True, text=True, timeout=3) + _st = "OK" if _r.stdout.strip() == "active" else "DOWN" + except: _st = "?" + _p.text(f" {_svc:24} [{_st}]\n") + _p.text("-" * _cols + "\n") + except: pass + + _p.set(font='b', align='center', bold=False) + _p.text(f"${curr_dollar} milestone\n") + _p.text("=" * _cols + "\n") _p.cut() + with _sock.create_connection(("192.168.0.137", 9100), timeout=5) as _s: _s.sendall(_p.output) except Exception as _pe: