"""Arm A: Ollama /api/chat with JSON tools. This is the baseline — what mort-bot, OpenWebUI, and every other Ollama client does. Ollama's server translates the OpenAI-style JSON tools array into Gemma's native <|tool>declaration:... tokens and parses the model's <|tool_call>call:... output back into structured tool_calls. This arm measures what we already live with. Think setting: fixed to `false` per round-3 bakeoff finding (26B silently stops on think:true in multi-turn tool loops). For E4B the finding was less load-bearing but we hold think:false constant across arms so only the inference path varies. """ from __future__ import annotations import asyncio import json import time from typing import Any import aiohttp from tasks import SYSTEM_PROMPT, TOOLS, FAKE_HISTORY, execute_tool_stub async def run( *, ollama_url: str, model: str, task_prompt: str, num_ctx: int, num_predict: int, step_budget: int, ) -> dict[str, Any]: messages = [{"role": "system", "content": SYSTEM_PROMPT}] + list(FAKE_HISTORY) messages.append({"role": "user", "content": f"[2026-04-18 14:20] @seth:sethpc.xyz: {task_prompt}"}) trace: dict[str, Any] = { "arm": "ollama-json", "model": model, "num_ctx": num_ctx, "num_predict": num_predict, "started_at": time.time(), "turns": [], "final": None, } tool_call_total = 0 halt: str | None = None async with aiohttp.ClientSession() as session: for step in range(1, step_budget + 1): t0 = time.time() payload = { "model": model, "messages": messages, "tools": TOOLS, "stream": False, "think": False, "options": {"num_ctx": num_ctx, "num_predict": num_predict, "temperature": 0.7, "top_p": 0.95, "top_k": 64}, "keep_alive": "2h", } try: async with session.post( f"{ollama_url}/api/chat", json=payload, timeout=aiohttp.ClientTimeout(total=300), ) as resp: r = await resp.json() except Exception as e: halt = f"error: {e}" trace["turns"].append({"step": step, "error": str(e)}) break msg = r.get("message", {}) or {} content = msg.get("content", "") or "" tool_calls = msg.get("tool_calls") or [] history_chars = sum(len(m.get("content", "") or "") for m in messages) trace["turns"].append({ "step": step, "elapsed_s": round(time.time() - t0, 2), "prompt_eval_count": r.get("prompt_eval_count"), "eval_count": r.get("eval_count"), "content_len": len(content), "tool_call_count": len(tool_calls), "history_chars_before_append": history_chars, }) messages.append(msg) if not tool_calls: halt = "no_tool_calls" break tool_call_total += len(tool_calls) for tc in tool_calls: fn = tc.get("function", {}) name = fn.get("name") args = fn.get("arguments") or {} if isinstance(args, str): try: args = json.loads(args) except Exception: args = {} result = execute_tool_stub(name, args) messages.append({"role": "tool", "content": result}) if step == step_budget: halt = "step_budget" break trace["final"] = { "halt_reason": halt, "steps_used": len(trace["turns"]), "tool_calls_total": tool_call_total, "wall_clock_s": round(time.time() - trace["started_at"], 2), "final_message_count": len(messages), "final_history_chars": sum(len(m.get("content", "") or "") for m in messages), } return trace