Files
Mortdecai a945207aab feat: CLI coding agent bakeoff — 26b reproducibly silent-stops at write_file
Ran minimal agent loop (Ollama /api/chat + read_file/write_file/run_bash) on
steel141 3090 Ti against 3 models on a broken-median-function task:

- gemma4:31b-it-q4_K_M: PASS (8 iters, 1 write, 44s) — textbook trace
- qwen3-coder:30b: PASS (15 iters, 1 write, 22s) — correct but chatty
- gemma4:26b: FAIL (6 iters, 0 writes) — silently stops with eval=4
  after reading source. Reproduced on second run. One-shot probe
  confirms 26b CAN produce the correct fix — failure is specifically
  at the write_file tool-call argument boundary.

Updates GOTCHAS with a new HIGH-severity entry, SYNTHESIS model-selection
table, CORPUS_cli_coding_agent.md empirical-follow-up pointer, and adds
docs/reference/bakeoff-2026-04-18.md with the full writeup.
2026-04-18 13:27:50 -04:00

302 lines
8.8 KiB
Python

"""CLI coding agent bakeoff harness.
Minimal agent loop: Ollama /api/chat with tools [read_file, write_file, run_bash].
Non-streaming, think=false. Workdir sandboxed; path traversal blocked; bash
timeout 30s; iteration cap 15.
Invocation:
python3 harness.py <model-tag> <workdir> <log-path>
Example:
python3 harness.py gemma4:26b runs/gemma4-26b/work runs/gemma4-26b/log.json
"""
from __future__ import annotations
import json
import os
import shutil
import subprocess
import sys
import time
from pathlib import Path
from urllib import request as urlreq
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434")
MAX_ITERATIONS = 15
BASH_TIMEOUT_S = 30
REQUEST_TIMEOUT_S = 540 # 9 min — keep inside the 10-min Bash ceiling
SYSTEM_PROMPT = """You are a terminal coding agent.
## What you do
- Read source and test files to understand the code
- Make targeted edits to fix bugs so the tests pass
- Run pytest to verify your fix
- Stop once all tests pass and reply with a one-sentence summary
## What you do NOT do
- Never modify files under tests/
- Never disable, skip, or delete tests
- Never write outside the working directory
- Never call tools after all tests pass — just reply with the summary and stop
## Available tools
- read_file(path): read a file relative to the working directory
- write_file(path, content): overwrite a file relative to the working directory
- run_bash(command): run a shell command in the working directory
## Rules
- Start by reading README.md
- Prefer minimal edits. Do not refactor unrelated code.
- Run the full test suite after each edit to verify.
"""
USER_PROMPT = "Make the failing tests pass. Begin."
TOOLS = [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Read a file. Path is relative to the working directory.",
"parameters": {
"type": "object",
"properties": {"path": {"type": "string"}},
"required": ["path"],
},
},
},
{
"type": "function",
"function": {
"name": "write_file",
"description": "Overwrite a file with new content. Path is relative to the working directory.",
"parameters": {
"type": "object",
"properties": {
"path": {"type": "string"},
"content": {"type": "string"},
},
"required": ["path", "content"],
},
},
},
{
"type": "function",
"function": {
"name": "run_bash",
"description": "Run a shell command in the working directory. Returns stdout, stderr, and exit code.",
"parameters": {
"type": "object",
"properties": {"command": {"type": "string"}},
"required": ["command"],
},
},
},
]
def safe_path(workdir: Path, rel: str) -> Path:
p = (workdir / rel).resolve()
if not str(p).startswith(str(workdir.resolve())):
raise ValueError(f"path escapes workdir: {rel}")
return p
def tool_read_file(workdir: Path, args: dict) -> str:
p = safe_path(workdir, args["path"])
if not p.exists():
return f"ERROR: {args['path']} does not exist"
return p.read_text()
def tool_write_file(workdir: Path, args: dict) -> str:
p = safe_path(workdir, args["path"])
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(args["content"])
return f"wrote {args['path']} ({len(args['content'])} bytes)"
def tool_run_bash(workdir: Path, args: dict) -> str:
try:
r = subprocess.run(
["bash", "-c", args["command"]],
cwd=workdir,
capture_output=True,
text=True,
timeout=BASH_TIMEOUT_S,
)
except subprocess.TimeoutExpired:
return f"ERROR: command timed out after {BASH_TIMEOUT_S}s"
head = (
f"exit={r.returncode}\n"
f"--- stdout ---\n{r.stdout[-4000:]}\n"
f"--- stderr ---\n{r.stderr[-2000:]}"
)
return head
TOOL_DISPATCH = {
"read_file": tool_read_file,
"write_file": tool_write_file,
"run_bash": tool_run_bash,
}
def ollama_chat(model: str, messages: list) -> dict:
payload = {
"model": model,
"messages": messages,
"tools": TOOLS,
"stream": False,
"think": False,
"keep_alive": "10m",
"options": {
"num_ctx": 32768,
"num_predict": 4096,
"temperature": 0.3,
},
}
data = json.dumps(payload).encode()
req = urlreq.Request(
f"{OLLAMA_HOST}/api/chat",
data=data,
headers={"Content-Type": "application/json"},
)
with urlreq.urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp:
return json.loads(resp.read())
def pytest_passes(workdir: Path) -> bool:
r = subprocess.run(
["python3", "-m", "pytest", "tests/", "-q"],
cwd=workdir,
capture_output=True,
text=True,
timeout=60,
)
return r.returncode == 0
def run_bakeoff(model: str, workdir: Path, log_path: Path) -> dict:
log_path.parent.mkdir(parents=True, exist_ok=True)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": USER_PROMPT},
]
trace = {
"model": model,
"workdir": str(workdir),
"started_at": time.time(),
"turns": [],
"final": None,
}
tool_call_counts = {"read_file": 0, "write_file": 0, "run_bash": 0}
halt_reason = None
for iteration in range(1, MAX_ITERATIONS + 1):
turn_start = time.time()
try:
response = ollama_chat(model, messages)
except Exception as e:
halt_reason = f"chat_error: {e}"
trace["turns"].append(
{"iteration": iteration, "error": str(e), "elapsed_s": time.time() - turn_start}
)
break
assistant_msg = response.get("message", {})
content = assistant_msg.get("content", "") or ""
tool_calls = assistant_msg.get("tool_calls", []) or []
turn = {
"iteration": iteration,
"elapsed_s": round(time.time() - turn_start, 2),
"content": content,
"tool_calls": [],
"prompt_eval_count": response.get("prompt_eval_count"),
"eval_count": response.get("eval_count"),
}
messages.append({"role": "assistant", "content": content, "tool_calls": tool_calls})
if not tool_calls:
trace["turns"].append(turn)
halt_reason = "no_tool_calls"
break
for tc in tool_calls:
fn = tc.get("function", {})
name = fn.get("name")
args = fn.get("arguments") or {}
if isinstance(args, str):
try:
args = json.loads(args)
except Exception:
args = {"_raw": args}
if name not in TOOL_DISPATCH:
result = f"ERROR: unknown tool {name}"
else:
try:
result = TOOL_DISPATCH[name](workdir, args)
tool_call_counts[name] = tool_call_counts.get(name, 0) + 1
except Exception as e:
result = f"ERROR: {e}"
turn["tool_calls"].append({"name": name, "arguments": args, "result": result[:800]})
messages.append({"role": "tool", "content": result})
trace["turns"].append(turn)
if iteration == MAX_ITERATIONS:
halt_reason = "iteration_cap"
break
final_pass = pytest_passes(workdir)
trace["final"] = {
"halt_reason": halt_reason,
"tests_pass": final_pass,
"iterations_used": len(trace["turns"]),
"tool_call_counts": tool_call_counts,
"wall_clock_s": round(time.time() - trace["started_at"], 2),
}
log_path.write_text(json.dumps(trace, indent=2, default=str))
return trace
def main():
if len(sys.argv) != 4:
print(__doc__, file=sys.stderr)
sys.exit(2)
model, workdir_s, log_s = sys.argv[1], sys.argv[2], sys.argv[3]
workdir = Path(workdir_s).resolve()
log_path = Path(log_s).resolve()
seed = Path(__file__).parent / "task_seed"
if workdir.exists():
shutil.rmtree(workdir)
shutil.copytree(seed, workdir)
result = run_bakeoff(model, workdir, log_path)
final = result["final"]
print(
f"model={model} pass={final['tests_pass']} "
f"iters={final['iterations_used']} "
f"read={final['tool_call_counts'].get('read_file', 0)} "
f"write={final['tool_call_counts'].get('write_file', 0)} "
f"bash={final['tool_call_counts'].get('run_bash', 0)} "
f"halt={final['halt_reason']} "
f"wall={final['wall_clock_s']}s"
)
if __name__ == "__main__":
main()