"""Closes the loop on Round 1: original write_file harness, but think flag omitted. If 26B PASSES here, the original Round 1 failure (and Round 2 patch-mode failure) were both caused by `think: false`, not by the edit tool surface or response size. """ from __future__ import annotations import json, os, shutil, subprocess, sys, time from pathlib import Path from urllib import request as urlreq OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434") MAX_ITERATIONS = 15 BASH_TIMEOUT_S = 30 REQUEST_TIMEOUT_S = 540 SYSTEM_PROMPT = """You are a terminal coding agent. ## What you do - Read source and test files to understand the code - Make targeted edits to fix bugs so the tests pass - Run pytest to verify your fix - Stop once all tests pass and reply with a one-sentence summary ## What you do NOT do - Never modify files under tests/ - Never disable, skip, or delete tests - Never write outside the working directory - Never call tools after all tests pass — just reply with the summary and stop ## Available tools - read_file(path): read a file relative to the working directory - write_file(path, content): overwrite a file relative to the working directory - run_bash(command): run a shell command in the working directory ## Rules - Start by reading README.md - Prefer minimal edits. Do not refactor unrelated code. - Run the full test suite after each edit to verify. """ USER_PROMPT = "Make the failing tests pass. Begin." TOOLS = [ {"type": "function", "function": {"name": "read_file", "description": "Read a file relative to workdir.", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}}, {"type": "function", "function": {"name": "write_file", "description": "Overwrite a file relative to workdir.", "parameters": {"type": "object", "properties": {"path": {"type": "string"}, "content": {"type": "string"}}, "required": ["path", "content"]}}}, {"type": "function", "function": {"name": "run_bash", "description": "Run a bash command in workdir.", "parameters": {"type": "object", "properties": {"command": {"type": "string"}}, "required": ["command"]}}}, ] def safe_path(wd, rel): p = (wd / rel).resolve() if not str(p).startswith(str(wd.resolve())): raise ValueError(f"path escapes workdir: {rel}") return p def t_read(wd, a): p = safe_path(wd, a["path"]) return p.read_text() if p.exists() else f"ERROR: {a['path']} does not exist" def t_write(wd, a): p = safe_path(wd, a["path"]) p.parent.mkdir(parents=True, exist_ok=True) p.write_text(a["content"]) return f"wrote {a['path']} ({len(a['content'])} bytes)" def t_bash(wd, a): try: r = subprocess.run(["bash", "-c", a["command"]], cwd=wd, capture_output=True, text=True, timeout=BASH_TIMEOUT_S) except subprocess.TimeoutExpired: return f"ERROR: timeout {BASH_TIMEOUT_S}s" return f"exit={r.returncode}\n--- stdout ---\n{r.stdout[-4000:]}\n--- stderr ---\n{r.stderr[-2000:]}" DISP = {"read_file": t_read, "write_file": t_write, "run_bash": t_bash} def chat(model, msgs): payload = {"model": model, "messages": msgs, "tools": TOOLS, "stream": False, "keep_alive": "10m", "options": {"num_ctx": 32768, "num_predict": 4096, "temperature": 0.3}} req = urlreq.Request(f"{OLLAMA_HOST}/api/chat", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}) with urlreq.urlopen(req, timeout=REQUEST_TIMEOUT_S) as r: return json.loads(r.read()) def run(model, wd, logp): logp.parent.mkdir(parents=True, exist_ok=True) msgs = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": USER_PROMPT}] trace = {"model": model, "edit_tool": "write_file", "think_setting": "unset (default)", "turns": [], "started_at": time.time()} counts = {"read_file": 0, "write_file": 0, "run_bash": 0} halt = None for i in range(1, MAX_ITERATIONS + 1): t0 = time.time() try: r = chat(model, msgs) except Exception as e: halt = f"error: {e}" break m = r.get("message", {}) tcs = m.get("tool_calls") or [] trace["turns"].append({"iteration": i, "elapsed_s": round(time.time() - t0, 2), "content": m.get("content") or "", "prompt_eval_count": r.get("prompt_eval_count"), "eval_count": r.get("eval_count"), "tool_calls_count": len(tcs)}) msgs.append({"role": "assistant", "content": m.get("content") or "", "tool_calls": tcs}) if not tcs: halt = "no_tool_calls" break for tc in tcs: fn = tc.get("function", {}) n = fn.get("name") args = fn.get("arguments") or {} if isinstance(args, str): try: args = json.loads(args) except: args = {"_raw": args} try: res = DISP[n](wd, args) if n in DISP else f"ERROR: unknown {n}" except Exception as e: res = f"ERROR: {e}" if n in counts: counts[n] += 1 msgs.append({"role": "tool", "content": res}) if i == MAX_ITERATIONS: halt = "cap" break p = subprocess.run(["python3", "-m", "pytest", "tests/", "-q"], cwd=wd, capture_output=True, text=True, timeout=60) trace["final"] = {"halt_reason": halt, "tests_pass": p.returncode == 0, "iterations_used": len(trace["turns"]), "tool_call_counts": counts, "wall_clock_s": round(time.time() - trace["started_at"], 2)} logp.write_text(json.dumps(trace, indent=2, default=str)) return trace if __name__ == "__main__": model, wd_s, log_s = sys.argv[1], sys.argv[2], sys.argv[3] wd, logp = Path(wd_s).resolve(), Path(log_s).resolve() seed = Path(__file__).parent / "task_seed" if wd.exists(): shutil.rmtree(wd) shutil.copytree(seed, wd) r = run(model, wd, logp) f = r["final"] print(f"model={model} pass={f['tests_pass']} iters={f['iterations_used']} r={f['tool_call_counts']['read_file']} w={f['tool_call_counts']['write_file']} b={f['tool_call_counts']['run_bash']} halt={f['halt_reason']} wall={f['wall_clock_s']}s")