c61394923c
Seth asked "was this with think=false?" Yes — and that was the only question that mattered. Everything I concluded in round 1 and round 2 was wrong. Actual cause, isolated in round 3: - At identical message state, gemma4:26b with think=false returns eval=4 (silent stop); with think unset or think=true, returns eval=165 and emits the correct tool call. - Original round-1 write_file harness + think unset: 26B passes in 8 iters, 20s. No mitigations needed. - 31B dense and qwen3-coder:30b tolerate think=false; 26B MoE does not. Red herrings (kept on-record in the bakeoff doc, not silently erased): - Round 1: "write_file tool-call argument size" — wrong - Round 2a: refuted the arg-size theory but for the wrong reason (still failed because think=false was still set) - Round 2b: "cumulative tool-response context size" — truncating did make 26B pass, but by coincidence. Shorter context at the decision turn dodged the think=false side effect. Why the existing "always think:false" guidance was misleading: it was derived from AI_Visualizer (single-turn JSON pipelines) where thinking tokens do eat num_predict invisibly. In multi-turn tool-calling agents the channels are separate and the flag has a different effect — catastrophic on 26B specifically. Doc updates: - GOTCHAS: replaced the 26B entry with the actual cause; scoped the original "Thinking Mode Eats Context" entry to single-turn pipelines - SYNTHESIS: split the "Mandatory Ollama Settings" block into single-turn vs multi-turn variants; updated anti-patterns and quick-start checklist - CORPUS_cli_coding_agent.md: revised pointer and config template - docs/reference/bakeoff-2026-04-18.md: added Round 3 section with the correction notice at the top of the file and full diagnostic methodology New artifacts: harness_no_think_flag.py, harness_write_no_think.py, and 4 new log files demonstrating all three models pass when think is left at default.
174 lines
7.6 KiB
Python
174 lines
7.6 KiB
Python
"""Diagnostic: patch-mode harness with think flag OMITTED (Ollama default).
|
|
|
|
Exact copy of harness_patch.py except the payload does NOT set "think".
|
|
Testing whether Gemma 4 26B's silent-stop at iter 6 is caused by
|
|
`think: false` specifically, rather than by tool-response context.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from urllib import request as urlreq
|
|
|
|
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434")
|
|
MAX_ITERATIONS = 15
|
|
BASH_TIMEOUT_S = 30
|
|
REQUEST_TIMEOUT_S = 540
|
|
|
|
SYSTEM_PROMPT = """You are a terminal coding agent.
|
|
|
|
## What you do
|
|
- Read source and test files to understand the code
|
|
- Make targeted edits to fix bugs so the tests pass
|
|
- Run pytest to verify your fix
|
|
- Stop once all tests pass and reply with a one-sentence summary
|
|
|
|
## What you do NOT do
|
|
- Never modify files under tests/
|
|
- Never disable, skip, or delete tests
|
|
- Never write outside the working directory
|
|
- Never call tools after all tests pass — just reply with the summary and stop
|
|
|
|
## Available tools
|
|
- read_file(path): read a file relative to the working directory
|
|
- apply_patch(path, old_text, new_text): replace an exact unique text span in a file
|
|
- run_bash(command): run a shell command in the working directory
|
|
|
|
## Rules
|
|
- Start by reading README.md
|
|
- Prefer minimal edits. Do not refactor unrelated code.
|
|
- Run the full test suite after each edit to verify.
|
|
- apply_patch requires old_text to appear EXACTLY ONCE in the file; include enough surrounding context to make it unique.
|
|
"""
|
|
|
|
USER_PROMPT = "Make the failing tests pass. Begin."
|
|
|
|
TOOLS = [
|
|
{"type": "function", "function": {"name": "read_file", "description": "Read a file. Path is relative to the working directory.", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}},
|
|
{"type": "function", "function": {"name": "apply_patch", "description": "Replace a unique span of text in a file. old_text must appear exactly once. Include surrounding context if needed to make the match unique.", "parameters": {"type": "object", "properties": {"path": {"type": "string"}, "old_text": {"type": "string"}, "new_text": {"type": "string"}}, "required": ["path", "old_text", "new_text"]}}},
|
|
{"type": "function", "function": {"name": "run_bash", "description": "Run a shell command in the working directory. Returns stdout, stderr, and exit code.", "parameters": {"type": "object", "properties": {"command": {"type": "string"}}, "required": ["command"]}}},
|
|
]
|
|
|
|
|
|
def safe_path(workdir, rel):
|
|
p = (workdir / rel).resolve()
|
|
if not str(p).startswith(str(workdir.resolve())):
|
|
raise ValueError(f"path escapes workdir: {rel}")
|
|
return p
|
|
|
|
|
|
def tool_read_file(workdir, args):
|
|
p = safe_path(workdir, args["path"])
|
|
if not p.exists():
|
|
return f"ERROR: {args['path']} does not exist"
|
|
return p.read_text()
|
|
|
|
|
|
def tool_apply_patch(workdir, args):
|
|
p = safe_path(workdir, args["path"])
|
|
if not p.exists():
|
|
return f"ERROR: {args['path']} does not exist"
|
|
old, new = args["old_text"], args["new_text"]
|
|
text = p.read_text()
|
|
n = text.count(old)
|
|
if n == 0:
|
|
return f"ERROR: old_text not found in {args['path']}."
|
|
if n > 1:
|
|
return f"ERROR: old_text appears {n} times in {args['path']}."
|
|
p.write_text(text.replace(old, new, 1))
|
|
return f"patched {args['path']} (replaced {len(old)} chars with {len(new)} chars)"
|
|
|
|
|
|
def tool_run_bash(workdir, args):
|
|
try:
|
|
r = subprocess.run(["bash", "-c", args["command"]], cwd=workdir, capture_output=True, text=True, timeout=BASH_TIMEOUT_S)
|
|
except subprocess.TimeoutExpired:
|
|
return f"ERROR: command timed out after {BASH_TIMEOUT_S}s"
|
|
return f"exit={r.returncode}\n--- stdout ---\n{r.stdout[-4000:]}\n--- stderr ---\n{r.stderr[-2000:]}"
|
|
|
|
|
|
TOOL_DISPATCH = {"read_file": tool_read_file, "apply_patch": tool_apply_patch, "run_bash": tool_run_bash}
|
|
|
|
|
|
def ollama_chat(model, messages):
|
|
# NOTE: no "think" key — Ollama default behavior
|
|
payload = {
|
|
"model": model, "messages": messages, "tools": TOOLS,
|
|
"stream": False, "keep_alive": "10m",
|
|
"options": {"num_ctx": 32768, "num_predict": 4096, "temperature": 0.3},
|
|
}
|
|
req = urlreq.Request(f"{OLLAMA_HOST}/api/chat", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
|
|
with urlreq.urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp:
|
|
return json.loads(resp.read())
|
|
|
|
|
|
def pytest_passes(workdir):
|
|
r = subprocess.run(["python3", "-m", "pytest", "tests/", "-q"], cwd=workdir, capture_output=True, text=True, timeout=60)
|
|
return r.returncode == 0
|
|
|
|
|
|
def run_bakeoff(model, workdir, log_path):
|
|
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": USER_PROMPT}]
|
|
trace = {"model": model, "edit_tool": "apply_patch", "think_setting": "unset (default)", "workdir": str(workdir), "started_at": time.time(), "turns": [], "final": None}
|
|
counts = {"read_file": 0, "apply_patch": 0, "run_bash": 0}
|
|
halt = None
|
|
for i in range(1, MAX_ITERATIONS + 1):
|
|
t0 = time.time()
|
|
try:
|
|
r = ollama_chat(model, messages)
|
|
except Exception as e:
|
|
halt = f"chat_error: {e}"
|
|
trace["turns"].append({"iteration": i, "error": str(e)})
|
|
break
|
|
msg = r.get("message", {})
|
|
content = msg.get("content", "") or ""
|
|
tcs = msg.get("tool_calls") or []
|
|
thinking = msg.get("thinking")
|
|
turn = {"iteration": i, "elapsed_s": round(time.time() - t0, 2), "content": content, "tool_calls": [], "prompt_eval_count": r.get("prompt_eval_count"), "eval_count": r.get("eval_count"), "thinking_field_len": len(thinking) if thinking else 0}
|
|
messages.append({"role": "assistant", "content": content, "tool_calls": tcs})
|
|
if not tcs:
|
|
trace["turns"].append(turn)
|
|
halt = "no_tool_calls"
|
|
break
|
|
for tc in tcs:
|
|
fn = tc.get("function", {})
|
|
name = fn.get("name")
|
|
args = fn.get("arguments") or {}
|
|
if isinstance(args, str):
|
|
try: args = json.loads(args)
|
|
except: args = {"_raw": args}
|
|
try: result = TOOL_DISPATCH[name](workdir, args) if name in TOOL_DISPATCH else f"ERROR: unknown {name}"
|
|
except Exception as e: result = f"ERROR: {e}"
|
|
if name in counts: counts[name] += 1
|
|
turn["tool_calls"].append({"name": name, "arguments": args, "result": result[:800]})
|
|
messages.append({"role": "tool", "content": result})
|
|
trace["turns"].append(turn)
|
|
if i == MAX_ITERATIONS:
|
|
halt = "iteration_cap"
|
|
break
|
|
trace["final"] = {"halt_reason": halt, "tests_pass": pytest_passes(workdir), "iterations_used": len(trace["turns"]), "tool_call_counts": counts, "wall_clock_s": round(time.time() - trace["started_at"], 2)}
|
|
log_path.write_text(json.dumps(trace, indent=2, default=str))
|
|
return trace
|
|
|
|
|
|
def main():
|
|
model, workdir_s, log_s = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
workdir, log_path = Path(workdir_s).resolve(), Path(log_s).resolve()
|
|
seed = Path(__file__).parent / "task_seed"
|
|
if workdir.exists(): shutil.rmtree(workdir)
|
|
shutil.copytree(seed, workdir)
|
|
r = run_bakeoff(model, workdir, log_path)
|
|
f = r["final"]
|
|
print(f"model={model} pass={f['tests_pass']} iters={f['iterations_used']} reads={f['tool_call_counts']['read_file']} patches={f['tool_call_counts']['apply_patch']} bashes={f['tool_call_counts']['run_bash']} halt={f['halt_reason']} wall={f['wall_clock_s']}s")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|