#!/usr/bin/env python3 """Bakeoff: can Gemma 4 generate Connections-style structured puzzles? Stress-tests unaided one-shot generation on gemma4:26b and gemma4:31b on a local Ollama (point OLLAMA_HOST at your instance; default localhost:11434). Output is graded by hand afterward against a rubric in the README: overlap-traps, tight category labels, purple wordplay, blind anchor vs a real human-curated puzzle. """ import json import os import sys import time import urllib.request from datetime import datetime from pathlib import Path OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate" MODELS = ["gemma4:26b", "gemma4:31b-it-q4_K_M"] N_PER_MODEL = 5 TEMPERATURE = 0.8 PROJECT_ROOT = Path(__file__).resolve().parent.parent PROMPT = """You are designing a single puzzle in the style of NYT Connections. A Connections puzzle has: - Exactly 16 distinct words or short phrases - Sorted into 4 hidden groups of 4 - Each group has a tight, specific category label - Difficulty bands: yellow (easiest, most direct), green (medium), blue (harder, often more abstract), purple (trickiest -- wordplay, double meanings, hidden patterns; e.g. "___ HOUSE": GREEN, ICE, COURT, FIRE) - The CRITICAL feature: at least 2-3 words must plausibly fit a different group than where they actually go. These red herrings are what make the puzzle hard. Without them, the puzzle is trivial. Generate ONE puzzle on a theme of your choice. Output strict JSON in this shape: { "theme_seed": "", "groups": [ {"difficulty": "yellow", "category": "", "words": ["W1","W2","W3","W4"]}, {"difficulty": "green", "category": "<...>", "words": [...]}, {"difficulty": "blue", "category": "<...>", "words": [...]}, {"difficulty": "purple", "category": "<...>", "words": [...]} ], "intended_traps": [ {"word": "", "actual_group": "yellow|green|blue|purple", "trap_group": "yellow|green|blue|purple", "reason": ""} ] } Rules: - All 16 words must be distinct - Categories must be tight enough that the right answer feels obviously right after the reveal - intended_traps must list at least 2 genuine red-herring words - Output ONLY the JSON object. No preamble, no markdown fences, no commentary. """ def call(model: str, prompt: str, temperature: float, timeout: int = 600): # NOTE on Gemma 4 settings (see ~/bin/gemma4-research/GOTCHAS.md): # - No format=json (infinite loop on gemma4:26b Q4) # - think=false for single-turn JSON pipelines (else thinking tokens eat budget) # - num_ctx >> 2048 default, num_predict >> 128 default payload = { "model": model, "prompt": prompt, "stream": False, "think": False, "options": { "temperature": temperature, "num_ctx": 8192, "num_predict": 4096, }, } req = urllib.request.Request( OLLAMA, data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) t0 = time.time() with urllib.request.urlopen(req, timeout=timeout) as r: data = json.loads(r.read()) return time.time() - t0, data def extract_json(body: str): """Pull the JSON object out of a Gemma response. Returns parsed dict or raises.""" if not body or "{" not in body or "}" not in body: raise ValueError("no JSON object delimiters in response") chunk = body[body.find("{"): body.rfind("}") + 1] return json.loads(chunk) def warm(model: str) -> None: print(f"[warm] {model}", file=sys.stderr, flush=True) call(model, "Reply with just the word OK.", temperature=0.1, timeout=300) def run_model(model: str, n: int): out = [] for i in range(1, n + 1): # Retry with temp-bump pattern from AI_Visualizer last_raw = "" last_dt = 0.0 last_data = {} last_err = None puzzle = None ok = False attempts = 0 for attempt in range(3): attempts = attempt + 1 temp = TEMPERATURE + attempt * 0.1 print(f"[{model}] puzzle {i}/{n} attempt {attempts} (temp={temp:.1f})", file=sys.stderr, flush=True) try: dt, data = call(model, PROMPT, temperature=temp) except Exception as e: last_err = repr(e) continue last_dt, last_data = dt, data last_raw = data.get("response", "") or "" try: puzzle = extract_json(last_raw) ok = True break except Exception as e: last_err = repr(e) continue if ok: out.append({ "model": model, "i": i, "dt": last_dt, "ok": True, "attempts": attempts, "puzzle": puzzle, "eval_count": last_data.get("eval_count", 0), "prompt_eval_count": last_data.get("prompt_eval_count", 0), }) else: out.append({ "model": model, "i": i, "dt": last_dt, "ok": False, "attempts": attempts, "puzzle": {"_parse_error": last_err, "_raw": last_raw[:3000]}, "eval_count": last_data.get("eval_count", 0) if last_data else 0, "prompt_eval_count": last_data.get("prompt_eval_count", 0) if last_data else 0, }) return out def render(results, stamp: str) -> str: lines = [ f"# Gemma 4 Generation Bakeoff -- {stamp}", "", "## Setup", f"- Ollama endpoint: `{OLLAMA}` (RTX 3090 Ti on the test host)", "- Other GPU workloads paused for the duration of the run", f"- Models: {', '.join(f'`{m}`' for m in MODELS)}", f"- {N_PER_MODEL} puzzles per model, base temperature {TEMPERATURE}", "- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, " "`num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). " "JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`.", "- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry " "(AI_Visualizer pattern). Reported metrics use the *successful* attempt.", "- One-shot, unaided generation. No critique pass, no example puzzle in prompt.", "", "## Timing", "", "| Model | n | avg s | avg tokens | tok/s |", "|---|---|---|---|---|", ] for m in MODELS: rs = [r for r in results if r["model"] == m and "error" not in r] if not rs: lines.append(f"| `{m}` | 0 | -- | -- | -- |") continue avg_s = sum(r["dt"] for r in rs) / len(rs) avg_tok = sum(r["eval_count"] for r in rs) / len(rs) toks = avg_tok / avg_s if avg_s else 0 lines.append(f"| `{m}` | {len(rs)} | {avg_s:.1f} | {avg_tok:.0f} | {toks:.1f} |") lines += ["", "## JSON parse rate", ""] for m in MODELS: rs = [r for r in results if r["model"] == m] ok = sum(1 for r in rs if r.get("ok")) lines.append(f"- `{m}`: {ok}/{len(rs)} parsed cleanly") lines += [""] for r in results: head = f"## {r['model']} -- puzzle {r['i']}" lines += [head, ""] if "error" in r: lines += [f"_API error:_ `{r['error']}`", ""] continue if not r.get("ok"): err = r["puzzle"].get("_parse_error", "") raw = r["puzzle"].get("_raw", "")[:1500] lines += [f"_JSON parse failed:_ `{err}`", "```", raw, "```", ""] continue p = r["puzzle"] lines += [f"**Theme seed:** {p.get('theme_seed', '--')}", ""] lines += ["| Diff | Category | Words |", "|---|---|---|"] for g in p.get("groups", []) or []: words = ", ".join(g.get("words", []) or []) cat = (g.get("category") or "?").replace("|", "\\|") lines.append(f"| {g.get('difficulty', '?')} | {cat} | {words} |") traps = p.get("intended_traps", []) or [] lines += ["", f"**Claimed traps ({len(traps)}):**"] if not traps: lines.append("- _none claimed_") for t in traps: lines.append( f"- `{t.get('word')}` (actually {t.get('actual_group')}, traps {t.get('trap_group')}): " f"{t.get('reason')}" ) lines += ["", "_Grade:_ TODO", "", f"_dt={r['dt']:.1f}s, tokens={r['eval_count']}_", ""] return "\n".join(lines) def main() -> None: out_dir = PROJECT_ROOT / "docs" / "reference" out_dir.mkdir(parents=True, exist_ok=True) stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") raw_path = out_dir / f"gemma-generation-bakeoff-{stamp}-raw.json" md_path = out_dir / f"gemma-generation-bakeoff-{stamp}.md" all_results = [] for m in MODELS: warm(m) all_results.extend(run_model(m, N_PER_MODEL)) raw_path.write_text(json.dumps(all_results, indent=2)) print(f"raw -> {raw_path}", file=sys.stderr) md_path.write_text(render(all_results, stamp)) print(f"md -> {md_path}", file=sys.stderr) # Final stdout: just the markdown path so callers can pipe. print(md_path) if __name__ == "__main__": main()