seth_semantic_game/scripts/gemma-generation-bakeoff.py

#!/usr/bin/env python3
"""Bakeoff: can Gemma 4 generate Connections-style structured puzzles?

Stress-tests unaided one-shot generation on gemma4:26b and gemma4:31b on a
local Ollama (point OLLAMA_HOST at your instance; default localhost:11434).
Output is graded by hand afterward against a rubric in the README:
overlap-traps, tight category labels, purple wordplay, blind anchor vs a
real human-curated puzzle.
"""
import json
import os
import sys
import time
import urllib.request
from datetime import datetime
from pathlib import Path

OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
MODELS = ["gemma4:26b", "gemma4:31b-it-q4_K_M"]
N_PER_MODEL = 5
TEMPERATURE = 0.8
PROJECT_ROOT = Path(__file__).resolve().parent.parent

PROMPT = """You are designing a single puzzle in the style of NYT Connections.

A Connections puzzle has:
- Exactly 16 distinct words or short phrases
- Sorted into 4 hidden groups of 4
- Each group has a tight, specific category label
- Difficulty bands: yellow (easiest, most direct), green (medium), blue (harder, often more abstract), purple (trickiest -- wordplay, double meanings, hidden patterns; e.g. "___ HOUSE": GREEN, ICE, COURT, FIRE)
- The CRITICAL feature: at least 2-3 words must plausibly fit a different group than where they actually go. These red herrings are what make the puzzle hard. Without them, the puzzle is trivial.

Generate ONE puzzle on a theme of your choice. Output strict JSON in this shape:

{
  "theme_seed": "<one-line description of what inspired the puzzle>",
  "groups": [
    {"difficulty": "yellow", "category": "<tight category label>", "words": ["W1","W2","W3","W4"]},
    {"difficulty": "green",  "category": "<...>",                  "words": [...]},
    {"difficulty": "blue",   "category": "<...>",                  "words": [...]},
    {"difficulty": "purple", "category": "<...>",                  "words": [...]}
  ],
  "intended_traps": [
    {"word": "<a word from the puzzle>", "actual_group": "yellow|green|blue|purple", "trap_group": "yellow|green|blue|purple", "reason": "<why it plausibly fits the trap group>"}
  ]
}

Rules:
- All 16 words must be distinct
- Categories must be tight enough that the right answer feels obviously right after the reveal
- intended_traps must list at least 2 genuine red-herring words
- Output ONLY the JSON object. No preamble, no markdown fences, no commentary.
"""


def call(model: str, prompt: str, temperature: float, timeout: int = 600):
    # NOTE on Gemma 4 settings (see ~/bin/gemma4-research/GOTCHAS.md):
    # - No format=json (infinite loop on gemma4:26b Q4)
    # - think=false for single-turn JSON pipelines (else thinking tokens eat budget)
    # - num_ctx >> 2048 default, num_predict >> 128 default
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "think": False,
        "options": {
            "temperature": temperature,
            "num_ctx": 8192,
            "num_predict": 4096,
        },
    }
    req = urllib.request.Request(
        OLLAMA,
        data=json.dumps(payload).encode(),
        headers={"Content-Type": "application/json"},
    )
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=timeout) as r:
        data = json.loads(r.read())
    return time.time() - t0, data


def extract_json(body: str):
    """Pull the JSON object out of a Gemma response. Returns parsed dict or raises."""
    if not body or "{" not in body or "}" not in body:
        raise ValueError("no JSON object delimiters in response")
    chunk = body[body.find("{"): body.rfind("}") + 1]
    return json.loads(chunk)


def warm(model: str) -> None:
    print(f"[warm] {model}", file=sys.stderr, flush=True)
    call(model, "Reply with just the word OK.", temperature=0.1, timeout=300)


def run_model(model: str, n: int):
    out = []
    for i in range(1, n + 1):
        # Retry with temp-bump pattern from AI_Visualizer
        last_raw = ""
        last_dt = 0.0
        last_data = {}
        last_err = None
        puzzle = None
        ok = False
        attempts = 0
        for attempt in range(3):
            attempts = attempt + 1
            temp = TEMPERATURE + attempt * 0.1
            print(f"[{model}] puzzle {i}/{n} attempt {attempts} (temp={temp:.1f})",
                  file=sys.stderr, flush=True)
            try:
                dt, data = call(model, PROMPT, temperature=temp)
            except Exception as e:
                last_err = repr(e)
                continue
            last_dt, last_data = dt, data
            last_raw = data.get("response", "") or ""
            try:
                puzzle = extract_json(last_raw)
                ok = True
                break
            except Exception as e:
                last_err = repr(e)
                continue

        if ok:
            out.append({
                "model": model, "i": i, "dt": last_dt, "ok": True,
                "attempts": attempts,
                "puzzle": puzzle,
                "eval_count": last_data.get("eval_count", 0),
                "prompt_eval_count": last_data.get("prompt_eval_count", 0),
            })
        else:
            out.append({
                "model": model, "i": i, "dt": last_dt, "ok": False,
                "attempts": attempts,
                "puzzle": {"_parse_error": last_err, "_raw": last_raw[:3000]},
                "eval_count": last_data.get("eval_count", 0) if last_data else 0,
                "prompt_eval_count": last_data.get("prompt_eval_count", 0) if last_data else 0,
            })
    return out


def render(results, stamp: str) -> str:
    lines = [
        f"# Gemma 4 Generation Bakeoff -- {stamp}",
        "",
        "## Setup",
        f"- Ollama endpoint: `{OLLAMA}` (RTX 3090 Ti on the test host)",
        "- Other GPU workloads paused for the duration of the run",
        f"- Models: {', '.join(f'`{m}`' for m in MODELS)}",
        f"- {N_PER_MODEL} puzzles per model, base temperature {TEMPERATURE}",
        "- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, "
        "`num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). "
        "JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`.",
        "- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry "
        "(AI_Visualizer pattern). Reported metrics use the *successful* attempt.",
        "- One-shot, unaided generation. No critique pass, no example puzzle in prompt.",
        "",
        "## Timing",
        "",
        "| Model | n | avg s | avg tokens | tok/s |",
        "|---|---|---|---|---|",
    ]
    for m in MODELS:
        rs = [r for r in results if r["model"] == m and "error" not in r]
        if not rs:
            lines.append(f"| `{m}` | 0 | -- | -- | -- |")
            continue
        avg_s = sum(r["dt"] for r in rs) / len(rs)
        avg_tok = sum(r["eval_count"] for r in rs) / len(rs)
        toks = avg_tok / avg_s if avg_s else 0
        lines.append(f"| `{m}` | {len(rs)} | {avg_s:.1f} | {avg_tok:.0f} | {toks:.1f} |")

    lines += ["", "## JSON parse rate", ""]
    for m in MODELS:
        rs = [r for r in results if r["model"] == m]
        ok = sum(1 for r in rs if r.get("ok"))
        lines.append(f"- `{m}`: {ok}/{len(rs)} parsed cleanly")
    lines += [""]

    for r in results:
        head = f"## {r['model']} -- puzzle {r['i']}"
        lines += [head, ""]
        if "error" in r:
            lines += [f"_API error:_ `{r['error']}`", ""]
            continue
        if not r.get("ok"):
            err = r["puzzle"].get("_parse_error", "")
            raw = r["puzzle"].get("_raw", "")[:1500]
            lines += [f"_JSON parse failed:_ `{err}`", "```", raw, "```", ""]
            continue
        p = r["puzzle"]
        lines += [f"**Theme seed:** {p.get('theme_seed', '--')}", ""]
        lines += ["| Diff | Category | Words |", "|---|---|---|"]
        for g in p.get("groups", []) or []:
            words = ", ".join(g.get("words", []) or [])
            cat = (g.get("category") or "?").replace("|", "\\|")
            lines.append(f"| {g.get('difficulty', '?')} | {cat} | {words} |")
        traps = p.get("intended_traps", []) or []
        lines += ["", f"**Claimed traps ({len(traps)}):**"]
        if not traps:
            lines.append("- _none claimed_")
        for t in traps:
            lines.append(
                f"- `{t.get('word')}` (actually {t.get('actual_group')}, traps {t.get('trap_group')}): "
                f"{t.get('reason')}"
            )
        lines += ["", "_Grade:_ TODO", "", f"_dt={r['dt']:.1f}s, tokens={r['eval_count']}_", ""]
    return "\n".join(lines)


def main() -> None:
    out_dir = PROJECT_ROOT / "docs" / "reference"
    out_dir.mkdir(parents=True, exist_ok=True)
    stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
    raw_path = out_dir / f"gemma-generation-bakeoff-{stamp}-raw.json"
    md_path = out_dir / f"gemma-generation-bakeoff-{stamp}.md"

    all_results = []
    for m in MODELS:
        warm(m)
        all_results.extend(run_model(m, N_PER_MODEL))

    raw_path.write_text(json.dumps(all_results, indent=2))
    print(f"raw  -> {raw_path}", file=sys.stderr)
    md_path.write_text(render(all_results, stamp))
    print(f"md   -> {md_path}", file=sys.stderr)
    # Final stdout: just the markdown path so callers can pipe.
    print(md_path)


if __name__ == "__main__":
    main()