Files
seth_semantic_game/scripts/gemma-generation-bakeoff.py
T
Mortdecai 5a2a02e483 docs: bootstrap repo with bakeoff results and game-mechanics idea bank
This repo opens with the design-discovery work completed before any product
code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local
Ollama established that:

- Whole-puzzle generation in the Connections shape is unreliable on Gemma 4
  (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally
  out of project scope, so the generation route is harder still.
- Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b)
  on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging
  of player-INVENTED categories. That is the structural unlock vs static
  hand-curated word games.

The README contains the full writeup, the test bench, and a brainstormed
bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo
range, plus a primitives table for recombination.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 23:09:46 -04:00

237 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""Bakeoff: can Gemma 4 generate Connections-style structured puzzles?
Stress-tests unaided one-shot generation on gemma4:26b and gemma4:31b on a
local Ollama (point OLLAMA_HOST at your instance; default localhost:11434).
Output is graded by hand afterward against a rubric in the README:
overlap-traps, tight category labels, purple wordplay, blind anchor vs a
real human-curated puzzle.
"""
import json
import os
import sys
import time
import urllib.request
from datetime import datetime
from pathlib import Path
OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
MODELS = ["gemma4:26b", "gemma4:31b-it-q4_K_M"]
N_PER_MODEL = 5
TEMPERATURE = 0.8
PROJECT_ROOT = Path(__file__).resolve().parent.parent
PROMPT = """You are designing a single puzzle in the style of NYT Connections.
A Connections puzzle has:
- Exactly 16 distinct words or short phrases
- Sorted into 4 hidden groups of 4
- Each group has a tight, specific category label
- Difficulty bands: yellow (easiest, most direct), green (medium), blue (harder, often more abstract), purple (trickiest -- wordplay, double meanings, hidden patterns; e.g. "___ HOUSE": GREEN, ICE, COURT, FIRE)
- The CRITICAL feature: at least 2-3 words must plausibly fit a different group than where they actually go. These red herrings are what make the puzzle hard. Without them, the puzzle is trivial.
Generate ONE puzzle on a theme of your choice. Output strict JSON in this shape:
{
"theme_seed": "<one-line description of what inspired the puzzle>",
"groups": [
{"difficulty": "yellow", "category": "<tight category label>", "words": ["W1","W2","W3","W4"]},
{"difficulty": "green", "category": "<...>", "words": [...]},
{"difficulty": "blue", "category": "<...>", "words": [...]},
{"difficulty": "purple", "category": "<...>", "words": [...]}
],
"intended_traps": [
{"word": "<a word from the puzzle>", "actual_group": "yellow|green|blue|purple", "trap_group": "yellow|green|blue|purple", "reason": "<why it plausibly fits the trap group>"}
]
}
Rules:
- All 16 words must be distinct
- Categories must be tight enough that the right answer feels obviously right after the reveal
- intended_traps must list at least 2 genuine red-herring words
- Output ONLY the JSON object. No preamble, no markdown fences, no commentary.
"""
def call(model: str, prompt: str, temperature: float, timeout: int = 600):
# NOTE on Gemma 4 settings (see ~/bin/gemma4-research/GOTCHAS.md):
# - No format=json (infinite loop on gemma4:26b Q4)
# - think=false for single-turn JSON pipelines (else thinking tokens eat budget)
# - num_ctx >> 2048 default, num_predict >> 128 default
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"think": False,
"options": {
"temperature": temperature,
"num_ctx": 8192,
"num_predict": 4096,
},
}
req = urllib.request.Request(
OLLAMA,
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
t0 = time.time()
with urllib.request.urlopen(req, timeout=timeout) as r:
data = json.loads(r.read())
return time.time() - t0, data
def extract_json(body: str):
"""Pull the JSON object out of a Gemma response. Returns parsed dict or raises."""
if not body or "{" not in body or "}" not in body:
raise ValueError("no JSON object delimiters in response")
chunk = body[body.find("{"): body.rfind("}") + 1]
return json.loads(chunk)
def warm(model: str) -> None:
print(f"[warm] {model}", file=sys.stderr, flush=True)
call(model, "Reply with just the word OK.", temperature=0.1, timeout=300)
def run_model(model: str, n: int):
out = []
for i in range(1, n + 1):
# Retry with temp-bump pattern from AI_Visualizer
last_raw = ""
last_dt = 0.0
last_data = {}
last_err = None
puzzle = None
ok = False
attempts = 0
for attempt in range(3):
attempts = attempt + 1
temp = TEMPERATURE + attempt * 0.1
print(f"[{model}] puzzle {i}/{n} attempt {attempts} (temp={temp:.1f})",
file=sys.stderr, flush=True)
try:
dt, data = call(model, PROMPT, temperature=temp)
except Exception as e:
last_err = repr(e)
continue
last_dt, last_data = dt, data
last_raw = data.get("response", "") or ""
try:
puzzle = extract_json(last_raw)
ok = True
break
except Exception as e:
last_err = repr(e)
continue
if ok:
out.append({
"model": model, "i": i, "dt": last_dt, "ok": True,
"attempts": attempts,
"puzzle": puzzle,
"eval_count": last_data.get("eval_count", 0),
"prompt_eval_count": last_data.get("prompt_eval_count", 0),
})
else:
out.append({
"model": model, "i": i, "dt": last_dt, "ok": False,
"attempts": attempts,
"puzzle": {"_parse_error": last_err, "_raw": last_raw[:3000]},
"eval_count": last_data.get("eval_count", 0) if last_data else 0,
"prompt_eval_count": last_data.get("prompt_eval_count", 0) if last_data else 0,
})
return out
def render(results, stamp: str) -> str:
lines = [
f"# Gemma 4 Generation Bakeoff -- {stamp}",
"",
"## Setup",
f"- Ollama endpoint: `{OLLAMA}` (RTX 3090 Ti on the test host)",
"- Other GPU workloads paused for the duration of the run",
f"- Models: {', '.join(f'`{m}`' for m in MODELS)}",
f"- {N_PER_MODEL} puzzles per model, base temperature {TEMPERATURE}",
"- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, "
"`num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). "
"JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`.",
"- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry "
"(AI_Visualizer pattern). Reported metrics use the *successful* attempt.",
"- One-shot, unaided generation. No critique pass, no example puzzle in prompt.",
"",
"## Timing",
"",
"| Model | n | avg s | avg tokens | tok/s |",
"|---|---|---|---|---|",
]
for m in MODELS:
rs = [r for r in results if r["model"] == m and "error" not in r]
if not rs:
lines.append(f"| `{m}` | 0 | -- | -- | -- |")
continue
avg_s = sum(r["dt"] for r in rs) / len(rs)
avg_tok = sum(r["eval_count"] for r in rs) / len(rs)
toks = avg_tok / avg_s if avg_s else 0
lines.append(f"| `{m}` | {len(rs)} | {avg_s:.1f} | {avg_tok:.0f} | {toks:.1f} |")
lines += ["", "## JSON parse rate", ""]
for m in MODELS:
rs = [r for r in results if r["model"] == m]
ok = sum(1 for r in rs if r.get("ok"))
lines.append(f"- `{m}`: {ok}/{len(rs)} parsed cleanly")
lines += [""]
for r in results:
head = f"## {r['model']} -- puzzle {r['i']}"
lines += [head, ""]
if "error" in r:
lines += [f"_API error:_ `{r['error']}`", ""]
continue
if not r.get("ok"):
err = r["puzzle"].get("_parse_error", "")
raw = r["puzzle"].get("_raw", "")[:1500]
lines += [f"_JSON parse failed:_ `{err}`", "```", raw, "```", ""]
continue
p = r["puzzle"]
lines += [f"**Theme seed:** {p.get('theme_seed', '--')}", ""]
lines += ["| Diff | Category | Words |", "|---|---|---|"]
for g in p.get("groups", []) or []:
words = ", ".join(g.get("words", []) or [])
cat = (g.get("category") or "?").replace("|", "\\|")
lines.append(f"| {g.get('difficulty', '?')} | {cat} | {words} |")
traps = p.get("intended_traps", []) or []
lines += ["", f"**Claimed traps ({len(traps)}):**"]
if not traps:
lines.append("- _none claimed_")
for t in traps:
lines.append(
f"- `{t.get('word')}` (actually {t.get('actual_group')}, traps {t.get('trap_group')}): "
f"{t.get('reason')}"
)
lines += ["", "_Grade:_ TODO", "", f"_dt={r['dt']:.1f}s, tokens={r['eval_count']}_", ""]
return "\n".join(lines)
def main() -> None:
out_dir = PROJECT_ROOT / "docs" / "reference"
out_dir.mkdir(parents=True, exist_ok=True)
stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
raw_path = out_dir / f"gemma-generation-bakeoff-{stamp}-raw.json"
md_path = out_dir / f"gemma-generation-bakeoff-{stamp}.md"
all_results = []
for m in MODELS:
warm(m)
all_results.extend(run_model(m, N_PER_MODEL))
raw_path.write_text(json.dumps(all_results, indent=2))
print(f"raw -> {raw_path}", file=sys.stderr)
md_path.write_text(render(all_results, stamp))
print(f"md -> {md_path}", file=sys.stderr)
# Final stdout: just the markdown path so callers can pipe.
print(md_path)
if __name__ == "__main__":
main()