docs: bootstrap repo with bakeoff results and game-mechanics idea bank

This repo opens with the design-discovery work completed before any product
code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local
Ollama established that:

- Whole-puzzle generation in the Connections shape is unreliable on Gemma 4
  (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally
  out of project scope, so the generation route is harder still.
- Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b)
  on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging
  of player-INVENTED categories. That is the structural unlock vs static
  hand-curated word games.

The README contains the full writeup, the test bench, and a brainstormed
bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo
range, plus a primitives table for recombination.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mortdecai
2026-04-27 23:09:46 -04:00
commit 5a2a02e483
10 changed files with 4659 additions and 0 deletions
+236
View File
@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""Bakeoff: can Gemma 4 generate Connections-style structured puzzles?
Stress-tests unaided one-shot generation on gemma4:26b and gemma4:31b on a
local Ollama (point OLLAMA_HOST at your instance; default localhost:11434).
Output is graded by hand afterward against a rubric in the README:
overlap-traps, tight category labels, purple wordplay, blind anchor vs a
real human-curated puzzle.
"""
import json
import os
import sys
import time
import urllib.request
from datetime import datetime
from pathlib import Path
OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
MODELS = ["gemma4:26b", "gemma4:31b-it-q4_K_M"]
N_PER_MODEL = 5
TEMPERATURE = 0.8
PROJECT_ROOT = Path(__file__).resolve().parent.parent
PROMPT = """You are designing a single puzzle in the style of NYT Connections.
A Connections puzzle has:
- Exactly 16 distinct words or short phrases
- Sorted into 4 hidden groups of 4
- Each group has a tight, specific category label
- Difficulty bands: yellow (easiest, most direct), green (medium), blue (harder, often more abstract), purple (trickiest -- wordplay, double meanings, hidden patterns; e.g. "___ HOUSE": GREEN, ICE, COURT, FIRE)
- The CRITICAL feature: at least 2-3 words must plausibly fit a different group than where they actually go. These red herrings are what make the puzzle hard. Without them, the puzzle is trivial.
Generate ONE puzzle on a theme of your choice. Output strict JSON in this shape:
{
"theme_seed": "<one-line description of what inspired the puzzle>",
"groups": [
{"difficulty": "yellow", "category": "<tight category label>", "words": ["W1","W2","W3","W4"]},
{"difficulty": "green", "category": "<...>", "words": [...]},
{"difficulty": "blue", "category": "<...>", "words": [...]},
{"difficulty": "purple", "category": "<...>", "words": [...]}
],
"intended_traps": [
{"word": "<a word from the puzzle>", "actual_group": "yellow|green|blue|purple", "trap_group": "yellow|green|blue|purple", "reason": "<why it plausibly fits the trap group>"}
]
}
Rules:
- All 16 words must be distinct
- Categories must be tight enough that the right answer feels obviously right after the reveal
- intended_traps must list at least 2 genuine red-herring words
- Output ONLY the JSON object. No preamble, no markdown fences, no commentary.
"""
def call(model: str, prompt: str, temperature: float, timeout: int = 600):
# NOTE on Gemma 4 settings (see ~/bin/gemma4-research/GOTCHAS.md):
# - No format=json (infinite loop on gemma4:26b Q4)
# - think=false for single-turn JSON pipelines (else thinking tokens eat budget)
# - num_ctx >> 2048 default, num_predict >> 128 default
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"think": False,
"options": {
"temperature": temperature,
"num_ctx": 8192,
"num_predict": 4096,
},
}
req = urllib.request.Request(
OLLAMA,
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
t0 = time.time()
with urllib.request.urlopen(req, timeout=timeout) as r:
data = json.loads(r.read())
return time.time() - t0, data
def extract_json(body: str):
"""Pull the JSON object out of a Gemma response. Returns parsed dict or raises."""
if not body or "{" not in body or "}" not in body:
raise ValueError("no JSON object delimiters in response")
chunk = body[body.find("{"): body.rfind("}") + 1]
return json.loads(chunk)
def warm(model: str) -> None:
print(f"[warm] {model}", file=sys.stderr, flush=True)
call(model, "Reply with just the word OK.", temperature=0.1, timeout=300)
def run_model(model: str, n: int):
out = []
for i in range(1, n + 1):
# Retry with temp-bump pattern from AI_Visualizer
last_raw = ""
last_dt = 0.0
last_data = {}
last_err = None
puzzle = None
ok = False
attempts = 0
for attempt in range(3):
attempts = attempt + 1
temp = TEMPERATURE + attempt * 0.1
print(f"[{model}] puzzle {i}/{n} attempt {attempts} (temp={temp:.1f})",
file=sys.stderr, flush=True)
try:
dt, data = call(model, PROMPT, temperature=temp)
except Exception as e:
last_err = repr(e)
continue
last_dt, last_data = dt, data
last_raw = data.get("response", "") or ""
try:
puzzle = extract_json(last_raw)
ok = True
break
except Exception as e:
last_err = repr(e)
continue
if ok:
out.append({
"model": model, "i": i, "dt": last_dt, "ok": True,
"attempts": attempts,
"puzzle": puzzle,
"eval_count": last_data.get("eval_count", 0),
"prompt_eval_count": last_data.get("prompt_eval_count", 0),
})
else:
out.append({
"model": model, "i": i, "dt": last_dt, "ok": False,
"attempts": attempts,
"puzzle": {"_parse_error": last_err, "_raw": last_raw[:3000]},
"eval_count": last_data.get("eval_count", 0) if last_data else 0,
"prompt_eval_count": last_data.get("prompt_eval_count", 0) if last_data else 0,
})
return out
def render(results, stamp: str) -> str:
lines = [
f"# Gemma 4 Generation Bakeoff -- {stamp}",
"",
"## Setup",
f"- Ollama endpoint: `{OLLAMA}` (RTX 3090 Ti on the test host)",
"- Other GPU workloads paused for the duration of the run",
f"- Models: {', '.join(f'`{m}`' for m in MODELS)}",
f"- {N_PER_MODEL} puzzles per model, base temperature {TEMPERATURE}",
"- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, "
"`num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). "
"JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`.",
"- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry "
"(AI_Visualizer pattern). Reported metrics use the *successful* attempt.",
"- One-shot, unaided generation. No critique pass, no example puzzle in prompt.",
"",
"## Timing",
"",
"| Model | n | avg s | avg tokens | tok/s |",
"|---|---|---|---|---|",
]
for m in MODELS:
rs = [r for r in results if r["model"] == m and "error" not in r]
if not rs:
lines.append(f"| `{m}` | 0 | -- | -- | -- |")
continue
avg_s = sum(r["dt"] for r in rs) / len(rs)
avg_tok = sum(r["eval_count"] for r in rs) / len(rs)
toks = avg_tok / avg_s if avg_s else 0
lines.append(f"| `{m}` | {len(rs)} | {avg_s:.1f} | {avg_tok:.0f} | {toks:.1f} |")
lines += ["", "## JSON parse rate", ""]
for m in MODELS:
rs = [r for r in results if r["model"] == m]
ok = sum(1 for r in rs if r.get("ok"))
lines.append(f"- `{m}`: {ok}/{len(rs)} parsed cleanly")
lines += [""]
for r in results:
head = f"## {r['model']} -- puzzle {r['i']}"
lines += [head, ""]
if "error" in r:
lines += [f"_API error:_ `{r['error']}`", ""]
continue
if not r.get("ok"):
err = r["puzzle"].get("_parse_error", "")
raw = r["puzzle"].get("_raw", "")[:1500]
lines += [f"_JSON parse failed:_ `{err}`", "```", raw, "```", ""]
continue
p = r["puzzle"]
lines += [f"**Theme seed:** {p.get('theme_seed', '--')}", ""]
lines += ["| Diff | Category | Words |", "|---|---|---|"]
for g in p.get("groups", []) or []:
words = ", ".join(g.get("words", []) or [])
cat = (g.get("category") or "?").replace("|", "\\|")
lines.append(f"| {g.get('difficulty', '?')} | {cat} | {words} |")
traps = p.get("intended_traps", []) or []
lines += ["", f"**Claimed traps ({len(traps)}):**"]
if not traps:
lines.append("- _none claimed_")
for t in traps:
lines.append(
f"- `{t.get('word')}` (actually {t.get('actual_group')}, traps {t.get('trap_group')}): "
f"{t.get('reason')}"
)
lines += ["", "_Grade:_ TODO", "", f"_dt={r['dt']:.1f}s, tokens={r['eval_count']}_", ""]
return "\n".join(lines)
def main() -> None:
out_dir = PROJECT_ROOT / "docs" / "reference"
out_dir.mkdir(parents=True, exist_ok=True)
stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
raw_path = out_dir / f"gemma-generation-bakeoff-{stamp}-raw.json"
md_path = out_dir / f"gemma-generation-bakeoff-{stamp}.md"
all_results = []
for m in MODELS:
warm(m)
all_results.extend(run_model(m, N_PER_MODEL))
raw_path.write_text(json.dumps(all_results, indent=2))
print(f"raw -> {raw_path}", file=sys.stderr)
md_path.write_text(render(all_results, stamp))
print(f"md -> {md_path}", file=sys.stderr)
# Final stdout: just the markdown path so callers can pipe.
print(md_path)
if __name__ == "__main__":
main()
+431
View File
@@ -0,0 +1,431 @@
#!/usr/bin/env python3
"""Bakeoff: Gemma 4's atomic semantic-matching abilities.
Three test types, all with hand-labeled ground truth:
- JUDGE: given (category, 4 words), does Gemma correctly say yes/no on whether
the words tightly fit?
- CREATE: given a category, does Gemma produce 4 words that tightly fit it?
- CREATIVE_ACCEPT: given 4 words and a player-proposed category that may or
may not be the puzzle's intended one, does Gemma fairly judge validity?
This is the test of whether "fuzzy / creative-grouping acceptance" -- the
twist from IDEA.md that a static NYT format structurally can't do -- is
feasible.
Models tested: gemma4:26b, gemma4:31b-it-q4_K_M, gemma4:latest (8B). 8B is
included because judging runs per player guess in any live design; if 8B is
reliable enough for JUDGE, the per-guess economics get a lot better.
Settings (well-known Gemma-4-on-Ollama gotchas): think=false, num_ctx=4096,
num_predict=512, no format=json (server-side JSON enforcer hangs on 26b Q4),
JSON extracted client-side. Point OLLAMA_HOST at your instance; default
localhost:11434.
"""
import json
import os
import sys
import time
import urllib.request
from datetime import datetime
from pathlib import Path
OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
MODELS = ["gemma4:latest", "gemma4:26b", "gemma4:31b-it-q4_K_M"]
PROJECT_ROOT = Path(__file__).resolve().parent.parent
TEMPERATURE = 0.2 # judging is a low-creativity task; we want consistency
# ---------- prompts ----------
JUDGE_PROMPT = """You are evaluating whether four words tightly fit a given semantic category, in the style of NYT Connections.
Category: {category}
Words: {w1}, {w2}, {w3}, {w4}
Do ALL FOUR words clearly fit this category? Be strict -- if even one word doesn't fit, the answer is "no". Generic loose connections do not count.
Output strict JSON, no preamble or fences:
{{"verdict": "yes" or "no", "reason": "<one short sentence>", "misfit_words": ["<any words that don't fit>"]}}
"""
CREATE_PROMPT = """You are creating a tight 4-word group in the style of NYT Connections.
Category: {category}
Produce EXACTLY four words or short phrases that tightly fit this category. Each word must clearly belong; vague or loosely-related words are not acceptable.
Output strict JSON, no preamble or fences:
{{"words": ["W1", "W2", "W3", "W4"], "reason": "<one short sentence on how all four fit>"}}
"""
CREATIVE_ACCEPT_PROMPT = """You are judging a Connections-style puzzle where the player has proposed their OWN category for four words. Their category may differ from the puzzle's intended one, but it might still be a valid alternative -- if all four words plausibly fit the player's category, accept it.
Words: {w1}, {w2}, {w3}, {w4}
Player's proposed category: "{player_category}"
Do all four words plausibly fit the player's category? Be fair: a player-creative-but-valid grouping should be accepted. But if even one word genuinely doesn't fit, reject it.
Output strict JSON, no preamble or fences:
{{"valid": "yes" or "no", "reason": "<one short sentence>", "weak_words": ["<any words that don't really fit>"]}}
"""
# ---------- test bank ----------
# Each case has hand-labeled ground truth. The "gt" field is what a thoughtful
# human grader would say (yes/no for JUDGE and CREATIVE_ACCEPT). For CREATE,
# `gt_check` describes what a passing answer should look like.
CASES = [
# ---- JUDGE: clear yes (tight fit) ----
{"id": "judge-y1", "type": "JUDGE", "category": "Types of trees",
"words": ["OAK", "MAPLE", "BIRCH", "PINE"], "gt": "yes"},
{"id": "judge-y2", "type": "JUDGE", "category": "Greek letters",
"words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "gt": "yes"},
{"id": "judge-y3", "type": "JUDGE", "category": "Days of the week",
"words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "gt": "yes"},
{"id": "judge-y4", "type": "JUDGE", "category": "Synonyms for 'happy'",
"words": ["JOYFUL", "GLAD", "CHEERFUL", "ELATED"], "gt": "yes"},
{"id": "judge-y5", "type": "JUDGE", "category": "___ HOUSE (compound words)",
"words": ["GREEN", "ICE", "FIRE", "COURT"], "gt": "yes"},
{"id": "judge-y6", "type": "JUDGE", "category": "Words that follow COLD",
"words": ["SHOULDER", "FRONT", "SNAP", "TURKEY"], "gt": "yes"},
{"id": "judge-y7", "type": "JUDGE", "category": "Verbs meaning 'to move quickly'",
"words": ["DART", "BOLT", "RUSH", "FLY"], "gt": "yes"},
{"id": "judge-y8", "type": "JUDGE", "category": "Synonyms for 'idea'",
"words": ["NOTION", "CONCEPT", "THOUGHT", "INKLING"], "gt": "yes"},
# ---- JUDGE: clear no (one or more words don't fit) ----
{"id": "judge-n1", "type": "JUDGE", "category": "Types of trees",
"words": ["OAK", "MAPLE", "BIRCH", "CARROT"], "gt": "no",
"gt_misfit": ["CARROT"]},
{"id": "judge-n2", "type": "JUDGE", "category": "Greek letters",
"words": ["ALPHA", "BETA", "GAMMA", "CYRILLIC"], "gt": "no",
"gt_misfit": ["CYRILLIC"]},
{"id": "judge-n3", "type": "JUDGE", "category": "Synonyms for 'happy'",
"words": ["JOYFUL", "GLAD", "SAD", "ELATED"], "gt": "no",
"gt_misfit": ["SAD"]},
{"id": "judge-n4", "type": "JUDGE", "category": "Days of the week",
"words": ["MONDAY", "JANUARY", "SUNDAY", "WEDNESDAY"], "gt": "no",
"gt_misfit": ["JANUARY"]},
{"id": "judge-n5", "type": "JUDGE", "category": "Body parts",
"words": ["ARM", "LEG", "EYE", "NIGHT"], "gt": "no",
"gt_misfit": ["NIGHT"]},
{"id": "judge-n6", "type": "JUDGE", "category": "Types of birds",
"words": ["CRANE", "SWALLOW", "BAT", "MOSQUITO"], "gt": "no",
"gt_misfit": ["BAT", "MOSQUITO"]},
{"id": "judge-n7", "type": "JUDGE", "category": "Things that are red",
"words": ["APPLE", "BLUE", "ROSE", "GRASS"], "gt": "no",
"gt_misfit": ["BLUE", "GRASS"]},
{"id": "judge-n8", "type": "JUDGE", "category": "Words that follow COLD",
"words": ["SHOULDER", "FRONT", "PIZZA", "MOUNTAIN"], "gt": "no",
"gt_misfit": ["PIZZA", "MOUNTAIN"]},
# ---- CREATE: easy categories ----
{"id": "create-e1", "type": "CREATE", "category": "Types of trees",
"gt_check": "Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE."},
{"id": "create-e2", "type": "CREATE", "category": "Greek letters",
"gt_check": "Four genuine Greek letters."},
{"id": "create-e3", "type": "CREATE", "category": "Synonyms for 'angry'",
"gt_check": "Four words that all genuinely mean angry/furious."},
{"id": "create-e4", "type": "CREATE", "category": "Days of the week",
"gt_check": "Four of the seven weekday names, no months or other words."},
# ---- CREATE: medium (compound / polysemy) ----
{"id": "create-m1", "type": "CREATE", "category": "___ STORM (compound words ending in STORM)",
"gt_check": "Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW)."},
{"id": "create-m2", "type": "CREATE", "category": "Synonyms for 'small'",
"gt_check": "Four words that all genuinely mean small."},
{"id": "create-m3", "type": "CREATE", "category": "Words that follow BLUE",
"gt_check": "Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS)."},
{"id": "create-m4", "type": "CREATE", "category": "Things that can be 'broken'",
"gt_check": "Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.)."},
# ---- CREATE: hard (wordplay / tight constraint) ----
{"id": "create-h1", "type": "CREATE",
"category": "Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)",
"gt_check": "Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them."},
{"id": "create-h2", "type": "CREATE",
"category": "Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)",
"gt_check": "Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD)."},
# ---- CREATIVE_ACCEPT: player's grouping is genuinely valid ----
{"id": "ca-y1", "type": "CREATIVE_ACCEPT",
"words": ["SCALE", "MOUNT", "ASCEND", "CLIMB"],
"player_category": "Verbs for going up", "gt": "yes"},
{"id": "ca-y2", "type": "CREATIVE_ACCEPT",
"words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
"player_category": "Fruits", "gt": "yes"},
{"id": "ca-y3", "type": "CREATIVE_ACCEPT",
"words": ["WHIP", "NUT", "CODE", "SMILE"],
"player_category": "Things you can crack", "gt": "yes"},
{"id": "ca-y4", "type": "CREATIVE_ACCEPT",
"words": ["BAT", "BALL", "GLOVE", "MITT"],
"player_category": "Baseball equipment", "gt": "yes"},
{"id": "ca-y5", "type": "CREATIVE_ACCEPT",
"words": ["MARS", "VENUS", "MERCURY", "JUPITER"],
"player_category": "Roman gods", "gt": "yes"},
# ---- CREATIVE_ACCEPT: player's grouping is wrong ----
{"id": "ca-n1", "type": "CREATIVE_ACCEPT",
"words": ["OAK", "MAPLE", "BIRCH", "PINE"],
"player_category": "Furniture brands", "gt": "no"},
{"id": "ca-n2", "type": "CREATIVE_ACCEPT",
"words": ["ALPHA", "BETA", "GAMMA", "DELTA"],
"player_category": "Words meaning 'small'", "gt": "no"},
{"id": "ca-n3", "type": "CREATIVE_ACCEPT",
"words": ["BAT", "BALL", "GLOVE", "MITT"],
"player_category": "Things worn on your hand", "gt": "no",
"gt_weak": ["BAT", "BALL"]},
{"id": "ca-n4", "type": "CREATIVE_ACCEPT",
"words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"],
"player_category": "Months of the year", "gt": "no"},
{"id": "ca-n5", "type": "CREATIVE_ACCEPT",
"words": ["WHIP", "NUT", "CODE", "SMILE"],
"player_category": "Things found in a kitchen", "gt": "no",
"gt_weak": ["CODE", "SMILE"]},
# ---- CREATIVE_ACCEPT: borderline (deliberately ambiguous) ----
{"id": "ca-b1", "type": "CREATIVE_ACCEPT",
"words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
"player_category": "Tech/phone brands", "gt": "borderline",
"gt_note": "APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE."},
{"id": "ca-b2", "type": "CREATIVE_ACCEPT",
"words": ["SHARP", "FLAT", "NATURAL", "KEY"],
"player_category": "Real estate listing terms", "gt": "borderline",
"gt_note": "FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes."},
]
# ---------- runner ----------
def call(model, prompt, temperature=TEMPERATURE, timeout=300):
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"think": False,
"options": {"temperature": temperature, "num_ctx": 4096, "num_predict": 512},
}
req = urllib.request.Request(
OLLAMA, data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
t0 = time.time()
with urllib.request.urlopen(req, timeout=timeout) as r:
data = json.loads(r.read())
return time.time() - t0, data
def extract_json(body):
if not body or "{" not in body or "}" not in body:
raise ValueError("no JSON braces in response")
return json.loads(body[body.find("{"): body.rfind("}") + 1])
def render_prompt(case):
if case["type"] == "JUDGE":
return JUDGE_PROMPT.format(
category=case["category"],
w1=case["words"][0], w2=case["words"][1],
w3=case["words"][2], w4=case["words"][3],
)
if case["type"] == "CREATE":
return CREATE_PROMPT.format(category=case["category"])
if case["type"] == "CREATIVE_ACCEPT":
return CREATIVE_ACCEPT_PROMPT.format(
w1=case["words"][0], w2=case["words"][1],
w3=case["words"][2], w4=case["words"][3],
player_category=case["player_category"],
)
raise ValueError(case["type"])
def warm(model):
print(f"[warm] {model}", file=sys.stderr, flush=True)
call(model, "Reply with the word OK only.", temperature=0.1, timeout=300)
def run_model(model, cases):
out = []
for case in cases:
prompt = render_prompt(case)
last_err = None
parsed = None
last_dt = 0.0
last_eval = 0
last_raw = ""
for attempt in range(3):
temp = TEMPERATURE + attempt * 0.1
print(f"[{model}] {case['id']} attempt {attempt+1} (temp={temp:.1f})",
file=sys.stderr, flush=True)
try:
dt, data = call(model, prompt, temperature=temp)
except Exception as e:
last_err = repr(e)
continue
last_dt = dt
last_eval = data.get("eval_count", 0)
last_raw = data.get("response", "") or ""
try:
parsed = extract_json(last_raw)
last_err = None
break
except Exception as e:
last_err = repr(e)
continue
out.append({
"case_id": case["id"], "type": case["type"], "model": model,
"dt": last_dt, "eval_count": last_eval,
"ok": parsed is not None,
"parsed": parsed,
"raw": last_raw[:1500] if parsed is None else None,
"error": last_err,
"case": case,
})
return out
def score(results):
"""Auto-score against ground truth where possible."""
for r in results:
c = r["case"]
if not r["ok"]:
r["score"] = "PARSE_FAIL"
continue
p = r["parsed"]
if c["type"] == "JUDGE":
v = (p.get("verdict") or "").strip().lower()
r["score"] = "PASS" if v == c["gt"] else "FAIL"
elif c["type"] == "CREATIVE_ACCEPT":
v = (p.get("valid") or "").strip().lower()
if c["gt"] == "borderline":
r["score"] = "BORDERLINE" # human grades these
else:
r["score"] = "PASS" if v == c["gt"] else "FAIL"
elif c["type"] == "CREATE":
r["score"] = "MANUAL" # human grades these against gt_check
return results
def render(results):
by_model = {}
for r in results:
by_model.setdefault(r["model"], []).append(r)
lines = [f"# Gemma 4 Semantic Bakeoff -- {datetime.now().strftime('%Y-%m-%d %H:%M')}", ""]
lines += [
"## Setup",
f"- Host: steel141 (RTX 3090 Ti) `{OLLAMA}`",
f"- Models: {', '.join('`'+m+'`' for m in MODELS)}",
f"- Temperature {TEMPERATURE} (raised +0.1 per retry on JSON parse fail, max 3 attempts)",
"- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md)",
f"- {len(CASES)} test cases: "
f"{sum(1 for c in CASES if c['type']=='JUDGE')} JUDGE, "
f"{sum(1 for c in CASES if c['type']=='CREATE')} CREATE, "
f"{sum(1 for c in CASES if c['type']=='CREATIVE_ACCEPT')} CREATIVE_ACCEPT",
"- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py`",
"",
]
# ---- per-model summaries ----
lines += ["## Auto-scored summary", ""]
lines += ["| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s |", "|---|---|---|---|---|"]
for m in MODELS:
rs = by_model.get(m, [])
if not rs:
lines.append(f"| `{m}` | - | - | - | - |")
continue
j_pass = sum(1 for r in rs if r["case"]["type"] == "JUDGE" and r.get("score") == "PASS")
j_n = sum(1 for r in rs if r["case"]["type"] == "JUDGE")
c_pass = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r.get("score") == "PASS")
c_n = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r["case"].get("gt") != "borderline")
parse_fail = sum(1 for r in rs if not r["ok"])
avg_dt = sum(r["dt"] for r in rs) / max(len(rs), 1)
lines.append(f"| `{m}` | {j_pass}/{j_n} | {c_pass}/{c_n} | {parse_fail} | {avg_dt:.1f} |")
lines += [""]
# ---- by case-type, full breakdown ----
for tname in ["JUDGE", "CREATE", "CREATIVE_ACCEPT"]:
lines += [f"## {tname}", ""]
cases_of_type = [c for c in CASES if c["type"] == tname]
for case in cases_of_type:
lines += [f"### {case['id']}", ""]
if tname == "JUDGE":
lines += [
f"- Category: `{case['category']}`",
f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
f"- Ground truth: **{case['gt']}**" + (
f" (misfit: {', '.join(case.get('gt_misfit', []))})" if case.get("gt_misfit") else ""),
"",
]
elif tname == "CREATE":
lines += [
f"- Category: `{case['category']}`",
f"- Quality bar: {case['gt_check']}",
"",
]
else: # CREATIVE_ACCEPT
lines += [
f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
f"- Player's category: `\"{case['player_category']}\"`",
f"- Ground truth: **{case['gt']}**" + (
f" -- {case.get('gt_note', '')}" if case.get("gt_note") else ""),
"",
]
lines += ["| Model | Verdict | Reason | Score | dt |", "|---|---|---|---|---|"]
for m in MODELS:
r = next((r for r in by_model.get(m, []) if r["case_id"] == case["id"]), None)
if r is None:
lines.append(f"| `{m}` | - | - | - | - |")
continue
if not r["ok"]:
lines.append(f"| `{m}` | _parse fail_ | `{(r.get('error') or '')[:60]}` | PARSE_FAIL | {r['dt']:.1f}s |")
continue
p = r["parsed"]
if tname == "JUDGE":
v = p.get("verdict", "?")
reason = p.get("reason", "")
extra = ""
if p.get("misfit_words"):
extra = f" (misfit: {', '.join(p['misfit_words'])})"
elif tname == "CREATE":
v = ", ".join(p.get("words", []) or [])[:80]
reason = p.get("reason", "")
extra = ""
else:
v = p.get("valid", "?")
reason = p.get("reason", "")
extra = ""
if p.get("weak_words"):
extra = f" (weak: {', '.join(p['weak_words'])})"
reason_short = (reason + extra).replace("|", "\\|")[:120]
v_clean = str(v).replace("|", "\\|")[:80]
lines.append(f"| `{m}` | {v_clean} | {reason_short} | {r.get('score', '?')} | {r['dt']:.1f}s |")
lines += [""]
return "\n".join(lines)
def main():
out_dir = PROJECT_ROOT / "docs" / "reference"
out_dir.mkdir(parents=True, exist_ok=True)
stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
raw_path = out_dir / f"gemma-semantic-bakeoff-{stamp}-raw.json"
md_path = out_dir / f"gemma-semantic-bakeoff-{stamp}.md"
all_results = []
for m in MODELS:
warm(m)
all_results.extend(run_model(m, CASES))
score(all_results)
# save raw without the case dict redundantly
raw = [{k: v for k, v in r.items() if k != "case"} | {"case_id": r["case_id"]} for r in all_results]
raw_path.write_text(json.dumps(raw, indent=2))
print(f"raw -> {raw_path}", file=sys.stderr)
md_path.write_text(render(all_results))
print(f"md -> {md_path}", file=sys.stderr)
print(md_path)
if __name__ == "__main__":
main()