docs: bootstrap repo with bakeoff results and game-mechanics idea bank

This repo opens with the design-discovery work completed before any product code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local Ollama established that: - Whole-puzzle generation in the Connections shape is unreliable on Gemma 4 (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally out of project scope, so the generation route is harder still. - Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b) on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging of player-INVENTED categories. That is the structural unlock vs static hand-curated word games. The README contains the full writeup, the test bench, and a brainstormed bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo range, plus a primitives table for recombination. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 23:09:46 -04:00
commit 5a2a02e483
10 changed files with 4659 additions and 0 deletions
@@ -0,0 +1,431 @@
+#!/usr/bin/env python3
+"""Bakeoff: Gemma 4's atomic semantic-matching abilities.
+
+Three test types, all with hand-labeled ground truth:
+
+- JUDGE: given (category, 4 words), does Gemma correctly say yes/no on whether
+  the words tightly fit?
+- CREATE: given a category, does Gemma produce 4 words that tightly fit it?
+- CREATIVE_ACCEPT: given 4 words and a player-proposed category that may or
+  may not be the puzzle's intended one, does Gemma fairly judge validity?
+  This is the test of whether "fuzzy / creative-grouping acceptance" -- the
+  twist from IDEA.md that a static NYT format structurally can't do -- is
+  feasible.
+
+Models tested: gemma4:26b, gemma4:31b-it-q4_K_M, gemma4:latest (8B). 8B is
+included because judging runs per player guess in any live design; if 8B is
+reliable enough for JUDGE, the per-guess economics get a lot better.
+
+Settings (well-known Gemma-4-on-Ollama gotchas): think=false, num_ctx=4096,
+num_predict=512, no format=json (server-side JSON enforcer hangs on 26b Q4),
+JSON extracted client-side. Point OLLAMA_HOST at your instance; default
+localhost:11434.
+"""
+import json
+import os
+import sys
+import time
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+
+OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
+MODELS = ["gemma4:latest", "gemma4:26b", "gemma4:31b-it-q4_K_M"]
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+TEMPERATURE = 0.2  # judging is a low-creativity task; we want consistency
+
+# ---------- prompts ----------
+
+JUDGE_PROMPT = """You are evaluating whether four words tightly fit a given semantic category, in the style of NYT Connections.
+
+Category: {category}
+Words: {w1}, {w2}, {w3}, {w4}
+
+Do ALL FOUR words clearly fit this category? Be strict -- if even one word doesn't fit, the answer is "no". Generic loose connections do not count.
+
+Output strict JSON, no preamble or fences:
+{{"verdict": "yes" or "no", "reason": "<one short sentence>", "misfit_words": ["<any words that don't fit>"]}}
+"""
+
+CREATE_PROMPT = """You are creating a tight 4-word group in the style of NYT Connections.
+
+Category: {category}
+
+Produce EXACTLY four words or short phrases that tightly fit this category. Each word must clearly belong; vague or loosely-related words are not acceptable.
+
+Output strict JSON, no preamble or fences:
+{{"words": ["W1", "W2", "W3", "W4"], "reason": "<one short sentence on how all four fit>"}}
+"""
+
+CREATIVE_ACCEPT_PROMPT = """You are judging a Connections-style puzzle where the player has proposed their OWN category for four words. Their category may differ from the puzzle's intended one, but it might still be a valid alternative -- if all four words plausibly fit the player's category, accept it.
+
+Words: {w1}, {w2}, {w3}, {w4}
+Player's proposed category: "{player_category}"
+
+Do all four words plausibly fit the player's category? Be fair: a player-creative-but-valid grouping should be accepted. But if even one word genuinely doesn't fit, reject it.
+
+Output strict JSON, no preamble or fences:
+{{"valid": "yes" or "no", "reason": "<one short sentence>", "weak_words": ["<any words that don't really fit>"]}}
+"""
+
+# ---------- test bank ----------
+# Each case has hand-labeled ground truth. The "gt" field is what a thoughtful
+# human grader would say (yes/no for JUDGE and CREATIVE_ACCEPT). For CREATE,
+# `gt_check` describes what a passing answer should look like.
+
+CASES = [
+    # ---- JUDGE: clear yes (tight fit) ----
+    {"id": "judge-y1", "type": "JUDGE", "category": "Types of trees",
+     "words": ["OAK", "MAPLE", "BIRCH", "PINE"], "gt": "yes"},
+    {"id": "judge-y2", "type": "JUDGE", "category": "Greek letters",
+     "words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "gt": "yes"},
+    {"id": "judge-y3", "type": "JUDGE", "category": "Days of the week",
+     "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "gt": "yes"},
+    {"id": "judge-y4", "type": "JUDGE", "category": "Synonyms for 'happy'",
+     "words": ["JOYFUL", "GLAD", "CHEERFUL", "ELATED"], "gt": "yes"},
+    {"id": "judge-y5", "type": "JUDGE", "category": "___ HOUSE (compound words)",
+     "words": ["GREEN", "ICE", "FIRE", "COURT"], "gt": "yes"},
+    {"id": "judge-y6", "type": "JUDGE", "category": "Words that follow COLD",
+     "words": ["SHOULDER", "FRONT", "SNAP", "TURKEY"], "gt": "yes"},
+    {"id": "judge-y7", "type": "JUDGE", "category": "Verbs meaning 'to move quickly'",
+     "words": ["DART", "BOLT", "RUSH", "FLY"], "gt": "yes"},
+    {"id": "judge-y8", "type": "JUDGE", "category": "Synonyms for 'idea'",
+     "words": ["NOTION", "CONCEPT", "THOUGHT", "INKLING"], "gt": "yes"},
+
+    # ---- JUDGE: clear no (one or more words don't fit) ----
+    {"id": "judge-n1", "type": "JUDGE", "category": "Types of trees",
+     "words": ["OAK", "MAPLE", "BIRCH", "CARROT"], "gt": "no",
+     "gt_misfit": ["CARROT"]},
+    {"id": "judge-n2", "type": "JUDGE", "category": "Greek letters",
+     "words": ["ALPHA", "BETA", "GAMMA", "CYRILLIC"], "gt": "no",
+     "gt_misfit": ["CYRILLIC"]},
+    {"id": "judge-n3", "type": "JUDGE", "category": "Synonyms for 'happy'",
+     "words": ["JOYFUL", "GLAD", "SAD", "ELATED"], "gt": "no",
+     "gt_misfit": ["SAD"]},
+    {"id": "judge-n4", "type": "JUDGE", "category": "Days of the week",
+     "words": ["MONDAY", "JANUARY", "SUNDAY", "WEDNESDAY"], "gt": "no",
+     "gt_misfit": ["JANUARY"]},
+    {"id": "judge-n5", "type": "JUDGE", "category": "Body parts",
+     "words": ["ARM", "LEG", "EYE", "NIGHT"], "gt": "no",
+     "gt_misfit": ["NIGHT"]},
+    {"id": "judge-n6", "type": "JUDGE", "category": "Types of birds",
+     "words": ["CRANE", "SWALLOW", "BAT", "MOSQUITO"], "gt": "no",
+     "gt_misfit": ["BAT", "MOSQUITO"]},
+    {"id": "judge-n7", "type": "JUDGE", "category": "Things that are red",
+     "words": ["APPLE", "BLUE", "ROSE", "GRASS"], "gt": "no",
+     "gt_misfit": ["BLUE", "GRASS"]},
+    {"id": "judge-n8", "type": "JUDGE", "category": "Words that follow COLD",
+     "words": ["SHOULDER", "FRONT", "PIZZA", "MOUNTAIN"], "gt": "no",
+     "gt_misfit": ["PIZZA", "MOUNTAIN"]},
+
+    # ---- CREATE: easy categories ----
+    {"id": "create-e1", "type": "CREATE", "category": "Types of trees",
+     "gt_check": "Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE."},
+    {"id": "create-e2", "type": "CREATE", "category": "Greek letters",
+     "gt_check": "Four genuine Greek letters."},
+    {"id": "create-e3", "type": "CREATE", "category": "Synonyms for 'angry'",
+     "gt_check": "Four words that all genuinely mean angry/furious."},
+    {"id": "create-e4", "type": "CREATE", "category": "Days of the week",
+     "gt_check": "Four of the seven weekday names, no months or other words."},
+
+    # ---- CREATE: medium (compound / polysemy) ----
+    {"id": "create-m1", "type": "CREATE", "category": "___ STORM (compound words ending in STORM)",
+     "gt_check": "Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW)."},
+    {"id": "create-m2", "type": "CREATE", "category": "Synonyms for 'small'",
+     "gt_check": "Four words that all genuinely mean small."},
+    {"id": "create-m3", "type": "CREATE", "category": "Words that follow BLUE",
+     "gt_check": "Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS)."},
+    {"id": "create-m4", "type": "CREATE", "category": "Things that can be 'broken'",
+     "gt_check": "Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.)."},
+
+    # ---- CREATE: hard (wordplay / tight constraint) ----
+    {"id": "create-h1", "type": "CREATE",
+     "category": "Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)",
+     "gt_check": "Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them."},
+    {"id": "create-h2", "type": "CREATE",
+     "category": "Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)",
+     "gt_check": "Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD)."},
+
+    # ---- CREATIVE_ACCEPT: player's grouping is genuinely valid ----
+    {"id": "ca-y1", "type": "CREATIVE_ACCEPT",
+     "words": ["SCALE", "MOUNT", "ASCEND", "CLIMB"],
+     "player_category": "Verbs for going up", "gt": "yes"},
+    {"id": "ca-y2", "type": "CREATIVE_ACCEPT",
+     "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
+     "player_category": "Fruits", "gt": "yes"},
+    {"id": "ca-y3", "type": "CREATIVE_ACCEPT",
+     "words": ["WHIP", "NUT", "CODE", "SMILE"],
+     "player_category": "Things you can crack", "gt": "yes"},
+    {"id": "ca-y4", "type": "CREATIVE_ACCEPT",
+     "words": ["BAT", "BALL", "GLOVE", "MITT"],
+     "player_category": "Baseball equipment", "gt": "yes"},
+    {"id": "ca-y5", "type": "CREATIVE_ACCEPT",
+     "words": ["MARS", "VENUS", "MERCURY", "JUPITER"],
+     "player_category": "Roman gods", "gt": "yes"},
+
+    # ---- CREATIVE_ACCEPT: player's grouping is wrong ----
+    {"id": "ca-n1", "type": "CREATIVE_ACCEPT",
+     "words": ["OAK", "MAPLE", "BIRCH", "PINE"],
+     "player_category": "Furniture brands", "gt": "no"},
+    {"id": "ca-n2", "type": "CREATIVE_ACCEPT",
+     "words": ["ALPHA", "BETA", "GAMMA", "DELTA"],
+     "player_category": "Words meaning 'small'", "gt": "no"},
+    {"id": "ca-n3", "type": "CREATIVE_ACCEPT",
+     "words": ["BAT", "BALL", "GLOVE", "MITT"],
+     "player_category": "Things worn on your hand", "gt": "no",
+     "gt_weak": ["BAT", "BALL"]},
+    {"id": "ca-n4", "type": "CREATIVE_ACCEPT",
+     "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"],
+     "player_category": "Months of the year", "gt": "no"},
+    {"id": "ca-n5", "type": "CREATIVE_ACCEPT",
+     "words": ["WHIP", "NUT", "CODE", "SMILE"],
+     "player_category": "Things found in a kitchen", "gt": "no",
+     "gt_weak": ["CODE", "SMILE"]},
+
+    # ---- CREATIVE_ACCEPT: borderline (deliberately ambiguous) ----
+    {"id": "ca-b1", "type": "CREATIVE_ACCEPT",
+     "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
+     "player_category": "Tech/phone brands", "gt": "borderline",
+     "gt_note": "APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE."},
+    {"id": "ca-b2", "type": "CREATIVE_ACCEPT",
+     "words": ["SHARP", "FLAT", "NATURAL", "KEY"],
+     "player_category": "Real estate listing terms", "gt": "borderline",
+     "gt_note": "FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes."},
+]
+
+# ---------- runner ----------
+
+def call(model, prompt, temperature=TEMPERATURE, timeout=300):
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "think": False,
+        "options": {"temperature": temperature, "num_ctx": 4096, "num_predict": 512},
+    }
+    req = urllib.request.Request(
+        OLLAMA, data=json.dumps(payload).encode(),
+        headers={"Content-Type": "application/json"},
+    )
+    t0 = time.time()
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        data = json.loads(r.read())
+    return time.time() - t0, data
+
+
+def extract_json(body):
+    if not body or "{" not in body or "}" not in body:
+        raise ValueError("no JSON braces in response")
+    return json.loads(body[body.find("{"): body.rfind("}") + 1])
+
+
+def render_prompt(case):
+    if case["type"] == "JUDGE":
+        return JUDGE_PROMPT.format(
+            category=case["category"],
+            w1=case["words"][0], w2=case["words"][1],
+            w3=case["words"][2], w4=case["words"][3],
+        )
+    if case["type"] == "CREATE":
+        return CREATE_PROMPT.format(category=case["category"])
+    if case["type"] == "CREATIVE_ACCEPT":
+        return CREATIVE_ACCEPT_PROMPT.format(
+            w1=case["words"][0], w2=case["words"][1],
+            w3=case["words"][2], w4=case["words"][3],
+            player_category=case["player_category"],
+        )
+    raise ValueError(case["type"])
+
+
+def warm(model):
+    print(f"[warm] {model}", file=sys.stderr, flush=True)
+    call(model, "Reply with the word OK only.", temperature=0.1, timeout=300)
+
+
+def run_model(model, cases):
+    out = []
+    for case in cases:
+        prompt = render_prompt(case)
+        last_err = None
+        parsed = None
+        last_dt = 0.0
+        last_eval = 0
+        last_raw = ""
+        for attempt in range(3):
+            temp = TEMPERATURE + attempt * 0.1
+            print(f"[{model}] {case['id']} attempt {attempt+1} (temp={temp:.1f})",
+                  file=sys.stderr, flush=True)
+            try:
+                dt, data = call(model, prompt, temperature=temp)
+            except Exception as e:
+                last_err = repr(e)
+                continue
+            last_dt = dt
+            last_eval = data.get("eval_count", 0)
+            last_raw = data.get("response", "") or ""
+            try:
+                parsed = extract_json(last_raw)
+                last_err = None
+                break
+            except Exception as e:
+                last_err = repr(e)
+                continue
+        out.append({
+            "case_id": case["id"], "type": case["type"], "model": model,
+            "dt": last_dt, "eval_count": last_eval,
+            "ok": parsed is not None,
+            "parsed": parsed,
+            "raw": last_raw[:1500] if parsed is None else None,
+            "error": last_err,
+            "case": case,
+        })
+    return out
+
+
+def score(results):
+    """Auto-score against ground truth where possible."""
+    for r in results:
+        c = r["case"]
+        if not r["ok"]:
+            r["score"] = "PARSE_FAIL"
+            continue
+        p = r["parsed"]
+        if c["type"] == "JUDGE":
+            v = (p.get("verdict") or "").strip().lower()
+            r["score"] = "PASS" if v == c["gt"] else "FAIL"
+        elif c["type"] == "CREATIVE_ACCEPT":
+            v = (p.get("valid") or "").strip().lower()
+            if c["gt"] == "borderline":
+                r["score"] = "BORDERLINE"  # human grades these
+            else:
+                r["score"] = "PASS" if v == c["gt"] else "FAIL"
+        elif c["type"] == "CREATE":
+            r["score"] = "MANUAL"  # human grades these against gt_check
+    return results
+
+
+def render(results):
+    by_model = {}
+    for r in results:
+        by_model.setdefault(r["model"], []).append(r)
+
+    lines = [f"# Gemma 4 Semantic Bakeoff -- {datetime.now().strftime('%Y-%m-%d %H:%M')}", ""]
+    lines += [
+        "## Setup",
+        f"- Host: steel141 (RTX 3090 Ti) `{OLLAMA}`",
+        f"- Models: {', '.join('`'+m+'`' for m in MODELS)}",
+        f"- Temperature {TEMPERATURE} (raised +0.1 per retry on JSON parse fail, max 3 attempts)",
+        "- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md)",
+        f"- {len(CASES)} test cases: "
+        f"{sum(1 for c in CASES if c['type']=='JUDGE')} JUDGE, "
+        f"{sum(1 for c in CASES if c['type']=='CREATE')} CREATE, "
+        f"{sum(1 for c in CASES if c['type']=='CREATIVE_ACCEPT')} CREATIVE_ACCEPT",
+        "- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py`",
+        "",
+    ]
+
+    # ---- per-model summaries ----
+    lines += ["## Auto-scored summary", ""]
+    lines += ["| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s |", "|---|---|---|---|---|"]
+    for m in MODELS:
+        rs = by_model.get(m, [])
+        if not rs:
+            lines.append(f"| `{m}` | - | - | - | - |")
+            continue
+        j_pass = sum(1 for r in rs if r["case"]["type"] == "JUDGE" and r.get("score") == "PASS")
+        j_n = sum(1 for r in rs if r["case"]["type"] == "JUDGE")
+        c_pass = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r.get("score") == "PASS")
+        c_n = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r["case"].get("gt") != "borderline")
+        parse_fail = sum(1 for r in rs if not r["ok"])
+        avg_dt = sum(r["dt"] for r in rs) / max(len(rs), 1)
+        lines.append(f"| `{m}` | {j_pass}/{j_n} | {c_pass}/{c_n} | {parse_fail} | {avg_dt:.1f} |")
+    lines += [""]
+
+    # ---- by case-type, full breakdown ----
+    for tname in ["JUDGE", "CREATE", "CREATIVE_ACCEPT"]:
+        lines += [f"## {tname}", ""]
+        cases_of_type = [c for c in CASES if c["type"] == tname]
+        for case in cases_of_type:
+            lines += [f"### {case['id']}", ""]
+            if tname == "JUDGE":
+                lines += [
+                    f"- Category: `{case['category']}`",
+                    f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
+                    f"- Ground truth: **{case['gt']}**" + (
+                        f" (misfit: {', '.join(case.get('gt_misfit', []))})" if case.get("gt_misfit") else ""),
+                    "",
+                ]
+            elif tname == "CREATE":
+                lines += [
+                    f"- Category: `{case['category']}`",
+                    f"- Quality bar: {case['gt_check']}",
+                    "",
+                ]
+            else:  # CREATIVE_ACCEPT
+                lines += [
+                    f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
+                    f"- Player's category: `\"{case['player_category']}\"`",
+                    f"- Ground truth: **{case['gt']}**" + (
+                        f" -- {case.get('gt_note', '')}" if case.get("gt_note") else ""),
+                    "",
+                ]
+            lines += ["| Model | Verdict | Reason | Score | dt |", "|---|---|---|---|---|"]
+            for m in MODELS:
+                r = next((r for r in by_model.get(m, []) if r["case_id"] == case["id"]), None)
+                if r is None:
+                    lines.append(f"| `{m}` | - | - | - | - |")
+                    continue
+                if not r["ok"]:
+                    lines.append(f"| `{m}` | _parse fail_ | `{(r.get('error') or '')[:60]}` | PARSE_FAIL | {r['dt']:.1f}s |")
+                    continue
+                p = r["parsed"]
+                if tname == "JUDGE":
+                    v = p.get("verdict", "?")
+                    reason = p.get("reason", "")
+                    extra = ""
+                    if p.get("misfit_words"):
+                        extra = f" (misfit: {', '.join(p['misfit_words'])})"
+                elif tname == "CREATE":
+                    v = ", ".join(p.get("words", []) or [])[:80]
+                    reason = p.get("reason", "")
+                    extra = ""
+                else:
+                    v = p.get("valid", "?")
+                    reason = p.get("reason", "")
+                    extra = ""
+                    if p.get("weak_words"):
+                        extra = f" (weak: {', '.join(p['weak_words'])})"
+                reason_short = (reason + extra).replace("|", "\\|")[:120]
+                v_clean = str(v).replace("|", "\\|")[:80]
+                lines.append(f"| `{m}` | {v_clean} | {reason_short} | {r.get('score', '?')} | {r['dt']:.1f}s |")
+            lines += [""]
+
+    return "\n".join(lines)
+
+
+def main():
+    out_dir = PROJECT_ROOT / "docs" / "reference"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
+    raw_path = out_dir / f"gemma-semantic-bakeoff-{stamp}-raw.json"
+    md_path = out_dir / f"gemma-semantic-bakeoff-{stamp}.md"
+
+    all_results = []
+    for m in MODELS:
+        warm(m)
+        all_results.extend(run_model(m, CASES))
+
+    score(all_results)
+
+    # save raw without the case dict redundantly
+    raw = [{k: v for k, v in r.items() if k != "case"} | {"case_id": r["case_id"]} for r in all_results]
+    raw_path.write_text(json.dumps(raw, indent=2))
+    print(f"raw  -> {raw_path}", file=sys.stderr)
+
+    md_path.write_text(render(all_results))
+    print(f"md   -> {md_path}", file=sys.stderr)
+    print(md_path)
+
+
+if __name__ == "__main__":
+    main()