From 5a2a02e48375dcacb41d1a10385929d626195868 Mon Sep 17 00:00:00 2001 From: Mortdecai Date: Mon, 27 Apr 2026 23:09:46 -0400 Subject: [PATCH] docs: bootstrap repo with bakeoff results and game-mechanics idea bank MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This repo opens with the design-discovery work completed before any product code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local Ollama established that: - Whole-puzzle generation in the Connections shape is unreliable on Gemma 4 (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally out of project scope, so the generation route is harder still. - Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b) on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging of player-INVENTED categories. That is the structural unlock vs static hand-curated word games. The README contains the full writeup, the test bench, and a brainstormed bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo range, plus a primitives table for recombination. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 19 + DECISIONS.md | 30 + IDEA.md | 49 + README.md | 235 ++ ...eration-bakeoff-2026-04-27-221751-raw.json | 790 +++++++ ...ma-generation-bakeoff-2026-04-27-221751.md | 278 +++ ...emantic-bakeoff-2026-04-27-224800-raw.json | 2077 +++++++++++++++++ ...emma-semantic-bakeoff-2026-04-27-224800.md | 514 ++++ scripts/gemma-generation-bakeoff.py | 236 ++ scripts/gemma-semantic-bakeoff.py | 431 ++++ 10 files changed, 4659 insertions(+) create mode 100644 .gitignore create mode 100644 DECISIONS.md create mode 100644 IDEA.md create mode 100644 README.md create mode 100644 docs/reference/gemma-generation-bakeoff-2026-04-27-221751-raw.json create mode 100644 docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md create mode 100644 docs/reference/gemma-semantic-bakeoff-2026-04-27-224800-raw.json create mode 100644 docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md create mode 100644 scripts/gemma-generation-bakeoff.py create mode 100644 scripts/gemma-semantic-bakeoff.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3d79370 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# Local backups (created by editing pipeline; not for sharing) +.backup/ + +# Session handoff documents — heavily homelab-internal, replaced by README +.claude/ + +# Python +__pycache__/ +*.pyc +*.pyo + +# Editor / OS +.DS_Store +*.swp +*~ + +# Local environment +.env +.env.* diff --git a/DECISIONS.md b/DECISIONS.md new file mode 100644 index 0000000..10f5731 --- /dev/null +++ b/DECISIONS.md @@ -0,0 +1,30 @@ +# DECISIONS.md — seth_semantic_game Decision Log + +Project-specific decisions. For global/cross-cutting decisions, see `~/bin/DECISIONS.md`. + +Format: `YYYY-MM-DD: ` + +## Architecture + +- **2026-04-27: The Gemma-enabled twist is real-time CREATIVE_ACCEPT — fair judging of player-invented categories** — Semantic bakeoff (`docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md`) showed all three Gemma 4 variants (8B, 26b, 31b) achieve 10/10 on player-creative-but-valid grouping judgments. This is the IDEA.md unlock: a derivative game that *accepts the player's own valid groupings* in real time, which the static NYT format structurally cannot do. Likely product framing: "Connections, but you can group however you can defend." +- **2026-04-27: Live judging on gemma4:latest (8B) at 0.7s/call** — 8B JUDGE accuracy is 87.5% strict, CREATIVE_ACCEPT 100%, output sub-second. Per-guess economics are effectively free. (Originally this entry called for 31b on once-per-day generation; that was superseded when 31b was removed from scope — see below.) +- **2026-04-27: 26b is NOT the live judge despite being only marginally slower than 8B** — 26b showed an "agree with the user" bias on the borderline tech-brand case (accepted KIWI as a tech brand). For CREATIVE_ACCEPT specifically, false-positives are worse than false-negatives — accepting bad groupings degrades game integrity, while rejecting valid ones is just frustrating. 8B's stricter calibration is the right tradeoff. +- **2026-04-27: Generation must go through a guarded pipeline, not a single Gemma call** — Prior bakeoff (`docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md`) showed gemma4:31b passes ~40-50% structurally clean and gemma4:26b ~20-30%; both produce duplicate-tile and broken-category failures unaided. Acceptable design shape: 31b generate → deterministic filter (16 distinct tiles, no dup words, all claimed-trap words present) → category-similarity check → critique pass (8B or 26b — much cheaper than 31b critique) → cache the day's accepted puzzle. +- **2026-04-27: gemma4:31b is OUT OF SCOPE — only 8B and 26b are in the model lineup** — User constraint: 31b's quality edge does not justify keeping it as a project dependency; 8B and 26b are good enough. **Implication for generation**: 26b's ~20-30% structural-pass rate becomes the working number. Generation pipeline must do more work to compensate — either stricter automated filters, more retry attempts, OR shift the design center toward player-driven generation (game ideas where the *player* supplies words/categories and Gemma judges) rather than AI-driven generation. The latter is favored because Gemma's per-call JUDGE / CREATIVE_ACCEPT is reliable on both 8B and 26b (87.5% / 93.75%) — that's the strong axis to lean on. +- **2026-04-27: Live judging on gemma4:latest (8B), generation candidate gemma4:26b** — 8B JUDGE 14/16, CREATIVE_ACCEPT 10/10, 0.7s. 26b is the heavier model when accuracy matters more (e.g. offline puzzle gen + critique). Model use by role: live JUDGE → 8B; live CREATIVE_ACCEPT → 8B; offline generation → 26b with retries; offline critique → 26b grading 8B's output (or vice-versa) so the same model isn't rubber-stamping itself. + +## Implementation + +- **2026-04-27: Single-turn JSON pipeline payload settings (canonical for this project)** — `think: false`, `num_ctx: 8192`, `num_predict: 4096`, NO `format: "json"`, parse JSON client-side via `body[body.find('{'):body.rfind('}')+1]`, retry up to 3× with temperature bumped +0.1 each attempt. All four are mandatory per `~/bin/gemma4-research/GOTCHAS.md` for gemma4:26b/31b on Ollama 0.20.x; format=json hangs the model, default num_predict=128 truncates output, default num_ctx=2048 truncates the prompt, and unset `think` consumes the response budget on thinking tokens. +- **2026-04-27: Inference host = local 3090 Ti (24 GB)** — delivers ~94 tok/s on gemma4:26b and ~24 tok/s on gemma4:31b; sub-second per-call latency on the short JUDGE / CREATIVE_ACCEPT prompts. + +## Deferred / Rejected + + +- **2026-04-27 — REJECTED: Gemma self-grading puzzles** — In the bakeoff, Gemma's own "intended_traps" claims didn't always hold up (e.g., #3-26b claimed `PRESS` traps the "Words after BLOOD" group, but blood-press isn't a phrase). If we route the critique pass back through the same model, it will rubber-stamp the same kinds of errors it generates. Use a different judge: a non-Gemma model on the same host (any reasonably-capable open-weights model), or two different Gemma sizes against each other. +- **2026-04-27 — DEFERRED: Connections-vs-Gemma blind anchor** — The plan called for mixing one real NYT puzzle into the grading set. Skipped because Gemma's structural failures (duplicate tiles, broken categories) are obvious curator-rejections — the within-Gemma evidence was decisive on its own. Revisit before locking the design: eyeball one filter-passed Gemma puzzle next to a real NYT puzzle and confirm equivalence. +- **2026-04-27 — DEFERRED: Diversity-over-time test** — All 10 bakeoff puzzles were unseeded. With 31b alone, two of five were scale-themed; risk of long-term repetition. Need a seeded run (e.g., 30 puzzles with date-rotated theme prompts) before committing to a year-round daily-puzzle product. +- **2026-04-27 — DEFERRED: Critique-pass effectiveness test** — The architecture above assumes a second-model critique pass catches the broken categories. Not yet verified. Next experiment: feed the failed bakeoff puzzles into a critique prompt and check whether the model flags the actual structural issues. +- **2026-04-27 — DEFERRED: Adversarial-player robustness on CREATIVE_ACCEPT** — Test cases were honest player categories. Real players will gaming-test the judge with categories like "Words containing a vowel" (trivially-true for most English words) or "Words that are 4-7 letters long" (true by construction in many cases). Need a category-tightness pre-check on player input — e.g. require category to fail for at least one word on the board, or require category specificity above a threshold — before submitting it to Gemma for word-fit judging. +- **2026-04-27 — DEFERRED: Cultural/contextual category robustness** — Tested categories were lexical/factual ("Roman gods", "fruits", "things you can crack"). Cultural references ("Words in Beatles songs", "Common Texan slang") and time-bound categories may break the judge. Test before designing any feature that depends on them. +- **2026-04-27 — KNOWN LIMIT: Hard wordplay categories ("homophones of body parts") fail on all three Gemma 4 variants** — This is a structural model limit, not a configuration issue. If this category class is desired in puzzles, scaffold with worked examples in the prompt or human-curate the seed list; do not rely on unaided generation for it. diff --git a/IDEA.md b/IDEA.md new file mode 100644 index 0000000..5518323 --- /dev/null +++ b/IDEA.md @@ -0,0 +1,49 @@ +# IDEA.md — seth_semantic_game + +## What is this? + +A daily word game **based on NYT Connections**, powered by a locally-hosted Gemma 4 +model. Connections gives the player 16 words that have to be sorted into 4 hidden +groups of 4 by shared semantic category. The twist for this project — what makes it +worth building rather than just playing the original — is whatever Gemma 4 enables +that NYT's hand-curated static format cannot. + +That twist is **not yet decided**. That's what brainstorming is for. + +The base mechanic is fixed: +- Connections-style grouping puzzle (semantic categories, not letters) +- Gemma 4 in the loop somewhere (puzzle generation, judging, hint system, or all of + the above) +- Daily-puzzle structure with social-shareable result (the Connections / Wordle + ritual — borrowed *only* for its sharing pattern, not its gameplay) + +This is **not** Wordle-derived. The original draft of this file framed it as +"Wordle-style"; that was wrong. The mechanic is grouping, not letter-guessing. + +## Problem it solves + +Mostly fun and a real use of the local Gemma 4 stack. NYT Connections is hand-curated +and ships one puzzle per day; a generative version could ship infinite puzzles, accept +fuzzy or creative groupings, generate themed/seeded puzzles, or do other things the +hand-built version structurally cannot. Secondary: a daily-puzzle hook for sethpc.xyz +alongside other homelab games. + +## Constraints / preferences + +- Self-hosted: Ollama with Gemma 4 on commodity GPU (a single 24 GB card is enough) +- Web frontend, dark theme with orange accents +- If a puzzle is generative, output must be **deterministic per day** (every player + on a given date gets the same puzzle). Likely a date-seeded prompt with cached + output rather than a fresh generation per request. +- Per-guess judging cost should be cheap — at most one Gemma call per submission, and + ideally answers are precomputed when the daily puzzle is generated, so judging + becomes a cheap lookup. +- No login required for casual play (cookies/localStorage for streak) + +> NOTE on history: this brief was originally a "Wordle-style" framing. That was +> wrong — the seed game is NYT Connections (16 words → 4 hidden groups of 4). +> But after the model bakeoffs (see README), the *direction* shifted again: +> rather than cloning Connections, the project pivots toward gameplay that +> uses Gemma's per-call CREATIVE_ACCEPT ability to fairly judge +> player-INVENTED categories — a thing static curated games structurally can't +> do. The brainstormed game ideas in the README are what came out of that. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0fa6084 --- /dev/null +++ b/README.md @@ -0,0 +1,235 @@ +# seth_semantic_game + +**Working title.** A self-hosted word game built around an LLM's ability to fairly judge *player-invented* semantic categories in real time — something static, hand-curated word games structurally cannot do. + +This repo documents the design discovery process, including two model bakeoffs that picked the architecture and a brainstormed bank of game-mechanics ideas that the actual product will draw from. + +--- + +## TL;DR + +- **Seed idea:** clone NYT Connections (16 words → 4 hidden groups of 4) with a local LLM doing the curation. +- **Seed idea died fast:** unaided whole-puzzle generation on Gemma 4 ships broken puzzles ~50% of the time (duplicate tiles, mislabeled categories, fake wordplay) — see [docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md](docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md). +- **The actual unlock:** Gemma 4 reliably judges whether a player-supplied category fits a player-supplied set of words. Across 35 hand-labeled cases on three model sizes, **CREATIVE_ACCEPT scored 10/10 on every model** including the 8B variant at 0.7s per call. JUDGE landed at 87.5% / 93.75% / 100% (8B / 26b / 31b). See [docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md](docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md). +- **The pivot:** stop trying to generate Connections. Build games where the *player* invents the groupings and the LLM is the live, fair judge. That's what the static format can't do. +- **Models in scope:** `gemma4:latest` (8B) for live judging, `gemma4:26b` for offline puzzle prep / critique. `gemma4:31b` was tested and is more accurate, but is intentionally out of scope for this project. + +--- + +## What we did + +Two experiments, both reproducible from `scripts/` against a local Ollama (point `OLLAMA_HOST` at your instance; defaults to `http://localhost:11434`). + +### Experiment 1 — Generation bakeoff + +**Question:** can Gemma 4 generate a Connections-quality 16-word / 4-group puzzle in one shot? + +**Setup:** 5 puzzles per model on gemma4:26b and gemma4:31b. Strict JSON schema requesting groups + difficulty bands + claimed overlap-trap words. No format=json (that's a known Gemma 4 + Ollama hang); JSON parsed client-side; up to 3 retries with temperature bumped +0.1 each attempt. + +**Results:** + +| Model | Pass | Borderline | Fail | Avg s/puzzle | +|---|---|---|---|---| +| `gemma4:26b` | 1 | 1 + 1 partial | 2 | 5.2 | +| `gemma4:31b-it-q4_K_M` | 2 | 2 | 1 | 18.2 | + +Failure modes ranked by severity: + +1. **Structural violations** — duplicate or near-duplicate words on the 16-tile board. *Trivially detectable.* +2. **Broken category logic** — words listed in a category they don't actually fit (`DELUXE` doesn't start with the full Greek letter "DELTA"; `LIBRA` isn't a "type of scale"). *Hard to detect deterministically — needs a critique pass.* +3. **Redundant categories** — two groups themed on the same concept. Detectable. +4. **Self-graded traps don't always hold up** — Gemma's claimed `intended_traps` were sometimes nonsense (`PRESS` claimed to fit "Words after BLOOD," but the compound is *blood pressure*, not *blood press*). **Important consequence: the same model cannot be trusted to grade its own output.** + +This was decisive for the project direction: unaided generation isn't viable; AND we're explicitly capping at 26b, which is the *less* reliable generator. So we need a different game shape — one that doesn't depend on the LLM generating finished puzzles unaided. + +### Experiment 2 — Semantic-skill bakeoff + +**Question:** instead of whole-puzzle generation, can Gemma reliably perform the atomic skills a live game would need? Specifically: + +- **JUDGE** — given a category and 4 words, does Gemma correctly say yes/no on whether they all fit? +- **CREATE** — given a category, does Gemma produce 4 tightly-fitting words? +- **CREATIVE_ACCEPT** — given 4 words and a *player-proposed* category, does Gemma fairly judge whether the category validates the grouping (even if it differs from any "intended" category)? + +The third one is the design-relevant one. If it works, the game can let players invent their own groupings — which is structurally impossible for a hand-curated static format. + +**Setup:** 35 hand-labeled cases (16 JUDGE / 10 CREATE / 9 CREATIVE_ACCEPT + 2 deliberately ambiguous) tested across `gemma4:latest` (8B), `gemma4:26b`, and `gemma4:31b`. Each case has explicit ground truth in the test bank. + +**Results:** + +| Model | JUDGE | CREATE | CREATIVE_ACCEPT | Avg s/call | +|---|---|---|---|---| +| `gemma4:latest` (8B) | 14/16 (87.5%) | 8/10 | **10/10** | 0.7 | +| `gemma4:26b` | 15/16 (93.75%) | 9/10 | **10/10** | 0.8 | +| `gemma4:31b-it-q4_K_M` | 16/16 | 9/10 | **10/10** | 2.3 | + +**Key findings:** + +- **CREATIVE_ACCEPT is decisive across all three models.** 10/10 on five player-creative-but-valid groupings (e.g. `WHIP / NUT / CODE / SMILE → "Things you can crack"` accepted) AND 10/10 on five invalid ones (e.g. `OAK / MAPLE / BIRCH / PINE → "Furniture brands"` rejected). The model gets the distinction. +- **8B is fast enough to use as a live judge.** Sub-second on a 24 GB consumer GPU; per-guess economics are effectively free. +- **26b is mildly over-permissive on borderline cases.** It accepted KIWI as a tech brand (`APPLE / ORANGE / KIWI / BLACKBERRY → "Tech/phone brands"`). 8B and 31b were stricter. For a live game, false-positives degrade integrity more than false-negatives — so 8B's calibration is the right tradeoff for live judging. +- **One failure mode is shared by all three models:** "homophones-of-body-parts" (8B gave SEA/SEE/HEAR/HERE — none of which sound like body parts; 26b gave EYE which IS a body part rather than a homophone of one; 31b parse-failed three times running). Avoid this category class or scaffold prompts with worked examples. + +--- + +## What we picked + +**Model assignments:** + +| Role | Model | Why | +|---|---|---| +| Live JUDGE (per player guess) | `gemma4:latest` (8B) | Sub-second, strict-enough calibration, 87.5% accuracy on tight cases | +| Live CREATIVE_ACCEPT | `gemma4:latest` (8B) | 10/10 in test, sub-second | +| Offline puzzle generation (if used at all) | `gemma4:26b` with strict filters + retries | 31b is out of scope by user constraint; 26b plus a deterministic post-filter and a critique pass is the workable path | +| Offline critique pass | `gemma4:26b` grading 8B's work, OR a non-Gemma open-weights judge | A model cannot be trusted to grade itself — the bakeoff confirmed Gemma rubber-stamps its own structural mistakes | + +**Operational gotchas baked into the scripts** (all from upstream Gemma 4 + Ollama issue tracker; documented in the bakeoff scripts): + +- No `format: "json"` — server-side JSON enforcer hangs gemma4:26b Q4 indefinitely; ask for JSON in the prompt and parse client-side. +- `think: false` for single-turn JSON pipelines — otherwise thinking tokens consume the response budget and `response` comes back empty. +- Override Ollama defaults: `num_ctx` (default 2048 truncates the prompt), `num_predict` (default 128 truncates the output). +- For multi-turn tool-calling agents the rule is the opposite: leave `think` unset on 26b. Not relevant here, but worth knowing. + +--- + +## Game-mechanics idea bank + +The two bakeoffs together say: **don't build a game where the LLM is the curator. Build a game where the LLM is the live, fair judge of player creativity.** Below are 10 distinct game ideas that take that as the design constraint. None of them is Connections; each one leans on something a static game structurally can't replicate (live category validation, multi-solution puzzles, generative answer pools, semantic chains, etc.). + +Each idea lists its **tempo** (how fast the game feels), the **AI calls per turn** (so cost can be reasoned about), and the **structural novelty** (the thing this idea can do that a hand-curated static format cannot). + +### Fast-paced (≤60-second rounds) + +#### 1. **Pile** — speedrun categorize +- **Tempo:** real-time, 60-second rounds. +- **Mechanic:** A pool of ~16 random words. You drag any 3–5 of them into a box and type a category. The LLM (8B) judges in ~0.7s. Accepted → those words clear, refilled from a deck. Rejected → they stay. Score = words categorized per minute. +- **AI calls:** 1 per submission (CREATIVE_ACCEPT shape: player-supplied category + player-supplied words). +- **Structural novelty:** the player invents groupings under time pressure; categories aren't pre-known. A static game has a single fixed answer per puzzle; this one has open-ended valid answers as long as the LLM can confirm tightness. + +#### 2. **Bridge** — single-word polysemy speedrun +- **Tempo:** real-time, ~10 sec per move. +- **Mechanic:** Two category cards on screen ("Words for sharp pain" and "Things that bite"). Type one word the LLM accepts as fitting BOTH (e.g. `STING`). Move on. Faster = more points. +- **AI calls:** 2 JUDGE calls per submission (one per category, on the player's word). +- **Structural novelty:** the polysemy/multi-meaning skill — a known Connections difficulty axis — turned into the *primary* gameplay loop. Static games can plant such words but can't let the player invent them on demand. + +#### 3. **Threaded** — semantic word chains +- **Tempo:** real-time / continuous. +- **Mechanic:** Words drift across a conveyor belt. You build a chain by linking consecutive words with a category the LLM accepts ("APPLE → ORANGE: both fruits" → "ORANGE → RED: both colors" → "RED → ANGRY: red with anger"). Chain length = score. One chain per game. +- **AI calls:** 1 JUDGE per link, on the player's pair-and-category. +- **Structural novelty:** emergent semantic graphs from arbitrary word streams. The category set isn't pre-built — it's whatever the player can find. A static game can't be open-ended on the connection vocabulary. + +### Medium-paced (5–15 minute sessions) + +#### 4. **Stretch** — push a category to its limit +- **Tempo:** medium, 5-min sessions. +- **Mechanic:** The game opens with a tight seed category and 4 starting words ("Types of trees: OAK, MAPLE, BIRCH, PINE"). Add a 5th word — does it still fit? LLM judges. Yes → add a 6th. Each accepted word = +1 point. First rejection ends the round. Some categories support more stretch than others (broader = more elastic). +- **AI calls:** 1 JUDGE per word added. +- **Structural novelty:** category *elasticity* as a gameplay dimension. There's no pre-set answer length. The player learns intuitions about which categories admit how much stretching — a meta-skill no static game develops. + +#### 5. **Inverse** — multi-solution sort +- **Tempo:** medium, ~10 min per puzzle. +- **Mechanic:** 16 words on a board with NO predetermined grouping. The player sorts them into ANY 4 groups of 4 with ANY categories of their choice. The LLM judges all 4 categories. All 4 valid → win. Bonus for tightness (LLM rates each category 1–5). +- **AI calls:** 4 CREATIVE_ACCEPT per submission, plus optional 4 tightness-score calls. +- **Structural novelty:** Connections has *one* valid answer; this version has thousands. Players compete on creativity and tightness, not on guessing the curator's mind. + +#### 6. **Misfit** — odd-one-out, then redeem +- **Tempo:** medium, ~3 min per puzzle. +- **Mechanic:** The game shows a category and 4–5 words; one of them doesn't quite fit. Stage 1: identify the misfit. Stage 2 (bonus): propose a category the *misfit* word DOES fit. Both stages judged by the LLM. +- **AI calls:** 1 JUDGE on stage 1 (verifies the misfit), 1 CREATIVE_ACCEPT on stage 2 (validates the player's redemption category). +- **Structural novelty:** the second stage — "what category does the wrong word actually fit?" — is essentially impossible without live judging. Static games can plant misfits; they can't accept arbitrary creative redemptions. + +### Slow / daily + +#### 7. **Coalition** — daily creativity leaderboard +- **Tempo:** daily, 24-hour cycle, async. +- **Mechanic:** Once per day, the system publishes 16 words (offline-generated by 26b with the guarded pipeline + filter + critique pass). All players worldwide get the same 16. Each player submits their own 4×4 sort with 4 self-supplied categories. Server collects all submissions. Daily leaderboard ranks by: + - **Validity:** all 4 categories accepted by the LLM (binary gate). + - **Tightness score:** LLM rates each category 1–5; submission score is the average. + - **Uniqueness:** how few other players used the same exact grouping (rewards creativity over the obvious solution). +- **AI calls:** 4 CREATIVE_ACCEPT + 4 tightness ratings per submission. +- **Structural novelty:** the social/share ritual of Wordle and Connections, but with creativity as the leaderboard axis instead of speed-to-known-answer. "I split the daily 16 with the only 'Greek myths' grouping anyone found" is a different brag than "I solved it in 2 mistakes." + +#### 8. **Bench** — collaborative single-category foraging +- **Tempo:** daily, 24-hour async. +- **Mechanic:** Each day a single category is published ("Words that follow GREEN" or "Things you can break"). Players have 24 hours to submit as many words as they can; LLM judges each. Each accepted word is "claimed" by the first submitter (publicly visible). Per-player score = unique claims. +- **AI calls:** 1 JUDGE per submitted word. +- **Structural novelty:** the *answer set is generative*, not hand-curated. NYT can't ship an open-ended "submit anything that fits" puzzle because they don't know all the answers; the LLM does (well enough for 87.5% of cases, with the bench growing publicly to fill in the rest). + +### Hybrid / structurally distinctive + +#### 9. **Heist** — competitive bluff-and-claim +- **Tempo:** medium-fast, 2-team multiplayer. +- **Mechanic:** Two teams share a pool of words. Each turn, the active team **announces a category** ("Words that follow BLUE") and has 30 seconds to claim words from the pool that fit. The opposing team can **challenge** any claim — if the LLM agrees the word doesn't fit, the claiming team loses points; if it does, the challenger loses points. Bluffing dynamics emerge naturally: claim a borderline word and dare them to challenge. +- **AI calls:** 1 JUDGE per claim (at challenge-time only — no need to judge unchallenged claims unless you want a "true scoring" cleanup pass at end-of-game). +- **Structural novelty:** competitive *risk-taking* on category boundaries. The challenge mechanic literally requires a live, fair judge — there's no static-game equivalent because static games can't adjudicate disputes mid-play. + +#### 10. **Hidden** — find the broadest tight category +- **Tempo:** medium, ~5 min per puzzle. +- **Mechanic:** 12 (or more) words on a board. Find ONE category that fits ALL of them — and the *narrower / more specific* the category, the higher the score. ("Things that exist" gets you 1 point; "Things you'd find in a 1980s bedroom" gets you 8.) LLM judges on both validity (does it actually fit all 12?) and tightness (1–5). +- **AI calls:** 1 batched JUDGE (on category × 12 words) per submission, plus 1 tightness rating. +- **Structural novelty:** the inversion. Every other word game asks the player to find narrow groups inside a board; this one asks the player to find the *broadest* category that *still* feels tight. A different cognitive skill, and impossible without live category judging. + +--- + +## Recombinable building blocks + +The 10 ideas above mix five primitives. Use these to remix or design new variants: + +| Primitive | Variants | +|---|---| +| **Time pressure** | Real-time / per-move timer / per-day async / untimed | +| **Goal direction** | Find a valid grouping · validate a player-proposed grouping · find a misfit · find a "bridge" word · find the broadest tight category · build a chain | +| **Player count** | Solo · async-multi (Wordle-shape) · sync-co-op · sync-versus | +| **Word source** | Daily-curated 16 · player-supplied · conveyor-fed stream · category-seeded generation | +| **Scoring axis** | Speed · count · uniqueness vs other players · LLM-rated tightness · chain length | +| **AI call shape** | JUDGE single · JUDGE batched (one category × N words) · CREATIVE_ACCEPT · CREATE (rare — from the bakeoff this is the least reliable axis) · tightness-rating | + +Easy recombinations to consider: + +- **Pile + Coalition** = daily 60-second speedrun on the day's curated word pool, leaderboard by score. +- **Stretch + Hidden** = find the longest broadest category that still passes the tightness bar. +- **Heist + Threaded** = chain-builder versus mode where teams steal links from each other's chains. +- **Bench + Misfit** = daily foraging where some submissions are deliberate adversarial misfits the community has to flag. + +--- + +## Open questions / things still untested + +1. **Adversarial player input on CREATIVE_ACCEPT.** Tests used honest categories. Real players will gaming-test the judge with categories like "Words containing a vowel" (trivially-true on most English words) or "Words that are 4–7 letters long" (true by construction in many cases). Need a category-tightness pre-check on player input — at minimum, require the category to *fail* for at least one word from the wider deck, or apply a specificity bar. +2. **Cultural / contextual category robustness.** Tested categories were lexical/factual ("Roman gods", "fruits", "things you can crack"). Cultural references and time-bound categories ("Words in Beatles songs", "Common Texan slang") may break the judge. +3. **Critique-pass effectiveness.** The generation pipeline assumes a second-model critique pass catches structural mistakes. Not yet verified — feed Experiment 1's failed puzzles into a critique prompt and check. +4. **8B's "no" bias on hard YES cases.** It missed `judge-y3` (days of the week — said all four were misfits, which was incoherent) and `judge-y6` (cold turkey). 8B might be slightly more conservative in production than its test numbers suggest. +5. **Diversity over time.** All 10 puzzles in Experiment 1 were unseeded; 31b reached for "scales" twice in 5 puzzles. With 26b alone for generation, the diversity question is sharper. A 30-day seeded run is the next experiment if any of the daily-puzzle ideas (Coalition, Bench) goes forward. + +--- + +## Repo structure + +``` +. +├── README.md # this file +├── IDEA.md # original brief, with note about the pivot +├── DECISIONS.md # decision log, kept as project moves forward +├── scripts/ +│ ├── gemma-generation-bakeoff.py # Experiment 1 — whole-puzzle generation +│ └── gemma-semantic-bakeoff.py # Experiment 2 — atomic skills +└── docs/reference/ + ├── gemma-generation-bakeoff-2026-04-27-221751.md # Experiment 1 report (graded) + ├── gemma-generation-bakeoff-2026-04-27-221751-raw.json + ├── gemma-semantic-bakeoff-2026-04-27-224800.md # Experiment 2 report (graded) + └── gemma-semantic-bakeoff-2026-04-27-224800-raw.json +``` + +## Reproduce + +```bash +# point at any local Ollama with gemma4:latest and gemma4:26b loaded +export OLLAMA_HOST=http://localhost:11434 +python3 scripts/gemma-semantic-bakeoff.py # ~5 min on a 24 GB GPU +python3 scripts/gemma-generation-bakeoff.py # ~10 min +``` + +Reports land in `docs/reference/` with timestamps. Hand-grade the CREATE outputs and any TODO grades inline in the markdown — both bakeoff scripts emit grading-friendly reports. + +## License + +Not yet specified. If you're considering using this code or the test bank in your own work, open an issue and ask. diff --git a/docs/reference/gemma-generation-bakeoff-2026-04-27-221751-raw.json b/docs/reference/gemma-generation-bakeoff-2026-04-27-221751-raw.json new file mode 100644 index 0000000..7d2a4ff --- /dev/null +++ b/docs/reference/gemma-generation-bakeoff-2026-04-27-221751-raw.json @@ -0,0 +1,790 @@ +[ + { + "model": "gemma4:26b", + "i": 1, + "dt": 5.670714855194092, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "An exploration of common words that can be paired with 'BIRD' or 'DRAGON' or relate to types of 'FLY'.", + "groups": [ + { + "difficulty": "yellow", + "category": "Types of Flies", + "words": [ + "FRUIT", + "HOUSE", + "FIRE", + "BLOW" + ] + }, + { + "difficulty": "green", + "category": "Words before 'FLY'", + "words": [ + "DRAGON", + "MAY", + "BUTTER", + "BOTTLE" + ] + }, + { + "difficulty": "blue", + "category": "Types of Birds", + "words": [ + "LARK", + "SWALLOW", + "CRANE", + "SWIFT" + ] + }, + { + "difficulty": "purple", + "category": "Words that follow 'DRAGON'", + "words": [ + "FRUIT", + "FLY", + "SLAYER", + "FLAME" + ] + } + ], + "intended_traps": [ + { + "word": "FRUIT", + "actual_group": "purple", + "trap_group": "yellow", + "reason": "Fruit fly is a very common type of fly, making it a primary red herring for the yellow group." + }, + { + "word": "FIRE", + "actual_group": "yellow", + "trap_group": "purple", + "reason": "Fire dragon or dragon flame connection could lead someone to look for a dragon-themed group." + }, + { + "word": "DRAGON", + "actual_group": "green", + "trap_group": "blue", + "reason": "Dragon is a mythical creature, and users might look for a 'mythical creatures' group including Crane or Swallow (which are real birds)." + }, + { + "word": "FLY", + "actual_group": "purple", + "trap_group": "green", + "reason": "Dragonfly is a common word, tempting the player to put 'FLY' in the 'Words before FLY' group." + } + ] + }, + "eval_count": 500, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:26b", + "i": 2, + "dt": 5.300434589385986, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "Homophones and linguistic overlaps between nature and everyday objects.", + "groups": [ + { + "difficulty": "yellow", + "category": "Types of Trees", + "words": [ + "BIRCH", + "CEDAR", + "MAPLE", + "PINE" + ] + }, + { + "difficulty": "green", + "category": "Things that can be 'Dropped'", + "words": [ + "BEAT", + "HINT", + "LINE", + "SUBJECT" + ] + }, + { + "difficulty": "blue", + "category": "Units of Measurement", + "words": [ + "FOOT", + "GRAM", + "SECOND", + "STONE" + ] + }, + { + "difficulty": "purple", + "category": "Homophones of Body Parts", + "words": [ + "EYE", + "HARE", + "MUSSEL", + "QUEUE" + ] + } + ], + "intended_traps": [ + { + "word": "PINE", + "actual_group": "yellow", + "trap_group": "green", + "reason": "To 'pine' for someone is a feeling, but 'pine' could be mistaken for something you can 'drop' (like a pine needle)." + }, + { + "word": "FOOT", + "actual_group": "blue", + "trap_group": "yellow", + "reason": "A 'foot' is a unit of measurement, but it is also a body part, potentially leading to the 'Body Part Homophones' group." + }, + { + "word": "BEAT", + "actual_group": "green", + "trap_group": "blue", + "reason": "A 'beat' is a rhythm, but it could be mistaken for a unit of time/measurement in a musical context." + }, + { + "word": "EYE", + "actual_group": "purple", + "trap_group": "blue", + "reason": "The word 'eye' is a body part, which might tempt players looking for a body part category, but it is actually a homophone (I) for the purple group." + } + ] + }, + "eval_count": 510, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:26b", + "i": 3, + "dt": 5.103548049926758, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "The concept of 'Pressure' and things that can be measured or applied.", + "groups": [ + { + "difficulty": "yellow", + "category": "Units of Pressure", + "words": [ + "BAR", + "PASCAL", + "PSI", + "TORR" + ] + }, + { + "difficulty": "green", + "category": "Words after 'BLOOD'", + "words": [ + "PRESSURE", + "TYPE", + "ORANGE", + "DONOR" + ] + }, + { + "difficulty": "blue", + "category": "Things that can be 'Broken'", + "words": [ + "HEART", + "RECORD", + "LAW", + "PROMISE" + ] + }, + { + "difficulty": "purple", + "category": "Words that are also Verbs for 'Applying Force'", + "words": [ + "PRESS", + "STAMP", + "PUNCH", + "DRIVE" + ] + } + ], + "intended_traps": [ + { + "word": "PRESSURE", + "actual_group": "green", + "trap_group": "yellow", + "reason": "Pressure is the concept being measured by the units in the yellow group." + }, + { + "word": "BAR", + "actual_group": "yellow", + "trap_group": "blue", + "reason": "A 'bar' is something that can be 'broken' (like a chocolate bar) or refers to a legal setting." + }, + { + "word": "PRESS", + "actual_group": "purple", + "trap_group": "green", + "reason": "The 'Press' is a common term in news/media, and 'Press' can be related to 'Blood' in a medical context (blood pressure)." + }, + { + "word": "RECORD", + "actual_group": "blue", + "trap_group": "purple", + "reason": "A 'record' is something you can 'stamp' or 'drive' (as in a driving record)." + } + ] + }, + "eval_count": 492, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:26b", + "i": 4, + "dt": 5.257405757904053, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "A collection of words related to things that can be 'broken' or 'cracked', mixed with other categories like cooking and music.", + "groups": [ + { + "difficulty": "yellow", + "category": "Kinds of Eggs", + "words": [ + "CHICKEN", + "QUAIL", + "GOOSE", + "DUCK" + ] + }, + { + "difficulty": "green", + "category": "Things You Can Crack", + "words": [ + "NUT", + "CODE", + "WHIP", + "SMILE" + ] + }, + { + "difficulty": "blue", + "category": "Musical Terms", + "words": [ + "NOTE", + "CHORD", + "CLEF", + "BEAT" + ] + }, + { + "difficulty": "purple", + "category": "Words Before 'Egg'", + "words": [ + "NEST", + "EASTER", + "BAD", + "NESTING" + ] + } + ], + "intended_traps": [ + { + "word": "DUCK", + "actual_group": "yellow", + "trap_group": "green", + "reason": "DUCK is an action related to avoiding a whip or a blow, and can be paired with 'crack' in some linguistic contexts." + }, + { + "word": "CHICKEN", + "actual_group": "yellow", + "trap_group": "purple", + "reason": "CHICKEN is an egg-related word that could tempt someone looking for '___ Egg' phrases (like Chicken Egg)." + }, + { + "word": "BEAT", + "actual_group": "blue", + "trap_group": "green", + "reason": "BEAT is something you can do to a whip or a person, fitting the 'Things you can crack/strike' theme." + }, + { + "word": "NEST", + "actual_group": "purple", + "trap_group": "yellow", + "reason": "NEST is intrinsically linked to the birds in the yellow group (CHICKEN, GOOSE, etc.)." + } + ] + }, + "eval_count": 506, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:26b", + "i": 5, + "dt": 4.578032493591309, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "A collection of words centered around various types of 'records' and 'tracks'.", + "groups": [ + { + "difficulty": "yellow", + "category": "Types of Records", + "words": [ + "LP", + "EP", + "Single", + "Album" + ] + }, + { + "difficulty": "green", + "category": "Things that have Tracks", + "words": [ + "Train", + "Athlete", + "Song", + "Mud" + ] + }, + { + "difficulty": "blue", + "category": "Words for 'Evidence'", + "words": [ + "Clue", + "Trace", + "Mark", + "Proof" + ] + }, + { + "difficulty": "purple", + "category": "Words that follow 'Side'", + "words": [ + "Effect", + "Kick", + "Split", + "Walk" + ] + } + ], + "intended_traps": [ + { + "word": "Single", + "actual_group": "yellow", + "trap_group": "blue", + "reason": "A 'single' can be a piece of evidence or a solitary item." + }, + { + "word": "Song", + "actual_group": "green", + "trap_group": "yellow", + "reason": "A song is often released as an LP, EP, or Single." + }, + { + "word": "Trace", + "actual_group": "blue", + "trap_group": "green", + "reason": "A trace can be a physical track left behind in mud." + }, + { + "word": "Split", + "actual_group": "purple", + "trap_group": "green", + "reason": "A 'split' can refer to a track or division in a race." + } + ] + }, + "eval_count": 436, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:31b-it-q4_K_M", + "i": 1, + "dt": 18.89196801185608, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "Words related to circles, timing, and subtle wordplay with 'Hands'", + "groups": [ + { + "difficulty": "yellow", + "category": "Circular Objects", + "words": [ + "RING", + "COIN", + "WHEEL", + "DISC" + ] + }, + { + "difficulty": "green", + "category": "Units of Time", + "words": [ + "SECOND", + "MINUTE", + "HOUR", + "DAY" + ] + }, + { + "difficulty": "blue", + "category": "Ways to Assist", + "words": [ + "HELP", + "AID", + "BACK", + "SUPPORT" + ] + }, + { + "difficulty": "purple", + "category": "Things with Hands", + "words": [ + "CLOCK", + "WATCH", + "DECK", + "PLAYER" + ] + } + ], + "intended_traps": [ + { + "word": "SECOND", + "actual_group": "green", + "trap_group": "blue", + "reason": "To 'second' a motion is to support it." + }, + { + "word": "WATCH", + "actual_group": "purple", + "trap_group": "green", + "reason": "A watch is a device used to measure the units of time (second, minute, hour)." + }, + { + "word": "CLOCK", + "actual_group": "purple", + "trap_group": "green", + "reason": "Similar to watch, it is intrinsically linked to the time units group." + }, + { + "word": "BACK", + "actual_group": "blue", + "trap_group": "yellow", + "reason": "Could be mistaken as a part of a circular object or a physical position." + } + ] + }, + "eval_count": 439, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:31b-it-q4_K_M", + "i": 2, + "dt": 16.603100776672363, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "A mix of architectural elements, types of scales, and linguistic wordplay", + "groups": [ + { + "difficulty": "yellow", + "category": "Parts of a Building", + "words": [ + "BEAM", + "COLUMN", + "EAVE", + "FOUNDATION" + ] + }, + { + "difficulty": "green", + "category": "Types of Scales", + "words": [ + "LIBRA", + "MUSICAL", + "WEIGHING", + "TIMELINE" + ] + }, + { + "difficulty": "blue", + "category": "Synonyms for Balance", + "words": [ + "EQUILIBRIUM", + "POISE", + "STEADY", + "STABILITY" + ] + }, + { + "difficulty": "purple", + "category": "Words that start with a Greek Letter", + "words": [ + "ALPHABET", + "BETRAYAL", + "GAMUT", + "DELUXE" + ] + } + ], + "intended_traps": [ + { + "word": "LIBRA", + "actual_group": "green", + "trap_group": "blue", + "reason": "Libra is the sign of the scales, which represents balance/equilibrium." + }, + { + "word": "COLUMN", + "actual_group": "yellow", + "trap_group": "green", + "reason": "A column can be a way to organize data on a scale or chart." + }, + { + "word": "GAMUT", + "actual_group": "purple", + "trap_group": "green", + "reason": "The 'gamut' refers to the full range or scale of something." + } + ] + }, + "eval_count": 410, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:31b-it-q4_K_M", + "i": 3, + "dt": 17.279732942581177, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "Words related to vision, optics, and ocular puns", + "groups": [ + { + "difficulty": "yellow", + "category": "Parts of the Eye", + "words": [ + "CORNEA", + "PUPIL", + "IRIS", + "RETINA" + ] + }, + { + "difficulty": "green", + "category": "Types of Lenses", + "words": [ + "CONCAVE", + "CONVEX", + "CONTACT", + "BIFOCAL" + ] + }, + { + "difficulty": "blue", + "category": "Words meaning 'To Witness'", + "words": [ + "OBSERVE", + "SPOT", + "NOTE", + "VIEW" + ] + }, + { + "difficulty": "purple", + "category": "___ EYE", + "words": [ + "BULLS", + "BIRD", + "FISH", + "PRIVATE" + ] + } + ], + "intended_traps": [ + { + "word": "PUPIL", + "actual_group": "yellow", + "trap_group": "blue", + "reason": "A pupil is a student, and one might 'note' or 'observe' a pupil in a classroom setting." + }, + { + "word": "PRIVATE", + "actual_group": "purple", + "trap_group": "blue", + "reason": "Private can be a synonym for 'secret' or 'hidden', contrasting with the act of observing or spotting." + }, + { + "word": "CONTACT", + "actual_group": "green", + "trap_group": "blue", + "reason": "To 'contact' someone can be confused with 'spotting' or 'viewing' them in a social context." + } + ] + }, + "eval_count": 422, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:31b-it-q4_K_M", + "i": 4, + "dt": 19.867947340011597, + "ok": true, + "attempts": 1, + "puzzle": { + "theme_seed": "Common words that can also be types of birds or musical terms", + "groups": [ + { + "difficulty": "yellow", + "category": "Things that can be 'Cast'", + "words": [ + "SPELL", + "VOTE", + "SHADOW", + "NET" + ] + }, + { + "difficulty": "green", + "category": "Musical Dynamics", + "words": [ + "PIANO", + "FORTE", + "SHARP", + "FLAT" + ] + }, + { + "difficulty": "blue", + "category": "Types of Birds", + "words": [ + "CRANE", + "SWALLOW", + "SWIFT", + "LOON" + ] + }, + { + "difficulty": "purple", + "category": "Words that mean 'to move quickly'", + "words": [ + "DART", + "BOLT", + "RUSH", + "FLY" + ] + } + ], + "intended_traps": [ + { + "word": "SWIFT", + "actual_group": "blue", + "trap_group": "purple", + "reason": "Swift is a synonym for moving quickly, fitting the 'move quickly' group." + }, + { + "word": "FLY", + "actual_group": "purple", + "trap_group": "blue", + "reason": "Fly is a common bird-related term (or a type of insect), making it look like it belongs with Crane and Swallow." + }, + { + "word": "SWALLOW", + "actual_group": "blue", + "trap_group": "purple", + "reason": "While a bird, if the user interprets 'move quickly' as 'gulp down', it might confuse them, though primarily it serves as a bird lure." + }, + { + "word": "SHARP", + "actual_group": "green", + "trap_group": "purple", + "reason": "A 'sharp' turn or movement can be associated with quickness/darting." + } + ] + }, + "eval_count": 487, + "prompt_eval_count": 440 + }, + { + "model": "gemma4:31b-it-q4_K_M", + "i": 5, + "dt": 18.18777823448181, + "ok": true, + "attempts": 2, + "puzzle": { + "theme_seed": "Words related to types of 'scales' and musical terminology", + "groups": [ + { + "difficulty": "yellow", + "category": "Musical Scales", + "words": [ + "MAJOR", + "MINOR", + "CHROMATIC", + "PENTATONIC" + ] + }, + { + "difficulty": "green", + "category": "Things that Weigh", + "words": [ + "BALANCE", + "SPRING", + "DIGITAL", + "TRIPLE-BEAM" + ] + }, + { + "difficulty": "blue", + "category": "Ways to Climb", + "words": [ + "ASCEND", + "MOUNT", + "SCALE", + "SURMOUNT" + ] + }, + { + "difficulty": "purple", + "category": "___ NOTE", + "words": [ + "FOOT", + "BANK", + "KEY", + "HIGH" + ] + } + ], + "intended_traps": [ + { + "word": "SCALE", + "actual_group": "blue", + "trap_group": "yellow", + "reason": "It is the root word for musical scales, leading the player to look for other scale-related terms." + }, + { + "word": "KEY", + "actual_group": "purple", + "trap_group": "yellow", + "reason": "A 'key' is fundamentally linked to musical scales (e.g., the Key of C Major)." + }, + { + "word": "HIGH", + "actual_group": "purple", + "trap_group": "blue", + "reason": "High is an adjective often associated with climbing or ascending." + }, + { + "word": "BALANCE", + "actual_group": "green", + "trap_group": "blue", + "reason": "Balance can be seen as a state of being when climbing or mountaineering." + } + ] + }, + "eval_count": 453, + "prompt_eval_count": 440 + } +] \ No newline at end of file diff --git a/docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md b/docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md new file mode 100644 index 0000000..e8e5e40 --- /dev/null +++ b/docs/reference/gemma-generation-bakeoff-2026-04-27-221751.md @@ -0,0 +1,278 @@ +# Gemma 4 Generation Bakeoff -- 2026-04-27-221751 + +## Setup +- Local Ollama on the test host (RTX 3090 Ti, 24 GB VRAM) +- Other GPU workloads paused for the duration of the run +- Models: `gemma4:26b`, `gemma4:31b-it-q4_K_M` +- 5 puzzles per model, base temperature 0.8 +- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, `num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`. +- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry (AI_Visualizer pattern). Reported metrics use the *successful* attempt. +- One-shot, unaided generation. No critique pass, no example puzzle in prompt. + +## Timing + +| Model | n | avg s | avg tokens | tok/s | +|---|---|---|---|---| +| `gemma4:26b` | 5 | 5.2 | 489 | 94.3 | +| `gemma4:31b-it-q4_K_M` | 5 | 18.2 | 442 | 24.3 | + +## JSON parse rate + +- `gemma4:26b`: 5/5 parsed cleanly +- `gemma4:31b-it-q4_K_M`: 5/5 parsed cleanly + +## gemma4:26b -- puzzle 1 + +**Theme seed:** An exploration of common words that can be paired with 'BIRD' or 'DRAGON' or relate to types of 'FLY'. + +| Diff | Category | Words | +|---|---|---| +| yellow | Types of Flies | FRUIT, HOUSE, FIRE, BLOW | +| green | Words before 'FLY' | DRAGON, MAY, BUTTER, BOTTLE | +| blue | Types of Birds | LARK, SWALLOW, CRANE, SWIFT | +| purple | Words that follow 'DRAGON' | FRUIT, FLY, SLAYER, FLAME | + +**Claimed traps (4):** +- `FRUIT` (actually purple, traps yellow): Fruit fly is a very common type of fly, making it a primary red herring for the yellow group. +- `FIRE` (actually yellow, traps purple): Fire dragon or dragon flame connection could lead someone to look for a dragon-themed group. +- `DRAGON` (actually green, traps blue): Dragon is a mythical creature, and users might look for a 'mythical creatures' group including Crane or Swallow (which are real birds). +- `FLY` (actually purple, traps green): Dragonfly is a common word, tempting the player to put 'FLY' in the 'Words before FLY' group. + +**Grade: FAIL.** `FRUIT` appears in **two groups** (yellow Types of Flies + purple Words that follow DRAGON) — that's only 15 distinct tiles, not the required 16. Categories are also redundant: yellow ("Types of Flies") and green ("Words before FLY") are both fly-themed. A human curator would never ship this. + +_dt=5.7s, tokens=500_ + +## gemma4:26b -- puzzle 2 + +**Theme seed:** Homophones and linguistic overlaps between nature and everyday objects. + +| Diff | Category | Words | +|---|---|---| +| yellow | Types of Trees | BIRCH, CEDAR, MAPLE, PINE | +| green | Things that can be 'Dropped' | BEAT, HINT, LINE, SUBJECT | +| blue | Units of Measurement | FOOT, GRAM, SECOND, STONE | +| purple | Homophones of Body Parts | EYE, HARE, MUSSEL, QUEUE | + +**Claimed traps (4):** +- `PINE` (actually yellow, traps green): To 'pine' for someone is a feeling, but 'pine' could be mistaken for something you can 'drop' (like a pine needle). +- `FOOT` (actually blue, traps yellow): A 'foot' is a unit of measurement, but it is also a body part, potentially leading to the 'Body Part Homophones' group. +- `BEAT` (actually green, traps blue): A 'beat' is a rhythm, but it could be mistaken for a unit of time/measurement in a musical context. +- `EYE` (actually purple, traps blue): The word 'eye' is a body part, which might tempt players looking for a body part category, but it is actually a homophone (I) for the purple group. + +**Grade: PARTIAL FAIL.** Yellow/green/blue all tight. Purple "Homophones of Body Parts" is half-broken: `HARE`→hair ✓, `MUSSEL`→muscle ✓, but `EYE` *is* a body part (not a homophone of one) and `QUEUE`→cue is not a body-part homophone. 2/4 wrong on the trickiest group is a curator-rejection. Trap reasoning is weak (claims `EYE` traps blue Units — body parts aren't units). + +_dt=5.3s, tokens=510_ + +## gemma4:26b -- puzzle 3 + +**Theme seed:** The concept of 'Pressure' and things that can be measured or applied. + +| Diff | Category | Words | +|---|---|---| +| yellow | Units of Pressure | BAR, PASCAL, PSI, TORR | +| green | Words after 'BLOOD' | PRESSURE, TYPE, ORANGE, DONOR | +| blue | Things that can be 'Broken' | HEART, RECORD, LAW, PROMISE | +| purple | Words that are also Verbs for 'Applying Force' | PRESS, STAMP, PUNCH, DRIVE | + +**Claimed traps (4):** +- `PRESSURE` (actually green, traps yellow): Pressure is the concept being measured by the units in the yellow group. +- `BAR` (actually yellow, traps blue): A 'bar' is something that can be 'broken' (like a chocolate bar) or refers to a legal setting. +- `PRESS` (actually purple, traps green): The 'Press' is a common term in news/media, and 'Press' can be related to 'Blood' in a medical context (blood pressure). +- `RECORD` (actually blue, traps purple): A 'record' is something you can 'stamp' or 'drive' (as in a driving record). + +**Grade: BORDERLINE PASS.** Categories tight, 16 distinct words, ✓. `BAR` (yellow → blue trap) is a real overlap (chocolate bar, broken bar). `PRESS` claimed to trap green ("Words after BLOOD") — but blood **pressure** is the compound, not blood **press**, so the claimed trap doesn't actually hold. `RECORD`→stamp/drive is a stretch. 1/4 of Gemma's self-graded traps is broken; remainder OK. + +_dt=5.1s, tokens=492_ + +## gemma4:26b -- puzzle 4 + +**Theme seed:** A collection of words related to things that can be 'broken' or 'cracked', mixed with other categories like cooking and music. + +| Diff | Category | Words | +|---|---|---| +| yellow | Kinds of Eggs | CHICKEN, QUAIL, GOOSE, DUCK | +| green | Things You Can Crack | NUT, CODE, WHIP, SMILE | +| blue | Musical Terms | NOTE, CHORD, CLEF, BEAT | +| purple | Words Before 'Egg' | NEST, EASTER, BAD, NESTING | + +**Claimed traps (4):** +- `DUCK` (actually yellow, traps green): DUCK is an action related to avoiding a whip or a blow, and can be paired with 'crack' in some linguistic contexts. +- `CHICKEN` (actually yellow, traps purple): CHICKEN is an egg-related word that could tempt someone looking for '___ Egg' phrases (like Chicken Egg). +- `BEAT` (actually blue, traps green): BEAT is something you can do to a whip or a person, fitting the 'Things you can crack/strike' theme. +- `NEST` (actually purple, traps yellow): NEST is intrinsically linked to the birds in the yellow group (CHICKEN, GOOSE, etc.). + +**Grade: FAIL.** Purple has `NESTING` as "Words Before EGG" — *nesting egg* isn't a phrase (*nest egg* is). Worse, `NEST` and `NESTING` are near-duplicate tiles. Yellow "Kinds of Eggs" and purple "Words Before EGG" are also redundant (chicken egg vs nest egg — both compound forms). Curator reject. + +_dt=5.3s, tokens=506_ + +## gemma4:26b -- puzzle 5 + +**Theme seed:** A collection of words centered around various types of 'records' and 'tracks'. + +| Diff | Category | Words | +|---|---|---| +| yellow | Types of Records | LP, EP, Single, Album | +| green | Things that have Tracks | Train, Athlete, Song, Mud | +| blue | Words for 'Evidence' | Clue, Trace, Mark, Proof | +| purple | Words that follow 'Side' | Effect, Kick, Split, Walk | + +**Claimed traps (4):** +- `Single` (actually yellow, traps blue): A 'single' can be a piece of evidence or a solitary item. +- `Song` (actually green, traps yellow): A song is often released as an LP, EP, or Single. +- `Trace` (actually blue, traps green): A trace can be a physical track left behind in mud. +- `Split` (actually purple, traps green): A 'split' can refer to a track or division in a race. + +**Grade: PASS.** Cleanest 26b output. 16 distinct words; categories tight; the word "track" carries **three different meanings** across green (railroad/running/album), with `SONG`→Records and `TRACE`→Tracks as real overlaps. That's the multi-meaning anchor pattern Connections actually uses. `SINGLE` and `SPLIT` traps are weak, but the puzzle structure is publication-shaped. + +_dt=4.6s, tokens=436_ + +## gemma4:31b-it-q4_K_M -- puzzle 1 + +**Theme seed:** Words related to circles, timing, and subtle wordplay with 'Hands' + +| Diff | Category | Words | +|---|---|---| +| yellow | Circular Objects | RING, COIN, WHEEL, DISC | +| green | Units of Time | SECOND, MINUTE, HOUR, DAY | +| blue | Ways to Assist | HELP, AID, BACK, SUPPORT | +| purple | Things with Hands | CLOCK, WATCH, DECK, PLAYER | + +**Claimed traps (4):** +- `SECOND` (actually green, traps blue): To 'second' a motion is to support it. +- `WATCH` (actually purple, traps green): A watch is a device used to measure the units of time (second, minute, hour). +- `CLOCK` (actually purple, traps green): Similar to watch, it is intrinsically linked to the time units group. +- `BACK` (actually blue, traps yellow): Could be mistaken as a part of a circular object or a physical position. + +**Grade: BORDERLINE PASS.** `DECK` in "Things with Hands" is shaky — decks have cards, players have hands; the phrase is "all hands on deck" (sailors), not "deck has hands." Other 3 purple members fit. Strong saving grace: `SECOND`→assist (second a motion), `WATCH`→time, `CLOCK`→time are 3 genuinely strong traps. Categories otherwise tight. + +_dt=18.9s, tokens=439_ + +## gemma4:31b-it-q4_K_M -- puzzle 2 + +**Theme seed:** A mix of architectural elements, types of scales, and linguistic wordplay + +| Diff | Category | Words | +|---|---|---| +| yellow | Parts of a Building | BEAM, COLUMN, EAVE, FOUNDATION | +| green | Types of Scales | LIBRA, MUSICAL, WEIGHING, TIMELINE | +| blue | Synonyms for Balance | EQUILIBRIUM, POISE, STEADY, STABILITY | +| purple | Words that start with a Greek Letter | ALPHABET, BETRAYAL, GAMUT, DELUXE | + +**Claimed traps (3):** +- `LIBRA` (actually green, traps blue): Libra is the sign of the scales, which represents balance/equilibrium. +- `COLUMN` (actually yellow, traps green): A column can be a way to organize data on a scale or chart. +- `GAMUT` (actually purple, traps green): The 'gamut' refers to the full range or scale of something. + +**Grade: FAIL.** Two broken categories. Green "Types of Scales" includes `LIBRA` (zodiac sign, not a scale type) and `TIMELINE` (a line, not a scale). Purple "Words that start with a Greek Letter" includes `DELUXE` — but `DELUXE` starts with "DELU", not the full letter name "DELTA". `GAMUT` starts with "GAM", missing the full "GAMMA". 2 of 4 purple words don't actually start with the complete Greek-letter name. + +_dt=16.6s, tokens=410_ + +## gemma4:31b-it-q4_K_M -- puzzle 3 + +**Theme seed:** Words related to vision, optics, and ocular puns + +| Diff | Category | Words | +|---|---|---| +| yellow | Parts of the Eye | CORNEA, PUPIL, IRIS, RETINA | +| green | Types of Lenses | CONCAVE, CONVEX, CONTACT, BIFOCAL | +| blue | Words meaning 'To Witness' | OBSERVE, SPOT, NOTE, VIEW | +| purple | ___ EYE | BULLS, BIRD, FISH, PRIVATE | + +**Claimed traps (3):** +- `PUPIL` (actually yellow, traps blue): A pupil is a student, and one might 'note' or 'observe' a pupil in a classroom setting. +- `PRIVATE` (actually purple, traps blue): Private can be a synonym for 'secret' or 'hidden', contrasting with the act of observing or spotting. +- `CONTACT` (actually green, traps blue): To 'contact' someone can be confused with 'spotting' or 'viewing' them in a social context. + +**Grade: PASS.** All 4 categories tight: parts of eye / lens types / witness verbs / `___EYE` compounds (bull's, bird's, fish, private). `PUPIL`→student is a real, NYT-style multi-meaning trap. Only 3 traps claimed (rubric asked for 2+, ✓). Closest puzzle to publication quality across both models. + +_dt=17.3s, tokens=422_ + +## gemma4:31b-it-q4_K_M -- puzzle 4 + +**Theme seed:** Common words that can also be types of birds or musical terms + +| Diff | Category | Words | +|---|---|---| +| yellow | Things that can be 'Cast' | SPELL, VOTE, SHADOW, NET | +| green | Musical Dynamics | PIANO, FORTE, SHARP, FLAT | +| blue | Types of Birds | CRANE, SWALLOW, SWIFT, LOON | +| purple | Words that mean 'to move quickly' | DART, BOLT, RUSH, FLY | + +**Claimed traps (4):** +- `SWIFT` (actually blue, traps purple): Swift is a synonym for moving quickly, fitting the 'move quickly' group. +- `FLY` (actually purple, traps blue): Fly is a common bird-related term (or a type of insect), making it look like it belongs with Crane and Swallow. +- `SWALLOW` (actually blue, traps purple): While a bird, if the user interprets 'move quickly' as 'gulp down', it might confuse them, though primarily it serves as a bird lure. +- `SHARP` (actually green, traps purple): A 'sharp' turn or movement can be associated with quickness/darting. + +**Grade: BORDERLINE PASS.** Green should be "Musical Terms" not "Dynamics" — `SHARP` and `FLAT` are accidentals (pitch modifiers), not dynamics (loudness). Pedantic but a real category-label miss. Saving graces: `SWIFT` (bird that means fast) and `FLY` (bird that means move quickly) are exactly the kind of multi-meaning anchors a real Connections puzzle would deploy. + +_dt=19.9s, tokens=487_ + +## gemma4:31b-it-q4_K_M -- puzzle 5 + +**Theme seed:** Words related to types of 'scales' and musical terminology + +| Diff | Category | Words | +|---|---|---| +| yellow | Musical Scales | MAJOR, MINOR, CHROMATIC, PENTATONIC | +| green | Things that Weigh | BALANCE, SPRING, DIGITAL, TRIPLE-BEAM | +| blue | Ways to Climb | ASCEND, MOUNT, SCALE, SURMOUNT | +| purple | ___ NOTE | FOOT, BANK, KEY, HIGH | + +**Claimed traps (4):** +- `SCALE` (actually blue, traps yellow): It is the root word for musical scales, leading the player to look for other scale-related terms. +- `KEY` (actually purple, traps yellow): A 'key' is fundamentally linked to musical scales (e.g., the Key of C Major). +- `HIGH` (actually purple, traps blue): High is an adjective often associated with climbing or ascending. +- `BALANCE` (actually green, traps blue): Balance can be seen as a state of being when climbing or mountaineering. + +**Grade: PASS.** The whole puzzle is built around `SCALE` carrying three meanings: musical scale (yellow), weighing scale (green's theme — though Gemma mislabels it "Things that Weigh" instead of "Types of Scales"), and "to climb" (blue, where SCALE-the-word lives). That is exactly the central-anchor pattern a real NYT Connections puzzle uses. `KEY`, `HIGH`, `BALANCE` traps all genuinely overlap. Categories slightly mislabeled but structure is publication-quality. + +_dt=18.2s, tokens=453_ + +--- + +## Aggregate + +| Model | Pass | Borderline | Fail | Avg s | Avg tok/s | +|---|---|---|---|---|---| +| `gemma4:26b` | 1 (#5) | 1 (#3) + 1 partial (#2) | 2 (#1, #4) | 5.2 | 94.3 | +| `gemma4:31b-it-q4_K_M` | 2 (#3, #5) | 2 (#1, #4) | 1 (#2) | 18.2 | 24.3 | + +**31b is materially more reliable** — 2 clean passes vs 26b's 1, and only 1 hard fail vs 26b's 2 hard fails plus a partial-fail. 31b is 3.5× slower per generation but at 18s for a once-per-day puzzle, that's irrelevant. 26b is fast enough for interactive use but produces broken puzzles half the time. + +### Failure modes (in order of how often they recur) + +1. **Structural violations** — duplicate or near-duplicate words on the 16-tile board, or a word listed in two groups. (#1-26b: `FRUIT` × 2; #4-26b: `NEST`/`NESTING`.) Catastrophic — a real Connections board has 16 *distinct* tiles. **Trivially detectable** with a deterministic post-filter. +2. **Broken category logic** — words placed in a category that don't actually fit. (#2-26b: `EYE`/`QUEUE` aren't body-part homophones; #4-26b: `NESTING` isn't a "Word before EGG"; #2-31b: `LIBRA`/`TIMELINE` aren't scales, `DELUXE` doesn't start with the full Greek letter "DELTA"; #1-31b: `DECK` doesn't have hands.) **Hard to detect deterministically** — needs a critique/judging pass. +3. **Redundant categories** — two groups themed on the same concept (#1-26b: yellow + green both fly-themed; #4-26b: yellow + purple both egg-themed). Detectable with a category-similarity check. +4. **Weak/circular trap reasoning** — Gemma's claimed "intended_traps" sometimes don't actually hold. (#3-26b: `PRESS` doesn't fit "Words after BLOOD" — the compound is *blood pressure*, not *blood press*.) Means **Gemma cannot reliably grade its own puzzles** — independent judging required. + +### Successes (when Gemma gets it right, what it does right) + +- **Multi-meaning anchor words** — `SCALE` (3 meanings, #5-31b), `SWIFT`/`FLY` (bird + fast, #4-31b), `PUPIL` (eye + student, #3-31b), `TRACK` (3 meanings, #5-26b). When Gemma builds a puzzle around an anchor, it produces real Connections-grade overlap. +- **Compound-word categories** — "`___ EYE`" (#3-31b), "Side `___`" (#5-26b), "Words before EGG" (#4-26b — when Gemma doesn't poison it). These are the easiest pattern to get right. +- **Tight short labels** when Gemma sticks to well-known domains (parts of eye, types of lenses, types of records). + +### Implication for design + +**Generation is viable, but not unaided.** The shape of the data engine: + +``` +generate (gemma4:31b) + → deterministic filter [check 16 distinct tiles, no dup words, all words appear in categories] + → category-similarity check [reject puzzles with redundant themes] + → critique pass [either gemma4:31b second pass, or qwen3-coder:30b as judge] + → reject + regenerate if any fail; accept once filtered + → cache as the day's puzzle +``` + +At ~18s/generation and a roughly 50% structural-pass rate, a daily puzzle costs an expected ~2 generations + 1 critique = maybe 1 minute of GPU time per day. Effectively free. + +**26b is unsuitable as the primary generator** — too many hard fails. It could plausibly be the *judging* model (cheaper, runs on every player guess) since judging is much easier than generating. But that decision is for the brainstorm. + +### Risks not yet checked + +- **Diversity over time.** All 10 puzzles produced here are within a single seed-less batch. If Gemma keeps reaching for the same themes (we saw "scales" twice on 31b alone), a 365-day-per-year stream might get repetitive. Test this with seeded prompts before committing. +- **Connections-vs-Gemma blind anchor not run.** I deferred this — the structural failures in Gemma's output (duplicate words, broken categories) are so obviously curator-rejection-tier that no human-curated puzzle would have them, so the within-Gemma comparison was decisive on its own. Still, before final design, eyeball one Gemma-pass puzzle next to a real NYT puzzle and check whether it actually feels equivalent. +- **Two-pass critique not validated.** The proposal above assumes a critique pass would catch Gemma's category mistakes. That assumption has not been tested. The next experiment is "feed Gemma's broken puzzles back to Gemma (or to a different model) and see if it flags the structural issues." + diff --git a/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800-raw.json b/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800-raw.json new file mode 100644 index 0000000..d8ea8d1 --- /dev/null +++ b/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800-raw.json @@ -0,0 +1,2077 @@ +[ + { + "case_id": "judge-y1", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6691536903381348, + "eval_count": 31, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are common names of tree species.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y2", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6203711032867432, + "eval_count": 30, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are names of Greek letters.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y3", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.7630858421325684, + "eval_count": 48, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "The words are not all representative of the days of the week.", + "misfit_words": [ + "MONDAY", + "FRIDAY", + "SUNDAY", + "WEDNESDAY" + ] + }, + "raw": null, + "error": null, + "score": "FAIL" + }, + { + "case_id": "judge-y4", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6369147300720215, + "eval_count": 34, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are direct and common synonyms for 'happy'.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y5", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6616277694702148, + "eval_count": 35, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words can precede 'HOUSE' to form common compound words.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y6", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.7502412796020508, + "eval_count": 45, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Only SHOULDER, FRONT, and SNAP fit common phrases with COLD, while TURKEY does not.", + "misfit_words": [ + "TURKEY" + ] + }, + "raw": null, + "error": null, + "score": "FAIL" + }, + { + "case_id": "judge-y7", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.619704008102417, + "eval_count": 31, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are strong synonyms for moving rapidly.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y8", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6541116237640381, + "eval_count": 34, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are common synonyms for a mental concept or idea.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n1", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6654887199401855, + "eval_count": 36, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Carrot is a root vegetable, not a type of tree.", + "misfit_words": [ + "CARROT" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n2", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6903350353240967, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Cyrillic is a script, not a Greek letter.", + "misfit_words": [ + "CYRILLIC" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n3", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.7092163562774658, + "eval_count": 42, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "The word 'SAD' is an antonym, not a synonym for 'happy'.", + "misfit_words": [ + "SAD" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n4", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6665847301483154, + "eval_count": 35, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "January is a month, not a day of the week.", + "misfit_words": [ + "JANUARY" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n5", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6246583461761475, + "eval_count": 31, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "NIGHT is not a body part.", + "misfit_words": [ + "NIGHT" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n6", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6524357795715332, + "eval_count": 36, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Bats and mosquitoes are not types of birds.", + "misfit_words": [ + "BAT", + "MOSQUITO" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n7", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.6560320854187012, + "eval_count": 35, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Blue is a color, not necessarily a thing that is red.", + "misfit_words": [ + "BLUE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n8", + "type": "JUDGE", + "model": "gemma4:latest", + "dt": 0.7655882835388184, + "eval_count": 48, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Only SHOULDER, FRONT, and PIZZA fit common phrases with COLD, but MOUNTAIN does not.", + "misfit_words": [ + "MOUNTAIN" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "create-e1", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.5947470664978027, + "eval_count": 30, + "ok": true, + "parsed": { + "words": [ + "Oak", + "Maple", + "Pine", + "Birch" + ], + "reason": "These are all common types of trees." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e2", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.6220695972442627, + "eval_count": 32, + "ok": true, + "parsed": { + "words": [ + "Alpha", + "Beta", + "Gamma", + "Delta" + ], + "reason": "These are the first four letters of the Greek alphabet." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e3", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.6435914039611816, + "eval_count": 35, + "ok": true, + "parsed": { + "words": [ + "furious", + "irate", + "enraged", + "mad" + ], + "reason": "All four words are direct synonyms for feeling intense anger." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e4", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.6068227291107178, + "eval_count": 30, + "ok": true, + "parsed": { + "words": [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday" + ], + "reason": "These are four consecutive days of the week." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m1", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.7005162239074707, + "eval_count": 41, + "ok": true, + "parsed": { + "words": [ + "BLIZZARD", + "HEAT", + "STORM", + "SNOW" + ], + "reason": "These words can all precede 'STORM' to form a type of storm." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m2", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.6505529880523682, + "eval_count": 37, + "ok": true, + "parsed": { + "words": [ + "tiny", + "petite", + "minuscule", + "diminutive" + ], + "reason": "All four words are direct synonyms for 'small' in size." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m3", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.6751258373260498, + "eval_count": 37, + "ok": true, + "parsed": { + "words": [ + "JEANS", + "BIRD", + "MOON", + "BLOOD" + ], + "reason": "All words can follow 'BLUE' to form common phrases." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m4", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.6219143867492676, + "eval_count": 32, + "ok": true, + "parsed": { + "words": [ + "heart", + "promise", + "record", + "bone" + ], + "reason": "All four items can be metaphorically or literally broken." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-h1", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.7682175636291504, + "eval_count": 49, + "ok": true, + "parsed": { + "words": [ + "SEA", + "SEE", + "HEAR", + "HERE" + ], + "reason": "All four words are homophones of common body parts (sea/see for 'C' sound, hear/here for 'hear')." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-h2", + "type": "CREATE", + "model": "gemma4:latest", + "dt": 0.7266902923583984, + "eval_count": 42, + "ok": true, + "parsed": { + "words": [ + "KNEEHIGH", + "EARDRUM", + "STOMACHACHE", + "EYELASH" + ], + "reason": "Each word contains a body part name as a substring." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "ca-y1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6249268054962158, + "eval_count": 31, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words can be used as verbs meaning to go up.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.5928308963775635, + "eval_count": 27, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four listed items are types of fruit.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y3", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6987485885620117, + "eval_count": 40, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "You can crack a nut, a code, a smile, or a whip (like a whip crack).", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y4", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6028926372528076, + "eval_count": 27, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four items are commonly associated with baseball.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y5", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6008837223052979, + "eval_count": 29, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four are names of Roman deities associated with planets.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.728452205657959, + "eval_count": 43, + "ok": true, + "parsed": { + "valid": "no", + "reason": "These words are types of trees, not furniture brands.", + "weak_words": [ + "OAK", + "MAPLE", + "BIRCH", + "PINE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.621466875076294, + "eval_count": 32, + "ok": true, + "parsed": { + "valid": "no", + "reason": "These words are Greek letters, not inherently meaning 'small'.", + "weak_words": [ + "ALPHA" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n3", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6523487567901611, + "eval_count": 33, + "ok": true, + "parsed": { + "valid": "no", + "reason": "BAT and BALL are not things worn on the hand.", + "weak_words": [ + "BAT", + "BALL" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n4", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.7682509422302246, + "eval_count": 48, + "ok": true, + "parsed": { + "valid": "no", + "reason": "All the words listed are days of the week, not months of the year.", + "weak_words": [ + "MONDAY", + "FRIDAY", + "SUNDAY", + "WEDNESDAY" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n5", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6511921882629395, + "eval_count": 34, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Nut and smile are not typically considered kitchen items.", + "weak_words": [ + "NUT", + "SMILE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-b1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.6162059307098389, + "eval_count": 30, + "ok": true, + "parsed": { + "valid": "no", + "reason": "None of these words are recognized tech or phone brands.", + "weak_words": [ + "APPLE" + ] + }, + "raw": null, + "error": null, + "score": "BORDERLINE" + }, + { + "case_id": "ca-b2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:latest", + "dt": 0.8338274955749512, + "eval_count": 56, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Only SHARP and FLAT are common musical terms, while KEY and NATURAL are also musical terms, making 'Real estate listing terms' a poor fit for all four.", + "weak_words": [ + "KEY", + "NATURAL" + ] + }, + "raw": null, + "error": null, + "score": "BORDERLINE" + }, + { + "case_id": "judge-y1", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7991511821746826, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are specific, well-known types of trees.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y2", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7396526336669922, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are standard names of letters in the Greek alphabet.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y3", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7360649108886719, + "eval_count": 37, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are specific names of days in the week.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y4", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7296037673950195, + "eval_count": 37, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are direct synonyms representing different degrees of happiness.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y5", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.9206283092498779, + "eval_count": 60, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "While Green, Ice, and Fire House are common, 'Court House' is typically written as two words (Courthouse) or refers to a different semantic structure.", + "misfit_words": [ + "COURT" + ] + }, + "raw": null, + "error": null, + "score": "FAIL" + }, + { + "case_id": "judge-y6", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.9253664016723633, + "eval_count": 50, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words form common phrases when preceded by 'cold': cold shoulder, cold front, cold snap, and cold turkey.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y7", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7268784046173096, + "eval_count": 36, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are synonyms for moving at high speed.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y8", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7770085334777832, + "eval_count": 42, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are direct synonyms for a mental concept, notion, or vague understanding.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n1", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7453572750091553, + "eval_count": 40, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Carrot is a root vegetable, not a type of tree.", + "misfit_words": [ + "CARROT" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n2", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7958042621612549, + "eval_count": 44, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "CYRILLIC refers to a script, not a specific Greek letter.", + "misfit_words": [ + "CYRILLIC" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n3", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7643091678619385, + "eval_count": 42, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "The word 'SAD' is an antonym rather than a synonym.", + "misfit_words": [ + "SAD" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n4", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7373373508453369, + "eval_count": 40, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "JANUARY is a month, not a day of the week.", + "misfit_words": [ + "JANUARY" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n5", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7540380954742432, + "eval_count": 39, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "The word 'NIGHT' is not a body part.", + "misfit_words": [ + "NIGHT" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n6", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7899594306945801, + "eval_count": 46, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "BAT and MOSQUITO are mammals and insects, respectively, not birds.", + "misfit_words": [ + "BAT", + "MOSQUITO" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n7", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.7485904693603516, + "eval_count": 39, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "BLUE and GRASS are not typically red.", + "misfit_words": [ + "BLUE", + "GRASS" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n8", + "type": "JUDGE", + "model": "gemma4:26b", + "dt": 0.9349522590637207, + "eval_count": 63, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "While shoulder, front, and mountain can follow 'cold', 'cold pizza' is a description of temperature rather than a common idiomatic phrase or compound noun like the others.", + "misfit_words": [ + "PIZZA" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "create-e1", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.6970863342285156, + "eval_count": 34, + "ok": true, + "parsed": { + "words": [ + "Oak", + "Maple", + "Birch", + "Cedar" + ], + "reason": "These are all common species of trees." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e2", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.7034692764282227, + "eval_count": 37, + "ok": true, + "parsed": { + "words": [ + "ALPHA", + "BETA", + "GAMMA", + "DELTA" + ], + "reason": "These are the first four letters of the Greek alphabet." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e3", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.723792314529419, + "eval_count": 40, + "ok": true, + "parsed": { + "words": [ + "Irate", + "Livid", + "Furious", + "Incensed" + ], + "reason": "All four words are direct synonyms for being extremely angry." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e4", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.6912398338317871, + "eval_count": 34, + "ok": true, + "parsed": { + "words": [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday" + ], + "reason": "These are all consecutive days of the week." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m1", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.7368042469024658, + "eval_count": 42, + "ok": true, + "parsed": { + "words": [ + "BRAIN", + "DUST", + "FIRE", + "SNOW" + ], + "reason": "Each word forms a common compound word when followed by 'STORM'." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m2", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.7377326488494873, + "eval_count": 42, + "ok": true, + "parsed": { + "words": [ + "tiny", + "microscopic", + "diminutive", + "minuscule" + ], + "reason": "All four words are direct synonyms for something of very small size." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m3", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.7627155780792236, + "eval_count": 43, + "ok": true, + "parsed": { + "words": [ + "BIRD", + "JAY", + "PRINT", + "WHALE" + ], + "reason": "Each word can be preceded by 'blue' to form a common term." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m4", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.7079043388366699, + "eval_count": 37, + "ok": true, + "parsed": { + "words": [ + "Promise", + "Record", + "Heart", + "Law" + ], + "reason": "All four are common objects or concepts that can be broken." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-h1", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.8588354587554932, + "eval_count": 53, + "ok": true, + "parsed": { + "words": [ + "MUSSEL", + "HARE", + "EYE", + "KNEAD" + ], + "reason": "Each word is a homophone of a body part (muscle, hair, eye, knee) but spelled differently." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-h2", + "type": "CREATE", + "model": "gemma4:26b", + "dt": 0.8054819107055664, + "eval_count": 48, + "ok": true, + "parsed": { + "words": [ + "HANDSOME", + "FOOTPRINT", + "EARNEST", + "ARMORY" + ], + "reason": "Each word contains a body part (hand, foot, ear, arm) as a substring." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "ca-y1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7489900588989258, + "eval_count": 39, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are synonyms or closely related verbs describing the act of rising or ascending.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7018086910247803, + "eval_count": 31, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are common types of fruit.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y3", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7803692817687988, + "eval_count": 41, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "You can crack a whip, crack a nut, crack a code, and crack a smile.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y4", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7275149822235107, + "eval_count": 37, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are standard pieces of equipment used in the game of baseball.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y5", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7079963684082031, + "eval_count": 34, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are names of prominent deities in Roman mythology.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.8165183067321777, + "eval_count": 47, + "ok": true, + "parsed": { + "valid": "no", + "reason": "These words represent types of trees rather than furniture brands.", + "weak_words": [ + "OAK", + "MAPLE", + "BIRCH", + "PINE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.9430453777313232, + "eval_count": 60, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Alpha, beta, gamma, and delta are Greek letters used to denote various things, but none of them inherently mean 'small'.", + "weak_words": [ + "ALPHA", + "BETA", + "GAMMA", + "DELTA" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n3", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7568156719207764, + "eval_count": 37, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Bat and ball are not items worn on the hand.", + "weak_words": [ + "BAT", + "BALL" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n4", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.882706880569458, + "eval_count": 54, + "ok": true, + "parsed": { + "valid": "no", + "reason": "None of the words are months of the year; they are all days of the week.", + "weak_words": [ + "MONDAY", + "FRIDAY", + "SUNDAY", + "WEDNESDAY" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n5", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.8334591388702393, + "eval_count": 48, + "ok": true, + "parsed": { + "valid": "no", + "reason": "While whip and nut can relate to food, code and smile have no plausible connection to kitchen items.", + "weak_words": [ + "CODE", + "SMILE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-b1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.7428381443023682, + "eval_count": 38, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are names of well-known technology companies or mobile device brands.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "BORDERLINE" + }, + { + "case_id": "ca-b2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:26b", + "dt": 0.9244155883789062, + "eval_count": 58, + "ok": true, + "parsed": { + "valid": "no", + "reason": "While 'flat' and 'key' can relate to real estate, 'sharp' and 'natural' have no plausible connection to real estate listings.", + "weak_words": [ + "SHARP", + "NATURAL" + ] + }, + "raw": null, + "error": null, + "score": "BORDERLINE" + }, + { + "case_id": "judge-y1", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.0965914726257324, + "eval_count": 34, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are common types of trees.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y2", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.0503978729248047, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are the first four letters of the Greek alphabet.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y3", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.9456124305725098, + "eval_count": 35, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are standard days of the week.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y4", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.055443048477173, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are direct synonyms for the state of being happy.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y5", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.074954032897949, + "eval_count": 39, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words form common compound words or phrases with 'house'.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y6", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.1484782695770264, + "eval_count": 41, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words form common compound words or phrases when preceded by 'cold'.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y7", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.980708122253418, + "eval_count": 35, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are common synonyms for moving rapidly.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-y8", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.1886539459228516, + "eval_count": 42, + "ok": true, + "parsed": { + "verdict": "yes", + "reason": "All four words are common synonyms for a mental representation or a suggestion of an idea.", + "misfit_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n1", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.060978412628174, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Carrot is a root vegetable, not a tree.", + "misfit_words": [ + "CARROT" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n2", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.2179007530212402, + "eval_count": 43, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Cyrillic is a writing system, not a Greek letter.", + "misfit_words": [ + "CYRILLIC" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n3", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.1611642837524414, + "eval_count": 40, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "SAD is an antonym of happy, not a synonym.", + "misfit_words": [ + "SAD" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n4", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.117088794708252, + "eval_count": 40, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "JANUARY is a month, not a day of the week.", + "misfit_words": [ + "JANUARY" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n5", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.1064138412475586, + "eval_count": 40, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "NIGHT is a time of day, not a body part.", + "misfit_words": [ + "NIGHT" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n6", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.0780858993530273, + "eval_count": 38, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Bat and mosquito are not birds.", + "misfit_words": [ + "BAT", + "MOSQUITO" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n7", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.004055976867676, + "eval_count": 37, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Blue and grass are not red.", + "misfit_words": [ + "BLUE", + "GRASS" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "judge-n8", + "type": "JUDGE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.7377030849456787, + "eval_count": 57, + "ok": true, + "parsed": { + "verdict": "no", + "reason": "Cold pizza and cold mountain are not established compound words or common idioms like cold shoulder and cold front.", + "misfit_words": [ + "PIZZA", + "MOUNTAIN" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "create-e1", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.9321458339691162, + "eval_count": 34, + "ok": true, + "parsed": { + "words": [ + "Oak", + "Maple", + "Pine", + "Birch" + ], + "reason": "All four are common types of trees." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e2", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.9048969745635986, + "eval_count": 36, + "ok": true, + "parsed": { + "words": [ + "Alpha", + "Beta", + "Gamma", + "Delta" + ], + "reason": "These are the first four letters of the Greek alphabet." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e3", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.0525240898132324, + "eval_count": 40, + "ok": true, + "parsed": { + "words": [ + "Furious", + "Irate", + "Livid", + "Enraged" + ], + "reason": "All four words are strong synonyms for being extremely angry." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-e4", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.7998781204223633, + "eval_count": 33, + "ok": true, + "parsed": { + "words": [ + "Monday", + "Wednesday", + "Friday", + "Sunday" + ], + "reason": "They are all days of the week." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m1", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.073843479156494, + "eval_count": 40, + "ok": true, + "parsed": { + "words": [ + "BRAIN", + "FIRE", + "THUNDER", + "SNOW" + ], + "reason": "All four words form common compound words when followed by STORM." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m2", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.000436782836914, + "eval_count": 38, + "ok": true, + "parsed": { + "words": [ + "tiny", + "petite", + "slight", + "miniature" + ], + "reason": "All four words are direct synonyms for small in size." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m3", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.1925113201141357, + "eval_count": 43, + "ok": true, + "parsed": { + "words": [ + "BERRY", + "JAY", + "MOON", + "PRINT" + ], + "reason": "All four words follow 'BLUE' to form common compound words or phrases." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-m4", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.063403606414795, + "eval_count": 38, + "ok": true, + "parsed": { + "words": [ + "Record", + "Promise", + "Silence", + "Heart" + ], + "reason": "All four are common abstract or physical things that can be broken." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "create-h1", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 6.69955849647522, + "eval_count": 170, + "ok": false, + "parsed": null, + "raw": "{\"words\": [\"HARE\", \"MUSSEL\", \"CEREAL\", \"STEEL\"], \"reason\": \"They are homophones of hair, muscle, serial (not a body part) - wait.\"}\n\n*(Self-correction: Cereal/Serial is not a body part. Let me refine the list: Hare/Hair, Mussel/Muscle, Nose/Knows is not it, Eye/I is not it. Let's use: Hare, Mussel, Hue/Hew is not it. Let's try: Hare, Mussel, Waist/Waste, Sole/Soul.)*\n\n{\"words\": [\"HARE\", \"MUSSEL\", \"WASTE\", \"SOUL\"], \"reason\": \"They are homophones of hair, muscle, waist, and sole.\"}", + "error": "JSONDecodeError('Extra data: line 3 column 1 (char 133)')", + "score": "PARSE_FAIL" + }, + { + "case_id": "create-h2", + "type": "CREATE", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.2020654678344727, + "eval_count": 42, + "ok": true, + "parsed": { + "words": [ + "HEARTBEAT", + "HANDSOME", + "FOOTPRINT", + "ARMCHAIR" + ], + "reason": "Each word contains a body part as a substring." + }, + "raw": null, + "error": null, + "score": "MANUAL" + }, + { + "case_id": "ca-y1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.137995719909668, + "eval_count": 34, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are synonyms for the action of moving upward.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.8336284160614014, + "eval_count": 31, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are common types of fruit.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y3", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.1915175914764404, + "eval_count": 40, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are common objects or concepts that can be 'cracked'.", + "weak_words": [ + "WHIP" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y4", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.9880671501159668, + "eval_count": 34, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four items are standard pieces of equipment used in baseball.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-y5", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 1.996727705001831, + "eval_count": 35, + "ok": true, + "parsed": { + "valid": "yes", + "reason": "All four words are names of Roman deities as well as planets.", + "weak_words": [] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.716524839401245, + "eval_count": 54, + "ok": true, + "parsed": { + "valid": "no", + "reason": "These are types of trees/woods, not brands of furniture.", + "weak_words": [ + "OAK", + "MAPLE", + "BIRCH", + "PINE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.5698249340057373, + "eval_count": 48, + "ok": true, + "parsed": { + "valid": "no", + "reason": "These are Greek letters used as designations, not synonyms for 'small'.", + "weak_words": [ + "ALPHA", + "BETA", + "GAMMA", + "DELTA" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n3", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.312160015106201, + "eval_count": 43, + "ok": true, + "parsed": { + "valid": "no", + "reason": "A bat and a ball are not worn on the hand.", + "weak_words": [ + "BAT", + "BALL" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n4", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.6066768169403076, + "eval_count": 50, + "ok": true, + "parsed": { + "valid": "no", + "reason": "The words are days of the week, not months of the year.", + "weak_words": [ + "MONDAY", + "FRIDAY", + "SUNDAY", + "WEDNESDAY" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-n5", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.3842716217041016, + "eval_count": 45, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Code and smile are not items typically found in a kitchen.", + "weak_words": [ + "CODE", + "SMILE" + ] + }, + "raw": null, + "error": null, + "score": "PASS" + }, + { + "case_id": "ca-b1", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.347870349884033, + "eval_count": 44, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Orange and Kiwi are not recognized tech or phone brands.", + "weak_words": [ + "ORANGE", + "KIWI" + ] + }, + "raw": null, + "error": null, + "score": "BORDERLINE" + }, + { + "case_id": "ca-b2", + "type": "CREATIVE_ACCEPT", + "model": "gemma4:31b-it-q4_K_M", + "dt": 2.40256667137146, + "eval_count": 46, + "ok": true, + "parsed": { + "valid": "no", + "reason": "Natural and Sharp are not standard terms used in real estate listings.", + "weak_words": [ + "NATURAL", + "SHARP" + ] + }, + "raw": null, + "error": null, + "score": "BORDERLINE" + } +] \ No newline at end of file diff --git a/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md b/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md new file mode 100644 index 0000000..2ae7a79 --- /dev/null +++ b/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800.md @@ -0,0 +1,514 @@ +# Gemma 4 Semantic Bakeoff -- 2026-04-27 22:51 + +## Setup +- Local Ollama on the test host (RTX 3090 Ti, 24 GB VRAM) +- Models: `gemma4:latest`, `gemma4:26b`, `gemma4:31b-it-q4_K_M` +- Temperature 0.2 (raised +0.1 per retry on JSON parse fail, max 3 attempts) +- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md) +- 38 test cases: 16 JUDGE, 10 CREATE, 12 CREATIVE_ACCEPT +- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py` + +## Auto-scored summary + +| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s | +|---|---|---|---|---| +| `gemma4:latest` | 14/16 | 10/10 | 0 | 0.7 | +| `gemma4:26b` | 15/16 | 10/10 | 0 | 0.8 | +| `gemma4:31b-it-q4_K_M` | 16/16 | 10/10 | 1 | 2.3 | + +## JUDGE + +### judge-y1 + +- Category: `Types of trees` +- Words: `OAK`, `MAPLE`, `BIRCH`, `PINE` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words are common names of tree species. | PASS | 0.7s | +| `gemma4:26b` | yes | All four words are specific, well-known types of trees. | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are common types of trees. | PASS | 2.1s | + +### judge-y2 + +- Category: `Greek letters` +- Words: `ALPHA`, `BETA`, `GAMMA`, `DELTA` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words are names of Greek letters. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are standard names of letters in the Greek alphabet. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are the first four letters of the Greek alphabet. | PASS | 2.1s | + +### judge-y3 + +- Category: `Days of the week` +- Words: `MONDAY`, `FRIDAY`, `SUNDAY`, `WEDNESDAY` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | The words are not all representative of the days of the week. (misfit: MONDAY, FRIDAY, SUNDAY, WEDNESDAY) | FAIL | 0.8s | +| `gemma4:26b` | yes | All four words are specific names of days in the week. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are standard days of the week. | PASS | 1.9s | + +### judge-y4 + +- Category: `Synonyms for 'happy'` +- Words: `JOYFUL`, `GLAD`, `CHEERFUL`, `ELATED` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words are direct and common synonyms for 'happy'. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are direct synonyms representing different degrees of happiness. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are direct synonyms for the state of being happy. | PASS | 2.1s | + +### judge-y5 + +- Category: `___ HOUSE (compound words)` +- Words: `GREEN`, `ICE`, `FIRE`, `COURT` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words can precede 'HOUSE' to form common compound words. | PASS | 0.7s | +| `gemma4:26b` | no | While Green, Ice, and Fire House are common, 'Court House' is typically written as two words (Courthouse) or refers to a | FAIL | 0.9s | +| `gemma4:31b-it-q4_K_M` | yes | All four words form common compound words or phrases with 'house'. | PASS | 2.1s | + +### judge-y6 + +- Category: `Words that follow COLD` +- Words: `SHOULDER`, `FRONT`, `SNAP`, `TURKEY` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Only SHOULDER, FRONT, and SNAP fit common phrases with COLD, while TURKEY does not. (misfit: TURKEY) | FAIL | 0.8s | +| `gemma4:26b` | yes | All four words form common phrases when preceded by 'cold': cold shoulder, cold front, cold snap, and cold turkey. | PASS | 0.9s | +| `gemma4:31b-it-q4_K_M` | yes | All four words form common compound words or phrases when preceded by 'cold'. | PASS | 2.1s | + +### judge-y7 + +- Category: `Verbs meaning 'to move quickly'` +- Words: `DART`, `BOLT`, `RUSH`, `FLY` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words are strong synonyms for moving rapidly. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are synonyms for moving at high speed. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are common synonyms for moving rapidly. | PASS | 2.0s | + +### judge-y8 + +- Category: `Synonyms for 'idea'` +- Words: `NOTION`, `CONCEPT`, `THOUGHT`, `INKLING` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words are common synonyms for a mental concept or idea. | PASS | 0.7s | +| `gemma4:26b` | yes | All four words are direct synonyms for a mental concept, notion, or vague understanding. | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are common synonyms for a mental representation or a suggestion of an idea. | PASS | 2.2s | + +### judge-n1 + +- Category: `Types of trees` +- Words: `OAK`, `MAPLE`, `BIRCH`, `CARROT` +- Ground truth: **no** (misfit: CARROT) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Carrot is a root vegetable, not a type of tree. (misfit: CARROT) | PASS | 0.7s | +| `gemma4:26b` | no | Carrot is a root vegetable, not a type of tree. (misfit: CARROT) | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | no | Carrot is a root vegetable, not a tree. (misfit: CARROT) | PASS | 2.1s | + +### judge-n2 + +- Category: `Greek letters` +- Words: `ALPHA`, `BETA`, `GAMMA`, `CYRILLIC` +- Ground truth: **no** (misfit: CYRILLIC) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Cyrillic is a script, not a Greek letter. (misfit: CYRILLIC) | PASS | 0.7s | +| `gemma4:26b` | no | CYRILLIC refers to a script, not a specific Greek letter. (misfit: CYRILLIC) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | Cyrillic is a writing system, not a Greek letter. (misfit: CYRILLIC) | PASS | 2.2s | + +### judge-n3 + +- Category: `Synonyms for 'happy'` +- Words: `JOYFUL`, `GLAD`, `SAD`, `ELATED` +- Ground truth: **no** (misfit: SAD) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | The word 'SAD' is an antonym, not a synonym for 'happy'. (misfit: SAD) | PASS | 0.7s | +| `gemma4:26b` | no | The word 'SAD' is an antonym rather than a synonym. (misfit: SAD) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | SAD is an antonym of happy, not a synonym. (misfit: SAD) | PASS | 2.2s | + +### judge-n4 + +- Category: `Days of the week` +- Words: `MONDAY`, `JANUARY`, `SUNDAY`, `WEDNESDAY` +- Ground truth: **no** (misfit: JANUARY) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | January is a month, not a day of the week. (misfit: JANUARY) | PASS | 0.7s | +| `gemma4:26b` | no | JANUARY is a month, not a day of the week. (misfit: JANUARY) | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | no | JANUARY is a month, not a day of the week. (misfit: JANUARY) | PASS | 2.1s | + +### judge-n5 + +- Category: `Body parts` +- Words: `ARM`, `LEG`, `EYE`, `NIGHT` +- Ground truth: **no** (misfit: NIGHT) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | NIGHT is not a body part. (misfit: NIGHT) | PASS | 0.6s | +| `gemma4:26b` | no | The word 'NIGHT' is not a body part. (misfit: NIGHT) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | NIGHT is a time of day, not a body part. (misfit: NIGHT) | PASS | 2.1s | + +### judge-n6 + +- Category: `Types of birds` +- Words: `CRANE`, `SWALLOW`, `BAT`, `MOSQUITO` +- Ground truth: **no** (misfit: BAT, MOSQUITO) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Bats and mosquitoes are not types of birds. (misfit: BAT, MOSQUITO) | PASS | 0.7s | +| `gemma4:26b` | no | BAT and MOSQUITO are mammals and insects, respectively, not birds. (misfit: BAT, MOSQUITO) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | Bat and mosquito are not birds. (misfit: BAT, MOSQUITO) | PASS | 2.1s | + +### judge-n7 + +- Category: `Things that are red` +- Words: `APPLE`, `BLUE`, `ROSE`, `GRASS` +- Ground truth: **no** (misfit: BLUE, GRASS) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Blue is a color, not necessarily a thing that is red. (misfit: BLUE) | PASS | 0.7s | +| `gemma4:26b` | no | BLUE and GRASS are not typically red. (misfit: BLUE, GRASS) | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | no | Blue and grass are not red. (misfit: BLUE, GRASS) | PASS | 2.0s | + +### judge-n8 + +- Category: `Words that follow COLD` +- Words: `SHOULDER`, `FRONT`, `PIZZA`, `MOUNTAIN` +- Ground truth: **no** (misfit: PIZZA, MOUNTAIN) + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Only SHOULDER, FRONT, and PIZZA fit common phrases with COLD, but MOUNTAIN does not. (misfit: MOUNTAIN) | PASS | 0.8s | +| `gemma4:26b` | no | While shoulder, front, and mountain can follow 'cold', 'cold pizza' is a description of temperature rather than a common | PASS | 0.9s | +| `gemma4:31b-it-q4_K_M` | no | Cold pizza and cold mountain are not established compound words or common idioms like cold shoulder and cold front. (mis | PASS | 2.7s | + +## CREATE + +### create-e1 + +- Category: `Types of trees` +- Quality bar: Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | Oak, Maple, Pine, Birch | These are all common types of trees. | MANUAL | 0.6s | +| `gemma4:26b` | Oak, Maple, Birch, Cedar | These are all common species of trees. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | Oak, Maple, Pine, Birch | All four are common types of trees. | MANUAL | 1.9s | + +### create-e2 + +- Category: `Greek letters` +- Quality bar: Four genuine Greek letters. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | Alpha, Beta, Gamma, Delta | These are the first four letters of the Greek alphabet. | MANUAL | 0.6s | +| `gemma4:26b` | ALPHA, BETA, GAMMA, DELTA | These are the first four letters of the Greek alphabet. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | Alpha, Beta, Gamma, Delta | These are the first four letters of the Greek alphabet. | MANUAL | 1.9s | + +### create-e3 + +- Category: `Synonyms for 'angry'` +- Quality bar: Four words that all genuinely mean angry/furious. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | furious, irate, enraged, mad | All four words are direct synonyms for feeling intense anger. | MANUAL | 0.6s | +| `gemma4:26b` | Irate, Livid, Furious, Incensed | All four words are direct synonyms for being extremely angry. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | Furious, Irate, Livid, Enraged | All four words are strong synonyms for being extremely angry. | MANUAL | 2.1s | + +### create-e4 + +- Category: `Days of the week` +- Quality bar: Four of the seven weekday names, no months or other words. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | Monday, Tuesday, Wednesday, Thursday | These are four consecutive days of the week. | MANUAL | 0.6s | +| `gemma4:26b` | Monday, Tuesday, Wednesday, Thursday | These are all consecutive days of the week. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | Monday, Wednesday, Friday, Sunday | They are all days of the week. | MANUAL | 1.8s | + +### create-m1 + +- Category: `___ STORM (compound words ending in STORM)` +- Quality bar: Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW). + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | BLIZZARD, HEAT, STORM, SNOW | These words can all precede 'STORM' to form a type of storm. | MANUAL | 0.7s | +| `gemma4:26b` | BRAIN, DUST, FIRE, SNOW | Each word forms a common compound word when followed by 'STORM'. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | BRAIN, FIRE, THUNDER, SNOW | All four words form common compound words when followed by STORM. | MANUAL | 2.1s | + +### create-m2 + +- Category: `Synonyms for 'small'` +- Quality bar: Four words that all genuinely mean small. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | tiny, petite, minuscule, diminutive | All four words are direct synonyms for 'small' in size. | MANUAL | 0.7s | +| `gemma4:26b` | tiny, microscopic, diminutive, minuscule | All four words are direct synonyms for something of very small size. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | tiny, petite, slight, miniature | All four words are direct synonyms for small in size. | MANUAL | 2.0s | + +### create-m3 + +- Category: `Words that follow BLUE` +- Quality bar: Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS). + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | JEANS, BIRD, MOON, BLOOD | All words can follow 'BLUE' to form common phrases. | MANUAL | 0.7s | +| `gemma4:26b` | BIRD, JAY, PRINT, WHALE | Each word can be preceded by 'blue' to form a common term. | MANUAL | 0.8s | +| `gemma4:31b-it-q4_K_M` | BERRY, JAY, MOON, PRINT | All four words follow 'BLUE' to form common compound words or phrases. | MANUAL | 2.2s | + +### create-m4 + +- Category: `Things that can be 'broken'` +- Quality bar: Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.). + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | heart, promise, record, bone | All four items can be metaphorically or literally broken. | MANUAL | 0.6s | +| `gemma4:26b` | Promise, Record, Heart, Law | All four are common objects or concepts that can be broken. | MANUAL | 0.7s | +| `gemma4:31b-it-q4_K_M` | Record, Promise, Silence, Heart | All four are common abstract or physical things that can be broken. | MANUAL | 2.1s | + +### create-h1 + +- Category: `Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)` +- Quality bar: Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | SEA, SEE, HEAR, HERE | All four words are homophones of common body parts (sea/see for 'C' sound, hear/here for 'hear'). | MANUAL | 0.8s | +| `gemma4:26b` | MUSSEL, HARE, EYE, KNEAD | Each word is a homophone of a body part (muscle, hair, eye, knee) but spelled differently. | MANUAL | 0.9s | +| `gemma4:31b-it-q4_K_M` | _parse fail_ | `JSONDecodeError('Extra data: line 3 column 1 (char 133)')` | PARSE_FAIL | 6.7s | + +### create-h2 + +- Category: `Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)` +- Quality bar: Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD). + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | KNEEHIGH, EARDRUM, STOMACHACHE, EYELASH | Each word contains a body part name as a substring. | MANUAL | 0.7s | +| `gemma4:26b` | HANDSOME, FOOTPRINT, EARNEST, ARMORY | Each word contains a body part (hand, foot, ear, arm) as a substring. | MANUAL | 0.8s | +| `gemma4:31b-it-q4_K_M` | HEARTBEAT, HANDSOME, FOOTPRINT, ARMCHAIR | Each word contains a body part as a substring. | MANUAL | 2.2s | + +## CREATIVE_ACCEPT + +### ca-y1 + +- Words: `SCALE`, `MOUNT`, `ASCEND`, `CLIMB` +- Player's category: `"Verbs for going up"` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four words can be used as verbs meaning to go up. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are synonyms or closely related verbs describing the act of rising or ascending. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are synonyms for the action of moving upward. | PASS | 2.1s | + +### ca-y2 + +- Words: `APPLE`, `ORANGE`, `KIWI`, `BLACKBERRY` +- Player's category: `"Fruits"` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four listed items are types of fruit. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are common types of fruit. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are common types of fruit. | PASS | 1.8s | + +### ca-y3 + +- Words: `WHIP`, `NUT`, `CODE`, `SMILE` +- Player's category: `"Things you can crack"` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | You can crack a nut, a code, a smile, or a whip (like a whip crack). | PASS | 0.7s | +| `gemma4:26b` | yes | You can crack a whip, crack a nut, crack a code, and crack a smile. | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are common objects or concepts that can be 'cracked'. (weak: WHIP) | PASS | 2.2s | + +### ca-y4 + +- Words: `BAT`, `BALL`, `GLOVE`, `MITT` +- Player's category: `"Baseball equipment"` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four items are commonly associated with baseball. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are standard pieces of equipment used in the game of baseball. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four items are standard pieces of equipment used in baseball. | PASS | 2.0s | + +### ca-y5 + +- Words: `MARS`, `VENUS`, `MERCURY`, `JUPITER` +- Player's category: `"Roman gods"` +- Ground truth: **yes** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | yes | All four are names of Roman deities associated with planets. | PASS | 0.6s | +| `gemma4:26b` | yes | All four words are names of prominent deities in Roman mythology. | PASS | 0.7s | +| `gemma4:31b-it-q4_K_M` | yes | All four words are names of Roman deities as well as planets. | PASS | 2.0s | + +### ca-n1 + +- Words: `OAK`, `MAPLE`, `BIRCH`, `PINE` +- Player's category: `"Furniture brands"` +- Ground truth: **no** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | These words are types of trees, not furniture brands. (weak: OAK, MAPLE, BIRCH, PINE) | PASS | 0.7s | +| `gemma4:26b` | no | These words represent types of trees rather than furniture brands. (weak: OAK, MAPLE, BIRCH, PINE) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | These are types of trees/woods, not brands of furniture. (weak: OAK, MAPLE, BIRCH, PINE) | PASS | 2.7s | + +### ca-n2 + +- Words: `ALPHA`, `BETA`, `GAMMA`, `DELTA` +- Player's category: `"Words meaning 'small'"` +- Ground truth: **no** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | These words are Greek letters, not inherently meaning 'small'. (weak: ALPHA) | PASS | 0.6s | +| `gemma4:26b` | no | Alpha, beta, gamma, and delta are Greek letters used to denote various things, but none of them inherently mean 'small'. | PASS | 0.9s | +| `gemma4:31b-it-q4_K_M` | no | These are Greek letters used as designations, not synonyms for 'small'. (weak: ALPHA, BETA, GAMMA, DELTA) | PASS | 2.6s | + +### ca-n3 + +- Words: `BAT`, `BALL`, `GLOVE`, `MITT` +- Player's category: `"Things worn on your hand"` +- Ground truth: **no** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | BAT and BALL are not things worn on the hand. (weak: BAT, BALL) | PASS | 0.7s | +| `gemma4:26b` | no | Bat and ball are not items worn on the hand. (weak: BAT, BALL) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | A bat and a ball are not worn on the hand. (weak: BAT, BALL) | PASS | 2.3s | + +### ca-n4 + +- Words: `MONDAY`, `FRIDAY`, `SUNDAY`, `WEDNESDAY` +- Player's category: `"Months of the year"` +- Ground truth: **no** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | All the words listed are days of the week, not months of the year. (weak: MONDAY, FRIDAY, SUNDAY, WEDNESDAY) | PASS | 0.8s | +| `gemma4:26b` | no | None of the words are months of the year; they are all days of the week. (weak: MONDAY, FRIDAY, SUNDAY, WEDNESDAY) | PASS | 0.9s | +| `gemma4:31b-it-q4_K_M` | no | The words are days of the week, not months of the year. (weak: MONDAY, FRIDAY, SUNDAY, WEDNESDAY) | PASS | 2.6s | + +### ca-n5 + +- Words: `WHIP`, `NUT`, `CODE`, `SMILE` +- Player's category: `"Things found in a kitchen"` +- Ground truth: **no** + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Nut and smile are not typically considered kitchen items. (weak: NUT, SMILE) | PASS | 0.7s | +| `gemma4:26b` | no | While whip and nut can relate to food, code and smile have no plausible connection to kitchen items. (weak: CODE, SMILE) | PASS | 0.8s | +| `gemma4:31b-it-q4_K_M` | no | Code and smile are not items typically found in a kitchen. (weak: CODE, SMILE) | PASS | 2.4s | + +### ca-b1 + +- Words: `APPLE`, `ORANGE`, `KIWI`, `BLACKBERRY` +- Player's category: `"Tech/phone brands"` +- Ground truth: **borderline** -- APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | None of these words are recognized tech or phone brands. (weak: APPLE) | BORDERLINE | 0.6s | +| `gemma4:26b` | yes | All four words are names of well-known technology companies or mobile device brands. | BORDERLINE | 0.7s | +| `gemma4:31b-it-q4_K_M` | no | Orange and Kiwi are not recognized tech or phone brands. (weak: ORANGE, KIWI) | BORDERLINE | 2.3s | + +### ca-b2 + +- Words: `SHARP`, `FLAT`, `NATURAL`, `KEY` +- Player's category: `"Real estate listing terms"` +- Ground truth: **borderline** -- FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes. + +| Model | Verdict | Reason | Score | dt | +|---|---|---|---|---| +| `gemma4:latest` | no | Only SHARP and FLAT are common musical terms, while KEY and NATURAL are also musical terms, making 'Real estate listing | BORDERLINE | 0.8s | +| `gemma4:26b` | no | While 'flat' and 'key' can relate to real estate, 'sharp' and 'natural' have no plausible connection to real estate list | BORDERLINE | 0.9s | +| `gemma4:31b-it-q4_K_M` | no | Natural and Sharp are not standard terms used in real estate listings. (weak: NATURAL, SHARP) | BORDERLINE | 2.4s | + +--- + +## Manual grading of CREATE outputs + +Auto-grading isn't possible for CREATE — graded by hand here. + +| Case | gemma4:latest (8B) | gemma4:26b | gemma4:31b | +|---|---|---|---| +| **e1** Trees | OAK MAPLE PINE BIRCH ✓ | OAK MAPLE BIRCH CEDAR ✓ | OAK MAPLE PINE BIRCH ✓ | +| **e2** Greek letters | ALPHA BETA GAMMA DELTA ✓ | ALPHA BETA GAMMA DELTA ✓ | ALPHA BETA GAMMA DELTA ✓ | +| **e3** "Angry" synonyms | furious irate enraged mad ✓ | irate livid furious incensed ✓ | furious irate livid enraged ✓ | +| **e4** Days of week | Mon Tue Wed Thu ✓ | Mon Tue Wed Thu ✓ | Mon Wed Fri Sun ✓ | +| **m1** \_\_\_ STORM | **FAIL** — listed STORM itself; HEAT-storm not standard; BLIZZARD-storm redundant | brain dust fire snow ✓ | brain fire thunder snow ✓ | +| **m2** "Small" synonyms | tiny petite minuscule diminutive ✓ | tiny microscopic diminutive minuscule ✓ | tiny petite slight miniature ✓ | +| **m3** Words after BLUE | jeans bird moon blood ✓ | bird jay print whale ✓ | berry jay moon print ✓ | +| **m4** "Broken" things | heart promise record bone ✓ | promise record heart law ✓ | record promise silence heart ✓ | +| **h1** Body-part homophones | **FAIL** — gave SEA/SEE/HEAR/HERE, missed the body-part requirement entirely | **PARTIAL** — MUSSEL/HARE/KNEAD ✓ but EYE is the body part, not a homophone of one | **PARSE_FAIL** (after 3 retries) | +| **h2** Containing body parts | KNEEHIGH EARDRUM STOMACHACHE EYELASH ✓ | HANDSOME FOOTPRINT EARNEST ARMORY ✓ | HEARTBEAT HANDSOME FOOTPRINT ARMCHAIR ✓ | + +**CREATE pass rate**: 8B = 8/10 (e/m clean, both hard cases fail) · 26b = 9/10 (1 partial on h1) · 31b = 9/10 (1 parse-fail on h1) + +The h1 failure is consistent with the prior puzzle bakeoff (#2-26b had the same "homophones of body parts" failure: words that ARE body parts vs words that SOUND LIKE them). **All three models share this confusion.** Designs depending on deep wordplay categories like this need either prompt scaffolding (give a worked example) or human curation of category seeds. + +## Aggregate + +| Model | JUDGE | CREATE | CREATIVE_ACCEPT | Borderline | Avg s | Notes | +|---|---|---|---|---|---|---| +| `gemma4:latest` (8B) | 14/16 | 8/10 | 10/10 | 0/2 strict-aligned | 0.7 | Fastest. Slight bias toward "no" on hard YES cases (judge-y3 days-of-week miss, judge-y6 cold-turkey miss) | +| `gemma4:26b` | 15/16 | 9/10 | 10/10 | 1/2 over-permissive (said KIWI is a tech brand) | 0.8 | Best speed/quality balance for live judging. Shows mild "be helpful, agree" bias | +| `gemma4:31b-it-q4_K_M` | 16/16 | 9/10 | 10/10 | 2/2 strict | 2.3 | Most accurate. Only candidate for once-per-day generation. 1 parse-fail on h1 (3 retries didn't recover) | + +### What this proves + +1. **The CREATIVE_ACCEPT axis works on every model tested.** This is the structural unlock that makes a Gemma-powered Connections derivative meaningfully different from the static NYT format: live, fair judging of player-invented groupings. 10/10 across 3 models on 5 valid + 5 invalid player categories — accept WHIP/NUT/CODE/SMILE for "things you can crack", reject OAK/MAPLE/BIRCH/PINE for "furniture brands", reject MONDAY/FRIDAY/SUNDAY/WEDNESDAY for "months." The model gets the distinction Connections cares about. +2. **Per-guess JUDGE economics are cheap.** 0.7-0.8s on the 3090 Ti for 8B/26b. Even 1000 player-guesses/day costs <15 GPU-minutes — effectively free. +3. **31b is the right generator** (validated earlier) and **the right offline critique judge.** 26b/8B are the right live judges. +4. **Hard wordplay categories (homophones-of-body-parts class) consistently fail** across all three models. Either avoid them or scaffold with examples. + +### What this doesn't prove (limits / next steps) + +- **Borderline-case behavior is model-specific.** 26b said yes to KIWI as tech brand — that's a real false-positive risk for the CREATIVE_ACCEPT design. If the live game uses 26b, it will sometimes accept groupings a strict grader would reject. 8B's stricter bias makes it safer here despite the 87.5% JUDGE rate; 31b is consistent and would be the gold standard but is too slow for live use. +- **Cultural/contextual categories untested.** "Words in a Beatles song", "Things only true after 2020" — these may break the judge in ways simple semantics don't. +- **No adversarial player.** What if the player invents a category to *deliberately game* the system into accepting a near-wrong grouping? E.g. "Words that contain a vowel" trivially fits any 4 English words. Need a category-tightness check on player input, not just word-fit. +- **Ground truth is mine and arguable.** judge-y5 (\_\_\_ HOUSE — 26b said "courthouse is one word, not 'court' + 'house'") is a defensible call I marked as a fail. Real human-grader agreement might bump 26b to 16/16 too. diff --git a/scripts/gemma-generation-bakeoff.py b/scripts/gemma-generation-bakeoff.py new file mode 100644 index 0000000..10381ea --- /dev/null +++ b/scripts/gemma-generation-bakeoff.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +"""Bakeoff: can Gemma 4 generate Connections-style structured puzzles? + +Stress-tests unaided one-shot generation on gemma4:26b and gemma4:31b on a +local Ollama (point OLLAMA_HOST at your instance; default localhost:11434). +Output is graded by hand afterward against a rubric in the README: +overlap-traps, tight category labels, purple wordplay, blind anchor vs a +real human-curated puzzle. +""" +import json +import os +import sys +import time +import urllib.request +from datetime import datetime +from pathlib import Path + +OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate" +MODELS = ["gemma4:26b", "gemma4:31b-it-q4_K_M"] +N_PER_MODEL = 5 +TEMPERATURE = 0.8 +PROJECT_ROOT = Path(__file__).resolve().parent.parent + +PROMPT = """You are designing a single puzzle in the style of NYT Connections. + +A Connections puzzle has: +- Exactly 16 distinct words or short phrases +- Sorted into 4 hidden groups of 4 +- Each group has a tight, specific category label +- Difficulty bands: yellow (easiest, most direct), green (medium), blue (harder, often more abstract), purple (trickiest -- wordplay, double meanings, hidden patterns; e.g. "___ HOUSE": GREEN, ICE, COURT, FIRE) +- The CRITICAL feature: at least 2-3 words must plausibly fit a different group than where they actually go. These red herrings are what make the puzzle hard. Without them, the puzzle is trivial. + +Generate ONE puzzle on a theme of your choice. Output strict JSON in this shape: + +{ + "theme_seed": "", + "groups": [ + {"difficulty": "yellow", "category": "", "words": ["W1","W2","W3","W4"]}, + {"difficulty": "green", "category": "<...>", "words": [...]}, + {"difficulty": "blue", "category": "<...>", "words": [...]}, + {"difficulty": "purple", "category": "<...>", "words": [...]} + ], + "intended_traps": [ + {"word": "", "actual_group": "yellow|green|blue|purple", "trap_group": "yellow|green|blue|purple", "reason": ""} + ] +} + +Rules: +- All 16 words must be distinct +- Categories must be tight enough that the right answer feels obviously right after the reveal +- intended_traps must list at least 2 genuine red-herring words +- Output ONLY the JSON object. No preamble, no markdown fences, no commentary. +""" + + +def call(model: str, prompt: str, temperature: float, timeout: int = 600): + # NOTE on Gemma 4 settings (see ~/bin/gemma4-research/GOTCHAS.md): + # - No format=json (infinite loop on gemma4:26b Q4) + # - think=false for single-turn JSON pipelines (else thinking tokens eat budget) + # - num_ctx >> 2048 default, num_predict >> 128 default + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "think": False, + "options": { + "temperature": temperature, + "num_ctx": 8192, + "num_predict": 4096, + }, + } + req = urllib.request.Request( + OLLAMA, + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}, + ) + t0 = time.time() + with urllib.request.urlopen(req, timeout=timeout) as r: + data = json.loads(r.read()) + return time.time() - t0, data + + +def extract_json(body: str): + """Pull the JSON object out of a Gemma response. Returns parsed dict or raises.""" + if not body or "{" not in body or "}" not in body: + raise ValueError("no JSON object delimiters in response") + chunk = body[body.find("{"): body.rfind("}") + 1] + return json.loads(chunk) + + +def warm(model: str) -> None: + print(f"[warm] {model}", file=sys.stderr, flush=True) + call(model, "Reply with just the word OK.", temperature=0.1, timeout=300) + + +def run_model(model: str, n: int): + out = [] + for i in range(1, n + 1): + # Retry with temp-bump pattern from AI_Visualizer + last_raw = "" + last_dt = 0.0 + last_data = {} + last_err = None + puzzle = None + ok = False + attempts = 0 + for attempt in range(3): + attempts = attempt + 1 + temp = TEMPERATURE + attempt * 0.1 + print(f"[{model}] puzzle {i}/{n} attempt {attempts} (temp={temp:.1f})", + file=sys.stderr, flush=True) + try: + dt, data = call(model, PROMPT, temperature=temp) + except Exception as e: + last_err = repr(e) + continue + last_dt, last_data = dt, data + last_raw = data.get("response", "") or "" + try: + puzzle = extract_json(last_raw) + ok = True + break + except Exception as e: + last_err = repr(e) + continue + + if ok: + out.append({ + "model": model, "i": i, "dt": last_dt, "ok": True, + "attempts": attempts, + "puzzle": puzzle, + "eval_count": last_data.get("eval_count", 0), + "prompt_eval_count": last_data.get("prompt_eval_count", 0), + }) + else: + out.append({ + "model": model, "i": i, "dt": last_dt, "ok": False, + "attempts": attempts, + "puzzle": {"_parse_error": last_err, "_raw": last_raw[:3000]}, + "eval_count": last_data.get("eval_count", 0) if last_data else 0, + "prompt_eval_count": last_data.get("prompt_eval_count", 0) if last_data else 0, + }) + return out + + +def render(results, stamp: str) -> str: + lines = [ + f"# Gemma 4 Generation Bakeoff -- {stamp}", + "", + "## Setup", + f"- Ollama endpoint: `{OLLAMA}` (RTX 3090 Ti on the test host)", + "- Other GPU workloads paused for the duration of the run", + f"- Models: {', '.join(f'`{m}`' for m in MODELS)}", + f"- {N_PER_MODEL} puzzles per model, base temperature {TEMPERATURE}", + "- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, " + "`num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). " + "JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`.", + "- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry " + "(AI_Visualizer pattern). Reported metrics use the *successful* attempt.", + "- One-shot, unaided generation. No critique pass, no example puzzle in prompt.", + "", + "## Timing", + "", + "| Model | n | avg s | avg tokens | tok/s |", + "|---|---|---|---|---|", + ] + for m in MODELS: + rs = [r for r in results if r["model"] == m and "error" not in r] + if not rs: + lines.append(f"| `{m}` | 0 | -- | -- | -- |") + continue + avg_s = sum(r["dt"] for r in rs) / len(rs) + avg_tok = sum(r["eval_count"] for r in rs) / len(rs) + toks = avg_tok / avg_s if avg_s else 0 + lines.append(f"| `{m}` | {len(rs)} | {avg_s:.1f} | {avg_tok:.0f} | {toks:.1f} |") + + lines += ["", "## JSON parse rate", ""] + for m in MODELS: + rs = [r for r in results if r["model"] == m] + ok = sum(1 for r in rs if r.get("ok")) + lines.append(f"- `{m}`: {ok}/{len(rs)} parsed cleanly") + lines += [""] + + for r in results: + head = f"## {r['model']} -- puzzle {r['i']}" + lines += [head, ""] + if "error" in r: + lines += [f"_API error:_ `{r['error']}`", ""] + continue + if not r.get("ok"): + err = r["puzzle"].get("_parse_error", "") + raw = r["puzzle"].get("_raw", "")[:1500] + lines += [f"_JSON parse failed:_ `{err}`", "```", raw, "```", ""] + continue + p = r["puzzle"] + lines += [f"**Theme seed:** {p.get('theme_seed', '--')}", ""] + lines += ["| Diff | Category | Words |", "|---|---|---|"] + for g in p.get("groups", []) or []: + words = ", ".join(g.get("words", []) or []) + cat = (g.get("category") or "?").replace("|", "\\|") + lines.append(f"| {g.get('difficulty', '?')} | {cat} | {words} |") + traps = p.get("intended_traps", []) or [] + lines += ["", f"**Claimed traps ({len(traps)}):**"] + if not traps: + lines.append("- _none claimed_") + for t in traps: + lines.append( + f"- `{t.get('word')}` (actually {t.get('actual_group')}, traps {t.get('trap_group')}): " + f"{t.get('reason')}" + ) + lines += ["", "_Grade:_ TODO", "", f"_dt={r['dt']:.1f}s, tokens={r['eval_count']}_", ""] + return "\n".join(lines) + + +def main() -> None: + out_dir = PROJECT_ROOT / "docs" / "reference" + out_dir.mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + raw_path = out_dir / f"gemma-generation-bakeoff-{stamp}-raw.json" + md_path = out_dir / f"gemma-generation-bakeoff-{stamp}.md" + + all_results = [] + for m in MODELS: + warm(m) + all_results.extend(run_model(m, N_PER_MODEL)) + + raw_path.write_text(json.dumps(all_results, indent=2)) + print(f"raw -> {raw_path}", file=sys.stderr) + md_path.write_text(render(all_results, stamp)) + print(f"md -> {md_path}", file=sys.stderr) + # Final stdout: just the markdown path so callers can pipe. + print(md_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/gemma-semantic-bakeoff.py b/scripts/gemma-semantic-bakeoff.py new file mode 100644 index 0000000..58d9681 --- /dev/null +++ b/scripts/gemma-semantic-bakeoff.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 +"""Bakeoff: Gemma 4's atomic semantic-matching abilities. + +Three test types, all with hand-labeled ground truth: + +- JUDGE: given (category, 4 words), does Gemma correctly say yes/no on whether + the words tightly fit? +- CREATE: given a category, does Gemma produce 4 words that tightly fit it? +- CREATIVE_ACCEPT: given 4 words and a player-proposed category that may or + may not be the puzzle's intended one, does Gemma fairly judge validity? + This is the test of whether "fuzzy / creative-grouping acceptance" -- the + twist from IDEA.md that a static NYT format structurally can't do -- is + feasible. + +Models tested: gemma4:26b, gemma4:31b-it-q4_K_M, gemma4:latest (8B). 8B is +included because judging runs per player guess in any live design; if 8B is +reliable enough for JUDGE, the per-guess economics get a lot better. + +Settings (well-known Gemma-4-on-Ollama gotchas): think=false, num_ctx=4096, +num_predict=512, no format=json (server-side JSON enforcer hangs on 26b Q4), +JSON extracted client-side. Point OLLAMA_HOST at your instance; default +localhost:11434. +""" +import json +import os +import sys +import time +import urllib.request +from datetime import datetime +from pathlib import Path + +OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate" +MODELS = ["gemma4:latest", "gemma4:26b", "gemma4:31b-it-q4_K_M"] +PROJECT_ROOT = Path(__file__).resolve().parent.parent +TEMPERATURE = 0.2 # judging is a low-creativity task; we want consistency + +# ---------- prompts ---------- + +JUDGE_PROMPT = """You are evaluating whether four words tightly fit a given semantic category, in the style of NYT Connections. + +Category: {category} +Words: {w1}, {w2}, {w3}, {w4} + +Do ALL FOUR words clearly fit this category? Be strict -- if even one word doesn't fit, the answer is "no". Generic loose connections do not count. + +Output strict JSON, no preamble or fences: +{{"verdict": "yes" or "no", "reason": "", "misfit_words": [""]}} +""" + +CREATE_PROMPT = """You are creating a tight 4-word group in the style of NYT Connections. + +Category: {category} + +Produce EXACTLY four words or short phrases that tightly fit this category. Each word must clearly belong; vague or loosely-related words are not acceptable. + +Output strict JSON, no preamble or fences: +{{"words": ["W1", "W2", "W3", "W4"], "reason": ""}} +""" + +CREATIVE_ACCEPT_PROMPT = """You are judging a Connections-style puzzle where the player has proposed their OWN category for four words. Their category may differ from the puzzle's intended one, but it might still be a valid alternative -- if all four words plausibly fit the player's category, accept it. + +Words: {w1}, {w2}, {w3}, {w4} +Player's proposed category: "{player_category}" + +Do all four words plausibly fit the player's category? Be fair: a player-creative-but-valid grouping should be accepted. But if even one word genuinely doesn't fit, reject it. + +Output strict JSON, no preamble or fences: +{{"valid": "yes" or "no", "reason": "", "weak_words": [""]}} +""" + +# ---------- test bank ---------- +# Each case has hand-labeled ground truth. The "gt" field is what a thoughtful +# human grader would say (yes/no for JUDGE and CREATIVE_ACCEPT). For CREATE, +# `gt_check` describes what a passing answer should look like. + +CASES = [ + # ---- JUDGE: clear yes (tight fit) ---- + {"id": "judge-y1", "type": "JUDGE", "category": "Types of trees", + "words": ["OAK", "MAPLE", "BIRCH", "PINE"], "gt": "yes"}, + {"id": "judge-y2", "type": "JUDGE", "category": "Greek letters", + "words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "gt": "yes"}, + {"id": "judge-y3", "type": "JUDGE", "category": "Days of the week", + "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "gt": "yes"}, + {"id": "judge-y4", "type": "JUDGE", "category": "Synonyms for 'happy'", + "words": ["JOYFUL", "GLAD", "CHEERFUL", "ELATED"], "gt": "yes"}, + {"id": "judge-y5", "type": "JUDGE", "category": "___ HOUSE (compound words)", + "words": ["GREEN", "ICE", "FIRE", "COURT"], "gt": "yes"}, + {"id": "judge-y6", "type": "JUDGE", "category": "Words that follow COLD", + "words": ["SHOULDER", "FRONT", "SNAP", "TURKEY"], "gt": "yes"}, + {"id": "judge-y7", "type": "JUDGE", "category": "Verbs meaning 'to move quickly'", + "words": ["DART", "BOLT", "RUSH", "FLY"], "gt": "yes"}, + {"id": "judge-y8", "type": "JUDGE", "category": "Synonyms for 'idea'", + "words": ["NOTION", "CONCEPT", "THOUGHT", "INKLING"], "gt": "yes"}, + + # ---- JUDGE: clear no (one or more words don't fit) ---- + {"id": "judge-n1", "type": "JUDGE", "category": "Types of trees", + "words": ["OAK", "MAPLE", "BIRCH", "CARROT"], "gt": "no", + "gt_misfit": ["CARROT"]}, + {"id": "judge-n2", "type": "JUDGE", "category": "Greek letters", + "words": ["ALPHA", "BETA", "GAMMA", "CYRILLIC"], "gt": "no", + "gt_misfit": ["CYRILLIC"]}, + {"id": "judge-n3", "type": "JUDGE", "category": "Synonyms for 'happy'", + "words": ["JOYFUL", "GLAD", "SAD", "ELATED"], "gt": "no", + "gt_misfit": ["SAD"]}, + {"id": "judge-n4", "type": "JUDGE", "category": "Days of the week", + "words": ["MONDAY", "JANUARY", "SUNDAY", "WEDNESDAY"], "gt": "no", + "gt_misfit": ["JANUARY"]}, + {"id": "judge-n5", "type": "JUDGE", "category": "Body parts", + "words": ["ARM", "LEG", "EYE", "NIGHT"], "gt": "no", + "gt_misfit": ["NIGHT"]}, + {"id": "judge-n6", "type": "JUDGE", "category": "Types of birds", + "words": ["CRANE", "SWALLOW", "BAT", "MOSQUITO"], "gt": "no", + "gt_misfit": ["BAT", "MOSQUITO"]}, + {"id": "judge-n7", "type": "JUDGE", "category": "Things that are red", + "words": ["APPLE", "BLUE", "ROSE", "GRASS"], "gt": "no", + "gt_misfit": ["BLUE", "GRASS"]}, + {"id": "judge-n8", "type": "JUDGE", "category": "Words that follow COLD", + "words": ["SHOULDER", "FRONT", "PIZZA", "MOUNTAIN"], "gt": "no", + "gt_misfit": ["PIZZA", "MOUNTAIN"]}, + + # ---- CREATE: easy categories ---- + {"id": "create-e1", "type": "CREATE", "category": "Types of trees", + "gt_check": "Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE."}, + {"id": "create-e2", "type": "CREATE", "category": "Greek letters", + "gt_check": "Four genuine Greek letters."}, + {"id": "create-e3", "type": "CREATE", "category": "Synonyms for 'angry'", + "gt_check": "Four words that all genuinely mean angry/furious."}, + {"id": "create-e4", "type": "CREATE", "category": "Days of the week", + "gt_check": "Four of the seven weekday names, no months or other words."}, + + # ---- CREATE: medium (compound / polysemy) ---- + {"id": "create-m1", "type": "CREATE", "category": "___ STORM (compound words ending in STORM)", + "gt_check": "Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW)."}, + {"id": "create-m2", "type": "CREATE", "category": "Synonyms for 'small'", + "gt_check": "Four words that all genuinely mean small."}, + {"id": "create-m3", "type": "CREATE", "category": "Words that follow BLUE", + "gt_check": "Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS)."}, + {"id": "create-m4", "type": "CREATE", "category": "Things that can be 'broken'", + "gt_check": "Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.)."}, + + # ---- CREATE: hard (wordplay / tight constraint) ---- + {"id": "create-h1", "type": "CREATE", + "category": "Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)", + "gt_check": "Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them."}, + {"id": "create-h2", "type": "CREATE", + "category": "Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)", + "gt_check": "Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD)."}, + + # ---- CREATIVE_ACCEPT: player's grouping is genuinely valid ---- + {"id": "ca-y1", "type": "CREATIVE_ACCEPT", + "words": ["SCALE", "MOUNT", "ASCEND", "CLIMB"], + "player_category": "Verbs for going up", "gt": "yes"}, + {"id": "ca-y2", "type": "CREATIVE_ACCEPT", + "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"], + "player_category": "Fruits", "gt": "yes"}, + {"id": "ca-y3", "type": "CREATIVE_ACCEPT", + "words": ["WHIP", "NUT", "CODE", "SMILE"], + "player_category": "Things you can crack", "gt": "yes"}, + {"id": "ca-y4", "type": "CREATIVE_ACCEPT", + "words": ["BAT", "BALL", "GLOVE", "MITT"], + "player_category": "Baseball equipment", "gt": "yes"}, + {"id": "ca-y5", "type": "CREATIVE_ACCEPT", + "words": ["MARS", "VENUS", "MERCURY", "JUPITER"], + "player_category": "Roman gods", "gt": "yes"}, + + # ---- CREATIVE_ACCEPT: player's grouping is wrong ---- + {"id": "ca-n1", "type": "CREATIVE_ACCEPT", + "words": ["OAK", "MAPLE", "BIRCH", "PINE"], + "player_category": "Furniture brands", "gt": "no"}, + {"id": "ca-n2", "type": "CREATIVE_ACCEPT", + "words": ["ALPHA", "BETA", "GAMMA", "DELTA"], + "player_category": "Words meaning 'small'", "gt": "no"}, + {"id": "ca-n3", "type": "CREATIVE_ACCEPT", + "words": ["BAT", "BALL", "GLOVE", "MITT"], + "player_category": "Things worn on your hand", "gt": "no", + "gt_weak": ["BAT", "BALL"]}, + {"id": "ca-n4", "type": "CREATIVE_ACCEPT", + "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], + "player_category": "Months of the year", "gt": "no"}, + {"id": "ca-n5", "type": "CREATIVE_ACCEPT", + "words": ["WHIP", "NUT", "CODE", "SMILE"], + "player_category": "Things found in a kitchen", "gt": "no", + "gt_weak": ["CODE", "SMILE"]}, + + # ---- CREATIVE_ACCEPT: borderline (deliberately ambiguous) ---- + {"id": "ca-b1", "type": "CREATIVE_ACCEPT", + "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"], + "player_category": "Tech/phone brands", "gt": "borderline", + "gt_note": "APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE."}, + {"id": "ca-b2", "type": "CREATIVE_ACCEPT", + "words": ["SHARP", "FLAT", "NATURAL", "KEY"], + "player_category": "Real estate listing terms", "gt": "borderline", + "gt_note": "FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes."}, +] + +# ---------- runner ---------- + +def call(model, prompt, temperature=TEMPERATURE, timeout=300): + payload = { + "model": model, + "prompt": prompt, + "stream": False, + "think": False, + "options": {"temperature": temperature, "num_ctx": 4096, "num_predict": 512}, + } + req = urllib.request.Request( + OLLAMA, data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}, + ) + t0 = time.time() + with urllib.request.urlopen(req, timeout=timeout) as r: + data = json.loads(r.read()) + return time.time() - t0, data + + +def extract_json(body): + if not body or "{" not in body or "}" not in body: + raise ValueError("no JSON braces in response") + return json.loads(body[body.find("{"): body.rfind("}") + 1]) + + +def render_prompt(case): + if case["type"] == "JUDGE": + return JUDGE_PROMPT.format( + category=case["category"], + w1=case["words"][0], w2=case["words"][1], + w3=case["words"][2], w4=case["words"][3], + ) + if case["type"] == "CREATE": + return CREATE_PROMPT.format(category=case["category"]) + if case["type"] == "CREATIVE_ACCEPT": + return CREATIVE_ACCEPT_PROMPT.format( + w1=case["words"][0], w2=case["words"][1], + w3=case["words"][2], w4=case["words"][3], + player_category=case["player_category"], + ) + raise ValueError(case["type"]) + + +def warm(model): + print(f"[warm] {model}", file=sys.stderr, flush=True) + call(model, "Reply with the word OK only.", temperature=0.1, timeout=300) + + +def run_model(model, cases): + out = [] + for case in cases: + prompt = render_prompt(case) + last_err = None + parsed = None + last_dt = 0.0 + last_eval = 0 + last_raw = "" + for attempt in range(3): + temp = TEMPERATURE + attempt * 0.1 + print(f"[{model}] {case['id']} attempt {attempt+1} (temp={temp:.1f})", + file=sys.stderr, flush=True) + try: + dt, data = call(model, prompt, temperature=temp) + except Exception as e: + last_err = repr(e) + continue + last_dt = dt + last_eval = data.get("eval_count", 0) + last_raw = data.get("response", "") or "" + try: + parsed = extract_json(last_raw) + last_err = None + break + except Exception as e: + last_err = repr(e) + continue + out.append({ + "case_id": case["id"], "type": case["type"], "model": model, + "dt": last_dt, "eval_count": last_eval, + "ok": parsed is not None, + "parsed": parsed, + "raw": last_raw[:1500] if parsed is None else None, + "error": last_err, + "case": case, + }) + return out + + +def score(results): + """Auto-score against ground truth where possible.""" + for r in results: + c = r["case"] + if not r["ok"]: + r["score"] = "PARSE_FAIL" + continue + p = r["parsed"] + if c["type"] == "JUDGE": + v = (p.get("verdict") or "").strip().lower() + r["score"] = "PASS" if v == c["gt"] else "FAIL" + elif c["type"] == "CREATIVE_ACCEPT": + v = (p.get("valid") or "").strip().lower() + if c["gt"] == "borderline": + r["score"] = "BORDERLINE" # human grades these + else: + r["score"] = "PASS" if v == c["gt"] else "FAIL" + elif c["type"] == "CREATE": + r["score"] = "MANUAL" # human grades these against gt_check + return results + + +def render(results): + by_model = {} + for r in results: + by_model.setdefault(r["model"], []).append(r) + + lines = [f"# Gemma 4 Semantic Bakeoff -- {datetime.now().strftime('%Y-%m-%d %H:%M')}", ""] + lines += [ + "## Setup", + f"- Host: steel141 (RTX 3090 Ti) `{OLLAMA}`", + f"- Models: {', '.join('`'+m+'`' for m in MODELS)}", + f"- Temperature {TEMPERATURE} (raised +0.1 per retry on JSON parse fail, max 3 attempts)", + "- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md)", + f"- {len(CASES)} test cases: " + f"{sum(1 for c in CASES if c['type']=='JUDGE')} JUDGE, " + f"{sum(1 for c in CASES if c['type']=='CREATE')} CREATE, " + f"{sum(1 for c in CASES if c['type']=='CREATIVE_ACCEPT')} CREATIVE_ACCEPT", + "- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py`", + "", + ] + + # ---- per-model summaries ---- + lines += ["## Auto-scored summary", ""] + lines += ["| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s |", "|---|---|---|---|---|"] + for m in MODELS: + rs = by_model.get(m, []) + if not rs: + lines.append(f"| `{m}` | - | - | - | - |") + continue + j_pass = sum(1 for r in rs if r["case"]["type"] == "JUDGE" and r.get("score") == "PASS") + j_n = sum(1 for r in rs if r["case"]["type"] == "JUDGE") + c_pass = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r.get("score") == "PASS") + c_n = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r["case"].get("gt") != "borderline") + parse_fail = sum(1 for r in rs if not r["ok"]) + avg_dt = sum(r["dt"] for r in rs) / max(len(rs), 1) + lines.append(f"| `{m}` | {j_pass}/{j_n} | {c_pass}/{c_n} | {parse_fail} | {avg_dt:.1f} |") + lines += [""] + + # ---- by case-type, full breakdown ---- + for tname in ["JUDGE", "CREATE", "CREATIVE_ACCEPT"]: + lines += [f"## {tname}", ""] + cases_of_type = [c for c in CASES if c["type"] == tname] + for case in cases_of_type: + lines += [f"### {case['id']}", ""] + if tname == "JUDGE": + lines += [ + f"- Category: `{case['category']}`", + f"- Words: {', '.join('`'+w+'`' for w in case['words'])}", + f"- Ground truth: **{case['gt']}**" + ( + f" (misfit: {', '.join(case.get('gt_misfit', []))})" if case.get("gt_misfit") else ""), + "", + ] + elif tname == "CREATE": + lines += [ + f"- Category: `{case['category']}`", + f"- Quality bar: {case['gt_check']}", + "", + ] + else: # CREATIVE_ACCEPT + lines += [ + f"- Words: {', '.join('`'+w+'`' for w in case['words'])}", + f"- Player's category: `\"{case['player_category']}\"`", + f"- Ground truth: **{case['gt']}**" + ( + f" -- {case.get('gt_note', '')}" if case.get("gt_note") else ""), + "", + ] + lines += ["| Model | Verdict | Reason | Score | dt |", "|---|---|---|---|---|"] + for m in MODELS: + r = next((r for r in by_model.get(m, []) if r["case_id"] == case["id"]), None) + if r is None: + lines.append(f"| `{m}` | - | - | - | - |") + continue + if not r["ok"]: + lines.append(f"| `{m}` | _parse fail_ | `{(r.get('error') or '')[:60]}` | PARSE_FAIL | {r['dt']:.1f}s |") + continue + p = r["parsed"] + if tname == "JUDGE": + v = p.get("verdict", "?") + reason = p.get("reason", "") + extra = "" + if p.get("misfit_words"): + extra = f" (misfit: {', '.join(p['misfit_words'])})" + elif tname == "CREATE": + v = ", ".join(p.get("words", []) or [])[:80] + reason = p.get("reason", "") + extra = "" + else: + v = p.get("valid", "?") + reason = p.get("reason", "") + extra = "" + if p.get("weak_words"): + extra = f" (weak: {', '.join(p['weak_words'])})" + reason_short = (reason + extra).replace("|", "\\|")[:120] + v_clean = str(v).replace("|", "\\|")[:80] + lines.append(f"| `{m}` | {v_clean} | {reason_short} | {r.get('score', '?')} | {r['dt']:.1f}s |") + lines += [""] + + return "\n".join(lines) + + +def main(): + out_dir = PROJECT_ROOT / "docs" / "reference" + out_dir.mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + raw_path = out_dir / f"gemma-semantic-bakeoff-{stamp}-raw.json" + md_path = out_dir / f"gemma-semantic-bakeoff-{stamp}.md" + + all_results = [] + for m in MODELS: + warm(m) + all_results.extend(run_model(m, CASES)) + + score(all_results) + + # save raw without the case dict redundantly + raw = [{k: v for k, v in r.items() if k != "case"} | {"case_id": r["case_id"]} for r in all_results] + raw_path.write_text(json.dumps(raw, indent=2)) + print(f"raw -> {raw_path}", file=sys.stderr) + + md_path.write_text(render(all_results)) + print(f"md -> {md_path}", file=sys.stderr) + print(md_path) + + +if __name__ == "__main__": + main()