#!/usr/bin/env python3 """Bakeoff: Gemma 4's atomic semantic-matching abilities. Three test types, all with hand-labeled ground truth: - JUDGE: given (category, 4 words), does Gemma correctly say yes/no on whether the words tightly fit? - CREATE: given a category, does Gemma produce 4 words that tightly fit it? - CREATIVE_ACCEPT: given 4 words and a player-proposed category that may or may not be the puzzle's intended one, does Gemma fairly judge validity? This is the test of whether "fuzzy / creative-grouping acceptance" -- the twist from IDEA.md that a static NYT format structurally can't do -- is feasible. Models tested: gemma4:26b, gemma4:31b-it-q4_K_M, gemma4:latest (8B). 8B is included because judging runs per player guess in any live design; if 8B is reliable enough for JUDGE, the per-guess economics get a lot better. Settings (well-known Gemma-4-on-Ollama gotchas): think=false, num_ctx=4096, num_predict=512, no format=json (server-side JSON enforcer hangs on 26b Q4), JSON extracted client-side. Point OLLAMA_HOST at your instance; default localhost:11434. """ import json import os import sys import time import urllib.request from datetime import datetime from pathlib import Path OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate" MODELS = ["gemma4:latest", "gemma4:26b", "gemma4:31b-it-q4_K_M"] PROJECT_ROOT = Path(__file__).resolve().parent.parent TEMPERATURE = 0.2 # judging is a low-creativity task; we want consistency # ---------- prompts ---------- JUDGE_PROMPT = """You are evaluating whether four words tightly fit a given semantic category, in the style of NYT Connections. Category: {category} Words: {w1}, {w2}, {w3}, {w4} Do ALL FOUR words clearly fit this category? Be strict -- if even one word doesn't fit, the answer is "no". Generic loose connections do not count. Output strict JSON, no preamble or fences: {{"verdict": "yes" or "no", "reason": "", "misfit_words": [""]}} """ CREATE_PROMPT = """You are creating a tight 4-word group in the style of NYT Connections. Category: {category} Produce EXACTLY four words or short phrases that tightly fit this category. Each word must clearly belong; vague or loosely-related words are not acceptable. Output strict JSON, no preamble or fences: {{"words": ["W1", "W2", "W3", "W4"], "reason": ""}} """ CREATIVE_ACCEPT_PROMPT = """You are judging a Connections-style puzzle where the player has proposed their OWN category for four words. Their category may differ from the puzzle's intended one, but it might still be a valid alternative -- if all four words plausibly fit the player's category, accept it. Words: {w1}, {w2}, {w3}, {w4} Player's proposed category: "{player_category}" Do all four words plausibly fit the player's category? Be fair: a player-creative-but-valid grouping should be accepted. But if even one word genuinely doesn't fit, reject it. Output strict JSON, no preamble or fences: {{"valid": "yes" or "no", "reason": "", "weak_words": [""]}} """ # ---------- test bank ---------- # Each case has hand-labeled ground truth. The "gt" field is what a thoughtful # human grader would say (yes/no for JUDGE and CREATIVE_ACCEPT). For CREATE, # `gt_check` describes what a passing answer should look like. CASES = [ # ---- JUDGE: clear yes (tight fit) ---- {"id": "judge-y1", "type": "JUDGE", "category": "Types of trees", "words": ["OAK", "MAPLE", "BIRCH", "PINE"], "gt": "yes"}, {"id": "judge-y2", "type": "JUDGE", "category": "Greek letters", "words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "gt": "yes"}, {"id": "judge-y3", "type": "JUDGE", "category": "Days of the week", "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "gt": "yes"}, {"id": "judge-y4", "type": "JUDGE", "category": "Synonyms for 'happy'", "words": ["JOYFUL", "GLAD", "CHEERFUL", "ELATED"], "gt": "yes"}, {"id": "judge-y5", "type": "JUDGE", "category": "___ HOUSE (compound words)", "words": ["GREEN", "ICE", "FIRE", "COURT"], "gt": "yes"}, {"id": "judge-y6", "type": "JUDGE", "category": "Words that follow COLD", "words": ["SHOULDER", "FRONT", "SNAP", "TURKEY"], "gt": "yes"}, {"id": "judge-y7", "type": "JUDGE", "category": "Verbs meaning 'to move quickly'", "words": ["DART", "BOLT", "RUSH", "FLY"], "gt": "yes"}, {"id": "judge-y8", "type": "JUDGE", "category": "Synonyms for 'idea'", "words": ["NOTION", "CONCEPT", "THOUGHT", "INKLING"], "gt": "yes"}, # ---- JUDGE: clear no (one or more words don't fit) ---- {"id": "judge-n1", "type": "JUDGE", "category": "Types of trees", "words": ["OAK", "MAPLE", "BIRCH", "CARROT"], "gt": "no", "gt_misfit": ["CARROT"]}, {"id": "judge-n2", "type": "JUDGE", "category": "Greek letters", "words": ["ALPHA", "BETA", "GAMMA", "CYRILLIC"], "gt": "no", "gt_misfit": ["CYRILLIC"]}, {"id": "judge-n3", "type": "JUDGE", "category": "Synonyms for 'happy'", "words": ["JOYFUL", "GLAD", "SAD", "ELATED"], "gt": "no", "gt_misfit": ["SAD"]}, {"id": "judge-n4", "type": "JUDGE", "category": "Days of the week", "words": ["MONDAY", "JANUARY", "SUNDAY", "WEDNESDAY"], "gt": "no", "gt_misfit": ["JANUARY"]}, {"id": "judge-n5", "type": "JUDGE", "category": "Body parts", "words": ["ARM", "LEG", "EYE", "NIGHT"], "gt": "no", "gt_misfit": ["NIGHT"]}, {"id": "judge-n6", "type": "JUDGE", "category": "Types of birds", "words": ["CRANE", "SWALLOW", "BAT", "MOSQUITO"], "gt": "no", "gt_misfit": ["BAT", "MOSQUITO"]}, {"id": "judge-n7", "type": "JUDGE", "category": "Things that are red", "words": ["APPLE", "BLUE", "ROSE", "GRASS"], "gt": "no", "gt_misfit": ["BLUE", "GRASS"]}, {"id": "judge-n8", "type": "JUDGE", "category": "Words that follow COLD", "words": ["SHOULDER", "FRONT", "PIZZA", "MOUNTAIN"], "gt": "no", "gt_misfit": ["PIZZA", "MOUNTAIN"]}, # ---- CREATE: easy categories ---- {"id": "create-e1", "type": "CREATE", "category": "Types of trees", "gt_check": "Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE."}, {"id": "create-e2", "type": "CREATE", "category": "Greek letters", "gt_check": "Four genuine Greek letters."}, {"id": "create-e3", "type": "CREATE", "category": "Synonyms for 'angry'", "gt_check": "Four words that all genuinely mean angry/furious."}, {"id": "create-e4", "type": "CREATE", "category": "Days of the week", "gt_check": "Four of the seven weekday names, no months or other words."}, # ---- CREATE: medium (compound / polysemy) ---- {"id": "create-m1", "type": "CREATE", "category": "___ STORM (compound words ending in STORM)", "gt_check": "Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW)."}, {"id": "create-m2", "type": "CREATE", "category": "Synonyms for 'small'", "gt_check": "Four words that all genuinely mean small."}, {"id": "create-m3", "type": "CREATE", "category": "Words that follow BLUE", "gt_check": "Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS)."}, {"id": "create-m4", "type": "CREATE", "category": "Things that can be 'broken'", "gt_check": "Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.)."}, # ---- CREATE: hard (wordplay / tight constraint) ---- {"id": "create-h1", "type": "CREATE", "category": "Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)", "gt_check": "Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them."}, {"id": "create-h2", "type": "CREATE", "category": "Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)", "gt_check": "Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD)."}, # ---- CREATIVE_ACCEPT: player's grouping is genuinely valid ---- {"id": "ca-y1", "type": "CREATIVE_ACCEPT", "words": ["SCALE", "MOUNT", "ASCEND", "CLIMB"], "player_category": "Verbs for going up", "gt": "yes"}, {"id": "ca-y2", "type": "CREATIVE_ACCEPT", "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"], "player_category": "Fruits", "gt": "yes"}, {"id": "ca-y3", "type": "CREATIVE_ACCEPT", "words": ["WHIP", "NUT", "CODE", "SMILE"], "player_category": "Things you can crack", "gt": "yes"}, {"id": "ca-y4", "type": "CREATIVE_ACCEPT", "words": ["BAT", "BALL", "GLOVE", "MITT"], "player_category": "Baseball equipment", "gt": "yes"}, {"id": "ca-y5", "type": "CREATIVE_ACCEPT", "words": ["MARS", "VENUS", "MERCURY", "JUPITER"], "player_category": "Roman gods", "gt": "yes"}, # ---- CREATIVE_ACCEPT: player's grouping is wrong ---- {"id": "ca-n1", "type": "CREATIVE_ACCEPT", "words": ["OAK", "MAPLE", "BIRCH", "PINE"], "player_category": "Furniture brands", "gt": "no"}, {"id": "ca-n2", "type": "CREATIVE_ACCEPT", "words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "player_category": "Words meaning 'small'", "gt": "no"}, {"id": "ca-n3", "type": "CREATIVE_ACCEPT", "words": ["BAT", "BALL", "GLOVE", "MITT"], "player_category": "Things worn on your hand", "gt": "no", "gt_weak": ["BAT", "BALL"]}, {"id": "ca-n4", "type": "CREATIVE_ACCEPT", "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "player_category": "Months of the year", "gt": "no"}, {"id": "ca-n5", "type": "CREATIVE_ACCEPT", "words": ["WHIP", "NUT", "CODE", "SMILE"], "player_category": "Things found in a kitchen", "gt": "no", "gt_weak": ["CODE", "SMILE"]}, # ---- CREATIVE_ACCEPT: borderline (deliberately ambiguous) ---- {"id": "ca-b1", "type": "CREATIVE_ACCEPT", "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"], "player_category": "Tech/phone brands", "gt": "borderline", "gt_note": "APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE."}, {"id": "ca-b2", "type": "CREATIVE_ACCEPT", "words": ["SHARP", "FLAT", "NATURAL", "KEY"], "player_category": "Real estate listing terms", "gt": "borderline", "gt_note": "FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes."}, ] # ---------- runner ---------- def call(model, prompt, temperature=TEMPERATURE, timeout=300): payload = { "model": model, "prompt": prompt, "stream": False, "think": False, "options": {"temperature": temperature, "num_ctx": 4096, "num_predict": 512}, } req = urllib.request.Request( OLLAMA, data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) t0 = time.time() with urllib.request.urlopen(req, timeout=timeout) as r: data = json.loads(r.read()) return time.time() - t0, data def extract_json(body): if not body or "{" not in body or "}" not in body: raise ValueError("no JSON braces in response") return json.loads(body[body.find("{"): body.rfind("}") + 1]) def render_prompt(case): if case["type"] == "JUDGE": return JUDGE_PROMPT.format( category=case["category"], w1=case["words"][0], w2=case["words"][1], w3=case["words"][2], w4=case["words"][3], ) if case["type"] == "CREATE": return CREATE_PROMPT.format(category=case["category"]) if case["type"] == "CREATIVE_ACCEPT": return CREATIVE_ACCEPT_PROMPT.format( w1=case["words"][0], w2=case["words"][1], w3=case["words"][2], w4=case["words"][3], player_category=case["player_category"], ) raise ValueError(case["type"]) def warm(model): print(f"[warm] {model}", file=sys.stderr, flush=True) call(model, "Reply with the word OK only.", temperature=0.1, timeout=300) def run_model(model, cases): out = [] for case in cases: prompt = render_prompt(case) last_err = None parsed = None last_dt = 0.0 last_eval = 0 last_raw = "" for attempt in range(3): temp = TEMPERATURE + attempt * 0.1 print(f"[{model}] {case['id']} attempt {attempt+1} (temp={temp:.1f})", file=sys.stderr, flush=True) try: dt, data = call(model, prompt, temperature=temp) except Exception as e: last_err = repr(e) continue last_dt = dt last_eval = data.get("eval_count", 0) last_raw = data.get("response", "") or "" try: parsed = extract_json(last_raw) last_err = None break except Exception as e: last_err = repr(e) continue out.append({ "case_id": case["id"], "type": case["type"], "model": model, "dt": last_dt, "eval_count": last_eval, "ok": parsed is not None, "parsed": parsed, "raw": last_raw[:1500] if parsed is None else None, "error": last_err, "case": case, }) return out def score(results): """Auto-score against ground truth where possible.""" for r in results: c = r["case"] if not r["ok"]: r["score"] = "PARSE_FAIL" continue p = r["parsed"] if c["type"] == "JUDGE": v = (p.get("verdict") or "").strip().lower() r["score"] = "PASS" if v == c["gt"] else "FAIL" elif c["type"] == "CREATIVE_ACCEPT": v = (p.get("valid") or "").strip().lower() if c["gt"] == "borderline": r["score"] = "BORDERLINE" # human grades these else: r["score"] = "PASS" if v == c["gt"] else "FAIL" elif c["type"] == "CREATE": r["score"] = "MANUAL" # human grades these against gt_check return results def render(results): by_model = {} for r in results: by_model.setdefault(r["model"], []).append(r) lines = [f"# Gemma 4 Semantic Bakeoff -- {datetime.now().strftime('%Y-%m-%d %H:%M')}", ""] lines += [ "## Setup", f"- Host: steel141 (RTX 3090 Ti) `{OLLAMA}`", f"- Models: {', '.join('`'+m+'`' for m in MODELS)}", f"- Temperature {TEMPERATURE} (raised +0.1 per retry on JSON parse fail, max 3 attempts)", "- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md)", f"- {len(CASES)} test cases: " f"{sum(1 for c in CASES if c['type']=='JUDGE')} JUDGE, " f"{sum(1 for c in CASES if c['type']=='CREATE')} CREATE, " f"{sum(1 for c in CASES if c['type']=='CREATIVE_ACCEPT')} CREATIVE_ACCEPT", "- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py`", "", ] # ---- per-model summaries ---- lines += ["## Auto-scored summary", ""] lines += ["| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s |", "|---|---|---|---|---|"] for m in MODELS: rs = by_model.get(m, []) if not rs: lines.append(f"| `{m}` | - | - | - | - |") continue j_pass = sum(1 for r in rs if r["case"]["type"] == "JUDGE" and r.get("score") == "PASS") j_n = sum(1 for r in rs if r["case"]["type"] == "JUDGE") c_pass = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r.get("score") == "PASS") c_n = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r["case"].get("gt") != "borderline") parse_fail = sum(1 for r in rs if not r["ok"]) avg_dt = sum(r["dt"] for r in rs) / max(len(rs), 1) lines.append(f"| `{m}` | {j_pass}/{j_n} | {c_pass}/{c_n} | {parse_fail} | {avg_dt:.1f} |") lines += [""] # ---- by case-type, full breakdown ---- for tname in ["JUDGE", "CREATE", "CREATIVE_ACCEPT"]: lines += [f"## {tname}", ""] cases_of_type = [c for c in CASES if c["type"] == tname] for case in cases_of_type: lines += [f"### {case['id']}", ""] if tname == "JUDGE": lines += [ f"- Category: `{case['category']}`", f"- Words: {', '.join('`'+w+'`' for w in case['words'])}", f"- Ground truth: **{case['gt']}**" + ( f" (misfit: {', '.join(case.get('gt_misfit', []))})" if case.get("gt_misfit") else ""), "", ] elif tname == "CREATE": lines += [ f"- Category: `{case['category']}`", f"- Quality bar: {case['gt_check']}", "", ] else: # CREATIVE_ACCEPT lines += [ f"- Words: {', '.join('`'+w+'`' for w in case['words'])}", f"- Player's category: `\"{case['player_category']}\"`", f"- Ground truth: **{case['gt']}**" + ( f" -- {case.get('gt_note', '')}" if case.get("gt_note") else ""), "", ] lines += ["| Model | Verdict | Reason | Score | dt |", "|---|---|---|---|---|"] for m in MODELS: r = next((r for r in by_model.get(m, []) if r["case_id"] == case["id"]), None) if r is None: lines.append(f"| `{m}` | - | - | - | - |") continue if not r["ok"]: lines.append(f"| `{m}` | _parse fail_ | `{(r.get('error') or '')[:60]}` | PARSE_FAIL | {r['dt']:.1f}s |") continue p = r["parsed"] if tname == "JUDGE": v = p.get("verdict", "?") reason = p.get("reason", "") extra = "" if p.get("misfit_words"): extra = f" (misfit: {', '.join(p['misfit_words'])})" elif tname == "CREATE": v = ", ".join(p.get("words", []) or [])[:80] reason = p.get("reason", "") extra = "" else: v = p.get("valid", "?") reason = p.get("reason", "") extra = "" if p.get("weak_words"): extra = f" (weak: {', '.join(p['weak_words'])})" reason_short = (reason + extra).replace("|", "\\|")[:120] v_clean = str(v).replace("|", "\\|")[:80] lines.append(f"| `{m}` | {v_clean} | {reason_short} | {r.get('score', '?')} | {r['dt']:.1f}s |") lines += [""] return "\n".join(lines) def main(): out_dir = PROJECT_ROOT / "docs" / "reference" out_dir.mkdir(parents=True, exist_ok=True) stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") raw_path = out_dir / f"gemma-semantic-bakeoff-{stamp}-raw.json" md_path = out_dir / f"gemma-semantic-bakeoff-{stamp}.md" all_results = [] for m in MODELS: warm(m) all_results.extend(run_model(m, CASES)) score(all_results) # save raw without the case dict redundantly raw = [{k: v for k, v in r.items() if k != "case"} | {"case_id": r["case_id"]} for r in all_results] raw_path.write_text(json.dumps(raw, indent=2)) print(f"raw -> {raw_path}", file=sys.stderr) md_path.write_text(render(all_results)) print(f"md -> {md_path}", file=sys.stderr) print(md_path) if __name__ == "__main__": main()