docs: bootstrap repo with bakeoff results and game-mechanics idea bank
This repo opens with the design-discovery work completed before any product code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local Ollama established that: - Whole-puzzle generation in the Connections shape is unreliable on Gemma 4 (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally out of project scope, so the generation route is harder still. - Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b) on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging of player-INVENTED categories. That is the structural unlock vs static hand-curated word games. The README contains the full writeup, the test bench, and a brainstormed bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo range, plus a primitives table for recombination. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,431 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Bakeoff: Gemma 4's atomic semantic-matching abilities.
|
||||
|
||||
Three test types, all with hand-labeled ground truth:
|
||||
|
||||
- JUDGE: given (category, 4 words), does Gemma correctly say yes/no on whether
|
||||
the words tightly fit?
|
||||
- CREATE: given a category, does Gemma produce 4 words that tightly fit it?
|
||||
- CREATIVE_ACCEPT: given 4 words and a player-proposed category that may or
|
||||
may not be the puzzle's intended one, does Gemma fairly judge validity?
|
||||
This is the test of whether "fuzzy / creative-grouping acceptance" -- the
|
||||
twist from IDEA.md that a static NYT format structurally can't do -- is
|
||||
feasible.
|
||||
|
||||
Models tested: gemma4:26b, gemma4:31b-it-q4_K_M, gemma4:latest (8B). 8B is
|
||||
included because judging runs per player guess in any live design; if 8B is
|
||||
reliable enough for JUDGE, the per-guess economics get a lot better.
|
||||
|
||||
Settings (well-known Gemma-4-on-Ollama gotchas): think=false, num_ctx=4096,
|
||||
num_predict=512, no format=json (server-side JSON enforcer hangs on 26b Q4),
|
||||
JSON extracted client-side. Point OLLAMA_HOST at your instance; default
|
||||
localhost:11434.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
|
||||
MODELS = ["gemma4:latest", "gemma4:26b", "gemma4:31b-it-q4_K_M"]
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
TEMPERATURE = 0.2 # judging is a low-creativity task; we want consistency
|
||||
|
||||
# ---------- prompts ----------
|
||||
|
||||
JUDGE_PROMPT = """You are evaluating whether four words tightly fit a given semantic category, in the style of NYT Connections.
|
||||
|
||||
Category: {category}
|
||||
Words: {w1}, {w2}, {w3}, {w4}
|
||||
|
||||
Do ALL FOUR words clearly fit this category? Be strict -- if even one word doesn't fit, the answer is "no". Generic loose connections do not count.
|
||||
|
||||
Output strict JSON, no preamble or fences:
|
||||
{{"verdict": "yes" or "no", "reason": "<one short sentence>", "misfit_words": ["<any words that don't fit>"]}}
|
||||
"""
|
||||
|
||||
CREATE_PROMPT = """You are creating a tight 4-word group in the style of NYT Connections.
|
||||
|
||||
Category: {category}
|
||||
|
||||
Produce EXACTLY four words or short phrases that tightly fit this category. Each word must clearly belong; vague or loosely-related words are not acceptable.
|
||||
|
||||
Output strict JSON, no preamble or fences:
|
||||
{{"words": ["W1", "W2", "W3", "W4"], "reason": "<one short sentence on how all four fit>"}}
|
||||
"""
|
||||
|
||||
CREATIVE_ACCEPT_PROMPT = """You are judging a Connections-style puzzle where the player has proposed their OWN category for four words. Their category may differ from the puzzle's intended one, but it might still be a valid alternative -- if all four words plausibly fit the player's category, accept it.
|
||||
|
||||
Words: {w1}, {w2}, {w3}, {w4}
|
||||
Player's proposed category: "{player_category}"
|
||||
|
||||
Do all four words plausibly fit the player's category? Be fair: a player-creative-but-valid grouping should be accepted. But if even one word genuinely doesn't fit, reject it.
|
||||
|
||||
Output strict JSON, no preamble or fences:
|
||||
{{"valid": "yes" or "no", "reason": "<one short sentence>", "weak_words": ["<any words that don't really fit>"]}}
|
||||
"""
|
||||
|
||||
# ---------- test bank ----------
|
||||
# Each case has hand-labeled ground truth. The "gt" field is what a thoughtful
|
||||
# human grader would say (yes/no for JUDGE and CREATIVE_ACCEPT). For CREATE,
|
||||
# `gt_check` describes what a passing answer should look like.
|
||||
|
||||
CASES = [
|
||||
# ---- JUDGE: clear yes (tight fit) ----
|
||||
{"id": "judge-y1", "type": "JUDGE", "category": "Types of trees",
|
||||
"words": ["OAK", "MAPLE", "BIRCH", "PINE"], "gt": "yes"},
|
||||
{"id": "judge-y2", "type": "JUDGE", "category": "Greek letters",
|
||||
"words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "gt": "yes"},
|
||||
{"id": "judge-y3", "type": "JUDGE", "category": "Days of the week",
|
||||
"words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "gt": "yes"},
|
||||
{"id": "judge-y4", "type": "JUDGE", "category": "Synonyms for 'happy'",
|
||||
"words": ["JOYFUL", "GLAD", "CHEERFUL", "ELATED"], "gt": "yes"},
|
||||
{"id": "judge-y5", "type": "JUDGE", "category": "___ HOUSE (compound words)",
|
||||
"words": ["GREEN", "ICE", "FIRE", "COURT"], "gt": "yes"},
|
||||
{"id": "judge-y6", "type": "JUDGE", "category": "Words that follow COLD",
|
||||
"words": ["SHOULDER", "FRONT", "SNAP", "TURKEY"], "gt": "yes"},
|
||||
{"id": "judge-y7", "type": "JUDGE", "category": "Verbs meaning 'to move quickly'",
|
||||
"words": ["DART", "BOLT", "RUSH", "FLY"], "gt": "yes"},
|
||||
{"id": "judge-y8", "type": "JUDGE", "category": "Synonyms for 'idea'",
|
||||
"words": ["NOTION", "CONCEPT", "THOUGHT", "INKLING"], "gt": "yes"},
|
||||
|
||||
# ---- JUDGE: clear no (one or more words don't fit) ----
|
||||
{"id": "judge-n1", "type": "JUDGE", "category": "Types of trees",
|
||||
"words": ["OAK", "MAPLE", "BIRCH", "CARROT"], "gt": "no",
|
||||
"gt_misfit": ["CARROT"]},
|
||||
{"id": "judge-n2", "type": "JUDGE", "category": "Greek letters",
|
||||
"words": ["ALPHA", "BETA", "GAMMA", "CYRILLIC"], "gt": "no",
|
||||
"gt_misfit": ["CYRILLIC"]},
|
||||
{"id": "judge-n3", "type": "JUDGE", "category": "Synonyms for 'happy'",
|
||||
"words": ["JOYFUL", "GLAD", "SAD", "ELATED"], "gt": "no",
|
||||
"gt_misfit": ["SAD"]},
|
||||
{"id": "judge-n4", "type": "JUDGE", "category": "Days of the week",
|
||||
"words": ["MONDAY", "JANUARY", "SUNDAY", "WEDNESDAY"], "gt": "no",
|
||||
"gt_misfit": ["JANUARY"]},
|
||||
{"id": "judge-n5", "type": "JUDGE", "category": "Body parts",
|
||||
"words": ["ARM", "LEG", "EYE", "NIGHT"], "gt": "no",
|
||||
"gt_misfit": ["NIGHT"]},
|
||||
{"id": "judge-n6", "type": "JUDGE", "category": "Types of birds",
|
||||
"words": ["CRANE", "SWALLOW", "BAT", "MOSQUITO"], "gt": "no",
|
||||
"gt_misfit": ["BAT", "MOSQUITO"]},
|
||||
{"id": "judge-n7", "type": "JUDGE", "category": "Things that are red",
|
||||
"words": ["APPLE", "BLUE", "ROSE", "GRASS"], "gt": "no",
|
||||
"gt_misfit": ["BLUE", "GRASS"]},
|
||||
{"id": "judge-n8", "type": "JUDGE", "category": "Words that follow COLD",
|
||||
"words": ["SHOULDER", "FRONT", "PIZZA", "MOUNTAIN"], "gt": "no",
|
||||
"gt_misfit": ["PIZZA", "MOUNTAIN"]},
|
||||
|
||||
# ---- CREATE: easy categories ----
|
||||
{"id": "create-e1", "type": "CREATE", "category": "Types of trees",
|
||||
"gt_check": "Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE."},
|
||||
{"id": "create-e2", "type": "CREATE", "category": "Greek letters",
|
||||
"gt_check": "Four genuine Greek letters."},
|
||||
{"id": "create-e3", "type": "CREATE", "category": "Synonyms for 'angry'",
|
||||
"gt_check": "Four words that all genuinely mean angry/furious."},
|
||||
{"id": "create-e4", "type": "CREATE", "category": "Days of the week",
|
||||
"gt_check": "Four of the seven weekday names, no months or other words."},
|
||||
|
||||
# ---- CREATE: medium (compound / polysemy) ----
|
||||
{"id": "create-m1", "type": "CREATE", "category": "___ STORM (compound words ending in STORM)",
|
||||
"gt_check": "Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW)."},
|
||||
{"id": "create-m2", "type": "CREATE", "category": "Synonyms for 'small'",
|
||||
"gt_check": "Four words that all genuinely mean small."},
|
||||
{"id": "create-m3", "type": "CREATE", "category": "Words that follow BLUE",
|
||||
"gt_check": "Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS)."},
|
||||
{"id": "create-m4", "type": "CREATE", "category": "Things that can be 'broken'",
|
||||
"gt_check": "Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.)."},
|
||||
|
||||
# ---- CREATE: hard (wordplay / tight constraint) ----
|
||||
{"id": "create-h1", "type": "CREATE",
|
||||
"category": "Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)",
|
||||
"gt_check": "Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them."},
|
||||
{"id": "create-h2", "type": "CREATE",
|
||||
"category": "Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)",
|
||||
"gt_check": "Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD)."},
|
||||
|
||||
# ---- CREATIVE_ACCEPT: player's grouping is genuinely valid ----
|
||||
{"id": "ca-y1", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["SCALE", "MOUNT", "ASCEND", "CLIMB"],
|
||||
"player_category": "Verbs for going up", "gt": "yes"},
|
||||
{"id": "ca-y2", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
|
||||
"player_category": "Fruits", "gt": "yes"},
|
||||
{"id": "ca-y3", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["WHIP", "NUT", "CODE", "SMILE"],
|
||||
"player_category": "Things you can crack", "gt": "yes"},
|
||||
{"id": "ca-y4", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["BAT", "BALL", "GLOVE", "MITT"],
|
||||
"player_category": "Baseball equipment", "gt": "yes"},
|
||||
{"id": "ca-y5", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["MARS", "VENUS", "MERCURY", "JUPITER"],
|
||||
"player_category": "Roman gods", "gt": "yes"},
|
||||
|
||||
# ---- CREATIVE_ACCEPT: player's grouping is wrong ----
|
||||
{"id": "ca-n1", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["OAK", "MAPLE", "BIRCH", "PINE"],
|
||||
"player_category": "Furniture brands", "gt": "no"},
|
||||
{"id": "ca-n2", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["ALPHA", "BETA", "GAMMA", "DELTA"],
|
||||
"player_category": "Words meaning 'small'", "gt": "no"},
|
||||
{"id": "ca-n3", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["BAT", "BALL", "GLOVE", "MITT"],
|
||||
"player_category": "Things worn on your hand", "gt": "no",
|
||||
"gt_weak": ["BAT", "BALL"]},
|
||||
{"id": "ca-n4", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"],
|
||||
"player_category": "Months of the year", "gt": "no"},
|
||||
{"id": "ca-n5", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["WHIP", "NUT", "CODE", "SMILE"],
|
||||
"player_category": "Things found in a kitchen", "gt": "no",
|
||||
"gt_weak": ["CODE", "SMILE"]},
|
||||
|
||||
# ---- CREATIVE_ACCEPT: borderline (deliberately ambiguous) ----
|
||||
{"id": "ca-b1", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
|
||||
"player_category": "Tech/phone brands", "gt": "borderline",
|
||||
"gt_note": "APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE."},
|
||||
{"id": "ca-b2", "type": "CREATIVE_ACCEPT",
|
||||
"words": ["SHARP", "FLAT", "NATURAL", "KEY"],
|
||||
"player_category": "Real estate listing terms", "gt": "borderline",
|
||||
"gt_note": "FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes."},
|
||||
]
|
||||
|
||||
# ---------- runner ----------
|
||||
|
||||
def call(model, prompt, temperature=TEMPERATURE, timeout=300):
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"options": {"temperature": temperature, "num_ctx": 4096, "num_predict": 512},
|
||||
}
|
||||
req = urllib.request.Request(
|
||||
OLLAMA, data=json.dumps(payload).encode(),
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
t0 = time.time()
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
data = json.loads(r.read())
|
||||
return time.time() - t0, data
|
||||
|
||||
|
||||
def extract_json(body):
|
||||
if not body or "{" not in body or "}" not in body:
|
||||
raise ValueError("no JSON braces in response")
|
||||
return json.loads(body[body.find("{"): body.rfind("}") + 1])
|
||||
|
||||
|
||||
def render_prompt(case):
|
||||
if case["type"] == "JUDGE":
|
||||
return JUDGE_PROMPT.format(
|
||||
category=case["category"],
|
||||
w1=case["words"][0], w2=case["words"][1],
|
||||
w3=case["words"][2], w4=case["words"][3],
|
||||
)
|
||||
if case["type"] == "CREATE":
|
||||
return CREATE_PROMPT.format(category=case["category"])
|
||||
if case["type"] == "CREATIVE_ACCEPT":
|
||||
return CREATIVE_ACCEPT_PROMPT.format(
|
||||
w1=case["words"][0], w2=case["words"][1],
|
||||
w3=case["words"][2], w4=case["words"][3],
|
||||
player_category=case["player_category"],
|
||||
)
|
||||
raise ValueError(case["type"])
|
||||
|
||||
|
||||
def warm(model):
|
||||
print(f"[warm] {model}", file=sys.stderr, flush=True)
|
||||
call(model, "Reply with the word OK only.", temperature=0.1, timeout=300)
|
||||
|
||||
|
||||
def run_model(model, cases):
|
||||
out = []
|
||||
for case in cases:
|
||||
prompt = render_prompt(case)
|
||||
last_err = None
|
||||
parsed = None
|
||||
last_dt = 0.0
|
||||
last_eval = 0
|
||||
last_raw = ""
|
||||
for attempt in range(3):
|
||||
temp = TEMPERATURE + attempt * 0.1
|
||||
print(f"[{model}] {case['id']} attempt {attempt+1} (temp={temp:.1f})",
|
||||
file=sys.stderr, flush=True)
|
||||
try:
|
||||
dt, data = call(model, prompt, temperature=temp)
|
||||
except Exception as e:
|
||||
last_err = repr(e)
|
||||
continue
|
||||
last_dt = dt
|
||||
last_eval = data.get("eval_count", 0)
|
||||
last_raw = data.get("response", "") or ""
|
||||
try:
|
||||
parsed = extract_json(last_raw)
|
||||
last_err = None
|
||||
break
|
||||
except Exception as e:
|
||||
last_err = repr(e)
|
||||
continue
|
||||
out.append({
|
||||
"case_id": case["id"], "type": case["type"], "model": model,
|
||||
"dt": last_dt, "eval_count": last_eval,
|
||||
"ok": parsed is not None,
|
||||
"parsed": parsed,
|
||||
"raw": last_raw[:1500] if parsed is None else None,
|
||||
"error": last_err,
|
||||
"case": case,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def score(results):
|
||||
"""Auto-score against ground truth where possible."""
|
||||
for r in results:
|
||||
c = r["case"]
|
||||
if not r["ok"]:
|
||||
r["score"] = "PARSE_FAIL"
|
||||
continue
|
||||
p = r["parsed"]
|
||||
if c["type"] == "JUDGE":
|
||||
v = (p.get("verdict") or "").strip().lower()
|
||||
r["score"] = "PASS" if v == c["gt"] else "FAIL"
|
||||
elif c["type"] == "CREATIVE_ACCEPT":
|
||||
v = (p.get("valid") or "").strip().lower()
|
||||
if c["gt"] == "borderline":
|
||||
r["score"] = "BORDERLINE" # human grades these
|
||||
else:
|
||||
r["score"] = "PASS" if v == c["gt"] else "FAIL"
|
||||
elif c["type"] == "CREATE":
|
||||
r["score"] = "MANUAL" # human grades these against gt_check
|
||||
return results
|
||||
|
||||
|
||||
def render(results):
|
||||
by_model = {}
|
||||
for r in results:
|
||||
by_model.setdefault(r["model"], []).append(r)
|
||||
|
||||
lines = [f"# Gemma 4 Semantic Bakeoff -- {datetime.now().strftime('%Y-%m-%d %H:%M')}", ""]
|
||||
lines += [
|
||||
"## Setup",
|
||||
f"- Host: steel141 (RTX 3090 Ti) `{OLLAMA}`",
|
||||
f"- Models: {', '.join('`'+m+'`' for m in MODELS)}",
|
||||
f"- Temperature {TEMPERATURE} (raised +0.1 per retry on JSON parse fail, max 3 attempts)",
|
||||
"- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md)",
|
||||
f"- {len(CASES)} test cases: "
|
||||
f"{sum(1 for c in CASES if c['type']=='JUDGE')} JUDGE, "
|
||||
f"{sum(1 for c in CASES if c['type']=='CREATE')} CREATE, "
|
||||
f"{sum(1 for c in CASES if c['type']=='CREATIVE_ACCEPT')} CREATIVE_ACCEPT",
|
||||
"- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py`",
|
||||
"",
|
||||
]
|
||||
|
||||
# ---- per-model summaries ----
|
||||
lines += ["## Auto-scored summary", ""]
|
||||
lines += ["| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s |", "|---|---|---|---|---|"]
|
||||
for m in MODELS:
|
||||
rs = by_model.get(m, [])
|
||||
if not rs:
|
||||
lines.append(f"| `{m}` | - | - | - | - |")
|
||||
continue
|
||||
j_pass = sum(1 for r in rs if r["case"]["type"] == "JUDGE" and r.get("score") == "PASS")
|
||||
j_n = sum(1 for r in rs if r["case"]["type"] == "JUDGE")
|
||||
c_pass = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r.get("score") == "PASS")
|
||||
c_n = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r["case"].get("gt") != "borderline")
|
||||
parse_fail = sum(1 for r in rs if not r["ok"])
|
||||
avg_dt = sum(r["dt"] for r in rs) / max(len(rs), 1)
|
||||
lines.append(f"| `{m}` | {j_pass}/{j_n} | {c_pass}/{c_n} | {parse_fail} | {avg_dt:.1f} |")
|
||||
lines += [""]
|
||||
|
||||
# ---- by case-type, full breakdown ----
|
||||
for tname in ["JUDGE", "CREATE", "CREATIVE_ACCEPT"]:
|
||||
lines += [f"## {tname}", ""]
|
||||
cases_of_type = [c for c in CASES if c["type"] == tname]
|
||||
for case in cases_of_type:
|
||||
lines += [f"### {case['id']}", ""]
|
||||
if tname == "JUDGE":
|
||||
lines += [
|
||||
f"- Category: `{case['category']}`",
|
||||
f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
|
||||
f"- Ground truth: **{case['gt']}**" + (
|
||||
f" (misfit: {', '.join(case.get('gt_misfit', []))})" if case.get("gt_misfit") else ""),
|
||||
"",
|
||||
]
|
||||
elif tname == "CREATE":
|
||||
lines += [
|
||||
f"- Category: `{case['category']}`",
|
||||
f"- Quality bar: {case['gt_check']}",
|
||||
"",
|
||||
]
|
||||
else: # CREATIVE_ACCEPT
|
||||
lines += [
|
||||
f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
|
||||
f"- Player's category: `\"{case['player_category']}\"`",
|
||||
f"- Ground truth: **{case['gt']}**" + (
|
||||
f" -- {case.get('gt_note', '')}" if case.get("gt_note") else ""),
|
||||
"",
|
||||
]
|
||||
lines += ["| Model | Verdict | Reason | Score | dt |", "|---|---|---|---|---|"]
|
||||
for m in MODELS:
|
||||
r = next((r for r in by_model.get(m, []) if r["case_id"] == case["id"]), None)
|
||||
if r is None:
|
||||
lines.append(f"| `{m}` | - | - | - | - |")
|
||||
continue
|
||||
if not r["ok"]:
|
||||
lines.append(f"| `{m}` | _parse fail_ | `{(r.get('error') or '')[:60]}` | PARSE_FAIL | {r['dt']:.1f}s |")
|
||||
continue
|
||||
p = r["parsed"]
|
||||
if tname == "JUDGE":
|
||||
v = p.get("verdict", "?")
|
||||
reason = p.get("reason", "")
|
||||
extra = ""
|
||||
if p.get("misfit_words"):
|
||||
extra = f" (misfit: {', '.join(p['misfit_words'])})"
|
||||
elif tname == "CREATE":
|
||||
v = ", ".join(p.get("words", []) or [])[:80]
|
||||
reason = p.get("reason", "")
|
||||
extra = ""
|
||||
else:
|
||||
v = p.get("valid", "?")
|
||||
reason = p.get("reason", "")
|
||||
extra = ""
|
||||
if p.get("weak_words"):
|
||||
extra = f" (weak: {', '.join(p['weak_words'])})"
|
||||
reason_short = (reason + extra).replace("|", "\\|")[:120]
|
||||
v_clean = str(v).replace("|", "\\|")[:80]
|
||||
lines.append(f"| `{m}` | {v_clean} | {reason_short} | {r.get('score', '?')} | {r['dt']:.1f}s |")
|
||||
lines += [""]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
out_dir = PROJECT_ROOT / "docs" / "reference"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
|
||||
raw_path = out_dir / f"gemma-semantic-bakeoff-{stamp}-raw.json"
|
||||
md_path = out_dir / f"gemma-semantic-bakeoff-{stamp}.md"
|
||||
|
||||
all_results = []
|
||||
for m in MODELS:
|
||||
warm(m)
|
||||
all_results.extend(run_model(m, CASES))
|
||||
|
||||
score(all_results)
|
||||
|
||||
# save raw without the case dict redundantly
|
||||
raw = [{k: v for k, v in r.items() if k != "case"} | {"case_id": r["case_id"]} for r in all_results]
|
||||
raw_path.write_text(json.dumps(raw, indent=2))
|
||||
print(f"raw -> {raw_path}", file=sys.stderr)
|
||||
|
||||
md_path.write_text(render(all_results))
|
||||
print(f"md -> {md_path}", file=sys.stderr)
|
||||
print(md_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user