#!/usr/bin/env python3
"""Bakeoff: Gemma 4's atomic semantic-matching abilities.

Three test types, all with hand-labeled ground truth:

- JUDGE: given (category, 4 words), does Gemma correctly say yes/no on whether
  the words tightly fit?
- CREATE: given a category, does Gemma produce 4 words that tightly fit it?
- CREATIVE_ACCEPT: given 4 words and a player-proposed category that may or
  may not be the puzzle's intended one, does Gemma fairly judge validity?
  This is the test of whether "fuzzy / creative-grouping acceptance" -- the
  twist from IDEA.md that a static NYT format structurally can't do -- is
  feasible.

Models tested: gemma4:26b, gemma4:31b-it-q4_K_M, gemma4:latest (8B). 8B is
included because judging runs per player guess in any live design; if 8B is
reliable enough for JUDGE, the per-guess economics get a lot better.

Settings (well-known Gemma-4-on-Ollama gotchas): think=false, num_ctx=4096,
num_predict=512, no format=json (server-side JSON enforcer hangs on 26b Q4),
JSON extracted client-side. Point OLLAMA_HOST at your instance; default
localhost:11434.
"""
import json
import os
import sys
import time
import urllib.request
from datetime import datetime
from pathlib import Path

OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
MODELS = ["gemma4:latest", "gemma4:26b", "gemma4:31b-it-q4_K_M"]
PROJECT_ROOT = Path(__file__).resolve().parent.parent
TEMPERATURE = 0.2  # judging is a low-creativity task; we want consistency

# ---------- prompts ----------

JUDGE_PROMPT = """You are evaluating whether four words tightly fit a given semantic category, in the style of NYT Connections.

Category: {category}
Words: {w1}, {w2}, {w3}, {w4}

Do ALL FOUR words clearly fit this category? Be strict -- if even one word doesn't fit, the answer is "no". Generic loose connections do not count.

Output strict JSON, no preamble or fences:
{{"verdict": "yes" or "no", "reason": "<one short sentence>", "misfit_words": ["<any words that don't fit>"]}}
"""

CREATE_PROMPT = """You are creating a tight 4-word group in the style of NYT Connections.

Category: {category}

Produce EXACTLY four words or short phrases that tightly fit this category. Each word must clearly belong; vague or loosely-related words are not acceptable.

Output strict JSON, no preamble or fences:
{{"words": ["W1", "W2", "W3", "W4"], "reason": "<one short sentence on how all four fit>"}}
"""

CREATIVE_ACCEPT_PROMPT = """You are judging a Connections-style puzzle where the player has proposed their OWN category for four words. Their category may differ from the puzzle's intended one, but it might still be a valid alternative -- if all four words plausibly fit the player's category, accept it.

Words: {w1}, {w2}, {w3}, {w4}
Player's proposed category: "{player_category}"

Do all four words plausibly fit the player's category? Be fair: a player-creative-but-valid grouping should be accepted. But if even one word genuinely doesn't fit, reject it.

Output strict JSON, no preamble or fences:
{{"valid": "yes" or "no", "reason": "<one short sentence>", "weak_words": ["<any words that don't really fit>"]}}
"""

# ---------- test bank ----------
# Each case has hand-labeled ground truth. The "gt" field is what a thoughtful
# human grader would say (yes/no for JUDGE and CREATIVE_ACCEPT). For CREATE,
# `gt_check` describes what a passing answer should look like.

CASES = [
    # ---- JUDGE: clear yes (tight fit) ----
    {"id": "judge-y1", "type": "JUDGE", "category": "Types of trees",
     "words": ["OAK", "MAPLE", "BIRCH", "PINE"], "gt": "yes"},
    {"id": "judge-y2", "type": "JUDGE", "category": "Greek letters",
     "words": ["ALPHA", "BETA", "GAMMA", "DELTA"], "gt": "yes"},
    {"id": "judge-y3", "type": "JUDGE", "category": "Days of the week",
     "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"], "gt": "yes"},
    {"id": "judge-y4", "type": "JUDGE", "category": "Synonyms for 'happy'",
     "words": ["JOYFUL", "GLAD", "CHEERFUL", "ELATED"], "gt": "yes"},
    {"id": "judge-y5", "type": "JUDGE", "category": "___ HOUSE (compound words)",
     "words": ["GREEN", "ICE", "FIRE", "COURT"], "gt": "yes"},
    {"id": "judge-y6", "type": "JUDGE", "category": "Words that follow COLD",
     "words": ["SHOULDER", "FRONT", "SNAP", "TURKEY"], "gt": "yes"},
    {"id": "judge-y7", "type": "JUDGE", "category": "Verbs meaning 'to move quickly'",
     "words": ["DART", "BOLT", "RUSH", "FLY"], "gt": "yes"},
    {"id": "judge-y8", "type": "JUDGE", "category": "Synonyms for 'idea'",
     "words": ["NOTION", "CONCEPT", "THOUGHT", "INKLING"], "gt": "yes"},

    # ---- JUDGE: clear no (one or more words don't fit) ----
    {"id": "judge-n1", "type": "JUDGE", "category": "Types of trees",
     "words": ["OAK", "MAPLE", "BIRCH", "CARROT"], "gt": "no",
     "gt_misfit": ["CARROT"]},
    {"id": "judge-n2", "type": "JUDGE", "category": "Greek letters",
     "words": ["ALPHA", "BETA", "GAMMA", "CYRILLIC"], "gt": "no",
     "gt_misfit": ["CYRILLIC"]},
    {"id": "judge-n3", "type": "JUDGE", "category": "Synonyms for 'happy'",
     "words": ["JOYFUL", "GLAD", "SAD", "ELATED"], "gt": "no",
     "gt_misfit": ["SAD"]},
    {"id": "judge-n4", "type": "JUDGE", "category": "Days of the week",
     "words": ["MONDAY", "JANUARY", "SUNDAY", "WEDNESDAY"], "gt": "no",
     "gt_misfit": ["JANUARY"]},
    {"id": "judge-n5", "type": "JUDGE", "category": "Body parts",
     "words": ["ARM", "LEG", "EYE", "NIGHT"], "gt": "no",
     "gt_misfit": ["NIGHT"]},
    {"id": "judge-n6", "type": "JUDGE", "category": "Types of birds",
     "words": ["CRANE", "SWALLOW", "BAT", "MOSQUITO"], "gt": "no",
     "gt_misfit": ["BAT", "MOSQUITO"]},
    {"id": "judge-n7", "type": "JUDGE", "category": "Things that are red",
     "words": ["APPLE", "BLUE", "ROSE", "GRASS"], "gt": "no",
     "gt_misfit": ["BLUE", "GRASS"]},
    {"id": "judge-n8", "type": "JUDGE", "category": "Words that follow COLD",
     "words": ["SHOULDER", "FRONT", "PIZZA", "MOUNTAIN"], "gt": "no",
     "gt_misfit": ["PIZZA", "MOUNTAIN"]},

    # ---- CREATE: easy categories ----
    {"id": "create-e1", "type": "CREATE", "category": "Types of trees",
     "gt_check": "Four valid tree species; e.g. OAK, MAPLE, BIRCH, PINE."},
    {"id": "create-e2", "type": "CREATE", "category": "Greek letters",
     "gt_check": "Four genuine Greek letters."},
    {"id": "create-e3", "type": "CREATE", "category": "Synonyms for 'angry'",
     "gt_check": "Four words that all genuinely mean angry/furious."},
    {"id": "create-e4", "type": "CREATE", "category": "Days of the week",
     "gt_check": "Four of the seven weekday names, no months or other words."},

    # ---- CREATE: medium (compound / polysemy) ----
    {"id": "create-m1", "type": "CREATE", "category": "___ STORM (compound words ending in STORM)",
     "gt_check": "Four words that each form a real compound or fixed phrase with STORM (e.g. SAND, BRAIN, THUNDER, SNOW)."},
    {"id": "create-m2", "type": "CREATE", "category": "Synonyms for 'small'",
     "gt_check": "Four words that all genuinely mean small."},
    {"id": "create-m3", "type": "CREATE", "category": "Words that follow BLUE",
     "gt_check": "Four words that each form a real compound with BLUE (e.g. BERRY, BIRD, PRINT, BELL, GRASS)."},
    {"id": "create-m4", "type": "CREATE", "category": "Things that can be 'broken'",
     "gt_check": "Four words that each form a real fixed phrase with 'broken' (heart, record, law, promise, etc.)."},

    # ---- CREATE: hard (wordplay / tight constraint) ----
    {"id": "create-h1", "type": "CREATE",
     "category": "Words that are homophones of body parts but spelled differently (e.g. HARE = hair, MUSSEL = muscle)",
     "gt_check": "Four words that each sound like a body part but are spelled differently. Valid examples: HARE (hair), MUSSEL (muscle), HEAL (heel), SOUL (sole), AYE/EYE-spelled-otherwise. EYE and HEEL alone do NOT count -- those are the body parts themselves, not homophones of them."},
    {"id": "create-h2", "type": "CREATE",
     "category": "Words that contain a body part as a substring (e.g. HEARTBEAT contains HEART)",
     "gt_check": "Four words that each contain a body part anywhere inside them. Valid examples: HEARTH (HEART), CHESTNUT (CHEST), EARTH (EAR), HEADACHE (HEAD)."},

    # ---- CREATIVE_ACCEPT: player's grouping is genuinely valid ----
    {"id": "ca-y1", "type": "CREATIVE_ACCEPT",
     "words": ["SCALE", "MOUNT", "ASCEND", "CLIMB"],
     "player_category": "Verbs for going up", "gt": "yes"},
    {"id": "ca-y2", "type": "CREATIVE_ACCEPT",
     "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
     "player_category": "Fruits", "gt": "yes"},
    {"id": "ca-y3", "type": "CREATIVE_ACCEPT",
     "words": ["WHIP", "NUT", "CODE", "SMILE"],
     "player_category": "Things you can crack", "gt": "yes"},
    {"id": "ca-y4", "type": "CREATIVE_ACCEPT",
     "words": ["BAT", "BALL", "GLOVE", "MITT"],
     "player_category": "Baseball equipment", "gt": "yes"},
    {"id": "ca-y5", "type": "CREATIVE_ACCEPT",
     "words": ["MARS", "VENUS", "MERCURY", "JUPITER"],
     "player_category": "Roman gods", "gt": "yes"},

    # ---- CREATIVE_ACCEPT: player's grouping is wrong ----
    {"id": "ca-n1", "type": "CREATIVE_ACCEPT",
     "words": ["OAK", "MAPLE", "BIRCH", "PINE"],
     "player_category": "Furniture brands", "gt": "no"},
    {"id": "ca-n2", "type": "CREATIVE_ACCEPT",
     "words": ["ALPHA", "BETA", "GAMMA", "DELTA"],
     "player_category": "Words meaning 'small'", "gt": "no"},
    {"id": "ca-n3", "type": "CREATIVE_ACCEPT",
     "words": ["BAT", "BALL", "GLOVE", "MITT"],
     "player_category": "Things worn on your hand", "gt": "no",
     "gt_weak": ["BAT", "BALL"]},
    {"id": "ca-n4", "type": "CREATIVE_ACCEPT",
     "words": ["MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY"],
     "player_category": "Months of the year", "gt": "no"},
    {"id": "ca-n5", "type": "CREATIVE_ACCEPT",
     "words": ["WHIP", "NUT", "CODE", "SMILE"],
     "player_category": "Things found in a kitchen", "gt": "no",
     "gt_weak": ["CODE", "SMILE"]},

    # ---- CREATIVE_ACCEPT: borderline (deliberately ambiguous) ----
    {"id": "ca-b1", "type": "CREATIVE_ACCEPT",
     "words": ["APPLE", "ORANGE", "KIWI", "BLACKBERRY"],
     "player_category": "Tech/phone brands", "gt": "borderline",
     "gt_note": "APPLE and BLACKBERRY clearly are tech brands; ORANGE is a EU/UK telecom carrier (defensible); KIWI is not a tech brand. Strict grader says no; lenient grader might accept ORANGE."},
    {"id": "ca-b2", "type": "CREATIVE_ACCEPT",
     "words": ["SHARP", "FLAT", "NATURAL", "KEY"],
     "player_category": "Real estate listing terms", "gt": "borderline",
     "gt_note": "FLAT (UK apartment) and KEY (keys) connect; SHARP (a sharp property) and NATURAL (natural light?) are weak. Strict no; creative yes."},
]

# ---------- runner ----------

def call(model, prompt, temperature=TEMPERATURE, timeout=300):
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "think": False,
        "options": {"temperature": temperature, "num_ctx": 4096, "num_predict": 512},
    }
    req = urllib.request.Request(
        OLLAMA, data=json.dumps(payload).encode(),
        headers={"Content-Type": "application/json"},
    )
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=timeout) as r:
        data = json.loads(r.read())
    return time.time() - t0, data


def extract_json(body):
    if not body or "{" not in body or "}" not in body:
        raise ValueError("no JSON braces in response")
    return json.loads(body[body.find("{"): body.rfind("}") + 1])


def render_prompt(case):
    if case["type"] == "JUDGE":
        return JUDGE_PROMPT.format(
            category=case["category"],
            w1=case["words"][0], w2=case["words"][1],
            w3=case["words"][2], w4=case["words"][3],
        )
    if case["type"] == "CREATE":
        return CREATE_PROMPT.format(category=case["category"])
    if case["type"] == "CREATIVE_ACCEPT":
        return CREATIVE_ACCEPT_PROMPT.format(
            w1=case["words"][0], w2=case["words"][1],
            w3=case["words"][2], w4=case["words"][3],
            player_category=case["player_category"],
        )
    raise ValueError(case["type"])


def warm(model):
    print(f"[warm] {model}", file=sys.stderr, flush=True)
    call(model, "Reply with the word OK only.", temperature=0.1, timeout=300)


def run_model(model, cases):
    out = []
    for case in cases:
        prompt = render_prompt(case)
        last_err = None
        parsed = None
        last_dt = 0.0
        last_eval = 0
        last_raw = ""
        for attempt in range(3):
            temp = TEMPERATURE + attempt * 0.1
            print(f"[{model}] {case['id']} attempt {attempt+1} (temp={temp:.1f})",
                  file=sys.stderr, flush=True)
            try:
                dt, data = call(model, prompt, temperature=temp)
            except Exception as e:
                last_err = repr(e)
                continue
            last_dt = dt
            last_eval = data.get("eval_count", 0)
            last_raw = data.get("response", "") or ""
            try:
                parsed = extract_json(last_raw)
                last_err = None
                break
            except Exception as e:
                last_err = repr(e)
                continue
        out.append({
            "case_id": case["id"], "type": case["type"], "model": model,
            "dt": last_dt, "eval_count": last_eval,
            "ok": parsed is not None,
            "parsed": parsed,
            "raw": last_raw[:1500] if parsed is None else None,
            "error": last_err,
            "case": case,
        })
    return out


def score(results):
    """Auto-score against ground truth where possible."""
    for r in results:
        c = r["case"]
        if not r["ok"]:
            r["score"] = "PARSE_FAIL"
            continue
        p = r["parsed"]
        if c["type"] == "JUDGE":
            v = (p.get("verdict") or "").strip().lower()
            r["score"] = "PASS" if v == c["gt"] else "FAIL"
        elif c["type"] == "CREATIVE_ACCEPT":
            v = (p.get("valid") or "").strip().lower()
            if c["gt"] == "borderline":
                r["score"] = "BORDERLINE"  # human grades these
            else:
                r["score"] = "PASS" if v == c["gt"] else "FAIL"
        elif c["type"] == "CREATE":
            r["score"] = "MANUAL"  # human grades these against gt_check
    return results


def render(results):
    by_model = {}
    for r in results:
        by_model.setdefault(r["model"], []).append(r)

    lines = [f"# Gemma 4 Semantic Bakeoff -- {datetime.now().strftime('%Y-%m-%d %H:%M')}", ""]
    lines += [
        "## Setup",
        f"- Host: steel141 (RTX 3090 Ti) `{OLLAMA}`",
        f"- Models: {', '.join('`'+m+'`' for m in MODELS)}",
        f"- Temperature {TEMPERATURE} (raised +0.1 per retry on JSON parse fail, max 3 attempts)",
        "- think=false, num_ctx=4096, num_predict=512, no format=json (per gemma4-research/GOTCHAS.md)",
        f"- {len(CASES)} test cases: "
        f"{sum(1 for c in CASES if c['type']=='JUDGE')} JUDGE, "
        f"{sum(1 for c in CASES if c['type']=='CREATE')} CREATE, "
        f"{sum(1 for c in CASES if c['type']=='CREATIVE_ACCEPT')} CREATIVE_ACCEPT",
        "- Ground truth hand-labeled inline in `scripts/gemma-semantic-bakeoff.py`",
        "",
    ]

    # ---- per-model summaries ----
    lines += ["## Auto-scored summary", ""]
    lines += ["| Model | JUDGE pass | CREATIVE_ACCEPT pass | parse fails | avg s |", "|---|---|---|---|---|"]
    for m in MODELS:
        rs = by_model.get(m, [])
        if not rs:
            lines.append(f"| `{m}` | - | - | - | - |")
            continue
        j_pass = sum(1 for r in rs if r["case"]["type"] == "JUDGE" and r.get("score") == "PASS")
        j_n = sum(1 for r in rs if r["case"]["type"] == "JUDGE")
        c_pass = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r.get("score") == "PASS")
        c_n = sum(1 for r in rs if r["case"]["type"] == "CREATIVE_ACCEPT" and r["case"].get("gt") != "borderline")
        parse_fail = sum(1 for r in rs if not r["ok"])
        avg_dt = sum(r["dt"] for r in rs) / max(len(rs), 1)
        lines.append(f"| `{m}` | {j_pass}/{j_n} | {c_pass}/{c_n} | {parse_fail} | {avg_dt:.1f} |")
    lines += [""]

    # ---- by case-type, full breakdown ----
    for tname in ["JUDGE", "CREATE", "CREATIVE_ACCEPT"]:
        lines += [f"## {tname}", ""]
        cases_of_type = [c for c in CASES if c["type"] == tname]
        for case in cases_of_type:
            lines += [f"### {case['id']}", ""]
            if tname == "JUDGE":
                lines += [
                    f"- Category: `{case['category']}`",
                    f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
                    f"- Ground truth: **{case['gt']}**" + (
                        f" (misfit: {', '.join(case.get('gt_misfit', []))})" if case.get("gt_misfit") else ""),
                    "",
                ]
            elif tname == "CREATE":
                lines += [
                    f"- Category: `{case['category']}`",
                    f"- Quality bar: {case['gt_check']}",
                    "",
                ]
            else:  # CREATIVE_ACCEPT
                lines += [
                    f"- Words: {', '.join('`'+w+'`' for w in case['words'])}",
                    f"- Player's category: `\"{case['player_category']}\"`",
                    f"- Ground truth: **{case['gt']}**" + (
                        f" -- {case.get('gt_note', '')}" if case.get("gt_note") else ""),
                    "",
                ]
            lines += ["| Model | Verdict | Reason | Score | dt |", "|---|---|---|---|---|"]
            for m in MODELS:
                r = next((r for r in by_model.get(m, []) if r["case_id"] == case["id"]), None)
                if r is None:
                    lines.append(f"| `{m}` | - | - | - | - |")
                    continue
                if not r["ok"]:
                    lines.append(f"| `{m}` | _parse fail_ | `{(r.get('error') or '')[:60]}` | PARSE_FAIL | {r['dt']:.1f}s |")
                    continue
                p = r["parsed"]
                if tname == "JUDGE":
                    v = p.get("verdict", "?")
                    reason = p.get("reason", "")
                    extra = ""
                    if p.get("misfit_words"):
                        extra = f" (misfit: {', '.join(p['misfit_words'])})"
                elif tname == "CREATE":
                    v = ", ".join(p.get("words", []) or [])[:80]
                    reason = p.get("reason", "")
                    extra = ""
                else:
                    v = p.get("valid", "?")
                    reason = p.get("reason", "")
                    extra = ""
                    if p.get("weak_words"):
                        extra = f" (weak: {', '.join(p['weak_words'])})"
                reason_short = (reason + extra).replace("|", "\\|")[:120]
                v_clean = str(v).replace("|", "\\|")[:80]
                lines.append(f"| `{m}` | {v_clean} | {reason_short} | {r.get('score', '?')} | {r['dt']:.1f}s |")
            lines += [""]

    return "\n".join(lines)


def main():
    out_dir = PROJECT_ROOT / "docs" / "reference"
    out_dir.mkdir(parents=True, exist_ok=True)
    stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
    raw_path = out_dir / f"gemma-semantic-bakeoff-{stamp}-raw.json"
    md_path = out_dir / f"gemma-semantic-bakeoff-{stamp}.md"

    all_results = []
    for m in MODELS:
        warm(m)
        all_results.extend(run_model(m, CASES))

    score(all_results)

    # save raw without the case dict redundantly
    raw = [{k: v for k, v in r.items() if k != "case"} | {"case_id": r["case_id"]} for r in all_results]
    raw_path.write_text(json.dumps(raw, indent=2))
    print(f"raw  -> {raw_path}", file=sys.stderr)

    md_path.write_text(render(all_results))
    print(f"md   -> {md_path}", file=sys.stderr)
    print(md_path)


if __name__ == "__main__":
    main()