VIBECODE-THEORY/tools/concept-index/build_index.py

#!/usr/bin/env python3
"""Build a concept index and glossary for the VIBECODE-THEORY corpus."""

from __future__ import annotations

import json
import re
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path


ROOT = Path(__file__).resolve().parents[2]
OUT_DIR = Path(__file__).resolve().parent


@dataclass
class Document:
    doc_id: str
    title: str
    path: Path
    text: str
    supersedes: str | None


CONCEPTS = [
    {
        "name": "Vibe Coding",
        "aliases": ["vibe coding"],
        "introduced_in": "001",
        "status": "active",
        "related_concepts": ["Social-Cognitive Framework", "Mental Model Accuracy", "Meta-Skill Argument"],
    },
    {
        "name": "Social-Cognitive Framework",
        "aliases": ["social-cognitive framework", "vibe coding as social skill", "social-cognitive processes"],
        "introduced_in": "004",
        "status": "active",
        "related_concepts": ["Vibe Coding", "Mental Model Accuracy", "Adaptive Communication", "Collaboration Management"],
    },
    {
        "name": "Mental Model Accuracy",
        "aliases": ["mental model accuracy", "mental model"],
        "introduced_in": "001",
        "status": "active",
        "related_concepts": ["Social-Cognitive Framework", "Adaptive Communication", "Collaboration Management"],
    },
    {
        "name": "Adaptive Communication",
        "aliases": ["adaptive communication", "constraint calibration", "register matching"],
        "introduced_in": "001",
        "status": "active",
        "related_concepts": ["Social-Cognitive Framework", "Mental Model Accuracy", "Collaboration Management"],
    },
    {
        "name": "Collaboration Management",
        "aliases": ["collaboration management", "task decomposition", "trust calibration", "recovery"],
        "introduced_in": "001",
        "status": "active",
        "related_concepts": ["Social-Cognitive Framework", "Adaptive Communication", "Technical Foundation"],
    },
    {
        "name": "Technical Foundation",
        "aliases": ["technical foundation", "technical expertise"],
        "introduced_in": "001",
        "status": "active",
        "related_concepts": ["Vibe Coding", "Collaboration Management", "Meta-Skill Argument"],
    },
    {
        "name": "Neurodivergence Note",
        "aliases": ["neurodivergence note", "neurodivergence hypothesis"],
        "introduced_in": "001",
        "status": "open question",
        "related_concepts": ["Social-Cognitive Framework"],
    },
    {
        "name": "Shelf-Life Problem",
        "aliases": ["shelf-life problem", "shelf life problem"],
        "introduced_in": "003",
        "status": "active",
        "related_concepts": ["Meta-Skill Argument", "Infrastructure Threshold"],
    },
    {
        "name": "Meta-Skill Argument",
        "aliases": ["meta-skill argument", "meta-skill"],
        "introduced_in": "004",
        "status": "active",
        "related_concepts": ["Shelf-Life Problem", "Vibe Coding", "Social-Cognitive Framework"],
    },
    {
        "name": "Cognitive Surplus",
        "aliases": ["cognitive surplus", "surplus of cognition", "the cognitive surplus"],
        "introduced_in": "002",
        "status": "active",
        "related_concepts": ["Agricultural Parallel", "Cognition as a Commodity", "Automation Spiral"],
    },
    {
        "name": "Agricultural Parallel",
        "aliases": ["agricultural parallel", "agricultural analogy"],
        "introduced_in": "002",
        "status": "active",
        "related_concepts": ["Cognitive Surplus", "Green Revolution", "Feudal Internet", "Dependency Trap"],
    },
    {
        "name": "Dual Cognition Problem",
        "aliases": ["dual cognition problem", "the dual cognition problem"],
        "introduced_in": "002",
        "status": "active",
        "related_concepts": ["Cognitive Preference Shift", "Cognitive Atrophy", "Cognitive Surplus"],
    },
    {
        "name": "Cognitive Atrophy",
        "aliases": ["cognitive atrophy", "capability loss"],
        "introduced_in": "002",
        "status": "open question",
        "related_concepts": ["Dual Cognition Problem", "Cognitive Preference Shift", "Biological Ratchet"],
    },
    {
        "name": "Green Revolution",
        "aliases": ["green revolution"],
        "introduced_in": "002",
        "status": "active",
        "related_concepts": ["Agricultural Parallel", "Feudal Internet"],
    },
    {
        "name": "Feudal Internet",
        "aliases": ["feudal internet"],
        "introduced_in": "002",
        "status": "active",
        "related_concepts": ["Agricultural Parallel", "Dependency Trap", "Cognition as a Commodity"],
    },
    {
        "name": "Dependency Trap",
        "aliases": ["dependency trap", "future 3: the dependency trap"],
        "introduced_in": "002",
        "status": "active",
        "related_concepts": ["Feudal Internet", "Cognitive Atrophy", "Y2K Parallel"],
    },
    {
        "name": "Automation Spiral",
        "aliases": ["automation spiral"],
        "introduced_in": "003",
        "status": "active",
        "related_concepts": ["Cognitive Surplus", "Feedback Loop", "Master-Apprentice Parallel"],
    },
    {
        "name": "Cognitive Preference Shift",
        "aliases": ["cognitive preference shift", "preference shift"],
        "introduced_in": "003",
        "status": "active",
        "related_concepts": ["Dual Cognition Problem", "Cognitive Atrophy", "Biological Ratchet"],
    },
    {
        "name": "Cognition as a Commodity",
        "aliases": ["cognition as a commodity", "cognition-as-commodity framing"],
        "introduced_in": "005",
        "status": "active",
        "related_concepts": ["Cognitive Surplus", "Feudal Internet", "Information/Cognition Resource Hierarchy"],
    },
    {
        "name": "Y2K Parallel",
        "aliases": ["y2k parallel", "ai y2k moment", "y2k moment"],
        "introduced_in": "005",
        "status": "active",
        "related_concepts": ["Dependency Trap", "Infrastructure Threshold", "Cognitive Surplus"],
    },
    {
        "name": "Information/Cognition Resource Hierarchy",
        "aliases": ["information and cognition as resources", "resource hierarchy"],
        "introduced_in": "005",
        "status": "active",
        "related_concepts": ["Cognition as a Commodity", "Knowledge Unification"],
    },
    {
        "name": "Feedback Loop",
        "aliases": ["feedback loop"],
        "introduced_in": "006",
        "status": "active",
        "related_concepts": ["Automation Spiral", "Master-Apprentice Parallel", "Niche Construction"],
    },
    {
        "name": "Master-Apprentice Parallel",
        "aliases": ["master-apprentice parallel", "master-apprentice relationship"],
        "introduced_in": "006",
        "status": "active",
        "related_concepts": ["Feedback Loop", "Automation Spiral", "The Golem"],
    },
    {
        "name": "Niche Construction",
        "aliases": ["niche construction"],
        "introduced_in": "006",
        "status": "active",
        "related_concepts": ["Feedback Loop", "Recursion Observation"],
    },
    {
        "name": "Theological Thread",
        "aliases": ["theological thread"],
        "introduced_in": "006",
        "status": "active",
        "related_concepts": ["Prometheus", "Knowledge Unification", "Recursion Observation"],
    },
    {
        "name": "Recursion Observation",
        "aliases": ["recursion observation", "cosmological → biological → linguistic → computational"],
        "introduced_in": "006",
        "status": "open question",
        "related_concepts": ["Theological Thread", "Niche Construction", "Knowledge Unification"],
    },
    {
        "name": "Infrastructure Threshold",
        "aliases": ["infrastructure threshold"],
        "introduced_in": "007",
        "status": "active",
        "related_concepts": ["Biological Ratchet", "Premature Dependencies", "Y2K Parallel"],
    },
    {
        "name": "Premature Dependencies",
        "aliases": ["premature dependencies", "dependency waiting for its enabling technology"],
        "introduced_in": "007",
        "status": "active",
        "related_concepts": ["Infrastructure Threshold", "Biological Ratchet"],
    },
    {
        "name": "Biological Ratchet",
        "aliases": ["biological ratchet", "dependency ratchet", "ratchet thesis"],
        "introduced_in": "007",
        "status": "active",
        "related_concepts": ["Infrastructure Threshold", "Cognitive Preference Shift", "Knowledge Unification"],
    },
    {
        "name": "Dependency Chain",
        "aliases": ["dependency chain"],
        "introduced_in": "007",
        "status": "active",
        "related_concepts": ["Biological Ratchet", "Knowledge Unification", "Cheating Frame"],
    },
    {
        "name": "Knowledge Unification",
        "aliases": ["knowledge unification", "unification thesis", "unification of human knowledge", "the dependency chain as knowledge unification"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Dependency Chain", "Singularity as Compilation", "Integration Layer"],
    },
    {
        "name": "Singularity as Compilation",
        "aliases": ["singularity as compilation", "compilation not transcendence", "compilation, not transcendence"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Knowledge Unification", "Integration Layer", "Cheating Frame"],
    },
    {
        "name": "Integration Layer",
        "aliases": ["integration layer"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Knowledge Unification", "Singularity as Compilation", "Existential Purpose of the Chain"],
    },
    {
        "name": "Ship of Theseus Problem",
        "aliases": ["ship of theseus problem", "identity problem", "species identity problem", "the identity problem"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Continuity Argument", "Identity Argument", "Pragmatic Argument"],
    },
    {
        "name": "Continuity Argument",
        "aliases": ["continuity argument", "the continuity argument"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Ship of Theseus Problem", "Identity Argument", "Pragmatic Argument"],
    },
    {
        "name": "Identity Argument",
        "aliases": ["identity argument", "essentialist", "the identity argument"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Ship of Theseus Problem", "Continuity Argument", "Pragmatic Argument"],
    },
    {
        "name": "Pragmatic Argument",
        "aliases": ["pragmatic argument", "the pragmatic argument"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Ship of Theseus Problem", "Continuity Argument", "Identity Argument"],
    },
    {
        "name": "Cheating Frame",
        "aliases": ["did we cheat", "cheating frame"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Dependency Chain", "Singularity as Compilation", "Existential Purpose of the Chain"],
    },
    {
        "name": "Existential Purpose of the Chain",
        "aliases": ["existential purpose of the dependency chain", "existential purpose", "the existential purpose of the dependency chain"],
        "introduced_in": "008",
        "status": "active",
        "related_concepts": ["Integration Layer", "Cheating Frame", "Knowledge Unification"],
    },
    {
        "name": "Eve's Apple",
        "aliases": ["eve's apple"],
        "introduced_in": "eves-apple",
        "status": "reference allegory",
        "related_concepts": ["Cognitive Preference Shift", "Dependency Chain"],
    },
    {
        "name": "Pandora's Box",
        "aliases": ["pandora's box"],
        "introduced_in": "pandoras-box",
        "status": "reference allegory",
        "related_concepts": ["Dependency Chain", "Automation Spiral"],
    },
    {
        "name": "Prometheus",
        "aliases": ["prometheus"],
        "introduced_in": "prometheus",
        "status": "reference allegory",
        "related_concepts": ["Theological Thread", "Dependency Chain", "Cheating Frame"],
    },
    {
        "name": "Sorcerer's Apprentice",
        "aliases": ["sorcerer's apprentice"],
        "introduced_in": "sorcerers-apprentice",
        "status": "reference allegory",
        "related_concepts": ["Automation Spiral", "Feedback Loop", "Dependency Chain"],
    },
    {
        "name": "The Golem",
        "aliases": ["the golem", "golem"],
        "introduced_in": "the-golem",
        "status": "reference allegory",
        "related_concepts": ["Master-Apprentice Parallel", "Dependency Chain"],
    },
    {
        "name": "Faustian Bargain",
        "aliases": ["faustian bargain", "faust"],
        "introduced_in": "faust",
        "status": "reference allegory",
        "related_concepts": ["Feedback Loop", "Cognitive Preference Shift"],
    },
    {
        "name": "Icarus",
        "aliases": ["icarus"],
        "introduced_in": "icarus",
        "status": "reference allegory",
        "related_concepts": ["Shelf-Life Problem", "Infrastructure Threshold"],
    },
    {
        "name": "Tower of Babel",
        "aliases": ["tower of babel", "babel"],
        "introduced_in": "tower-of-babel",
        "status": "reference allegory",
        "related_concepts": ["Dependency Chain", "Knowledge Unification"],
    },
]


def clean_text(text: str) -> str:
    text = text.replace("\r\n", "\n")
    return text


def load_documents() -> dict[str, Document]:
    docs: dict[str, Document] = {}
    for path in sorted(ROOT.glob("00*.md")):
        text = clean_text(path.read_text(encoding="utf-8"))
        title_match = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, re.MULTILINE)
        supersedes_match = re.search(r"^\*\*Supersedes:\*\*\s*Paper\s+(\d{3})", text, re.MULTILINE)
        if not title_match:
            continue
        doc_id = title_match.group(1)
        docs[doc_id] = Document(
            doc_id=doc_id,
            title=title_match.group(2).strip(),
            path=path,
            text=text,
            supersedes=supersedes_match.group(1) if supersedes_match else None,
        )
    for path in sorted((ROOT / "allegorical").glob("*.md")):
        text = clean_text(path.read_text(encoding="utf-8"))
        title_match = re.search(r"^#\s+(.+)$", text, re.MULTILINE)
        doc_id = path.stem
        docs[doc_id] = Document(
            doc_id=doc_id,
            title=title_match.group(1).strip() if title_match else path.stem,
            path=path,
            text=text,
            supersedes=None,
        )
    return docs


def paragraphs(text: str) -> list[str]:
    return [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]


def sentences(text: str) -> list[str]:
    normalized = re.sub(r"\s+", " ", text.strip())
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+", normalized) if s.strip()]


def alias_present(text: str, alias: str) -> bool:
    pattern = r"\b" + re.escape(alias.lower()) + r"\b"
    return re.search(pattern, text.lower()) is not None


def extract_section(text: str, heading: str) -> str:
    pattern = rf"^##+\s+{re.escape(heading)}\s*$"
    match = re.search(pattern, text, re.MULTILINE | re.IGNORECASE)
    if not match:
        return ""
    start = match.end()
    next_heading = re.search(r"^##+\s+", text[start:], re.MULTILINE)
    end = start + next_heading.start() if next_heading else len(text)
    return text[start:end].strip()


def first_matching_sentence(doc: Document, aliases: list[str]) -> str:
    for alias in aliases:
        section = extract_section(doc.text, alias)
        if section:
            for paragraph in paragraphs(section):
                if paragraph.startswith("#"):
                    continue
                for sentence in sentences(paragraph):
                    if len(sentence) >= 40:
                        return sentence
    for paragraph in paragraphs(doc.text):
        if paragraph.startswith("#") or paragraph.startswith("**Authors:**") or paragraph.startswith("**Date:**"):
            continue
        for sentence in sentences(paragraph):
            if any(alias_present(sentence, alias) for alias in aliases) and len(sentence) >= 40:
                return sentence
    return "Definition sentence not found in source text."


def find_mentions(docs: dict[str, Document], aliases: list[str]) -> list[str]:
    refs: list[str] = []
    for doc_id, doc in docs.items():
        if any(alias_present(doc.text, alias) for alias in aliases):
            refs.append(doc_id)
    return refs


def find_revisions(docs: dict[str, Document], concept: dict, mentions: list[str]) -> list[str]:
    introduced_in = concept["introduced_in"]
    revisions: list[str] = []
    for doc_id in mentions:
        doc = docs[doc_id]
        if doc.supersedes == introduced_in:
            revisions.append(doc_id)
    return sorted(revisions)


def find_challenges(docs: dict[str, Document], concept: dict, mentions: list[str]) -> list[str]:
    aliases = [concept["name"]] + concept["aliases"]
    challenged: list[str] = []
    for doc_id in mentions:
        if doc_id == concept["introduced_in"]:
            continue
        doc = docs[doc_id]
        section = extract_section(doc.text, "Relationship to Prior Papers")
        open_q = extract_section(doc.text, "Open Questions") + "\n" + extract_section(doc.text, "Open Questions for Paper 007") + "\n" + extract_section(doc.text, "Open Questions for Paper 009")
        corpus = f"{section}\n{open_q}\n{doc.text[:3000]}"
        if any(alias_present(corpus, alias) for alias in aliases) and re.search(
            r"challenge|critic|rebuttal|unfalsif|weak|bounded|downgrade|unknown",
            corpus,
            re.IGNORECASE,
        ):
            challenged.append(doc_id)
    return sorted(set(challenged))


def mermaid_id(name: str) -> str:
    return "c_" + re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")


def build_index() -> dict[str, list[dict]]:
    docs = load_documents()
    items: list[dict] = []

    for concept in CONCEPTS:
        aliases = [concept["name"]] + concept["aliases"]
        intro_doc = docs[concept["introduced_in"]]
        mentions = find_mentions(docs, aliases)
        revised_in = find_revisions(docs, concept, mentions)
        challenged_in = find_challenges(docs, concept, mentions)
        referenced_in = [doc_id for doc_id in mentions if doc_id != concept["introduced_in"] and doc_id not in revised_in]
        definition = first_matching_sentence(intro_doc, aliases)

        items.append(
            {
                "name": concept["name"],
                "aliases": sorted(set(concept["aliases"])),
                "introduced_in": concept["introduced_in"],
                "definition": definition,
                "revised_in": revised_in,
                "challenged_in": challenged_in,
                "referenced_in": referenced_in,
                "status": concept["status"],
                "related_concepts": concept["related_concepts"],
            }
        )

    return {"concepts": items}


def write_glossary(index: dict[str, list[dict]]) -> None:
    lines = ["# VIBECODE-THEORY Glossary", ""]
    for item in sorted(index["concepts"], key=lambda x: x["name"].lower()):
        lines.extend(
            [
                f"## {item['name']}",
                f"Origin: {item['introduced_in']}",
                f"Status: {item['status']}",
                f"Aliases: {', '.join(item['aliases']) if item['aliases'] else 'None'}",
                item["definition"],
                f"Revised in: {', '.join(item['revised_in']) if item['revised_in'] else 'None'}",
                f"Challenged in: {', '.join(item['challenged_in']) if item['challenged_in'] else 'None'}",
                f"Referenced in: {', '.join(item['referenced_in']) if item['referenced_in'] else 'None'}",
                f"Related concepts: {', '.join(item['related_concepts']) if item['related_concepts'] else 'None'}",
                "",
            ]
        )
    (OUT_DIR / "glossary.md").write_text("\n".join(lines), encoding="utf-8")


def write_mermaid(index: dict[str, list[dict]]) -> None:
    lines = ["graph TD"]
    for item in index["concepts"]:
        lines.append(f'    {mermaid_id(item["name"])}["{item["name"]}"]')
    seen: set[tuple[str, str]] = set()
    for item in index["concepts"]:
        for related in item["related_concepts"]:
            edge = tuple(sorted((item["name"], related)))
            if edge in seen:
                continue
            seen.add(edge)
            lines.append(
                f"    {mermaid_id(item['name'])} -->|relates to| {mermaid_id(related)}"
            )
    (OUT_DIR / "concept_map.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8")


def main() -> None:
    index = build_index()
    (OUT_DIR / "index.json").write_text(json.dumps(index, indent=2) + "\n", encoding="utf-8")
    write_glossary(index)
    write_mermaid(index)
    print(f"Indexed {len(index['concepts'])} concepts.")
    print(f"Wrote {OUT_DIR / 'index.json'}")
    print(f"Wrote {OUT_DIR / 'glossary.md'}")
    print(f"Wrote {OUT_DIR / 'concept_map.mermaid'}")


if __name__ == "__main__":
    main()