VIBECODE-THEORY/tools/cross-references/build_cross_references.py

#!/usr/bin/env python3
"""Build cross-reference artifacts for the VIBECODE-THEORY paper series.

Outputs:
- graph.json
- graph.mermaid
- dangling_threads.md
- concept_flow.md
"""

from __future__ import annotations

import json
import re
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable


ROOT = Path(__file__).resolve().parents[2]
OUT_DIR = Path(__file__).resolve().parent

PAPER_GLOB = "00*-*.md"
ALLEGORY_GLOB = "allegorical/*.md"

STOPWORDS = {
    "about",
    "after",
    "again",
    "also",
    "because",
    "between",
    "could",
    "does",
    "from",
    "have",
    "into",
    "just",
    "like",
    "might",
    "more",
    "most",
    "over",
    "paper",
    "question",
    "series",
    "should",
    "than",
    "that",
    "their",
    "them",
    "then",
    "this",
    "those",
    "through",
    "what",
    "when",
    "which",
    "with",
    "would",
}

RELATION_PRIORITY = {
    "supersedes": 8,
    "refutes": 7,
    "challenges": 6,
    "revises": 5,
    "extends": 4,
    "addresses": 3,
    "introduces concept used by": 2,
    "references": 1,
}

CONCEPT_CATALOG = {
    "vibe coding as social skill": {
        "aliases": ["vibe coding", "social skill", "meta-skill"],
        "intro": "001",
    },
    "cognitive surplus": {
        "aliases": ["cognitive surplus", "surplus"],
        "intro": "002",
    },
    "dependency trap": {
        "aliases": ["dependency trap", "systemic dependency"],
        "intro": "002",
    },
    "cognitive preference shift": {
        "aliases": ["cognitive preference shift", "preference shift"],
        "intro": "005",
    },
    "automation spiral": {
        "aliases": ["automation spiral"],
        "intro": "003",
    },
    "feedback loop": {
        "aliases": ["feedback loop", "uncomfortable middle"],
        "intro": "006",
    },
    "biological ratchet": {
        "aliases": ["biological ratchet", "ratchet"],
        "intro": "007",
    },
    "infrastructure threshold": {
        "aliases": ["infrastructure threshold", "application phase"],
        "intro": "007",
    },
    "premature dependency hibernation": {
        "aliases": ["premature dependencies", "hibernation"],
        "intro": "007",
    },
    "knowledge unification": {
        "aliases": ["knowledge unification", "defragmentation"],
        "intro": "008",
    },
    "ship of theseus identity problem": {
        "aliases": ["ship of theseus", "species identity"],
        "intro": "008",
    },
    "cheating frame": {
        "aliases": ['"cheating"', "cheating frame"],
        "intro": "008",
    },
    "dependency chain": {
        "aliases": ["dependency chain"],
        "intro": "007",
    },
}


@dataclass
class Document:
    doc_id: str
    title: str
    kind: str
    path: Path
    text: str


def read_documents() -> list[Document]:
    docs: list[Document] = []

    for path in sorted(ROOT.glob(PAPER_GLOB)):
        text = path.read_text(encoding="utf-8")
        m = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, flags=re.M)
        if m:
            doc_id, title = m.group(1), m.group(2).strip()
        else:
            doc_id = path.name.split("-", 1)[0]
            title = path.stem
        docs.append(Document(doc_id=doc_id, title=title, kind="paper", path=path, text=text))

    for path in sorted(ROOT.glob(ALLEGORY_GLOB)):
        text = path.read_text(encoding="utf-8")
        m = re.search(r"^#\s+(.+)$", text, flags=re.M)
        title = m.group(1).strip() if m else path.stem.replace("-", " ").title()
        docs.append(
            Document(
                doc_id=f"A:{path.stem}",
                title=title,
                kind="allegory",
                path=path,
                text=text,
            )
        )

    return docs


def sentence_chunks(text: str) -> Iterable[str]:
    for chunk in re.split(r"(?<=[.!?])\s+|\n{2,}", text):
        cleaned = " ".join(chunk.strip().split())
        if cleaned:
            yield cleaned


def classify_relationship(text: str) -> str:
    lower = text.lower()
    if "supersed" in lower:
        return "supersedes"
    if any(k in lower for k in ("refute", "rebuttal", "against")):
        return "refutes"
    if any(k in lower for k in ("challenge", "critic", "unfalsifiable")):
        return "challenges"
    if "revis" in lower:
        return "revises"
    if "extend" in lower:
        return "extends"
    if any(k in lower for k in ("respond", "address", "engage")):
        return "addresses"
    return "references"


def find_paper_targets(text: str) -> list[str]:
    if "paper" not in text.lower():
        return []
    return sorted(set(re.findall(r"\b00[1-8]\b", text)))


def add_edge(edges: dict[tuple[str, str], dict], source: str, target: str, edge_type: str, context: str) -> None:
    if source == target:
        return
    key = (source, target)
    candidate = {"source": source, "target": target, "type": edge_type, "context": context}
    existing = edges.get(key)
    if not existing:
        edges[key] = candidate
        return
    if RELATION_PRIORITY[edge_type] > RELATION_PRIORITY[existing["type"]]:
        edges[key] = candidate


def extract_explicit_edges(docs: list[Document]) -> dict[tuple[str, str], dict]:
    edges: dict[tuple[str, str], dict] = {}
    allegory_name_to_id = {doc.path.stem.replace("-", " "): doc.doc_id for doc in docs if doc.kind == "allegory"}

    for doc in docs:
        for sent in sentence_chunks(doc.text):
            targets = find_paper_targets(sent)
            if targets:
                rel = classify_relationship(sent)
                for target in targets:
                    add_edge(edges, doc.doc_id, target, rel, sent[:220])

        if doc.kind == "paper" and doc.doc_id == "007":
            lower = doc.text.lower()
            for name, target_id in allegory_name_to_id.items():
                if name in lower:
                    add_edge(
                        edges,
                        doc.doc_id,
                        target_id,
                        "extends",
                        f"Paper 007 explicitly maps the {name.title()} allegory into the ratchet framework.",
                    )

        if doc.kind == "allegory":
            for sent in sentence_chunks(doc.text):
                targets = find_paper_targets(sent)
                for target in targets:
                    add_edge(edges, doc.doc_id, target, "addresses", sent[:220])

    return edges


def collect_concept_presence(docs: list[Document]) -> tuple[dict[str, str], dict[str, set[str]]]:
    intro: dict[str, str] = {}
    usage: dict[str, set[str]] = defaultdict(set)

    ordered = sorted([d for d in docs if d.kind == "paper"], key=lambda d: d.doc_id) + [
        d for d in docs if d.kind == "allegory"
    ]

    for doc in ordered:
        lower = doc.text.lower()
        for concept, info in CONCEPT_CATALOG.items():
            aliases = info["aliases"]
            if any(alias.lower() in lower for alias in aliases):
                usage[concept].add(doc.doc_id)
                expected_intro = info["intro"]
                if expected_intro in {d.doc_id for d in docs if d.kind == "paper"}:
                    intro.setdefault(concept, expected_intro)
                else:
                    intro.setdefault(concept, doc.doc_id)
    return intro, usage


def extract_implicit_edges(
    docs: list[Document], intro: dict[str, str], usage: dict[str, set[str]], edges: dict[tuple[str, str], dict]
) -> None:
    for concept, source in intro.items():
        if not re.match(r"^00[1-8]$", source):
            continue
        for target in sorted(usage[concept]):
            if target == source or not re.match(r"^00[1-8]$", target):
                continue
            if target <= source:
                continue
            add_edge(
                edges,
                source,
                target,
                "introduces concept used by",
                f"{concept} appears first in {source} and recurs in {target}.",
            )


def build_nodes(docs: list[Document], intro: dict[str, str]) -> list[dict]:
    concept_by_doc: dict[str, list[str]] = defaultdict(list)
    for concept, doc_id in intro.items():
        concept_by_doc[doc_id].append(concept)

    nodes: list[dict] = []
    for doc in sorted(docs, key=lambda d: (d.kind != "paper", d.doc_id)):
        nodes.append(
            {
                "id": doc.doc_id,
                "title": doc.title,
                "kind": doc.kind,
                "concepts_introduced": sorted(concept_by_doc.get(doc.doc_id, [])),
            }
        )
    return nodes


def write_mermaid(nodes: list[dict], edges: list[dict]) -> None:
    def mm_id(node_id: str) -> str:
        return re.sub(r"[^A-Za-z0-9_]", "_", node_id)

    lines = ["graph TD"]
    for node in nodes:
        nid = mm_id(node["id"])
        label = f'{node["id"]}: {node["title"]}'
        lines.append(f'    {nid}["{label}"]')
    for edge in edges:
        src = mm_id(edge["source"])
        dst = mm_id(edge["target"])
        rel = edge["type"].replace('"', "")
        lines.append(f"    {src} -->|{rel}| {dst}")
    (OUT_DIR / "graph.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8")


def extract_open_questions(paper: Document) -> list[str]:
    lines = paper.text.splitlines()
    start = None
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("## open questions"):
            start = i + 1
            break
    if start is None:
        return []

    questions: list[str] = []
    for line in lines[start:]:
        if line.startswith("## "):
            break
        stripped = line.strip()
        if re.match(r"^(\d+\.|-)\s+", stripped):
            body = re.sub(r"^(\d+\.|-)\s+", "", stripped).strip()
            if body:
                questions.append(body)
    return questions


def question_keywords(text: str) -> set[str]:
    words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower())
    return {w for w in words if w not in STOPWORDS}


def build_dangling_threads(papers: list[Document]) -> str:
    paper_map = {p.doc_id: p for p in papers}
    ordered_ids = sorted(paper_map.keys())
    lines = ["# Dangling Threads", ""]

    found_any = False
    for doc_id in ordered_ids:
        paper = paper_map[doc_id]
        questions = extract_open_questions(paper)
        later = [paper_map[i] for i in ordered_ids if i > doc_id]

        for question in questions:
            kws = question_keywords(question)
            hits: list[str] = []
            if kws:
                for other in later:
                    lower = other.text.lower()
                    overlap = sum(1 for kw in kws if kw in lower)
                    if overlap >= 2:
                        hits.append(other.doc_id)
            found_any = True
            if hits:
                lines.append(
                    f"- Raised in **Paper {doc_id}**: {question}  \n"
                    f"  Partially addressed in later papers: {', '.join(f'Paper {h}' for h in hits)}."
                )
            else:
                lines.append(
                    f"- Raised in **Paper {doc_id}**: {question}  \n"
                    "  Partially addressed in later papers: none detected."
                )

    if not found_any:
        lines.append("- No open-question sections were detected in the source files.")
    lines.append("")
    return "\n".join(lines)


def build_concept_flow(
    papers: list[Document], intro: dict[str, str], usage: dict[str, set[str]], explicit_edges: list[dict]
) -> str:
    lines = ["# Concept Flow", ""]

    paper_ids = sorted(p.doc_id for p in papers)
    paper_map = {p.doc_id: p for p in papers}

    for concept in sorted(CONCEPT_CATALOG.keys()):
        introduced = intro.get(concept, "unknown")
        used_in = sorted(d for d in usage.get(concept, set()) if d in paper_ids)
        aliases = CONCEPT_CATALOG[concept]["aliases"]

        challenged: set[str] = set()
        revised: set[str] = set()
        for doc_id in used_in:
            has_concept_sentence = False
            for sent in sentence_chunks(paper_map[doc_id].text):
                lower_sent = sent.lower()
                if not any(a.lower() in lower_sent for a in aliases):
                    continue
                has_concept_sentence = True
                if any(k in lower_sent for k in ("challenge", "critic", "rebuttal", "against", "unfalsifiable")):
                    challenged.add(doc_id)
                if any(k in lower_sent for k in ("revision", "revised", "supersedes", "responds", "extends")):
                    revised.add(doc_id)
            if not has_concept_sentence:
                continue

        challenged_list = sorted(challenged)
        revised_list = sorted(revised)
        current = used_in[-1] if used_in else "unknown"

        lines.append(f"## {concept.title()}")
        lines.append(f"- Introduced in: Paper {introduced}" if introduced != "unknown" else "- Introduced in: unknown")
        lines.append(
            f"- Challenged in: {', '.join(f'Paper {p}' for p in challenged_list)}"
            if challenged_list
            else "- Challenged in: none detected"
        )
        lines.append(
            f"- Revised in: {', '.join(f'Paper {p}' for p in revised_list)}"
            if revised_list
            else "- Revised in: none detected"
        )
        lines.append(
            f"- Referenced in: {', '.join(f'Paper {p}' for p in used_in)}" if used_in else "- Referenced in: none detected"
        )
        lines.append(f"- Current standing: active in latest mention (Paper {current})." if current != "unknown" else "- Current standing: unclear.")
        lines.append("")

    return "\n".join(lines)


def main() -> None:
    docs = read_documents()
    papers = [d for d in docs if d.kind == "paper"]

    intro, usage = collect_concept_presence(docs)
    edge_map = extract_explicit_edges(docs)
    extract_implicit_edges(docs, intro, usage, edge_map)

    nodes = build_nodes(docs, intro)
    edges = sorted(edge_map.values(), key=lambda e: (e["source"], e["target"], e["type"]))

    graph = {"nodes": nodes, "edges": edges}
    (OUT_DIR / "graph.json").write_text(json.dumps(graph, indent=2) + "\n", encoding="utf-8")
    write_mermaid(nodes, edges)

    dangling = build_dangling_threads(papers)
    (OUT_DIR / "dangling_threads.md").write_text(dangling, encoding="utf-8")

    flow = build_concept_flow(papers, intro, usage, edges)
    (OUT_DIR / "concept_flow.md").write_text(flow, encoding="utf-8")

    print(f"Wrote {OUT_DIR / 'graph.json'}")
    print(f"Wrote {OUT_DIR / 'graph.mermaid'}")
    print(f"Wrote {OUT_DIR / 'dangling_threads.md'}")
    print(f"Wrote {OUT_DIR / 'concept_flow.md'}")


if __name__ == "__main__":
    main()