#!/usr/bin/env python3 """Build cross-reference artifacts for the VIBECODE-THEORY paper series. Outputs: - graph.json - graph.mermaid - dangling_threads.md - concept_flow.md """ from __future__ import annotations import json import re from collections import defaultdict from dataclasses import dataclass from pathlib import Path from typing import Iterable ROOT = Path(__file__).resolve().parents[2] OUT_DIR = Path(__file__).resolve().parent PAPER_GLOB = "00*-*.md" ALLEGORY_GLOB = "allegorical/*.md" STOPWORDS = { "about", "after", "again", "also", "because", "between", "could", "does", "from", "have", "into", "just", "like", "might", "more", "most", "over", "paper", "question", "series", "should", "than", "that", "their", "them", "then", "this", "those", "through", "what", "when", "which", "with", "would", } RELATION_PRIORITY = { "supersedes": 8, "refutes": 7, "challenges": 6, "revises": 5, "extends": 4, "addresses": 3, "introduces concept used by": 2, "references": 1, } CONCEPT_CATALOG = { "vibe coding as social skill": { "aliases": ["vibe coding", "social skill", "meta-skill"], "intro": "001", }, "cognitive surplus": { "aliases": ["cognitive surplus", "surplus"], "intro": "002", }, "dependency trap": { "aliases": ["dependency trap", "systemic dependency"], "intro": "002", }, "cognitive preference shift": { "aliases": ["cognitive preference shift", "preference shift"], "intro": "005", }, "automation spiral": { "aliases": ["automation spiral"], "intro": "003", }, "feedback loop": { "aliases": ["feedback loop", "uncomfortable middle"], "intro": "006", }, "biological ratchet": { "aliases": ["biological ratchet", "ratchet"], "intro": "007", }, "infrastructure threshold": { "aliases": ["infrastructure threshold", "application phase"], "intro": "007", }, "premature dependency hibernation": { "aliases": ["premature dependencies", "hibernation"], "intro": "007", }, "knowledge unification": { "aliases": ["knowledge unification", "defragmentation"], "intro": "008", }, "ship of theseus identity problem": { "aliases": ["ship of theseus", "species identity"], "intro": "008", }, "cheating frame": { "aliases": ['"cheating"', "cheating frame"], "intro": "008", }, "dependency chain": { "aliases": ["dependency chain"], "intro": "007", }, } @dataclass class Document: doc_id: str title: str kind: str path: Path text: str def read_documents() -> list[Document]: docs: list[Document] = [] for path in sorted(ROOT.glob(PAPER_GLOB)): text = path.read_text(encoding="utf-8") m = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, flags=re.M) if m: doc_id, title = m.group(1), m.group(2).strip() else: doc_id = path.name.split("-", 1)[0] title = path.stem docs.append(Document(doc_id=doc_id, title=title, kind="paper", path=path, text=text)) for path in sorted(ROOT.glob(ALLEGORY_GLOB)): text = path.read_text(encoding="utf-8") m = re.search(r"^#\s+(.+)$", text, flags=re.M) title = m.group(1).strip() if m else path.stem.replace("-", " ").title() docs.append( Document( doc_id=f"A:{path.stem}", title=title, kind="allegory", path=path, text=text, ) ) return docs def sentence_chunks(text: str) -> Iterable[str]: for chunk in re.split(r"(?<=[.!?])\s+|\n{2,}", text): cleaned = " ".join(chunk.strip().split()) if cleaned: yield cleaned def classify_relationship(text: str) -> str: lower = text.lower() if "supersed" in lower: return "supersedes" if any(k in lower for k in ("refute", "rebuttal", "against")): return "refutes" if any(k in lower for k in ("challenge", "critic", "unfalsifiable")): return "challenges" if "revis" in lower: return "revises" if "extend" in lower: return "extends" if any(k in lower for k in ("respond", "address", "engage")): return "addresses" return "references" def find_paper_targets(text: str) -> list[str]: if "paper" not in text.lower(): return [] return sorted(set(re.findall(r"\b00[1-8]\b", text))) def add_edge(edges: dict[tuple[str, str], dict], source: str, target: str, edge_type: str, context: str) -> None: if source == target: return key = (source, target) candidate = {"source": source, "target": target, "type": edge_type, "context": context} existing = edges.get(key) if not existing: edges[key] = candidate return if RELATION_PRIORITY[edge_type] > RELATION_PRIORITY[existing["type"]]: edges[key] = candidate def extract_explicit_edges(docs: list[Document]) -> dict[tuple[str, str], dict]: edges: dict[tuple[str, str], dict] = {} allegory_name_to_id = {doc.path.stem.replace("-", " "): doc.doc_id for doc in docs if doc.kind == "allegory"} for doc in docs: for sent in sentence_chunks(doc.text): targets = find_paper_targets(sent) if targets: rel = classify_relationship(sent) for target in targets: add_edge(edges, doc.doc_id, target, rel, sent[:220]) if doc.kind == "paper" and doc.doc_id == "007": lower = doc.text.lower() for name, target_id in allegory_name_to_id.items(): if name in lower: add_edge( edges, doc.doc_id, target_id, "extends", f"Paper 007 explicitly maps the {name.title()} allegory into the ratchet framework.", ) if doc.kind == "allegory": for sent in sentence_chunks(doc.text): targets = find_paper_targets(sent) for target in targets: add_edge(edges, doc.doc_id, target, "addresses", sent[:220]) return edges def collect_concept_presence(docs: list[Document]) -> tuple[dict[str, str], dict[str, set[str]]]: intro: dict[str, str] = {} usage: dict[str, set[str]] = defaultdict(set) ordered = sorted([d for d in docs if d.kind == "paper"], key=lambda d: d.doc_id) + [ d for d in docs if d.kind == "allegory" ] for doc in ordered: lower = doc.text.lower() for concept, info in CONCEPT_CATALOG.items(): aliases = info["aliases"] if any(alias.lower() in lower for alias in aliases): usage[concept].add(doc.doc_id) expected_intro = info["intro"] if expected_intro in {d.doc_id for d in docs if d.kind == "paper"}: intro.setdefault(concept, expected_intro) else: intro.setdefault(concept, doc.doc_id) return intro, usage def extract_implicit_edges( docs: list[Document], intro: dict[str, str], usage: dict[str, set[str]], edges: dict[tuple[str, str], dict] ) -> None: for concept, source in intro.items(): if not re.match(r"^00[1-8]$", source): continue for target in sorted(usage[concept]): if target == source or not re.match(r"^00[1-8]$", target): continue if target <= source: continue add_edge( edges, source, target, "introduces concept used by", f"{concept} appears first in {source} and recurs in {target}.", ) def build_nodes(docs: list[Document], intro: dict[str, str]) -> list[dict]: concept_by_doc: dict[str, list[str]] = defaultdict(list) for concept, doc_id in intro.items(): concept_by_doc[doc_id].append(concept) nodes: list[dict] = [] for doc in sorted(docs, key=lambda d: (d.kind != "paper", d.doc_id)): nodes.append( { "id": doc.doc_id, "title": doc.title, "kind": doc.kind, "concepts_introduced": sorted(concept_by_doc.get(doc.doc_id, [])), } ) return nodes def write_mermaid(nodes: list[dict], edges: list[dict]) -> None: def mm_id(node_id: str) -> str: return re.sub(r"[^A-Za-z0-9_]", "_", node_id) lines = ["graph TD"] for node in nodes: nid = mm_id(node["id"]) label = f'{node["id"]}: {node["title"]}' lines.append(f' {nid}["{label}"]') for edge in edges: src = mm_id(edge["source"]) dst = mm_id(edge["target"]) rel = edge["type"].replace('"', "") lines.append(f" {src} -->|{rel}| {dst}") (OUT_DIR / "graph.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8") def extract_open_questions(paper: Document) -> list[str]: lines = paper.text.splitlines() start = None for i, line in enumerate(lines): if line.strip().lower().startswith("## open questions"): start = i + 1 break if start is None: return [] questions: list[str] = [] for line in lines[start:]: if line.startswith("## "): break stripped = line.strip() if re.match(r"^(\d+\.|-)\s+", stripped): body = re.sub(r"^(\d+\.|-)\s+", "", stripped).strip() if body: questions.append(body) return questions def question_keywords(text: str) -> set[str]: words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower()) return {w for w in words if w not in STOPWORDS} def build_dangling_threads(papers: list[Document]) -> str: paper_map = {p.doc_id: p for p in papers} ordered_ids = sorted(paper_map.keys()) lines = ["# Dangling Threads", ""] found_any = False for doc_id in ordered_ids: paper = paper_map[doc_id] questions = extract_open_questions(paper) later = [paper_map[i] for i in ordered_ids if i > doc_id] for question in questions: kws = question_keywords(question) hits: list[str] = [] if kws: for other in later: lower = other.text.lower() overlap = sum(1 for kw in kws if kw in lower) if overlap >= 2: hits.append(other.doc_id) found_any = True if hits: lines.append( f"- Raised in **Paper {doc_id}**: {question} \n" f" Partially addressed in later papers: {', '.join(f'Paper {h}' for h in hits)}." ) else: lines.append( f"- Raised in **Paper {doc_id}**: {question} \n" " Partially addressed in later papers: none detected." ) if not found_any: lines.append("- No open-question sections were detected in the source files.") lines.append("") return "\n".join(lines) def build_concept_flow( papers: list[Document], intro: dict[str, str], usage: dict[str, set[str]], explicit_edges: list[dict] ) -> str: lines = ["# Concept Flow", ""] paper_ids = sorted(p.doc_id for p in papers) paper_map = {p.doc_id: p for p in papers} for concept in sorted(CONCEPT_CATALOG.keys()): introduced = intro.get(concept, "unknown") used_in = sorted(d for d in usage.get(concept, set()) if d in paper_ids) aliases = CONCEPT_CATALOG[concept]["aliases"] challenged: set[str] = set() revised: set[str] = set() for doc_id in used_in: has_concept_sentence = False for sent in sentence_chunks(paper_map[doc_id].text): lower_sent = sent.lower() if not any(a.lower() in lower_sent for a in aliases): continue has_concept_sentence = True if any(k in lower_sent for k in ("challenge", "critic", "rebuttal", "against", "unfalsifiable")): challenged.add(doc_id) if any(k in lower_sent for k in ("revision", "revised", "supersedes", "responds", "extends")): revised.add(doc_id) if not has_concept_sentence: continue challenged_list = sorted(challenged) revised_list = sorted(revised) current = used_in[-1] if used_in else "unknown" lines.append(f"## {concept.title()}") lines.append(f"- Introduced in: Paper {introduced}" if introduced != "unknown" else "- Introduced in: unknown") lines.append( f"- Challenged in: {', '.join(f'Paper {p}' for p in challenged_list)}" if challenged_list else "- Challenged in: none detected" ) lines.append( f"- Revised in: {', '.join(f'Paper {p}' for p in revised_list)}" if revised_list else "- Revised in: none detected" ) lines.append( f"- Referenced in: {', '.join(f'Paper {p}' for p in used_in)}" if used_in else "- Referenced in: none detected" ) lines.append(f"- Current standing: active in latest mention (Paper {current})." if current != "unknown" else "- Current standing: unclear.") lines.append("") return "\n".join(lines) def main() -> None: docs = read_documents() papers = [d for d in docs if d.kind == "paper"] intro, usage = collect_concept_presence(docs) edge_map = extract_explicit_edges(docs) extract_implicit_edges(docs, intro, usage, edge_map) nodes = build_nodes(docs, intro) edges = sorted(edge_map.values(), key=lambda e: (e["source"], e["target"], e["type"])) graph = {"nodes": nodes, "edges": edges} (OUT_DIR / "graph.json").write_text(json.dumps(graph, indent=2) + "\n", encoding="utf-8") write_mermaid(nodes, edges) dangling = build_dangling_threads(papers) (OUT_DIR / "dangling_threads.md").write_text(dangling, encoding="utf-8") flow = build_concept_flow(papers, intro, usage, edges) (OUT_DIR / "concept_flow.md").write_text(flow, encoding="utf-8") print(f"Wrote {OUT_DIR / 'graph.json'}") print(f"Wrote {OUT_DIR / 'graph.mermaid'}") print(f"Wrote {OUT_DIR / 'dangling_threads.md'}") print(f"Wrote {OUT_DIR / 'concept_flow.md'}") if __name__ == "__main__": main()