Files
Mortdecai f654b30de9 docs: integration tools — cross-reference graph, concept index, research digest
Codex-built tooling: cross-reference graph, concept index with build script,
and research integrator that extracted 142 scholars, 175 bibliography items,
4 contradiction topics, and coverage maps for Paper 009 planning.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 08:31:20 -04:00

471 lines
14 KiB
Python

#!/usr/bin/env python3
"""Build cross-reference artifacts for the VIBECODE-THEORY paper series.
Outputs:
- graph.json
- graph.mermaid
- dangling_threads.md
- concept_flow.md
"""
from __future__ import annotations
import json
import re
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
ROOT = Path(__file__).resolve().parents[2]
OUT_DIR = Path(__file__).resolve().parent
PAPER_GLOB = "00*-*.md"
ALLEGORY_GLOB = "allegorical/*.md"
STOPWORDS = {
"about",
"after",
"again",
"also",
"because",
"between",
"could",
"does",
"from",
"have",
"into",
"just",
"like",
"might",
"more",
"most",
"over",
"paper",
"question",
"series",
"should",
"than",
"that",
"their",
"them",
"then",
"this",
"those",
"through",
"what",
"when",
"which",
"with",
"would",
}
RELATION_PRIORITY = {
"supersedes": 8,
"refutes": 7,
"challenges": 6,
"revises": 5,
"extends": 4,
"addresses": 3,
"introduces concept used by": 2,
"references": 1,
}
CONCEPT_CATALOG = {
"vibe coding as social skill": {
"aliases": ["vibe coding", "social skill", "meta-skill"],
"intro": "001",
},
"cognitive surplus": {
"aliases": ["cognitive surplus", "surplus"],
"intro": "002",
},
"dependency trap": {
"aliases": ["dependency trap", "systemic dependency"],
"intro": "002",
},
"cognitive preference shift": {
"aliases": ["cognitive preference shift", "preference shift"],
"intro": "005",
},
"automation spiral": {
"aliases": ["automation spiral"],
"intro": "003",
},
"feedback loop": {
"aliases": ["feedback loop", "uncomfortable middle"],
"intro": "006",
},
"biological ratchet": {
"aliases": ["biological ratchet", "ratchet"],
"intro": "007",
},
"infrastructure threshold": {
"aliases": ["infrastructure threshold", "application phase"],
"intro": "007",
},
"premature dependency hibernation": {
"aliases": ["premature dependencies", "hibernation"],
"intro": "007",
},
"knowledge unification": {
"aliases": ["knowledge unification", "defragmentation"],
"intro": "008",
},
"ship of theseus identity problem": {
"aliases": ["ship of theseus", "species identity"],
"intro": "008",
},
"cheating frame": {
"aliases": ['"cheating"', "cheating frame"],
"intro": "008",
},
"dependency chain": {
"aliases": ["dependency chain"],
"intro": "007",
},
}
@dataclass
class Document:
doc_id: str
title: str
kind: str
path: Path
text: str
def read_documents() -> list[Document]:
docs: list[Document] = []
for path in sorted(ROOT.glob(PAPER_GLOB)):
text = path.read_text(encoding="utf-8")
m = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, flags=re.M)
if m:
doc_id, title = m.group(1), m.group(2).strip()
else:
doc_id = path.name.split("-", 1)[0]
title = path.stem
docs.append(Document(doc_id=doc_id, title=title, kind="paper", path=path, text=text))
for path in sorted(ROOT.glob(ALLEGORY_GLOB)):
text = path.read_text(encoding="utf-8")
m = re.search(r"^#\s+(.+)$", text, flags=re.M)
title = m.group(1).strip() if m else path.stem.replace("-", " ").title()
docs.append(
Document(
doc_id=f"A:{path.stem}",
title=title,
kind="allegory",
path=path,
text=text,
)
)
return docs
def sentence_chunks(text: str) -> Iterable[str]:
for chunk in re.split(r"(?<=[.!?])\s+|\n{2,}", text):
cleaned = " ".join(chunk.strip().split())
if cleaned:
yield cleaned
def classify_relationship(text: str) -> str:
lower = text.lower()
if "supersed" in lower:
return "supersedes"
if any(k in lower for k in ("refute", "rebuttal", "against")):
return "refutes"
if any(k in lower for k in ("challenge", "critic", "unfalsifiable")):
return "challenges"
if "revis" in lower:
return "revises"
if "extend" in lower:
return "extends"
if any(k in lower for k in ("respond", "address", "engage")):
return "addresses"
return "references"
def find_paper_targets(text: str) -> list[str]:
if "paper" not in text.lower():
return []
return sorted(set(re.findall(r"\b00[1-8]\b", text)))
def add_edge(edges: dict[tuple[str, str], dict], source: str, target: str, edge_type: str, context: str) -> None:
if source == target:
return
key = (source, target)
candidate = {"source": source, "target": target, "type": edge_type, "context": context}
existing = edges.get(key)
if not existing:
edges[key] = candidate
return
if RELATION_PRIORITY[edge_type] > RELATION_PRIORITY[existing["type"]]:
edges[key] = candidate
def extract_explicit_edges(docs: list[Document]) -> dict[tuple[str, str], dict]:
edges: dict[tuple[str, str], dict] = {}
allegory_name_to_id = {doc.path.stem.replace("-", " "): doc.doc_id for doc in docs if doc.kind == "allegory"}
for doc in docs:
for sent in sentence_chunks(doc.text):
targets = find_paper_targets(sent)
if targets:
rel = classify_relationship(sent)
for target in targets:
add_edge(edges, doc.doc_id, target, rel, sent[:220])
if doc.kind == "paper" and doc.doc_id == "007":
lower = doc.text.lower()
for name, target_id in allegory_name_to_id.items():
if name in lower:
add_edge(
edges,
doc.doc_id,
target_id,
"extends",
f"Paper 007 explicitly maps the {name.title()} allegory into the ratchet framework.",
)
if doc.kind == "allegory":
for sent in sentence_chunks(doc.text):
targets = find_paper_targets(sent)
for target in targets:
add_edge(edges, doc.doc_id, target, "addresses", sent[:220])
return edges
def collect_concept_presence(docs: list[Document]) -> tuple[dict[str, str], dict[str, set[str]]]:
intro: dict[str, str] = {}
usage: dict[str, set[str]] = defaultdict(set)
ordered = sorted([d for d in docs if d.kind == "paper"], key=lambda d: d.doc_id) + [
d for d in docs if d.kind == "allegory"
]
for doc in ordered:
lower = doc.text.lower()
for concept, info in CONCEPT_CATALOG.items():
aliases = info["aliases"]
if any(alias.lower() in lower for alias in aliases):
usage[concept].add(doc.doc_id)
expected_intro = info["intro"]
if expected_intro in {d.doc_id for d in docs if d.kind == "paper"}:
intro.setdefault(concept, expected_intro)
else:
intro.setdefault(concept, doc.doc_id)
return intro, usage
def extract_implicit_edges(
docs: list[Document], intro: dict[str, str], usage: dict[str, set[str]], edges: dict[tuple[str, str], dict]
) -> None:
for concept, source in intro.items():
if not re.match(r"^00[1-8]$", source):
continue
for target in sorted(usage[concept]):
if target == source or not re.match(r"^00[1-8]$", target):
continue
if target <= source:
continue
add_edge(
edges,
source,
target,
"introduces concept used by",
f"{concept} appears first in {source} and recurs in {target}.",
)
def build_nodes(docs: list[Document], intro: dict[str, str]) -> list[dict]:
concept_by_doc: dict[str, list[str]] = defaultdict(list)
for concept, doc_id in intro.items():
concept_by_doc[doc_id].append(concept)
nodes: list[dict] = []
for doc in sorted(docs, key=lambda d: (d.kind != "paper", d.doc_id)):
nodes.append(
{
"id": doc.doc_id,
"title": doc.title,
"kind": doc.kind,
"concepts_introduced": sorted(concept_by_doc.get(doc.doc_id, [])),
}
)
return nodes
def write_mermaid(nodes: list[dict], edges: list[dict]) -> None:
def mm_id(node_id: str) -> str:
return re.sub(r"[^A-Za-z0-9_]", "_", node_id)
lines = ["graph TD"]
for node in nodes:
nid = mm_id(node["id"])
label = f'{node["id"]}: {node["title"]}'
lines.append(f' {nid}["{label}"]')
for edge in edges:
src = mm_id(edge["source"])
dst = mm_id(edge["target"])
rel = edge["type"].replace('"', "")
lines.append(f" {src} -->|{rel}| {dst}")
(OUT_DIR / "graph.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8")
def extract_open_questions(paper: Document) -> list[str]:
lines = paper.text.splitlines()
start = None
for i, line in enumerate(lines):
if line.strip().lower().startswith("## open questions"):
start = i + 1
break
if start is None:
return []
questions: list[str] = []
for line in lines[start:]:
if line.startswith("## "):
break
stripped = line.strip()
if re.match(r"^(\d+\.|-)\s+", stripped):
body = re.sub(r"^(\d+\.|-)\s+", "", stripped).strip()
if body:
questions.append(body)
return questions
def question_keywords(text: str) -> set[str]:
words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower())
return {w for w in words if w not in STOPWORDS}
def build_dangling_threads(papers: list[Document]) -> str:
paper_map = {p.doc_id: p for p in papers}
ordered_ids = sorted(paper_map.keys())
lines = ["# Dangling Threads", ""]
found_any = False
for doc_id in ordered_ids:
paper = paper_map[doc_id]
questions = extract_open_questions(paper)
later = [paper_map[i] for i in ordered_ids if i > doc_id]
for question in questions:
kws = question_keywords(question)
hits: list[str] = []
if kws:
for other in later:
lower = other.text.lower()
overlap = sum(1 for kw in kws if kw in lower)
if overlap >= 2:
hits.append(other.doc_id)
found_any = True
if hits:
lines.append(
f"- Raised in **Paper {doc_id}**: {question} \n"
f" Partially addressed in later papers: {', '.join(f'Paper {h}' for h in hits)}."
)
else:
lines.append(
f"- Raised in **Paper {doc_id}**: {question} \n"
" Partially addressed in later papers: none detected."
)
if not found_any:
lines.append("- No open-question sections were detected in the source files.")
lines.append("")
return "\n".join(lines)
def build_concept_flow(
papers: list[Document], intro: dict[str, str], usage: dict[str, set[str]], explicit_edges: list[dict]
) -> str:
lines = ["# Concept Flow", ""]
paper_ids = sorted(p.doc_id for p in papers)
paper_map = {p.doc_id: p for p in papers}
for concept in sorted(CONCEPT_CATALOG.keys()):
introduced = intro.get(concept, "unknown")
used_in = sorted(d for d in usage.get(concept, set()) if d in paper_ids)
aliases = CONCEPT_CATALOG[concept]["aliases"]
challenged: set[str] = set()
revised: set[str] = set()
for doc_id in used_in:
has_concept_sentence = False
for sent in sentence_chunks(paper_map[doc_id].text):
lower_sent = sent.lower()
if not any(a.lower() in lower_sent for a in aliases):
continue
has_concept_sentence = True
if any(k in lower_sent for k in ("challenge", "critic", "rebuttal", "against", "unfalsifiable")):
challenged.add(doc_id)
if any(k in lower_sent for k in ("revision", "revised", "supersedes", "responds", "extends")):
revised.add(doc_id)
if not has_concept_sentence:
continue
challenged_list = sorted(challenged)
revised_list = sorted(revised)
current = used_in[-1] if used_in else "unknown"
lines.append(f"## {concept.title()}")
lines.append(f"- Introduced in: Paper {introduced}" if introduced != "unknown" else "- Introduced in: unknown")
lines.append(
f"- Challenged in: {', '.join(f'Paper {p}' for p in challenged_list)}"
if challenged_list
else "- Challenged in: none detected"
)
lines.append(
f"- Revised in: {', '.join(f'Paper {p}' for p in revised_list)}"
if revised_list
else "- Revised in: none detected"
)
lines.append(
f"- Referenced in: {', '.join(f'Paper {p}' for p in used_in)}" if used_in else "- Referenced in: none detected"
)
lines.append(f"- Current standing: active in latest mention (Paper {current})." if current != "unknown" else "- Current standing: unclear.")
lines.append("")
return "\n".join(lines)
def main() -> None:
docs = read_documents()
papers = [d for d in docs if d.kind == "paper"]
intro, usage = collect_concept_presence(docs)
edge_map = extract_explicit_edges(docs)
extract_implicit_edges(docs, intro, usage, edge_map)
nodes = build_nodes(docs, intro)
edges = sorted(edge_map.values(), key=lambda e: (e["source"], e["target"], e["type"]))
graph = {"nodes": nodes, "edges": edges}
(OUT_DIR / "graph.json").write_text(json.dumps(graph, indent=2) + "\n", encoding="utf-8")
write_mermaid(nodes, edges)
dangling = build_dangling_threads(papers)
(OUT_DIR / "dangling_threads.md").write_text(dangling, encoding="utf-8")
flow = build_concept_flow(papers, intro, usage, edges)
(OUT_DIR / "concept_flow.md").write_text(flow, encoding="utf-8")
print(f"Wrote {OUT_DIR / 'graph.json'}")
print(f"Wrote {OUT_DIR / 'graph.mermaid'}")
print(f"Wrote {OUT_DIR / 'dangling_threads.md'}")
print(f"Wrote {OUT_DIR / 'concept_flow.md'}")
if __name__ == "__main__":
main()