f654b30de9
Codex-built tooling: cross-reference graph, concept index with build script, and research integrator that extracted 142 scholars, 175 bibliography items, 4 contradiction topics, and coverage maps for Paper 009 planning. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
471 lines
14 KiB
Python
471 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""Build cross-reference artifacts for the VIBECODE-THEORY paper series.
|
|
|
|
Outputs:
|
|
- graph.json
|
|
- graph.mermaid
|
|
- dangling_threads.md
|
|
- concept_flow.md
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[2]
|
|
OUT_DIR = Path(__file__).resolve().parent
|
|
|
|
PAPER_GLOB = "00*-*.md"
|
|
ALLEGORY_GLOB = "allegorical/*.md"
|
|
|
|
STOPWORDS = {
|
|
"about",
|
|
"after",
|
|
"again",
|
|
"also",
|
|
"because",
|
|
"between",
|
|
"could",
|
|
"does",
|
|
"from",
|
|
"have",
|
|
"into",
|
|
"just",
|
|
"like",
|
|
"might",
|
|
"more",
|
|
"most",
|
|
"over",
|
|
"paper",
|
|
"question",
|
|
"series",
|
|
"should",
|
|
"than",
|
|
"that",
|
|
"their",
|
|
"them",
|
|
"then",
|
|
"this",
|
|
"those",
|
|
"through",
|
|
"what",
|
|
"when",
|
|
"which",
|
|
"with",
|
|
"would",
|
|
}
|
|
|
|
RELATION_PRIORITY = {
|
|
"supersedes": 8,
|
|
"refutes": 7,
|
|
"challenges": 6,
|
|
"revises": 5,
|
|
"extends": 4,
|
|
"addresses": 3,
|
|
"introduces concept used by": 2,
|
|
"references": 1,
|
|
}
|
|
|
|
CONCEPT_CATALOG = {
|
|
"vibe coding as social skill": {
|
|
"aliases": ["vibe coding", "social skill", "meta-skill"],
|
|
"intro": "001",
|
|
},
|
|
"cognitive surplus": {
|
|
"aliases": ["cognitive surplus", "surplus"],
|
|
"intro": "002",
|
|
},
|
|
"dependency trap": {
|
|
"aliases": ["dependency trap", "systemic dependency"],
|
|
"intro": "002",
|
|
},
|
|
"cognitive preference shift": {
|
|
"aliases": ["cognitive preference shift", "preference shift"],
|
|
"intro": "005",
|
|
},
|
|
"automation spiral": {
|
|
"aliases": ["automation spiral"],
|
|
"intro": "003",
|
|
},
|
|
"feedback loop": {
|
|
"aliases": ["feedback loop", "uncomfortable middle"],
|
|
"intro": "006",
|
|
},
|
|
"biological ratchet": {
|
|
"aliases": ["biological ratchet", "ratchet"],
|
|
"intro": "007",
|
|
},
|
|
"infrastructure threshold": {
|
|
"aliases": ["infrastructure threshold", "application phase"],
|
|
"intro": "007",
|
|
},
|
|
"premature dependency hibernation": {
|
|
"aliases": ["premature dependencies", "hibernation"],
|
|
"intro": "007",
|
|
},
|
|
"knowledge unification": {
|
|
"aliases": ["knowledge unification", "defragmentation"],
|
|
"intro": "008",
|
|
},
|
|
"ship of theseus identity problem": {
|
|
"aliases": ["ship of theseus", "species identity"],
|
|
"intro": "008",
|
|
},
|
|
"cheating frame": {
|
|
"aliases": ['"cheating"', "cheating frame"],
|
|
"intro": "008",
|
|
},
|
|
"dependency chain": {
|
|
"aliases": ["dependency chain"],
|
|
"intro": "007",
|
|
},
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
doc_id: str
|
|
title: str
|
|
kind: str
|
|
path: Path
|
|
text: str
|
|
|
|
|
|
def read_documents() -> list[Document]:
|
|
docs: list[Document] = []
|
|
|
|
for path in sorted(ROOT.glob(PAPER_GLOB)):
|
|
text = path.read_text(encoding="utf-8")
|
|
m = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, flags=re.M)
|
|
if m:
|
|
doc_id, title = m.group(1), m.group(2).strip()
|
|
else:
|
|
doc_id = path.name.split("-", 1)[0]
|
|
title = path.stem
|
|
docs.append(Document(doc_id=doc_id, title=title, kind="paper", path=path, text=text))
|
|
|
|
for path in sorted(ROOT.glob(ALLEGORY_GLOB)):
|
|
text = path.read_text(encoding="utf-8")
|
|
m = re.search(r"^#\s+(.+)$", text, flags=re.M)
|
|
title = m.group(1).strip() if m else path.stem.replace("-", " ").title()
|
|
docs.append(
|
|
Document(
|
|
doc_id=f"A:{path.stem}",
|
|
title=title,
|
|
kind="allegory",
|
|
path=path,
|
|
text=text,
|
|
)
|
|
)
|
|
|
|
return docs
|
|
|
|
|
|
def sentence_chunks(text: str) -> Iterable[str]:
|
|
for chunk in re.split(r"(?<=[.!?])\s+|\n{2,}", text):
|
|
cleaned = " ".join(chunk.strip().split())
|
|
if cleaned:
|
|
yield cleaned
|
|
|
|
|
|
def classify_relationship(text: str) -> str:
|
|
lower = text.lower()
|
|
if "supersed" in lower:
|
|
return "supersedes"
|
|
if any(k in lower for k in ("refute", "rebuttal", "against")):
|
|
return "refutes"
|
|
if any(k in lower for k in ("challenge", "critic", "unfalsifiable")):
|
|
return "challenges"
|
|
if "revis" in lower:
|
|
return "revises"
|
|
if "extend" in lower:
|
|
return "extends"
|
|
if any(k in lower for k in ("respond", "address", "engage")):
|
|
return "addresses"
|
|
return "references"
|
|
|
|
|
|
def find_paper_targets(text: str) -> list[str]:
|
|
if "paper" not in text.lower():
|
|
return []
|
|
return sorted(set(re.findall(r"\b00[1-8]\b", text)))
|
|
|
|
|
|
def add_edge(edges: dict[tuple[str, str], dict], source: str, target: str, edge_type: str, context: str) -> None:
|
|
if source == target:
|
|
return
|
|
key = (source, target)
|
|
candidate = {"source": source, "target": target, "type": edge_type, "context": context}
|
|
existing = edges.get(key)
|
|
if not existing:
|
|
edges[key] = candidate
|
|
return
|
|
if RELATION_PRIORITY[edge_type] > RELATION_PRIORITY[existing["type"]]:
|
|
edges[key] = candidate
|
|
|
|
|
|
def extract_explicit_edges(docs: list[Document]) -> dict[tuple[str, str], dict]:
|
|
edges: dict[tuple[str, str], dict] = {}
|
|
allegory_name_to_id = {doc.path.stem.replace("-", " "): doc.doc_id for doc in docs if doc.kind == "allegory"}
|
|
|
|
for doc in docs:
|
|
for sent in sentence_chunks(doc.text):
|
|
targets = find_paper_targets(sent)
|
|
if targets:
|
|
rel = classify_relationship(sent)
|
|
for target in targets:
|
|
add_edge(edges, doc.doc_id, target, rel, sent[:220])
|
|
|
|
if doc.kind == "paper" and doc.doc_id == "007":
|
|
lower = doc.text.lower()
|
|
for name, target_id in allegory_name_to_id.items():
|
|
if name in lower:
|
|
add_edge(
|
|
edges,
|
|
doc.doc_id,
|
|
target_id,
|
|
"extends",
|
|
f"Paper 007 explicitly maps the {name.title()} allegory into the ratchet framework.",
|
|
)
|
|
|
|
if doc.kind == "allegory":
|
|
for sent in sentence_chunks(doc.text):
|
|
targets = find_paper_targets(sent)
|
|
for target in targets:
|
|
add_edge(edges, doc.doc_id, target, "addresses", sent[:220])
|
|
|
|
return edges
|
|
|
|
|
|
def collect_concept_presence(docs: list[Document]) -> tuple[dict[str, str], dict[str, set[str]]]:
|
|
intro: dict[str, str] = {}
|
|
usage: dict[str, set[str]] = defaultdict(set)
|
|
|
|
ordered = sorted([d for d in docs if d.kind == "paper"], key=lambda d: d.doc_id) + [
|
|
d for d in docs if d.kind == "allegory"
|
|
]
|
|
|
|
for doc in ordered:
|
|
lower = doc.text.lower()
|
|
for concept, info in CONCEPT_CATALOG.items():
|
|
aliases = info["aliases"]
|
|
if any(alias.lower() in lower for alias in aliases):
|
|
usage[concept].add(doc.doc_id)
|
|
expected_intro = info["intro"]
|
|
if expected_intro in {d.doc_id for d in docs if d.kind == "paper"}:
|
|
intro.setdefault(concept, expected_intro)
|
|
else:
|
|
intro.setdefault(concept, doc.doc_id)
|
|
return intro, usage
|
|
|
|
|
|
def extract_implicit_edges(
|
|
docs: list[Document], intro: dict[str, str], usage: dict[str, set[str]], edges: dict[tuple[str, str], dict]
|
|
) -> None:
|
|
for concept, source in intro.items():
|
|
if not re.match(r"^00[1-8]$", source):
|
|
continue
|
|
for target in sorted(usage[concept]):
|
|
if target == source or not re.match(r"^00[1-8]$", target):
|
|
continue
|
|
if target <= source:
|
|
continue
|
|
add_edge(
|
|
edges,
|
|
source,
|
|
target,
|
|
"introduces concept used by",
|
|
f"{concept} appears first in {source} and recurs in {target}.",
|
|
)
|
|
|
|
|
|
def build_nodes(docs: list[Document], intro: dict[str, str]) -> list[dict]:
|
|
concept_by_doc: dict[str, list[str]] = defaultdict(list)
|
|
for concept, doc_id in intro.items():
|
|
concept_by_doc[doc_id].append(concept)
|
|
|
|
nodes: list[dict] = []
|
|
for doc in sorted(docs, key=lambda d: (d.kind != "paper", d.doc_id)):
|
|
nodes.append(
|
|
{
|
|
"id": doc.doc_id,
|
|
"title": doc.title,
|
|
"kind": doc.kind,
|
|
"concepts_introduced": sorted(concept_by_doc.get(doc.doc_id, [])),
|
|
}
|
|
)
|
|
return nodes
|
|
|
|
|
|
def write_mermaid(nodes: list[dict], edges: list[dict]) -> None:
|
|
def mm_id(node_id: str) -> str:
|
|
return re.sub(r"[^A-Za-z0-9_]", "_", node_id)
|
|
|
|
lines = ["graph TD"]
|
|
for node in nodes:
|
|
nid = mm_id(node["id"])
|
|
label = f'{node["id"]}: {node["title"]}'
|
|
lines.append(f' {nid}["{label}"]')
|
|
for edge in edges:
|
|
src = mm_id(edge["source"])
|
|
dst = mm_id(edge["target"])
|
|
rel = edge["type"].replace('"', "")
|
|
lines.append(f" {src} -->|{rel}| {dst}")
|
|
(OUT_DIR / "graph.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
|
|
|
|
def extract_open_questions(paper: Document) -> list[str]:
|
|
lines = paper.text.splitlines()
|
|
start = None
|
|
for i, line in enumerate(lines):
|
|
if line.strip().lower().startswith("## open questions"):
|
|
start = i + 1
|
|
break
|
|
if start is None:
|
|
return []
|
|
|
|
questions: list[str] = []
|
|
for line in lines[start:]:
|
|
if line.startswith("## "):
|
|
break
|
|
stripped = line.strip()
|
|
if re.match(r"^(\d+\.|-)\s+", stripped):
|
|
body = re.sub(r"^(\d+\.|-)\s+", "", stripped).strip()
|
|
if body:
|
|
questions.append(body)
|
|
return questions
|
|
|
|
|
|
def question_keywords(text: str) -> set[str]:
|
|
words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower())
|
|
return {w for w in words if w not in STOPWORDS}
|
|
|
|
|
|
def build_dangling_threads(papers: list[Document]) -> str:
|
|
paper_map = {p.doc_id: p for p in papers}
|
|
ordered_ids = sorted(paper_map.keys())
|
|
lines = ["# Dangling Threads", ""]
|
|
|
|
found_any = False
|
|
for doc_id in ordered_ids:
|
|
paper = paper_map[doc_id]
|
|
questions = extract_open_questions(paper)
|
|
later = [paper_map[i] for i in ordered_ids if i > doc_id]
|
|
|
|
for question in questions:
|
|
kws = question_keywords(question)
|
|
hits: list[str] = []
|
|
if kws:
|
|
for other in later:
|
|
lower = other.text.lower()
|
|
overlap = sum(1 for kw in kws if kw in lower)
|
|
if overlap >= 2:
|
|
hits.append(other.doc_id)
|
|
found_any = True
|
|
if hits:
|
|
lines.append(
|
|
f"- Raised in **Paper {doc_id}**: {question} \n"
|
|
f" Partially addressed in later papers: {', '.join(f'Paper {h}' for h in hits)}."
|
|
)
|
|
else:
|
|
lines.append(
|
|
f"- Raised in **Paper {doc_id}**: {question} \n"
|
|
" Partially addressed in later papers: none detected."
|
|
)
|
|
|
|
if not found_any:
|
|
lines.append("- No open-question sections were detected in the source files.")
|
|
lines.append("")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def build_concept_flow(
|
|
papers: list[Document], intro: dict[str, str], usage: dict[str, set[str]], explicit_edges: list[dict]
|
|
) -> str:
|
|
lines = ["# Concept Flow", ""]
|
|
|
|
paper_ids = sorted(p.doc_id for p in papers)
|
|
paper_map = {p.doc_id: p for p in papers}
|
|
|
|
for concept in sorted(CONCEPT_CATALOG.keys()):
|
|
introduced = intro.get(concept, "unknown")
|
|
used_in = sorted(d for d in usage.get(concept, set()) if d in paper_ids)
|
|
aliases = CONCEPT_CATALOG[concept]["aliases"]
|
|
|
|
challenged: set[str] = set()
|
|
revised: set[str] = set()
|
|
for doc_id in used_in:
|
|
has_concept_sentence = False
|
|
for sent in sentence_chunks(paper_map[doc_id].text):
|
|
lower_sent = sent.lower()
|
|
if not any(a.lower() in lower_sent for a in aliases):
|
|
continue
|
|
has_concept_sentence = True
|
|
if any(k in lower_sent for k in ("challenge", "critic", "rebuttal", "against", "unfalsifiable")):
|
|
challenged.add(doc_id)
|
|
if any(k in lower_sent for k in ("revision", "revised", "supersedes", "responds", "extends")):
|
|
revised.add(doc_id)
|
|
if not has_concept_sentence:
|
|
continue
|
|
|
|
challenged_list = sorted(challenged)
|
|
revised_list = sorted(revised)
|
|
current = used_in[-1] if used_in else "unknown"
|
|
|
|
lines.append(f"## {concept.title()}")
|
|
lines.append(f"- Introduced in: Paper {introduced}" if introduced != "unknown" else "- Introduced in: unknown")
|
|
lines.append(
|
|
f"- Challenged in: {', '.join(f'Paper {p}' for p in challenged_list)}"
|
|
if challenged_list
|
|
else "- Challenged in: none detected"
|
|
)
|
|
lines.append(
|
|
f"- Revised in: {', '.join(f'Paper {p}' for p in revised_list)}"
|
|
if revised_list
|
|
else "- Revised in: none detected"
|
|
)
|
|
lines.append(
|
|
f"- Referenced in: {', '.join(f'Paper {p}' for p in used_in)}" if used_in else "- Referenced in: none detected"
|
|
)
|
|
lines.append(f"- Current standing: active in latest mention (Paper {current})." if current != "unknown" else "- Current standing: unclear.")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main() -> None:
|
|
docs = read_documents()
|
|
papers = [d for d in docs if d.kind == "paper"]
|
|
|
|
intro, usage = collect_concept_presence(docs)
|
|
edge_map = extract_explicit_edges(docs)
|
|
extract_implicit_edges(docs, intro, usage, edge_map)
|
|
|
|
nodes = build_nodes(docs, intro)
|
|
edges = sorted(edge_map.values(), key=lambda e: (e["source"], e["target"], e["type"]))
|
|
|
|
graph = {"nodes": nodes, "edges": edges}
|
|
(OUT_DIR / "graph.json").write_text(json.dumps(graph, indent=2) + "\n", encoding="utf-8")
|
|
write_mermaid(nodes, edges)
|
|
|
|
dangling = build_dangling_threads(papers)
|
|
(OUT_DIR / "dangling_threads.md").write_text(dangling, encoding="utf-8")
|
|
|
|
flow = build_concept_flow(papers, intro, usage, edges)
|
|
(OUT_DIR / "concept_flow.md").write_text(flow, encoding="utf-8")
|
|
|
|
print(f"Wrote {OUT_DIR / 'graph.json'}")
|
|
print(f"Wrote {OUT_DIR / 'graph.mermaid'}")
|
|
print(f"Wrote {OUT_DIR / 'dangling_threads.md'}")
|
|
print(f"Wrote {OUT_DIR / 'concept_flow.md'}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|