docs: integration tools — cross-reference graph, concept index, research digest
Codex-built tooling: cross-reference graph, concept index with build script, and research integrator that extracted 142 scholars, 175 bibliography items, 4 contradiction topics, and coverage maps for Paper 009 planning. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,911 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integrate research markdown files into a unified digest for Paper 009 planning."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE)
|
||||
BULLET_RE = re.compile(r"^\s*[-*]\s+(.+?)\s*$")
|
||||
NUMBERED_RE = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$")
|
||||
SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
|
||||
NAME_RE = re.compile(
|
||||
r"\b(?:[A-Z](?:\.[A-Z])+\.?|[A-Z][a-zA-Z'-]+)"
|
||||
r"(?:\s+(?:[A-Z](?:\.[A-Z])+\.?|[A-Z][a-zA-Z'-]+)){1,3}\b"
|
||||
)
|
||||
BAD_NAME_WORDS = {
|
||||
"executive",
|
||||
"summary",
|
||||
"task",
|
||||
"sources",
|
||||
"paper",
|
||||
"physics",
|
||||
"technology",
|
||||
"society",
|
||||
"logs",
|
||||
"pricing",
|
||||
"history",
|
||||
"quantum",
|
||||
"analysis",
|
||||
}
|
||||
|
||||
|
||||
TOPIC_RULES = [
|
||||
{
|
||||
"id": "determinism_vs_agency",
|
||||
"label": "Technological determinism vs social agency",
|
||||
"pro_markers": [
|
||||
"autonomous technique",
|
||||
"irreversible",
|
||||
"lock-in",
|
||||
"path dependence",
|
||||
"ratchet",
|
||||
"structurally fixed",
|
||||
"cannot reverse",
|
||||
],
|
||||
"con_markers": [
|
||||
"social construct",
|
||||
"interpretive flexibility",
|
||||
"democratic rationalization",
|
||||
"human agency",
|
||||
"selective adoption",
|
||||
"tool taming",
|
||||
"re-shaped",
|
||||
"can change",
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "unification_vs_homogenization",
|
||||
"label": "Knowledge unification vs statistical homogenization",
|
||||
"pro_markers": [
|
||||
"knowledge unification",
|
||||
"integration layer",
|
||||
"interconnectedness",
|
||||
"consilience",
|
||||
"compiled",
|
||||
"coherent",
|
||||
],
|
||||
"con_markers": [
|
||||
"stochastic parrot",
|
||||
"homogenization",
|
||||
"illusion",
|
||||
"veneer",
|
||||
"lossy",
|
||||
"lacks understanding",
|
||||
"database lookup",
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "cognition_commodity_vs_mimicry",
|
||||
"label": "AI cognition commodity vs token mimicry",
|
||||
"pro_markers": [
|
||||
"cognition as a commodity",
|
||||
"price of thinking",
|
||||
"task-based framework",
|
||||
"automation",
|
||||
"productivity",
|
||||
"cognitive offloading",
|
||||
],
|
||||
"con_markers": [
|
||||
"stochastic parrot",
|
||||
"doesn't think",
|
||||
"mimicry",
|
||||
"predicts tokens",
|
||||
"no cognitive model",
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "retrocausal_attractor",
|
||||
"label": "Teleological attractor vs unfalsifiable retrocausality",
|
||||
"pro_markers": [
|
||||
"teleological attractor",
|
||||
"retrocausal",
|
||||
"omega point",
|
||||
"final cause",
|
||||
"participatory universe",
|
||||
"transactional interpretation",
|
||||
],
|
||||
"con_markers": [
|
||||
"unfalsifiability",
|
||||
"pseudoscience",
|
||||
"woo",
|
||||
"causality violation",
|
||||
"superdeterminism",
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "efficiency_vs_jevons",
|
||||
"label": "Efficiency frees time vs Jevons expansion",
|
||||
"pro_markers": [
|
||||
"efficiency gains",
|
||||
"free up human time",
|
||||
"productivity",
|
||||
"surplus",
|
||||
"cost disease",
|
||||
],
|
||||
"con_markers": [
|
||||
"jevons paradox",
|
||||
"increased consumption",
|
||||
"reasoning inflation",
|
||||
"more complex systems",
|
||||
"dependency",
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
CHALLENGE_KEYWORDS = {
|
||||
"unfalsifiable": 5,
|
||||
"dogma": 4,
|
||||
"pseudoscience": 5,
|
||||
"illusion": 4,
|
||||
"mimicry": 4,
|
||||
"lacks understanding": 4,
|
||||
"circular": 3,
|
||||
"causality violation": 4,
|
||||
"superdeterminism": 3,
|
||||
"lossy": 2,
|
||||
"stochastic parrot": 5,
|
||||
}
|
||||
|
||||
|
||||
QUESTION_KEYWORDS = {
|
||||
1: [
|
||||
"falsifiable",
|
||||
"falsifiability",
|
||||
"unification",
|
||||
"replacement",
|
||||
"fragment",
|
||||
"distort",
|
||||
"evidence",
|
||||
"test",
|
||||
"stochastic",
|
||||
],
|
||||
2: [
|
||||
"identity",
|
||||
"human",
|
||||
"consciousness",
|
||||
"agency",
|
||||
"values",
|
||||
"pragmatic",
|
||||
"continuity",
|
||||
"survival",
|
||||
],
|
||||
3: [
|
||||
"individual",
|
||||
"workers",
|
||||
"labor",
|
||||
"skills",
|
||||
"strategy",
|
||||
"governance",
|
||||
"practical",
|
||||
"action",
|
||||
],
|
||||
4: [
|
||||
"cheating",
|
||||
"tools",
|
||||
"dependency",
|
||||
"ratchet",
|
||||
"adoption",
|
||||
"ethics",
|
||||
"norm",
|
||||
],
|
||||
5: [
|
||||
"timeline",
|
||||
"threshold",
|
||||
"when",
|
||||
"prediction",
|
||||
"curve",
|
||||
"years",
|
||||
"exponential",
|
||||
"phase",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Doc:
|
||||
path: Path
|
||||
slug: str
|
||||
title: str
|
||||
text: str
|
||||
sections: dict[str, str]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
default_root = Path(__file__).resolve().parents[2]
|
||||
parser.add_argument("--project-root", type=Path, default=default_root)
|
||||
parser.add_argument("--research-dir", type=Path)
|
||||
parser.add_argument("--paper-008", type=Path)
|
||||
parser.add_argument("--out-dir", type=Path, default=Path(__file__).resolve().parent)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def clean_inline_md(text: str) -> str:
|
||||
text = text.strip()
|
||||
text = re.sub(r"`([^`]+)`", r"\1", text)
|
||||
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
|
||||
text = re.sub(r"\*([^*]+)\*", r"\1", text)
|
||||
text = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def extract_sections(text: str) -> dict[str, str]:
|
||||
matches = list(HEADING_RE.finditer(text))
|
||||
if not matches:
|
||||
return {}
|
||||
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
heading = clean_inline_md(match.group(2)).lower()
|
||||
start = match.end()
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[heading] = text[start:end].strip()
|
||||
return sections
|
||||
|
||||
|
||||
def load_research_docs(research_dir: Path) -> list[Doc]:
|
||||
docs: list[Doc] = []
|
||||
for path in sorted(research_dir.glob("*.md")):
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
title = path.stem
|
||||
for line in raw.splitlines():
|
||||
if line.startswith("# "):
|
||||
title = clean_inline_md(line[2:])
|
||||
break
|
||||
docs.append(
|
||||
Doc(
|
||||
path=path,
|
||||
slug=path.stem,
|
||||
title=title,
|
||||
text=raw,
|
||||
sections=extract_sections(raw),
|
||||
)
|
||||
)
|
||||
return docs
|
||||
|
||||
|
||||
def normalize_person_name(name: str) -> str:
|
||||
stripped = re.sub(r"\bet al\.?", "", name, flags=re.IGNORECASE)
|
||||
stripped = stripped.replace("&", " and ")
|
||||
stripped = re.sub(r"\([^)]*\)", "", stripped)
|
||||
stripped = re.sub(r"[^A-Za-z .'-]", " ", stripped)
|
||||
stripped = re.sub(r"\s+", " ", stripped).strip()
|
||||
return stripped.lower()
|
||||
|
||||
|
||||
def split_possible_names(chunk: str) -> list[str]:
|
||||
chunk = clean_inline_md(chunk)
|
||||
chunk = chunk.split(":", 1)[0]
|
||||
chunk = re.sub(r"\([^)]*\)", "", chunk)
|
||||
chunk = re.sub(r'"[^"]+"', "", chunk)
|
||||
chunk = chunk.replace("&", " and ")
|
||||
names = NAME_RE.findall(chunk)
|
||||
out: list[str] = []
|
||||
for name in names:
|
||||
name = re.sub(r"\s+", " ", name).strip(" .,:;")
|
||||
words = [w for w in name.split() if w and w[0].isalpha()]
|
||||
if len(words) >= 2 and not any(w.lower() in BAD_NAME_WORDS for w in words):
|
||||
out.append(" ".join(words))
|
||||
if not out:
|
||||
single = re.sub(r"[^A-Za-z'-]", "", chunk).strip()
|
||||
if (
|
||||
single
|
||||
and single[0].isupper()
|
||||
and single.lower() not in BAD_NAME_WORDS
|
||||
and len(single) > 3
|
||||
):
|
||||
out.append(single)
|
||||
return out
|
||||
|
||||
|
||||
def extract_scholars(docs: list[Doc]) -> dict[str, dict[str, Any]]:
|
||||
scholars: dict[str, dict[str, Any]] = {}
|
||||
|
||||
for doc in docs:
|
||||
key_sections = [text for name, text in doc.sections.items() if "key scholars" in name]
|
||||
|
||||
candidates: list[str] = []
|
||||
for section in key_sections:
|
||||
for line in section.splitlines():
|
||||
bullet_match = BULLET_RE.match(line)
|
||||
if not bullet_match:
|
||||
continue
|
||||
raw = bullet_match.group(1)
|
||||
bullet = clean_inline_md(raw)
|
||||
bold_match = re.search(r"\*\*([^*]+)\*\*", raw)
|
||||
if bold_match:
|
||||
candidates.extend(split_possible_names(bold_match.group(1)))
|
||||
else:
|
||||
lead = bullet.split(":", 1)[0]
|
||||
candidates.extend(split_possible_names(lead))
|
||||
|
||||
unique = sorted(set(candidates))
|
||||
text_lower = doc.text.lower()
|
||||
for name in unique:
|
||||
key = normalize_person_name(name)
|
||||
if not key:
|
||||
continue
|
||||
entry = scholars.setdefault(
|
||||
key,
|
||||
{
|
||||
"name": name,
|
||||
"aliases": set(),
|
||||
"files": set(),
|
||||
"mention_count": 0,
|
||||
"contexts": [],
|
||||
},
|
||||
)
|
||||
entry["aliases"].add(name)
|
||||
entry["files"].add(doc.slug)
|
||||
|
||||
surname = name.split()[-1].lower().strip(".,")
|
||||
local_mentions = []
|
||||
for sentence in SENTENCE_SPLIT_RE.split(doc.text):
|
||||
sentence_clean = clean_inline_md(sentence)
|
||||
if surname and surname in sentence_clean.lower():
|
||||
local_mentions.append(sentence_clean)
|
||||
|
||||
if not local_mentions:
|
||||
if surname and surname in text_lower:
|
||||
local_mentions = [f"Mentioned in {doc.slug}"]
|
||||
else:
|
||||
local_mentions = [f"Listed in {doc.slug}"]
|
||||
|
||||
entry["mention_count"] += len(local_mentions)
|
||||
for snippet in local_mentions[:3]:
|
||||
entry["contexts"].append({"file": doc.slug, "snippet": snippet})
|
||||
|
||||
for entry in scholars.values():
|
||||
entry["aliases"] = sorted(entry["aliases"])
|
||||
entry["files"] = sorted(entry["files"])
|
||||
|
||||
return scholars
|
||||
|
||||
|
||||
def extract_title_from_source_line(line: str) -> str:
|
||||
line_clean = clean_inline_md(line)
|
||||
quoted = re.findall(r'"([^"]+)"', line)
|
||||
if quoted:
|
||||
return clean_inline_md(quoted[0])
|
||||
italic = re.findall(r"\*([^*]+)\*", line)
|
||||
if italic:
|
||||
return clean_inline_md(italic[0])
|
||||
|
||||
year_match = re.search(r"\(\d{4}\)\.?", line_clean)
|
||||
if year_match:
|
||||
tail = line_clean[year_match.end() :].strip(" .:-")
|
||||
if tail:
|
||||
return tail.split(".", 1)[0].strip()
|
||||
|
||||
return line_clean
|
||||
|
||||
|
||||
def extract_authors_from_source_line(line: str) -> list[str]:
|
||||
line_clean = clean_inline_md(line)
|
||||
year_match = re.search(r"\(\d{4}\)", line_clean)
|
||||
head = line_clean[: year_match.start()].strip() if year_match else line_clean
|
||||
head = head.replace("&", " and ")
|
||||
head = re.sub(r"\bet al\.?", "", head, flags=re.IGNORECASE)
|
||||
parts = [p.strip(" ,.-") for p in re.split(r"\band\b|;", head) if p.strip(" ,.-")]
|
||||
|
||||
names: list[str] = []
|
||||
for part in parts:
|
||||
if re.search(r"[A-Za-z]", part):
|
||||
names.append(part)
|
||||
return names
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
title = title.lower()
|
||||
title = re.sub(r"[^a-z0-9 ]", " ", title)
|
||||
return re.sub(r"\s+", " ", title).strip()
|
||||
|
||||
|
||||
def extract_bibliography(docs: list[Doc]) -> dict[str, dict[str, Any]]:
|
||||
bibliography: dict[str, dict[str, Any]] = {}
|
||||
for doc in docs:
|
||||
sources = [text for name, text in doc.sections.items() if name.startswith("sources")]
|
||||
for src in sources:
|
||||
for line in src.splitlines():
|
||||
bullet = BULLET_RE.match(line)
|
||||
if not bullet:
|
||||
continue
|
||||
raw = bullet.group(1)
|
||||
title = extract_title_from_source_line(raw)
|
||||
if not title:
|
||||
continue
|
||||
key = normalize_title(title)
|
||||
if not key:
|
||||
continue
|
||||
entry = bibliography.setdefault(
|
||||
key,
|
||||
{
|
||||
"title": title,
|
||||
"authors": set(),
|
||||
"files": set(),
|
||||
"raw_mentions": [],
|
||||
},
|
||||
)
|
||||
entry["files"].add(doc.slug)
|
||||
entry["raw_mentions"].append(clean_inline_md(raw))
|
||||
for author in extract_authors_from_source_line(raw):
|
||||
entry["authors"].add(author)
|
||||
|
||||
for entry in bibliography.values():
|
||||
entry["authors"] = sorted(entry["authors"])
|
||||
entry["files"] = sorted(entry["files"])
|
||||
entry["relevance"] = len(entry["files"]) * 2 + len(entry["raw_mentions"])
|
||||
|
||||
return bibliography
|
||||
|
||||
|
||||
def first_sentence_with_marker(text: str, marker: str) -> str | None:
|
||||
for sentence in SENTENCE_SPLIT_RE.split(text):
|
||||
if marker in sentence.lower():
|
||||
return clean_inline_md(sentence)
|
||||
return None
|
||||
|
||||
|
||||
def detect_contradictions(docs: list[Doc]) -> list[dict[str, Any]]:
|
||||
contradictions: list[dict[str, Any]] = []
|
||||
|
||||
for rule in TOPIC_RULES:
|
||||
pro_evidence: list[dict[str, str]] = []
|
||||
con_evidence: list[dict[str, str]] = []
|
||||
|
||||
for doc in docs:
|
||||
text_lower = doc.text.lower()
|
||||
for marker in rule["pro_markers"]:
|
||||
if marker in text_lower:
|
||||
snippet = first_sentence_with_marker(doc.text, marker)
|
||||
if snippet:
|
||||
pro_evidence.append(
|
||||
{"file": doc.slug, "marker": marker, "snippet": snippet}
|
||||
)
|
||||
break
|
||||
for marker in rule["con_markers"]:
|
||||
if marker in text_lower:
|
||||
snippet = first_sentence_with_marker(doc.text, marker)
|
||||
if snippet:
|
||||
con_evidence.append(
|
||||
{"file": doc.slug, "marker": marker, "snippet": snippet}
|
||||
)
|
||||
break
|
||||
|
||||
pro_files = {item["file"] for item in pro_evidence}
|
||||
con_files = {item["file"] for item in con_evidence}
|
||||
if pro_files - con_files and con_files - pro_files:
|
||||
contradictions.append(
|
||||
{
|
||||
"topic": rule["label"],
|
||||
"topic_id": rule["id"],
|
||||
"supports": pro_evidence[:4],
|
||||
"challenges": con_evidence[:4],
|
||||
"supporting_files": sorted(pro_files),
|
||||
"challenging_files": sorted(con_files),
|
||||
}
|
||||
)
|
||||
|
||||
return contradictions
|
||||
|
||||
|
||||
def extract_open_questions(paper_008: Path) -> list[dict[str, Any]]:
|
||||
text = paper_008.read_text(encoding="utf-8")
|
||||
marker = "## Open Questions for Paper 009"
|
||||
if marker not in text:
|
||||
raise RuntimeError("Could not find 'Open Questions for Paper 009' in paper 008")
|
||||
|
||||
section = text.split(marker, 1)[1]
|
||||
next_header = re.search(r"\n##\s+", section)
|
||||
if next_header:
|
||||
section = section[: next_header.start()]
|
||||
|
||||
questions: list[dict[str, Any]] = []
|
||||
for line in section.splitlines():
|
||||
match = NUMBERED_RE.match(line)
|
||||
if not match:
|
||||
continue
|
||||
idx = int(match.group(1))
|
||||
body = clean_inline_md(match.group(2))
|
||||
body = re.sub(r"^\*\*", "", body)
|
||||
body = re.sub(r"\*\*", "", body)
|
||||
questions.append({"id": idx, "text": body})
|
||||
|
||||
if not questions:
|
||||
raise RuntimeError("No numbered open questions found in paper 008")
|
||||
return questions
|
||||
|
||||
|
||||
def map_to_open_questions(
|
||||
docs: list[Doc], open_questions: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
coverage: list[dict[str, Any]] = []
|
||||
|
||||
for question in open_questions:
|
||||
qid = question["id"]
|
||||
keywords = QUESTION_KEYWORDS.get(qid, [])
|
||||
file_scores: list[dict[str, Any]] = []
|
||||
total = 0
|
||||
|
||||
for doc in docs:
|
||||
score = 0
|
||||
snippets: list[str] = []
|
||||
for sentence in SENTENCE_SPLIT_RE.split(doc.text):
|
||||
sentence_clean = clean_inline_md(sentence)
|
||||
hits = sum(1 for kw in keywords if kw in sentence_clean.lower())
|
||||
if hits:
|
||||
score += hits
|
||||
if len(snippets) < 3:
|
||||
snippets.append(sentence_clean)
|
||||
if score:
|
||||
total += score
|
||||
if score >= 12:
|
||||
level = "high"
|
||||
elif score >= 6:
|
||||
level = "medium"
|
||||
else:
|
||||
level = "low"
|
||||
file_scores.append(
|
||||
{
|
||||
"file": doc.slug,
|
||||
"score": score,
|
||||
"level": level,
|
||||
"snippets": snippets,
|
||||
}
|
||||
)
|
||||
|
||||
file_scores.sort(key=lambda x: x["score"], reverse=True)
|
||||
coverage.append(
|
||||
{
|
||||
"question_id": qid,
|
||||
"question": question["text"],
|
||||
"total_score": total,
|
||||
"supporting_files": file_scores,
|
||||
}
|
||||
)
|
||||
|
||||
coverage.sort(key=lambda x: x["question_id"])
|
||||
return coverage
|
||||
|
||||
|
||||
def extract_strongest_challenges(docs: list[Doc]) -> list[dict[str, Any]]:
|
||||
challenges: list[dict[str, Any]] = []
|
||||
|
||||
for doc in docs:
|
||||
counter_sections = [
|
||||
text
|
||||
for name, text in doc.sections.items()
|
||||
if "counterarguments" in name or "critiques" in name
|
||||
]
|
||||
if not counter_sections:
|
||||
continue
|
||||
for section in counter_sections:
|
||||
for line in section.splitlines():
|
||||
bullet = BULLET_RE.match(line)
|
||||
if not bullet:
|
||||
continue
|
||||
text = clean_inline_md(bullet.group(1))
|
||||
lower = text.lower()
|
||||
score = 1
|
||||
for keyword, weight in CHALLENGE_KEYWORDS.items():
|
||||
if keyword in lower:
|
||||
score += weight
|
||||
challenges.append({"file": doc.slug, "text": text, "score": score})
|
||||
|
||||
merged: dict[str, dict[str, Any]] = {}
|
||||
for challenge in challenges:
|
||||
key = challenge["text"].lower()
|
||||
if key not in merged:
|
||||
merged[key] = {
|
||||
"text": challenge["text"],
|
||||
"score": challenge["score"],
|
||||
"files": {challenge["file"]},
|
||||
}
|
||||
else:
|
||||
merged[key]["score"] += challenge["score"]
|
||||
merged[key]["files"].add(challenge["file"])
|
||||
|
||||
ranked = sorted(
|
||||
(
|
||||
{
|
||||
"text": item["text"],
|
||||
"score": item["score"],
|
||||
"files": sorted(item["files"]),
|
||||
}
|
||||
for item in merged.values()
|
||||
),
|
||||
key=lambda x: x["score"],
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
return ranked[:10]
|
||||
|
||||
|
||||
def detect_emergent_themes(docs: list[Doc]) -> list[dict[str, Any]]:
|
||||
themes = {
|
||||
"Governance and agency design": [
|
||||
"agency",
|
||||
"democratic",
|
||||
"community",
|
||||
"policy",
|
||||
"selective adoption",
|
||||
"governance",
|
||||
],
|
||||
"Economic concentration and labor shift": [
|
||||
"labor",
|
||||
"capital",
|
||||
"commodity",
|
||||
"automation",
|
||||
"class",
|
||||
"pricing",
|
||||
],
|
||||
"Epistemic reliability and grounding": [
|
||||
"understand",
|
||||
"stochastic",
|
||||
"illusion",
|
||||
"lossy",
|
||||
"falsifiable",
|
||||
"evidence",
|
||||
],
|
||||
"Civilizational lock-in and resilience": [
|
||||
"lock-in",
|
||||
"path dependence",
|
||||
"retreat",
|
||||
"dependency",
|
||||
"ratchet",
|
||||
"reversal",
|
||||
],
|
||||
}
|
||||
|
||||
scored: list[dict[str, Any]] = []
|
||||
corpus = "\n".join(doc.text.lower() for doc in docs)
|
||||
for theme, keywords in themes.items():
|
||||
score = sum(corpus.count(k) for k in keywords)
|
||||
if score > 0:
|
||||
scored.append({"theme": theme, "score": score})
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
return scored
|
||||
|
||||
|
||||
def build_structured_result(
|
||||
docs: list[Doc],
|
||||
scholars: dict[str, dict[str, Any]],
|
||||
bibliography: dict[str, dict[str, Any]],
|
||||
contradictions: list[dict[str, Any]],
|
||||
open_question_coverage: list[dict[str, Any]],
|
||||
strongest_challenges: list[dict[str, Any]],
|
||||
emergent_themes: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
scholars_ranked = sorted(
|
||||
scholars.values(),
|
||||
key=lambda s: (len(s["files"]), s["mention_count"]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
bibliography_ranked = sorted(
|
||||
bibliography.values(), key=lambda b: b["relevance"], reverse=True
|
||||
)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"research_files": [doc.slug for doc in docs],
|
||||
"research_file_count": len(docs),
|
||||
},
|
||||
"scholars": scholars_ranked,
|
||||
"bibliography": bibliography_ranked,
|
||||
"contradictions": contradictions,
|
||||
"open_question_coverage": open_question_coverage,
|
||||
"strongest_challenges": strongest_challenges,
|
||||
"emergent_themes": emergent_themes,
|
||||
}
|
||||
|
||||
|
||||
def render_digest(result: dict[str, Any]) -> str:
|
||||
lines: list[str] = []
|
||||
lines.append("# Integrated Research Digest")
|
||||
lines.append("")
|
||||
lines.append("## Scope")
|
||||
lines.append(
|
||||
f"Processed {result['meta']['research_file_count']} research file(s): "
|
||||
+ ", ".join(result["meta"]["research_files"])
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Scholars by Frequency")
|
||||
for scholar in result["scholars"][:20]:
|
||||
files = ", ".join(scholar["files"])
|
||||
lines.append(
|
||||
f"- **{scholar['name']}** — files: {len(scholar['files'])}; mentions: {scholar['mention_count']}; in: {files}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Unified Bibliography")
|
||||
for item in result["bibliography"][:40]:
|
||||
authors = ", ".join(item["authors"]) if item["authors"] else "Unknown"
|
||||
files = ", ".join(item["files"])
|
||||
lines.append(
|
||||
f"- **{item['title']}** ({authors}) — relevance {item['relevance']}; cited in: {files}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Contradiction Report")
|
||||
if not result["contradictions"]:
|
||||
lines.append("- No cross-file contradictions detected by the current heuristic.")
|
||||
for item in result["contradictions"]:
|
||||
lines.append(f"### {item['topic']}")
|
||||
lines.append("- Supporting evidence:")
|
||||
for support in item["supports"]:
|
||||
lines.append(
|
||||
f" - `{support['file']}` ({support['marker']}): {support['snippet']}"
|
||||
)
|
||||
lines.append("- Challenging evidence:")
|
||||
for challenge in item["challenges"]:
|
||||
lines.append(
|
||||
f" - `{challenge['file']}` ({challenge['marker']}): {challenge['snippet']}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Paper 009 Coverage Map")
|
||||
for item in result["open_question_coverage"]:
|
||||
if item["supporting_files"]:
|
||||
max_level = max(fs["level"] for fs in item["supporting_files"])
|
||||
else:
|
||||
max_level = "none"
|
||||
lines.append(
|
||||
f"### Q{item['question_id']} (total score {item['total_score']}, strongest level {max_level})"
|
||||
)
|
||||
lines.append(f"{item['question']}")
|
||||
if not item["supporting_files"]:
|
||||
lines.append("- No supporting material detected.")
|
||||
continue
|
||||
for fs in item["supporting_files"][:4]:
|
||||
lines.append(f"- `{fs['file']}`: score {fs['score']} ({fs['level']})")
|
||||
for snip in fs["snippets"][:2]:
|
||||
lines.append(f" - {snip}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Strongest Challenges")
|
||||
if not result["strongest_challenges"]:
|
||||
lines.append("- No challenge bullets detected.")
|
||||
for item in result["strongest_challenges"]:
|
||||
lines.append(
|
||||
f"- **Score {item['score']}** ({', '.join(item['files'])}): {item['text']}"
|
||||
)
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def render_outline(result: dict[str, Any]) -> str:
|
||||
coverage_sorted = sorted(
|
||||
result["open_question_coverage"], key=lambda x: x["total_score"], reverse=True
|
||||
)
|
||||
most_covered = coverage_sorted[:2]
|
||||
least_covered = coverage_sorted[-2:] if len(coverage_sorted) >= 2 else coverage_sorted
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("# Suggested Outline for Paper 009")
|
||||
lines.append("")
|
||||
lines.append("## Why This Sequence")
|
||||
lines.append(
|
||||
"Order starts with heavily-supported questions, then closes with low-coverage questions that require new argumentation or new research."
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Coverage Priorities")
|
||||
lines.append("- Most supported open questions:")
|
||||
for item in most_covered:
|
||||
lines.append(
|
||||
f" - Q{item['question_id']} (score {item['total_score']}): {item['question']}"
|
||||
)
|
||||
lines.append("- Least supported open questions:")
|
||||
for item in least_covered:
|
||||
lines.append(
|
||||
f" - Q{item['question_id']} (score {item['total_score']}): {item['question']}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Proposed Sections")
|
||||
for item in coverage_sorted:
|
||||
qid = item["question_id"]
|
||||
lines.append(f"### Section {qid}: Q{qid}")
|
||||
lines.append(item["question"])
|
||||
if item["supporting_files"]:
|
||||
top_files = ", ".join(fs["file"] for fs in item["supporting_files"][:3])
|
||||
lines.append(f"- Primary evidence files: {top_files}")
|
||||
top_snips = [
|
||||
snip
|
||||
for fs in item["supporting_files"][:2]
|
||||
for snip in fs["snippets"][:1]
|
||||
]
|
||||
for snip in top_snips:
|
||||
lines.append(f"- Anchor claim: {snip}")
|
||||
else:
|
||||
lines.append("- Primary evidence files: none detected; requires fresh synthesis.")
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Cross-Cutting Counterarguments To Address Explicitly")
|
||||
for challenge in result["strongest_challenges"][:5]:
|
||||
lines.append(f"- {challenge['text']} ({', '.join(challenge['files'])})")
|
||||
lines.append("")
|
||||
|
||||
lines.append("## New Themes To Add Beyond Original Open Questions")
|
||||
for theme in result["emergent_themes"][:4]:
|
||||
lines.append(f"- {theme['theme']} (signal score {theme['score']})")
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
project_root = args.project_root.resolve()
|
||||
research_dir = (args.research_dir or (project_root / "research")).resolve()
|
||||
paper_008 = (args.paper_008 or (project_root / "008-the-ship-of-theseus.md")).resolve()
|
||||
out_dir = args.out_dir.resolve()
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"[integrator] project root: {project_root}")
|
||||
print(f"[integrator] research dir: {research_dir}")
|
||||
print(f"[integrator] paper 008: {paper_008}")
|
||||
print(f"[integrator] output dir: {out_dir}")
|
||||
|
||||
docs = load_research_docs(research_dir)
|
||||
print(f"[integrator] loaded {len(docs)} research file(s)")
|
||||
if not docs:
|
||||
print("[integrator] no research files found; writing empty digest/outline")
|
||||
|
||||
open_questions = extract_open_questions(paper_008)
|
||||
print(f"[integrator] extracted {len(open_questions)} open question(s) from Paper 008")
|
||||
|
||||
scholars = extract_scholars(docs)
|
||||
print(f"[integrator] extracted {len(scholars)} unique scholar name(s)")
|
||||
|
||||
bibliography = extract_bibliography(docs)
|
||||
print(f"[integrator] extracted {len(bibliography)} bibliography item(s)")
|
||||
|
||||
contradictions = detect_contradictions(docs)
|
||||
print(f"[integrator] detected {len(contradictions)} contradiction topic(s)")
|
||||
|
||||
coverage = map_to_open_questions(docs, open_questions)
|
||||
print("[integrator] mapped research evidence to Paper 008 open questions")
|
||||
|
||||
strongest_challenges = extract_strongest_challenges(docs)
|
||||
print(f"[integrator] ranked {len(strongest_challenges)} strongest challenge(s)")
|
||||
|
||||
emergent_themes = detect_emergent_themes(docs)
|
||||
print(f"[integrator] found {len(emergent_themes)} emergent theme(s)")
|
||||
|
||||
result = build_structured_result(
|
||||
docs,
|
||||
scholars,
|
||||
bibliography,
|
||||
contradictions,
|
||||
coverage,
|
||||
strongest_challenges,
|
||||
emergent_themes,
|
||||
)
|
||||
|
||||
json_path = out_dir / "integrated.json"
|
||||
digest_path = out_dir / "digest.md"
|
||||
outline_path = out_dir / "009_outline_suggestion.md"
|
||||
|
||||
json_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
|
||||
digest_path.write_text(render_digest(result), encoding="utf-8")
|
||||
outline_path.write_text(render_outline(result), encoding="utf-8")
|
||||
|
||||
print(f"[integrator] wrote {json_path}")
|
||||
print(f"[integrator] wrote {digest_path}")
|
||||
print(f"[integrator] wrote {outline_path}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user