#!/usr/bin/env python3 """Integrate research markdown files into a unified digest for Paper 009 planning.""" from __future__ import annotations import argparse import json import re from dataclasses import dataclass from pathlib import Path from typing import Any HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE) BULLET_RE = re.compile(r"^\s*[-*]\s+(.+?)\s*$") NUMBERED_RE = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$") SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+") NAME_RE = re.compile( r"\b(?:[A-Z](?:\.[A-Z])+\.?|[A-Z][a-zA-Z'-]+)" r"(?:\s+(?:[A-Z](?:\.[A-Z])+\.?|[A-Z][a-zA-Z'-]+)){1,3}\b" ) BAD_NAME_WORDS = { "executive", "summary", "task", "sources", "paper", "physics", "technology", "society", "logs", "pricing", "history", "quantum", "analysis", } TOPIC_RULES = [ { "id": "determinism_vs_agency", "label": "Technological determinism vs social agency", "pro_markers": [ "autonomous technique", "irreversible", "lock-in", "path dependence", "ratchet", "structurally fixed", "cannot reverse", ], "con_markers": [ "social construct", "interpretive flexibility", "democratic rationalization", "human agency", "selective adoption", "tool taming", "re-shaped", "can change", ], }, { "id": "unification_vs_homogenization", "label": "Knowledge unification vs statistical homogenization", "pro_markers": [ "knowledge unification", "integration layer", "interconnectedness", "consilience", "compiled", "coherent", ], "con_markers": [ "stochastic parrot", "homogenization", "illusion", "veneer", "lossy", "lacks understanding", "database lookup", ], }, { "id": "cognition_commodity_vs_mimicry", "label": "AI cognition commodity vs token mimicry", "pro_markers": [ "cognition as a commodity", "price of thinking", "task-based framework", "automation", "productivity", "cognitive offloading", ], "con_markers": [ "stochastic parrot", "doesn't think", "mimicry", "predicts tokens", "no cognitive model", ], }, { "id": "retrocausal_attractor", "label": "Teleological attractor vs unfalsifiable retrocausality", "pro_markers": [ "teleological attractor", "retrocausal", "omega point", "final cause", "participatory universe", "transactional interpretation", ], "con_markers": [ "unfalsifiability", "pseudoscience", "woo", "causality violation", "superdeterminism", ], }, { "id": "efficiency_vs_jevons", "label": "Efficiency frees time vs Jevons expansion", "pro_markers": [ "efficiency gains", "free up human time", "productivity", "surplus", "cost disease", ], "con_markers": [ "jevons paradox", "increased consumption", "reasoning inflation", "more complex systems", "dependency", ], }, ] CHALLENGE_KEYWORDS = { "unfalsifiable": 5, "dogma": 4, "pseudoscience": 5, "illusion": 4, "mimicry": 4, "lacks understanding": 4, "circular": 3, "causality violation": 4, "superdeterminism": 3, "lossy": 2, "stochastic parrot": 5, } QUESTION_KEYWORDS = { 1: [ "falsifiable", "falsifiability", "unification", "replacement", "fragment", "distort", "evidence", "test", "stochastic", ], 2: [ "identity", "human", "consciousness", "agency", "values", "pragmatic", "continuity", "survival", ], 3: [ "individual", "workers", "labor", "skills", "strategy", "governance", "practical", "action", ], 4: [ "cheating", "tools", "dependency", "ratchet", "adoption", "ethics", "norm", ], 5: [ "timeline", "threshold", "when", "prediction", "curve", "years", "exponential", "phase", ], } @dataclass class Doc: path: Path slug: str title: str text: str sections: dict[str, str] def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) default_root = Path(__file__).resolve().parents[2] parser.add_argument("--project-root", type=Path, default=default_root) parser.add_argument("--research-dir", type=Path) parser.add_argument("--paper-008", type=Path) parser.add_argument("--out-dir", type=Path, default=Path(__file__).resolve().parent) return parser.parse_args() def clean_inline_md(text: str) -> str: text = text.strip() text = re.sub(r"`([^`]+)`", r"\1", text) text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) text = re.sub(r"\*([^*]+)\*", r"\1", text) text = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", text) return re.sub(r"\s+", " ", text).strip() def extract_sections(text: str) -> dict[str, str]: matches = list(HEADING_RE.finditer(text)) if not matches: return {} sections: dict[str, str] = {} for idx, match in enumerate(matches): heading = clean_inline_md(match.group(2)).lower() start = match.end() end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text) sections[heading] = text[start:end].strip() return sections def load_research_docs(research_dir: Path) -> list[Doc]: docs: list[Doc] = [] for path in sorted(research_dir.glob("*.md")): raw = path.read_text(encoding="utf-8") title = path.stem for line in raw.splitlines(): if line.startswith("# "): title = clean_inline_md(line[2:]) break docs.append( Doc( path=path, slug=path.stem, title=title, text=raw, sections=extract_sections(raw), ) ) return docs def normalize_person_name(name: str) -> str: stripped = re.sub(r"\bet al\.?", "", name, flags=re.IGNORECASE) stripped = stripped.replace("&", " and ") stripped = re.sub(r"\([^)]*\)", "", stripped) stripped = re.sub(r"[^A-Za-z .'-]", " ", stripped) stripped = re.sub(r"\s+", " ", stripped).strip() return stripped.lower() def split_possible_names(chunk: str) -> list[str]: chunk = clean_inline_md(chunk) chunk = chunk.split(":", 1)[0] chunk = re.sub(r"\([^)]*\)", "", chunk) chunk = re.sub(r'"[^"]+"', "", chunk) chunk = chunk.replace("&", " and ") names = NAME_RE.findall(chunk) out: list[str] = [] for name in names: name = re.sub(r"\s+", " ", name).strip(" .,:;") words = [w for w in name.split() if w and w[0].isalpha()] if len(words) >= 2 and not any(w.lower() in BAD_NAME_WORDS for w in words): out.append(" ".join(words)) if not out: single = re.sub(r"[^A-Za-z'-]", "", chunk).strip() if ( single and single[0].isupper() and single.lower() not in BAD_NAME_WORDS and len(single) > 3 ): out.append(single) return out def extract_scholars(docs: list[Doc]) -> dict[str, dict[str, Any]]: scholars: dict[str, dict[str, Any]] = {} for doc in docs: key_sections = [text for name, text in doc.sections.items() if "key scholars" in name] candidates: list[str] = [] for section in key_sections: for line in section.splitlines(): bullet_match = BULLET_RE.match(line) if not bullet_match: continue raw = bullet_match.group(1) bullet = clean_inline_md(raw) bold_match = re.search(r"\*\*([^*]+)\*\*", raw) if bold_match: candidates.extend(split_possible_names(bold_match.group(1))) else: lead = bullet.split(":", 1)[0] candidates.extend(split_possible_names(lead)) unique = sorted(set(candidates)) text_lower = doc.text.lower() for name in unique: key = normalize_person_name(name) if not key: continue entry = scholars.setdefault( key, { "name": name, "aliases": set(), "files": set(), "mention_count": 0, "contexts": [], }, ) entry["aliases"].add(name) entry["files"].add(doc.slug) surname = name.split()[-1].lower().strip(".,") local_mentions = [] for sentence in SENTENCE_SPLIT_RE.split(doc.text): sentence_clean = clean_inline_md(sentence) if surname and surname in sentence_clean.lower(): local_mentions.append(sentence_clean) if not local_mentions: if surname and surname in text_lower: local_mentions = [f"Mentioned in {doc.slug}"] else: local_mentions = [f"Listed in {doc.slug}"] entry["mention_count"] += len(local_mentions) for snippet in local_mentions[:3]: entry["contexts"].append({"file": doc.slug, "snippet": snippet}) for entry in scholars.values(): entry["aliases"] = sorted(entry["aliases"]) entry["files"] = sorted(entry["files"]) return scholars def extract_title_from_source_line(line: str) -> str: line_clean = clean_inline_md(line) quoted = re.findall(r'"([^"]+)"', line) if quoted: return clean_inline_md(quoted[0]) italic = re.findall(r"\*([^*]+)\*", line) if italic: return clean_inline_md(italic[0]) year_match = re.search(r"\(\d{4}\)\.?", line_clean) if year_match: tail = line_clean[year_match.end() :].strip(" .:-") if tail: return tail.split(".", 1)[0].strip() return line_clean def extract_authors_from_source_line(line: str) -> list[str]: line_clean = clean_inline_md(line) year_match = re.search(r"\(\d{4}\)", line_clean) head = line_clean[: year_match.start()].strip() if year_match else line_clean head = head.replace("&", " and ") head = re.sub(r"\bet al\.?", "", head, flags=re.IGNORECASE) parts = [p.strip(" ,.-") for p in re.split(r"\band\b|;", head) if p.strip(" ,.-")] names: list[str] = [] for part in parts: if re.search(r"[A-Za-z]", part): names.append(part) return names def normalize_title(title: str) -> str: title = title.lower() title = re.sub(r"[^a-z0-9 ]", " ", title) return re.sub(r"\s+", " ", title).strip() def extract_bibliography(docs: list[Doc]) -> dict[str, dict[str, Any]]: bibliography: dict[str, dict[str, Any]] = {} for doc in docs: sources = [text for name, text in doc.sections.items() if name.startswith("sources")] for src in sources: for line in src.splitlines(): bullet = BULLET_RE.match(line) if not bullet: continue raw = bullet.group(1) title = extract_title_from_source_line(raw) if not title: continue key = normalize_title(title) if not key: continue entry = bibliography.setdefault( key, { "title": title, "authors": set(), "files": set(), "raw_mentions": [], }, ) entry["files"].add(doc.slug) entry["raw_mentions"].append(clean_inline_md(raw)) for author in extract_authors_from_source_line(raw): entry["authors"].add(author) for entry in bibliography.values(): entry["authors"] = sorted(entry["authors"]) entry["files"] = sorted(entry["files"]) entry["relevance"] = len(entry["files"]) * 2 + len(entry["raw_mentions"]) return bibliography def first_sentence_with_marker(text: str, marker: str) -> str | None: for sentence in SENTENCE_SPLIT_RE.split(text): if marker in sentence.lower(): return clean_inline_md(sentence) return None def detect_contradictions(docs: list[Doc]) -> list[dict[str, Any]]: contradictions: list[dict[str, Any]] = [] for rule in TOPIC_RULES: pro_evidence: list[dict[str, str]] = [] con_evidence: list[dict[str, str]] = [] for doc in docs: text_lower = doc.text.lower() for marker in rule["pro_markers"]: if marker in text_lower: snippet = first_sentence_with_marker(doc.text, marker) if snippet: pro_evidence.append( {"file": doc.slug, "marker": marker, "snippet": snippet} ) break for marker in rule["con_markers"]: if marker in text_lower: snippet = first_sentence_with_marker(doc.text, marker) if snippet: con_evidence.append( {"file": doc.slug, "marker": marker, "snippet": snippet} ) break pro_files = {item["file"] for item in pro_evidence} con_files = {item["file"] for item in con_evidence} if pro_files - con_files and con_files - pro_files: contradictions.append( { "topic": rule["label"], "topic_id": rule["id"], "supports": pro_evidence[:4], "challenges": con_evidence[:4], "supporting_files": sorted(pro_files), "challenging_files": sorted(con_files), } ) return contradictions def extract_open_questions(paper_008: Path) -> list[dict[str, Any]]: text = paper_008.read_text(encoding="utf-8") marker = "## Open Questions for Paper 009" if marker not in text: raise RuntimeError("Could not find 'Open Questions for Paper 009' in paper 008") section = text.split(marker, 1)[1] next_header = re.search(r"\n##\s+", section) if next_header: section = section[: next_header.start()] questions: list[dict[str, Any]] = [] for line in section.splitlines(): match = NUMBERED_RE.match(line) if not match: continue idx = int(match.group(1)) body = clean_inline_md(match.group(2)) body = re.sub(r"^\*\*", "", body) body = re.sub(r"\*\*", "", body) questions.append({"id": idx, "text": body}) if not questions: raise RuntimeError("No numbered open questions found in paper 008") return questions def map_to_open_questions( docs: list[Doc], open_questions: list[dict[str, Any]] ) -> list[dict[str, Any]]: coverage: list[dict[str, Any]] = [] for question in open_questions: qid = question["id"] keywords = QUESTION_KEYWORDS.get(qid, []) file_scores: list[dict[str, Any]] = [] total = 0 for doc in docs: score = 0 snippets: list[str] = [] for sentence in SENTENCE_SPLIT_RE.split(doc.text): sentence_clean = clean_inline_md(sentence) hits = sum(1 for kw in keywords if kw in sentence_clean.lower()) if hits: score += hits if len(snippets) < 3: snippets.append(sentence_clean) if score: total += score if score >= 12: level = "high" elif score >= 6: level = "medium" else: level = "low" file_scores.append( { "file": doc.slug, "score": score, "level": level, "snippets": snippets, } ) file_scores.sort(key=lambda x: x["score"], reverse=True) coverage.append( { "question_id": qid, "question": question["text"], "total_score": total, "supporting_files": file_scores, } ) coverage.sort(key=lambda x: x["question_id"]) return coverage def extract_strongest_challenges(docs: list[Doc]) -> list[dict[str, Any]]: challenges: list[dict[str, Any]] = [] for doc in docs: counter_sections = [ text for name, text in doc.sections.items() if "counterarguments" in name or "critiques" in name ] if not counter_sections: continue for section in counter_sections: for line in section.splitlines(): bullet = BULLET_RE.match(line) if not bullet: continue text = clean_inline_md(bullet.group(1)) lower = text.lower() score = 1 for keyword, weight in CHALLENGE_KEYWORDS.items(): if keyword in lower: score += weight challenges.append({"file": doc.slug, "text": text, "score": score}) merged: dict[str, dict[str, Any]] = {} for challenge in challenges: key = challenge["text"].lower() if key not in merged: merged[key] = { "text": challenge["text"], "score": challenge["score"], "files": {challenge["file"]}, } else: merged[key]["score"] += challenge["score"] merged[key]["files"].add(challenge["file"]) ranked = sorted( ( { "text": item["text"], "score": item["score"], "files": sorted(item["files"]), } for item in merged.values() ), key=lambda x: x["score"], reverse=True, ) return ranked[:10] def detect_emergent_themes(docs: list[Doc]) -> list[dict[str, Any]]: themes = { "Governance and agency design": [ "agency", "democratic", "community", "policy", "selective adoption", "governance", ], "Economic concentration and labor shift": [ "labor", "capital", "commodity", "automation", "class", "pricing", ], "Epistemic reliability and grounding": [ "understand", "stochastic", "illusion", "lossy", "falsifiable", "evidence", ], "Civilizational lock-in and resilience": [ "lock-in", "path dependence", "retreat", "dependency", "ratchet", "reversal", ], } scored: list[dict[str, Any]] = [] corpus = "\n".join(doc.text.lower() for doc in docs) for theme, keywords in themes.items(): score = sum(corpus.count(k) for k in keywords) if score > 0: scored.append({"theme": theme, "score": score}) scored.sort(key=lambda x: x["score"], reverse=True) return scored def build_structured_result( docs: list[Doc], scholars: dict[str, dict[str, Any]], bibliography: dict[str, dict[str, Any]], contradictions: list[dict[str, Any]], open_question_coverage: list[dict[str, Any]], strongest_challenges: list[dict[str, Any]], emergent_themes: list[dict[str, Any]], ) -> dict[str, Any]: scholars_ranked = sorted( scholars.values(), key=lambda s: (len(s["files"]), s["mention_count"]), reverse=True, ) bibliography_ranked = sorted( bibliography.values(), key=lambda b: b["relevance"], reverse=True ) return { "meta": { "research_files": [doc.slug for doc in docs], "research_file_count": len(docs), }, "scholars": scholars_ranked, "bibliography": bibliography_ranked, "contradictions": contradictions, "open_question_coverage": open_question_coverage, "strongest_challenges": strongest_challenges, "emergent_themes": emergent_themes, } def render_digest(result: dict[str, Any]) -> str: lines: list[str] = [] lines.append("# Integrated Research Digest") lines.append("") lines.append("## Scope") lines.append( f"Processed {result['meta']['research_file_count']} research file(s): " + ", ".join(result["meta"]["research_files"]) ) lines.append("") lines.append("## Scholars by Frequency") for scholar in result["scholars"][:20]: files = ", ".join(scholar["files"]) lines.append( f"- **{scholar['name']}** — files: {len(scholar['files'])}; mentions: {scholar['mention_count']}; in: {files}" ) lines.append("") lines.append("## Unified Bibliography") for item in result["bibliography"][:40]: authors = ", ".join(item["authors"]) if item["authors"] else "Unknown" files = ", ".join(item["files"]) lines.append( f"- **{item['title']}** ({authors}) — relevance {item['relevance']}; cited in: {files}" ) lines.append("") lines.append("## Contradiction Report") if not result["contradictions"]: lines.append("- No cross-file contradictions detected by the current heuristic.") for item in result["contradictions"]: lines.append(f"### {item['topic']}") lines.append("- Supporting evidence:") for support in item["supports"]: lines.append( f" - `{support['file']}` ({support['marker']}): {support['snippet']}" ) lines.append("- Challenging evidence:") for challenge in item["challenges"]: lines.append( f" - `{challenge['file']}` ({challenge['marker']}): {challenge['snippet']}" ) lines.append("") lines.append("## Paper 009 Coverage Map") for item in result["open_question_coverage"]: if item["supporting_files"]: max_level = max(fs["level"] for fs in item["supporting_files"]) else: max_level = "none" lines.append( f"### Q{item['question_id']} (total score {item['total_score']}, strongest level {max_level})" ) lines.append(f"{item['question']}") if not item["supporting_files"]: lines.append("- No supporting material detected.") continue for fs in item["supporting_files"][:4]: lines.append(f"- `{fs['file']}`: score {fs['score']} ({fs['level']})") for snip in fs["snippets"][:2]: lines.append(f" - {snip}") lines.append("") lines.append("## Strongest Challenges") if not result["strongest_challenges"]: lines.append("- No challenge bullets detected.") for item in result["strongest_challenges"]: lines.append( f"- **Score {item['score']}** ({', '.join(item['files'])}): {item['text']}" ) return "\n".join(lines) + "\n" def render_outline(result: dict[str, Any]) -> str: coverage_sorted = sorted( result["open_question_coverage"], key=lambda x: x["total_score"], reverse=True ) most_covered = coverage_sorted[:2] least_covered = coverage_sorted[-2:] if len(coverage_sorted) >= 2 else coverage_sorted lines: list[str] = [] lines.append("# Suggested Outline for Paper 009") lines.append("") lines.append("## Why This Sequence") lines.append( "Order starts with heavily-supported questions, then closes with low-coverage questions that require new argumentation or new research." ) lines.append("") lines.append("## Coverage Priorities") lines.append("- Most supported open questions:") for item in most_covered: lines.append( f" - Q{item['question_id']} (score {item['total_score']}): {item['question']}" ) lines.append("- Least supported open questions:") for item in least_covered: lines.append( f" - Q{item['question_id']} (score {item['total_score']}): {item['question']}" ) lines.append("") lines.append("## Proposed Sections") for item in coverage_sorted: qid = item["question_id"] lines.append(f"### Section {qid}: Q{qid}") lines.append(item["question"]) if item["supporting_files"]: top_files = ", ".join(fs["file"] for fs in item["supporting_files"][:3]) lines.append(f"- Primary evidence files: {top_files}") top_snips = [ snip for fs in item["supporting_files"][:2] for snip in fs["snippets"][:1] ] for snip in top_snips: lines.append(f"- Anchor claim: {snip}") else: lines.append("- Primary evidence files: none detected; requires fresh synthesis.") lines.append("") lines.append("## Cross-Cutting Counterarguments To Address Explicitly") for challenge in result["strongest_challenges"][:5]: lines.append(f"- {challenge['text']} ({', '.join(challenge['files'])})") lines.append("") lines.append("## New Themes To Add Beyond Original Open Questions") for theme in result["emergent_themes"][:4]: lines.append(f"- {theme['theme']} (signal score {theme['score']})") return "\n".join(lines) + "\n" def main() -> int: args = parse_args() project_root = args.project_root.resolve() research_dir = (args.research_dir or (project_root / "research")).resolve() paper_008 = (args.paper_008 or (project_root / "008-the-ship-of-theseus.md")).resolve() out_dir = args.out_dir.resolve() out_dir.mkdir(parents=True, exist_ok=True) print(f"[integrator] project root: {project_root}") print(f"[integrator] research dir: {research_dir}") print(f"[integrator] paper 008: {paper_008}") print(f"[integrator] output dir: {out_dir}") docs = load_research_docs(research_dir) print(f"[integrator] loaded {len(docs)} research file(s)") if not docs: print("[integrator] no research files found; writing empty digest/outline") open_questions = extract_open_questions(paper_008) print(f"[integrator] extracted {len(open_questions)} open question(s) from Paper 008") scholars = extract_scholars(docs) print(f"[integrator] extracted {len(scholars)} unique scholar name(s)") bibliography = extract_bibliography(docs) print(f"[integrator] extracted {len(bibliography)} bibliography item(s)") contradictions = detect_contradictions(docs) print(f"[integrator] detected {len(contradictions)} contradiction topic(s)") coverage = map_to_open_questions(docs, open_questions) print("[integrator] mapped research evidence to Paper 008 open questions") strongest_challenges = extract_strongest_challenges(docs) print(f"[integrator] ranked {len(strongest_challenges)} strongest challenge(s)") emergent_themes = detect_emergent_themes(docs) print(f"[integrator] found {len(emergent_themes)} emergent theme(s)") result = build_structured_result( docs, scholars, bibliography, contradictions, coverage, strongest_challenges, emergent_themes, ) json_path = out_dir / "integrated.json" digest_path = out_dir / "digest.md" outline_path = out_dir / "009_outline_suggestion.md" json_path.write_text(json.dumps(result, indent=2), encoding="utf-8") digest_path.write_text(render_digest(result), encoding="utf-8") outline_path.write_text(render_outline(result), encoding="utf-8") print(f"[integrator] wrote {json_path}") print(f"[integrator] wrote {digest_path}") print(f"[integrator] wrote {outline_path}") return 0 if __name__ == "__main__": raise SystemExit(main())