Files
Mortdecai/knowledge/build_index.py
T
Seth 77efac0283 Add knowledge corpus: 14 command references, server context, and TF-IDF search index (Phase 1.3)
- knowledge/mc-commands/commands.json: 14 MC commands with JE syntax, args, examples, common errors, 1.21 version notes
- knowledge/server-context/servers.json: all 4 servers (mc1, shrink, paper-ai, paper-dev) with full config
- knowledge/build_index.py: TF-IDF indexer + search function (19 docs, 725 terms)
- All command syntax validated live on dev server via RCON (12/13 passed)
- PLAN.md: mark Phase 1.3 complete
2026-03-18 02:01:12 -04:00

220 lines
7.0 KiB
Python

#!/usr/bin/env python3
"""
Build a simple TF-IDF-based search index over the knowledge corpus.
Indexes:
- knowledge/mc-commands/commands.json (command reference)
- knowledge/server-context/servers.json (server configs)
- knowledge/wiki-chunks/*.json (wiki content, if present)
Outputs: knowledge/index.json
Usage: python3 knowledge/build_index.py
"""
import json
import os
import re
from collections import Counter
from pathlib import Path
import math
ROOT = Path(__file__).resolve().parent
def tokenize(text: str) -> list:
"""Simple whitespace + punctuation tokenizer."""
return re.findall(r'[a-z0-9_:/.]{2,}', (text or '').lower())
def build_command_docs(commands_path: Path) -> list:
"""Build searchable documents from commands.json."""
docs = []
if not commands_path.exists():
return docs
commands = json.loads(commands_path.read_text())
for cmd in commands:
name = cmd.get('command', '')
# Build a text blob from all fields
parts = [
f"/{name} command",
cmd.get('description', ''),
' '.join(cmd.get('je_syntax', [])),
]
# Arguments
for arg_name, arg_info in cmd.get('arguments', {}).items():
if isinstance(arg_info, dict):
parts.append(f"{arg_name}: {arg_info.get('description', '')}")
else:
parts.append(f"{arg_name}: {arg_info}")
# Examples
for ex_name, ex_val in cmd.get('examples', {}).items():
parts.append(f"example {ex_name}: {ex_val}")
# Common errors
for err in cmd.get('common_errors', []):
parts.append(f"common error: {err}")
# Version notes
parts.append(cmd.get('version_notes', ''))
text = '\n'.join(p for p in parts if p)
snippet = f"/{name}: {cmd.get('description', '')}. Syntax: {'; '.join(cmd.get('je_syntax', [])[:2])}"
docs.append({
'id': f'cmd_{name}',
'type': 'command',
'title': f'/{name}',
'text': text,
'snippet': snippet[:300],
'source': 'mc-commands/commands.json',
})
return docs
def build_server_docs(servers_path: Path) -> list:
"""Build searchable documents from servers.json."""
docs = []
if not servers_path.exists():
return docs
data = json.loads(servers_path.read_text())
for srv in data.get('servers', []):
name = srv.get('name', '')
text = json.dumps(srv, indent=2)
snippet = f"Server '{name}': {srv.get('type', '')} {srv.get('version', '')} on port {srv.get('game_port', '')}. {srv.get('notes', '')}"
docs.append({
'id': f'srv_{name}',
'type': 'server',
'title': f'Server: {name}',
'text': text,
'snippet': snippet[:300],
'source': 'server-context/servers.json',
})
# Version notes as a separate doc
vn = data.get('version_notes', {})
if vn:
text = '\n'.join(f"{k}: {v}" for k, v in vn.items())
docs.append({
'id': 'version_notes',
'type': 'reference',
'title': 'Minecraft 1.21 Version Notes',
'text': text,
'snippet': text[:300],
'source': 'server-context/servers.json',
})
return docs
def build_wiki_docs(wiki_dir: Path) -> list:
"""Build searchable documents from wiki chunk files."""
docs = []
if not wiki_dir.exists():
return docs
for p in wiki_dir.glob('*.json'):
try:
chunks = json.loads(p.read_text())
if isinstance(chunks, list):
for i, chunk in enumerate(chunks):
text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk)
title = chunk.get('title', p.stem) if isinstance(chunk, dict) else p.stem
docs.append({
'id': f'wiki_{p.stem}_{i}',
'type': 'wiki',
'title': title,
'text': text,
'snippet': text[:300],
'source': f'wiki-chunks/{p.name}',
})
except Exception:
pass
return docs
def build_index():
"""Build and save the search index."""
docs = []
docs.extend(build_command_docs(ROOT / 'mc-commands' / 'commands.json'))
docs.extend(build_server_docs(ROOT / 'server-context' / 'servers.json'))
docs.extend(build_wiki_docs(ROOT / 'wiki-chunks'))
# Build TF-IDF
doc_freq = Counter()
for doc in docs:
tokens = set(tokenize(doc['text']))
doc['_tokens'] = list(tokens)
for t in tokens:
doc_freq[t] += 1
N = len(docs)
idf = {t: math.log(N / (1 + df)) for t, df in doc_freq.items()}
# Store index
index = {
'generated_at': __import__('time').time(),
'doc_count': N,
'docs': [{
'id': d['id'],
'type': d['type'],
'title': d['title'],
'snippet': d['snippet'],
'source': d['source'],
'tokens': d['_tokens'],
} for d in docs],
'idf': {t: round(v, 4) for t, v in sorted(idf.items()) if v > 0.1},
}
out_path = ROOT / 'index.json'
out_path.write_text(json.dumps(index, ensure_ascii=True, indent=2))
print(f"Index built: {N} documents, {len(idf)} unique terms -> {out_path}")
return index
def search(query: str, index: dict = None, limit: int = 5) -> list:
"""Search the index. Returns top matches."""
if index is None:
idx_path = ROOT / 'index.json'
index = json.loads(idx_path.read_text())
q_tokens = set(tokenize(query))
idf = index.get('idf', {})
results = []
for doc in index.get('docs', []):
d_tokens = set(doc.get('tokens', []))
overlap = q_tokens & d_tokens
if not overlap:
continue
score = sum(idf.get(t, 0.5) for t in overlap)
# Boost title matches
title_tokens = set(tokenize(doc.get('title', '')))
title_overlap = q_tokens & title_tokens
score += len(title_overlap) * 2.0
results.append((score, doc))
results.sort(key=lambda x: x[0], reverse=True)
return [{'score': round(s, 2), **d} for s, d in results[:limit]]
if __name__ == '__main__':
import sys
if len(sys.argv) > 1 and sys.argv[1] == 'search':
query = ' '.join(sys.argv[2:])
results = search(query)
for r in results:
print(f"[{r['score']:.1f}] {r['title']}: {r['snippet'][:100]}")
else:
build_index()
# Run test queries
print()
for q in [
"how to give enchanted sword",
"effect speed player",
"weather thunder storm",
"execute as vs at position",
"paper server port rcon",
"1.21 enchantment syntax",
]:
results = search(q)
print(f"Query: '{q}'")
for r in results[:3]:
print(f" [{r['score']:.1f}] {r['title']}")
print()