#!/usr/bin/env python3 """ Build a simple TF-IDF-based search index over the knowledge corpus. Indexes: - knowledge/mc-commands/commands.json (command reference) - knowledge/server-context/servers.json (server configs) - knowledge/wiki-chunks/*.json (wiki content, if present) Outputs: knowledge/index.json Usage: python3 knowledge/build_index.py """ import json import os import re from collections import Counter from pathlib import Path import math ROOT = Path(__file__).resolve().parent def tokenize(text: str) -> list: """Simple whitespace + punctuation tokenizer.""" return re.findall(r'[a-z0-9_:/.]{2,}', (text or '').lower()) def build_command_docs(commands_path: Path) -> list: """Build searchable documents from commands.json.""" docs = [] if not commands_path.exists(): return docs commands = json.loads(commands_path.read_text()) for cmd in commands: name = cmd.get('command', '') # Build a text blob from all fields parts = [ f"/{name} command", cmd.get('description', ''), ' '.join(cmd.get('je_syntax', [])), ] # Arguments for arg_name, arg_info in cmd.get('arguments', {}).items(): if isinstance(arg_info, dict): parts.append(f"{arg_name}: {arg_info.get('description', '')}") else: parts.append(f"{arg_name}: {arg_info}") # Examples for ex_name, ex_val in cmd.get('examples', {}).items(): parts.append(f"example {ex_name}: {ex_val}") # Common errors for err in cmd.get('common_errors', []): parts.append(f"common error: {err}") # Version notes parts.append(cmd.get('version_notes', '')) text = '\n'.join(p for p in parts if p) snippet = f"/{name}: {cmd.get('description', '')}. Syntax: {'; '.join(cmd.get('je_syntax', [])[:2])}" docs.append({ 'id': f'cmd_{name}', 'type': 'command', 'title': f'/{name}', 'text': text, 'snippet': snippet[:300], 'source': 'mc-commands/commands.json', }) return docs def build_server_docs(servers_path: Path) -> list: """Build searchable documents from servers.json.""" docs = [] if not servers_path.exists(): return docs data = json.loads(servers_path.read_text()) for srv in data.get('servers', []): name = srv.get('name', '') text = json.dumps(srv, indent=2) snippet = f"Server '{name}': {srv.get('type', '')} {srv.get('version', '')} on port {srv.get('game_port', '')}. {srv.get('notes', '')}" docs.append({ 'id': f'srv_{name}', 'type': 'server', 'title': f'Server: {name}', 'text': text, 'snippet': snippet[:300], 'source': 'server-context/servers.json', }) # Version notes as a separate doc vn = data.get('version_notes', {}) if vn: text = '\n'.join(f"{k}: {v}" for k, v in vn.items()) docs.append({ 'id': 'version_notes', 'type': 'reference', 'title': 'Minecraft 1.21 Version Notes', 'text': text, 'snippet': text[:300], 'source': 'server-context/servers.json', }) return docs def build_wiki_docs(wiki_dir: Path) -> list: """Build searchable documents from wiki chunk files.""" docs = [] if not wiki_dir.exists(): return docs for p in wiki_dir.glob('*.json'): try: chunks = json.loads(p.read_text()) if isinstance(chunks, list): for i, chunk in enumerate(chunks): text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk) title = chunk.get('title', p.stem) if isinstance(chunk, dict) else p.stem docs.append({ 'id': f'wiki_{p.stem}_{i}', 'type': 'wiki', 'title': title, 'text': text, 'snippet': text[:300], 'source': f'wiki-chunks/{p.name}', }) except Exception: pass return docs def build_index(): """Build and save the search index.""" docs = [] docs.extend(build_command_docs(ROOT / 'mc-commands' / 'commands.json')) docs.extend(build_server_docs(ROOT / 'server-context' / 'servers.json')) docs.extend(build_wiki_docs(ROOT / 'wiki-chunks')) # Build TF-IDF doc_freq = Counter() for doc in docs: tokens = set(tokenize(doc['text'])) doc['_tokens'] = list(tokens) for t in tokens: doc_freq[t] += 1 N = len(docs) idf = {t: math.log(N / (1 + df)) for t, df in doc_freq.items()} # Store index index = { 'generated_at': __import__('time').time(), 'doc_count': N, 'docs': [{ 'id': d['id'], 'type': d['type'], 'title': d['title'], 'snippet': d['snippet'], 'source': d['source'], 'tokens': d['_tokens'], } for d in docs], 'idf': {t: round(v, 4) for t, v in sorted(idf.items()) if v > 0.1}, } out_path = ROOT / 'index.json' out_path.write_text(json.dumps(index, ensure_ascii=True, indent=2)) print(f"Index built: {N} documents, {len(idf)} unique terms -> {out_path}") return index def search(query: str, index: dict = None, limit: int = 5) -> list: """Search the index. Returns top matches.""" if index is None: idx_path = ROOT / 'index.json' index = json.loads(idx_path.read_text()) q_tokens = set(tokenize(query)) idf = index.get('idf', {}) results = [] for doc in index.get('docs', []): d_tokens = set(doc.get('tokens', [])) overlap = q_tokens & d_tokens if not overlap: continue score = sum(idf.get(t, 0.5) for t in overlap) # Boost title matches title_tokens = set(tokenize(doc.get('title', ''))) title_overlap = q_tokens & title_tokens score += len(title_overlap) * 2.0 results.append((score, doc)) results.sort(key=lambda x: x[0], reverse=True) return [{'score': round(s, 2), **d} for s, d in results[:limit]] if __name__ == '__main__': import sys if len(sys.argv) > 1 and sys.argv[1] == 'search': query = ' '.join(sys.argv[2:]) results = search(query) for r in results: print(f"[{r['score']:.1f}] {r['title']}: {r['snippet'][:100]}") else: build_index() # Run test queries print() for q in [ "how to give enchanted sword", "effect speed player", "weather thunder storm", "execute as vs at position", "paper server port rcon", "1.21 enchantment syntax", ]: results = search(q) print(f"Query: '{q}'") for r in results[:3]: print(f" [{r['score']:.1f}] {r['title']}") print()