77efac0283
- knowledge/mc-commands/commands.json: 14 MC commands with JE syntax, args, examples, common errors, 1.21 version notes - knowledge/server-context/servers.json: all 4 servers (mc1, shrink, paper-ai, paper-dev) with full config - knowledge/build_index.py: TF-IDF indexer + search function (19 docs, 725 terms) - All command syntax validated live on dev server via RCON (12/13 passed) - PLAN.md: mark Phase 1.3 complete
220 lines
7.0 KiB
Python
220 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Build a simple TF-IDF-based search index over the knowledge corpus.
|
|
|
|
Indexes:
|
|
- knowledge/mc-commands/commands.json (command reference)
|
|
- knowledge/server-context/servers.json (server configs)
|
|
- knowledge/wiki-chunks/*.json (wiki content, if present)
|
|
|
|
Outputs: knowledge/index.json
|
|
|
|
Usage: python3 knowledge/build_index.py
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
import math
|
|
|
|
ROOT = Path(__file__).resolve().parent
|
|
|
|
|
|
def tokenize(text: str) -> list:
|
|
"""Simple whitespace + punctuation tokenizer."""
|
|
return re.findall(r'[a-z0-9_:/.]{2,}', (text or '').lower())
|
|
|
|
|
|
def build_command_docs(commands_path: Path) -> list:
|
|
"""Build searchable documents from commands.json."""
|
|
docs = []
|
|
if not commands_path.exists():
|
|
return docs
|
|
commands = json.loads(commands_path.read_text())
|
|
for cmd in commands:
|
|
name = cmd.get('command', '')
|
|
# Build a text blob from all fields
|
|
parts = [
|
|
f"/{name} command",
|
|
cmd.get('description', ''),
|
|
' '.join(cmd.get('je_syntax', [])),
|
|
]
|
|
# Arguments
|
|
for arg_name, arg_info in cmd.get('arguments', {}).items():
|
|
if isinstance(arg_info, dict):
|
|
parts.append(f"{arg_name}: {arg_info.get('description', '')}")
|
|
else:
|
|
parts.append(f"{arg_name}: {arg_info}")
|
|
# Examples
|
|
for ex_name, ex_val in cmd.get('examples', {}).items():
|
|
parts.append(f"example {ex_name}: {ex_val}")
|
|
# Common errors
|
|
for err in cmd.get('common_errors', []):
|
|
parts.append(f"common error: {err}")
|
|
# Version notes
|
|
parts.append(cmd.get('version_notes', ''))
|
|
|
|
text = '\n'.join(p for p in parts if p)
|
|
snippet = f"/{name}: {cmd.get('description', '')}. Syntax: {'; '.join(cmd.get('je_syntax', [])[:2])}"
|
|
|
|
docs.append({
|
|
'id': f'cmd_{name}',
|
|
'type': 'command',
|
|
'title': f'/{name}',
|
|
'text': text,
|
|
'snippet': snippet[:300],
|
|
'source': 'mc-commands/commands.json',
|
|
})
|
|
return docs
|
|
|
|
|
|
def build_server_docs(servers_path: Path) -> list:
|
|
"""Build searchable documents from servers.json."""
|
|
docs = []
|
|
if not servers_path.exists():
|
|
return docs
|
|
data = json.loads(servers_path.read_text())
|
|
for srv in data.get('servers', []):
|
|
name = srv.get('name', '')
|
|
text = json.dumps(srv, indent=2)
|
|
snippet = f"Server '{name}': {srv.get('type', '')} {srv.get('version', '')} on port {srv.get('game_port', '')}. {srv.get('notes', '')}"
|
|
docs.append({
|
|
'id': f'srv_{name}',
|
|
'type': 'server',
|
|
'title': f'Server: {name}',
|
|
'text': text,
|
|
'snippet': snippet[:300],
|
|
'source': 'server-context/servers.json',
|
|
})
|
|
# Version notes as a separate doc
|
|
vn = data.get('version_notes', {})
|
|
if vn:
|
|
text = '\n'.join(f"{k}: {v}" for k, v in vn.items())
|
|
docs.append({
|
|
'id': 'version_notes',
|
|
'type': 'reference',
|
|
'title': 'Minecraft 1.21 Version Notes',
|
|
'text': text,
|
|
'snippet': text[:300],
|
|
'source': 'server-context/servers.json',
|
|
})
|
|
return docs
|
|
|
|
|
|
def build_wiki_docs(wiki_dir: Path) -> list:
|
|
"""Build searchable documents from wiki chunk files."""
|
|
docs = []
|
|
if not wiki_dir.exists():
|
|
return docs
|
|
for p in wiki_dir.glob('*.json'):
|
|
try:
|
|
chunks = json.loads(p.read_text())
|
|
if isinstance(chunks, list):
|
|
for i, chunk in enumerate(chunks):
|
|
text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk)
|
|
title = chunk.get('title', p.stem) if isinstance(chunk, dict) else p.stem
|
|
docs.append({
|
|
'id': f'wiki_{p.stem}_{i}',
|
|
'type': 'wiki',
|
|
'title': title,
|
|
'text': text,
|
|
'snippet': text[:300],
|
|
'source': f'wiki-chunks/{p.name}',
|
|
})
|
|
except Exception:
|
|
pass
|
|
return docs
|
|
|
|
|
|
def build_index():
|
|
"""Build and save the search index."""
|
|
docs = []
|
|
docs.extend(build_command_docs(ROOT / 'mc-commands' / 'commands.json'))
|
|
docs.extend(build_server_docs(ROOT / 'server-context' / 'servers.json'))
|
|
docs.extend(build_wiki_docs(ROOT / 'wiki-chunks'))
|
|
|
|
# Build TF-IDF
|
|
doc_freq = Counter()
|
|
for doc in docs:
|
|
tokens = set(tokenize(doc['text']))
|
|
doc['_tokens'] = list(tokens)
|
|
for t in tokens:
|
|
doc_freq[t] += 1
|
|
|
|
N = len(docs)
|
|
idf = {t: math.log(N / (1 + df)) for t, df in doc_freq.items()}
|
|
|
|
# Store index
|
|
index = {
|
|
'generated_at': __import__('time').time(),
|
|
'doc_count': N,
|
|
'docs': [{
|
|
'id': d['id'],
|
|
'type': d['type'],
|
|
'title': d['title'],
|
|
'snippet': d['snippet'],
|
|
'source': d['source'],
|
|
'tokens': d['_tokens'],
|
|
} for d in docs],
|
|
'idf': {t: round(v, 4) for t, v in sorted(idf.items()) if v > 0.1},
|
|
}
|
|
|
|
out_path = ROOT / 'index.json'
|
|
out_path.write_text(json.dumps(index, ensure_ascii=True, indent=2))
|
|
print(f"Index built: {N} documents, {len(idf)} unique terms -> {out_path}")
|
|
return index
|
|
|
|
|
|
def search(query: str, index: dict = None, limit: int = 5) -> list:
|
|
"""Search the index. Returns top matches."""
|
|
if index is None:
|
|
idx_path = ROOT / 'index.json'
|
|
index = json.loads(idx_path.read_text())
|
|
|
|
q_tokens = set(tokenize(query))
|
|
idf = index.get('idf', {})
|
|
results = []
|
|
|
|
for doc in index.get('docs', []):
|
|
d_tokens = set(doc.get('tokens', []))
|
|
overlap = q_tokens & d_tokens
|
|
if not overlap:
|
|
continue
|
|
score = sum(idf.get(t, 0.5) for t in overlap)
|
|
# Boost title matches
|
|
title_tokens = set(tokenize(doc.get('title', '')))
|
|
title_overlap = q_tokens & title_tokens
|
|
score += len(title_overlap) * 2.0
|
|
results.append((score, doc))
|
|
|
|
results.sort(key=lambda x: x[0], reverse=True)
|
|
return [{'score': round(s, 2), **d} for s, d in results[:limit]]
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
if len(sys.argv) > 1 and sys.argv[1] == 'search':
|
|
query = ' '.join(sys.argv[2:])
|
|
results = search(query)
|
|
for r in results:
|
|
print(f"[{r['score']:.1f}] {r['title']}: {r['snippet'][:100]}")
|
|
else:
|
|
build_index()
|
|
# Run test queries
|
|
print()
|
|
for q in [
|
|
"how to give enchanted sword",
|
|
"effect speed player",
|
|
"weather thunder storm",
|
|
"execute as vs at position",
|
|
"paper server port rcon",
|
|
"1.21 enchantment syntax",
|
|
]:
|
|
results = search(q)
|
|
print(f"Query: '{q}'")
|
|
for r in results[:3]:
|
|
print(f" [{r['score']:.1f}] {r['title']}")
|
|
print()
|