From 8d323c9393dfddf9aa2aa38f29d9793e4f37d0ad Mon Sep 17 00:00:00 2001 From: Seth Freiberg Date: Fri, 13 Mar 2026 05:29:34 +0000 Subject: [PATCH] Add SearchXNG and SethSearch project context docs --- .gitignore | 5 ++++ CONTEXT.md | 56 +++++++++++++++++++++++++++++++++++++++++ README.md | 28 +++++++++++++++++++++ api/README.md | 24 ++++++++++++++++++ context/ARCHITECTURE.md | 31 +++++++++++++++++++++++ context/OPERATIONS.md | 36 ++++++++++++++++++++++++++ searchxng/README.md | 19 ++++++++++++++ 7 files changed, 199 insertions(+) create mode 100644 .gitignore create mode 100644 CONTEXT.md create mode 100644 README.md create mode 100644 api/README.md create mode 100644 context/ARCHITECTURE.md create mode 100644 context/OPERATIONS.md create mode 100644 searchxng/README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..54a87fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc +__pycache__/ +.DS_Store +*.log +.env diff --git a/CONTEXT.md b/CONTEXT.md new file mode 100644 index 0000000..803bf0c --- /dev/null +++ b/CONTEXT.md @@ -0,0 +1,56 @@ +# SearchXNG Context + +Last updated: 2026-03-13 05:27:14 UTC + +## Homelab placement + +- Cluster: `sethpc` +- SearXNG: + - CT: `119` + - Node: `pve173` + - URL: `https://searxng.sethpc.xyz` and `https://search.sethpc.xyz` + - Config: `/etc/searxng/settings.yml` +- SethSearch API: + - CT: `620` + - Node: `pve173` + - URL: `https://sethsearch.sethpc.xyz` + - Service: `sethsearch.service` + - App path: `/opt/sethsearch/sethsearch.py` + - Config: `/opt/sethsearch/config.json` +- Caddy: + - CT: `600` + - Node: `pve241` + - Config: `/etc/caddy/Caddyfile` + +## Search engines in use + +- `sethsearch` (`shortcut: ss`, category: `general`) + - URL: `https://sethsearch.sethpc.xyz/search?q={query}&source=general&limit=40` + - Weight: `5.0` +- `sethflix` (`shortcut: sfx`, category: `videos`) + - URL: `https://sethsearch.sethpc.xyz/search?q={query}&source=sethflix&limit=40` + - Weight: `5.0` +- `libretranslate` (`shortcut: lt`) + - Base URL: `https://translate.sethpc.xyz` + +## SethSearch sources + +- `sites`: Caddy host/domain catalog with tags. +- `gitea`: public repositories. +- `wikijs`: public crawl/fallback page catalog. +- `wordpress`: public pages/posts from `sethfreiberg.com`. +- `emby`: media discovery index (links require account session). +- `freshrss`: article index with stricter matching and lower weight. + +## Matching policy + +- General (`source=general`): includes Emby with stricter matching. +- Sethflix (`source=sethflix`): Emby only with liberal matching. +- FreshRSS: strict term matching and lower source weight. + +## API endpoints + +- Health: `GET /health` +- Search: `GET /search?q=&source=&limit=` +- Stats: `GET /stats` +- Manual sync: `POST /sync` diff --git a/README.md b/README.md new file mode 100644 index 0000000..84e00fd --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# SearchXNG Project + +Unified workspace for: + +- `searchxng/`: SearXNG settings, engine wiring, and instance branding context. +- `api/`: SethSearch API context, indexing rules, and source integration details. +- `context/`: operational runbooks and architecture docs. + +## Stack + +- SearXNG instance: `searxng.sethpc.xyz` / `search.sethpc.xyz` (CT 119 on `pve173`) +- SethSearch API: `sethsearch.sethpc.xyz` (CT 620 on `pve173`) +- Reverse proxy: Caddy CT 600 (`pve241`) + +## Key behavior + +- `sethsearch` engine (general): searches homelab/site/content sources with strict FreshRSS and strict Emby matching. +- `sethflix` engine (videos): Emby-only search for media discovery. +- LibreTranslate integrated via `translate.sethpc.xyz`. + +## Quick Ops + +- Restart SethSearch API: + - `ssh pve173 "pct exec 620 -- systemctl restart sethsearch"` +- Restart SearXNG: + - `ssh pve173 "pct exec 119 -- systemctl restart searxng"` +- Validate Caddy: + - `ssh pve241 "pct exec 600 -- caddy validate --config /etc/caddy/Caddyfile"` diff --git a/api/README.md b/api/README.md new file mode 100644 index 0000000..cb7f84c --- /dev/null +++ b/api/README.md @@ -0,0 +1,24 @@ +# SethSearch API Layer + +## Live deployment + +- Host CT: 620 (`sethsearch-api`) +- URL: `https://sethsearch.sethpc.xyz` +- App: `/opt/sethsearch/sethsearch.py` +- Config: `/opt/sethsearch/config.json` + +## Source groups + +- `source=general`: sites, gitea, wikijs, wordpress, freshrss, emby (strict) +- `source=sethflix`: emby (liberal) + +## Weighting overview + +- Higher: sites, gitea, wikijs, wordpress, emby +- Lower + strict: freshrss + +## Maintenance + +- Manual re-index: `POST /sync` +- Health check: `GET /health` +- Index summary: `GET /stats` diff --git a/context/ARCHITECTURE.md b/context/ARCHITECTURE.md new file mode 100644 index 0000000..d828c8b --- /dev/null +++ b/context/ARCHITECTURE.md @@ -0,0 +1,31 @@ +# Architecture + +## Request flow + +1. User query enters SearXNG (`search.sethpc.xyz`). +2. SearXNG calls `json_engine` endpoint at SethSearch API. +3. SethSearch queries local SQLite FTS5 index and returns normalized results. +4. SearXNG merges SethSearch with other engines and renders the result page. + +## Data plane + +- Index DB: `/opt/sethsearch/articles.db` +- Tables: + - `documents` (canonical indexed records) + - `documents_fts` (FTS5 virtual table) +- Source-level scoring and matching occur in SethSearch. + +## Source adapters + +- Caddy snapshot parser: domain discovery and tag generation. +- Gitea adapter: public repo metadata via REST. +- Wiki.js adapter: public crawl with fallback records. +- WordPress adapter: public posts/pages via `/wp-json/wp/v2/...`. +- Emby adapter: media index using server API token and deep links. +- FreshRSS adapter: GReader API article ingest. + +## Reliability model + +- SethSearch syncs sources independently. +- If one source fails, others continue and commit. +- Service runs under systemd with restart policy. diff --git a/context/OPERATIONS.md b/context/OPERATIONS.md new file mode 100644 index 0000000..642190d --- /dev/null +++ b/context/OPERATIONS.md @@ -0,0 +1,36 @@ +# Operations Runbook + +## Common commands + +- SethSearch service status: + - `ssh pve173 "pct exec 620 -- systemctl status sethsearch --no-pager"` +- SethSearch logs: + - `ssh pve173 "pct exec 620 -- journalctl -u sethsearch -n 100 --no-pager"` +- SearXNG service status: + - `ssh pve173 "pct exec 119 -- systemctl status searxng --no-pager"` +- SearXNG logs: + - `ssh pve173 "pct exec 119 -- journalctl -u searxng -n 100 --no-pager"` + +## Verify behavior + +- General search endpoint: + - `curl -s "https://sethsearch.sethpc.xyz/search?q=home&source=general&limit=5"` +- Sethflix endpoint: + - `curl -s "https://sethsearch.sethpc.xyz/search?q=always%20sunny&source=sethflix&limit=5"` +- Stats: + - `curl -s "https://sethsearch.sethpc.xyz/stats"` + +## Config touchpoints + +- SethSearch config: `/opt/sethsearch/config.json` +- SethSearch code: `/opt/sethsearch/sethsearch.py` +- SearXNG config: `/etc/searxng/settings.yml` +- Caddy config: `/etc/caddy/Caddyfile` + +## Change protocol + +1. Edit SethSearch code/config. +2. Restart SethSearch and verify `/health` and `/stats`. +3. Edit SearXNG engines (if needed). +4. Restart SearXNG and verify `/config` engine list. +5. Validate top query use-cases. diff --git a/searchxng/README.md b/searchxng/README.md new file mode 100644 index 0000000..1168129 --- /dev/null +++ b/searchxng/README.md @@ -0,0 +1,19 @@ +# SearXNG Layer + +This folder documents SearXNG-side integration with SethSearch. + +## Active custom engines + +- `sethsearch` (general, highest weight) +- `sethflix` (videos, Emby-only) +- `libretranslate` (translate) + +## Live config location + +- `/etc/searxng/settings.yml` in CT 119 on `pve173` + +## Important notes + +- SearXNG blocks plain HTTP in engine requests; use HTTPS endpoints. +- Engine names should be lowercase to avoid startup warnings. +- `use_default_settings: true` allows small override file patterns.