commit 1bc6e5737473cabcdaa8d073cec9f5f0d938bbf6 Author: kianiadee Date: Sun May 17 23:29:55 2026 +0300 Initial: plan, log-proxy app, README, gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3e9e02a --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +.venv/ +venv/ +.env +.env.* +!.env.example + +# n8n local exports / dumps +n8n/exports/ +n8n/credentials.json + +# Editor +.vscode/ +.idea/ +*.swp +.DS_Store + +# Secrets — should never be committed +secrets/ +*.pem +*.key +api-keys.txt diff --git a/260517_docker_n8n_logging.md b/260517_docker_n8n_logging.md new file mode 100644 index 0000000..1813fb3 --- /dev/null +++ b/260517_docker_n8n_logging.md @@ -0,0 +1,209 @@ +# n8n Docker-log Alerting (ntfy + WhatsApp) + +## Context + +The user runs a Coolify host at `twala.rahamafresh.com` with ~50 containers across ~15 logically distinct services (tracksolid telemetry pipeline, Coolify itself, n8n stacks, Supabase, Chatwoot, Evolution API, Dekart, Forgejo, Ente, Garage, etc.). They want **n8n to read Docker logs directly, segment by service, apply per-service thresholds, and notify via ntfy and WhatsApp**. + +Dozzle is explicitly out of scope as an integration source — it stays as a human-facing log viewer. The integration design must not depend on it. + +Why this matters: today, errors in any container are invisible until someone opens Dozzle. Critical issues (panics, OOMs, ingest failures on the tracksolid pipeline) can sit unnoticed for hours. The goal is per-service alerting with severity-aware routing, with thresholds tunable per service so that noisy services don't drown out quiet ones. + +## Decisions (locked with the user) + +| Choice | Decision | +| --- | --- | +| n8n instance | `n8n-o55elukmxacgp1s2xcwktyam` (queue mode: main + worker + task-runners + Postgres + Redis) | +| Docker log access | New **read-only log-proxy** container — n8n never touches `/var/run/docker.sock` | +| Service grouping | Auto-derive from each container's `COOLIFY_RESOURCE_UUID` env var | +| Channels | Self-hosted ntfy (new Coolify service) **+** existing Evolution API (WhatsApp) | +| Git | Workspace `/Users/kianiadee/Downloads/projects/03_dozzle_n8n` is **not** a git repo yet — user creates separate repo later | + +## Architecture + +``` + n8n queue-mode (o55elukmxacgp1s2xcwktyam) + ┌────────────────────────────────────────┐ + Docker Engine log-proxy │ Workflow: Poll & Evaluate (per group) │ + ┌──────────────┐ (new svc) │ 1. GET /logs/?since= │ + │ /var/run/ │ ◄─── RO socket ────► │ 2. regex → severity │ + │ docker.sock │ HTTP API │ 3. threshold + cooldown via │ + └──────────────┘ (internal net) ◄─┤ getWorkflowStaticData() │ + │ 4. emit Alert event │ + │ │ + │ Workflow: Notify (single, parametric) │ + │ severity=critical → ntfy + WhatsApp │ + │ severity=error → ntfy │ + │ severity=warn → ntfy (low prio) │ + └────────────────────┬───────────────────┘ + │ + ┌──────────────────────┴──────────────────────┐ + ▼ ▼ + ntfy (self-hosted via Coolify) Evolution API (api-vc4ok...) + POST / POST /message/sendText/ +``` + +## Components + +### 1. log-proxy (new container) + +**Purpose**: the only thing with `docker.sock` access. Dumb pipe — no alerting logic. + +**Image**: small Python/FastAPI or Node/Fastify app (~50 lines). Build from source in this repo. + +**Mount**: `/var/run/docker.sock` read-only. + +**Network**: joined to the n8n stack's Coolify network so n8n can reach it by hostname; **no Traefik route** (not publicly reachable). + +**API** (no auth needed — internal only; optional bearer token for defence in depth): + +- `GET /services` — `[{ "group": "bo3no...", "name": "tracksolid", "containers": [...] }, ...]` + - Groups containers by `COOLIFY_RESOURCE_UUID` env var. + - Filtered to the allow-list in `/config/groups.yml` — UUIDs not listed are skipped entirely. +- `GET /logs/?since=&until=&limit=2000` + - Calls Docker Engine API `GET /containers//logs?stdout=1&stderr=1&since=...&until=...×tamps=1` for every container in the group. + - Returns NDJSON or JSON array of `{ container, ts, stream, line }`. + - `since` defaults to "now − 60s" if absent; `until` defaults to "now". +- `GET /healthz` + +**Why a proxy and not direct socket-into-n8n**: any n8n editor user becomes root-on-host if n8n has the socket. Proxy keeps blast radius small and the API surface inspectable. + +### 2. Self-hosted ntfy + +Deploy via Coolify's one-click marketplace (or as a Docker Compose service). + +- Suggested FQDN (matches your existing pattern): `ntfy.rahamafresh.com` +- Auth: enable `auth-default-access: deny-all`; create per-topic users (one publisher user for n8n, plus client users for each subscriber). +- Topics: one per service group, e.g. `tracksolid-alerts`, `coolify-alerts`, `evolution-api-alerts`. Subscribe on phones via the ntfy mobile app. + +### 3. n8n workflows (in `n8n-o55elukmxacgp1s2xcwktyam`) + +**A. Poll & Evaluate** (one workflow per service group — easiest to tune independently) + +Nodes: + +1. **Schedule Trigger** — every 30s (tunable per group). +2. **Static Data Read** — pull `last_cursor` from `$getWorkflowStaticData('global').cursor`. +3. **HTTP Request** — `GET http://log-proxy:8080/logs/?since=`. +4. **Function (Pattern Match)** — for each line, run severity regexes (from workflow Variables) and emit `{ severity, pattern, container, ts, line, fingerprint }` where `fingerprint = sha256(group:pattern:container)` (used for cooldown). +5. **Function (Threshold + Cooldown)**: + - `critical`: emit immediately if not in cooldown. + - `error`: count rolling matches per fingerprint over `window` minutes; emit when threshold crossed. + - `warn`: same but larger window / threshold. + - Cooldown: `staticData.cooldowns[fingerprint] = now + cooldown_minutes`; skip while still hot. +6. **Static Data Write** — update `cursor = max(ts seen)` and `cooldowns`. +7. **Execute Workflow** — call the **Notify** workflow once per emitted Alert. + +**B. Notify** (single parametric workflow; called by each Poll workflow) + +Input: `{ group, severity, pattern, container, ts, line, fingerprint }` + +Nodes: + +1. **Switch** on `severity`. +2. **critical** branch: + - **HTTP Request** → ntfy: `POST https://ntfy.rahamafresh.com/-alerts` with priority=5, tags=`rotating_light`. + - **HTTP Request** → Evolution API: `POST https:///message/sendText/` with `{ number, text }`. Credentials via n8n credentials store. +3. **error** branch: ntfy only, priority=4. +4. **warn** branch: ntfy only, priority=3. +5. **Append-row** (Postgres node, optional) → `alerts_audit` table for history. + +### 4. Defaults (tunable per group via workflow Variables) + +| Severity | Default patterns | Threshold | Cooldown | Routing | +| --- | --- | --- | --- | --- | +| critical | `panic`, `FATAL`, `OOMKilled`, `out of memory`, `segmentation fault` | immediate (1 match) | 30 min | ntfy + WhatsApp | +| error | `\bERROR\b`, `Exception`, `Traceback`, `5\d\d ` (HTTP 5xx) | 10 / 5 min | 15 min | ntfy | +| warn | `\bWARN(ING)?\b`, `deadlock`, `timeout` | 50 / 15 min | 30 min | ntfy (low prio) | + +These live as a JSON object in each workflow's Variables, so per-group tuning is one edit. + +### 5. Group naming + +Friendly names mapped from Coolify resource UUID — sourced from `groups.yml` mounted into log-proxy. **`groups.yml` is also the allow-list**: only UUIDs listed here are monitored. Anything else the proxy sees on the host is ignored — non-mission-critical apps don't generate noise or burn polling cycles. + +```yaml +bo3nov2ija7g8wn9b1g2paxs: tracksolid +o55elukmxacgp1s2xcwktyam: n8n-prod +usoksgg8o40044g0cw08s8wc: n8n-simple +vc4ok84gw4s0kcgwwg8gooco: evolution-api +ks4sc8k4804swk0c0c4kk44c: chatwoot +foo048cw4skg8kswwsowwo0c: forgejo +u7rj0du43d33ncurig2t6ni1: dekart +e11bva63bu7swlq6zyfckxm3: rustfs +now8k08wcs044scwggos0wos: dozzle +# Coolify core, Supabase, shutterdiplomacy → handled as their own groups +# +# Explicitly NOT monitored (non-mission-critical, per user 2026-05-17): +# dy82njm7qgb5f2m573d1u3rh garage +# r77s24tgmfifmpfqe86xyqsp ente +# vw0wk0cg8gkwgwogsg4k0gsg excalidraw +``` + +Implication on the proxy: `GET /services` returns only allow-listed groups; `GET /logs/` 404s for non-allow-listed UUIDs. To start monitoring a service later, add a single line to `groups.yml` and clone a Poll workflow. + +## Workspace layout + +``` +/Users/kianiadee/Downloads/projects/03_dozzle_n8n/ ← no git yet +├── log-proxy/ +│ ├── Dockerfile +│ ├── app.py (FastAPI: /services, /logs/, /healthz) +│ ├── requirements.txt +│ └── groups.yml (UUID → friendly-name map) +├── ntfy/ +│ └── README.md (Coolify deploy notes + topic / user setup) +├── n8n/ +│ └── workflows/ +│ ├── poll-tracksolid.json +│ ├── poll-coolify.json +│ ├── poll-evolution.json +│ ├── poll-.json ← one per group, derived from a template +│ └── notify.json ← parametric fan-out +├── coolify/ +│ └── log-proxy.compose.yml (for Coolify "Docker Compose" service) +└── README.md (operating runbook: how to add a group, tune thresholds, rotate ntfy creds) +``` + +## Implementation steps (ordered) + +1. **Build log-proxy** locally (`log-proxy/`). Test against the remote docker socket via `docker context` or just deploy and iterate. +2. **Deploy log-proxy via Coolify** as a Docker Compose service. Attach to the same network as `n8n-o55...`. No Traefik route. Verify `GET /services` and `GET /logs/` from inside the n8n container (`docker exec n8n-o55... wget -qO- http://log-proxy:8080/services`). +3. **Deploy self-hosted ntfy via Coolify** at `ntfy.rahamafresh.com`. Configure deny-all default and one publisher user. Subscribe phones to test topic. +4. **Build the parametric Notify workflow** in n8n. Add credentials: `ntfy_publisher` (HTTP basic), `evolution_api` (header auth). Test by manually firing each branch. +5. **Build the Poll & Evaluate workflow** for **one group first** (suggest `tracksolid` — highest business value). Validate thresholds with a synthetic log line (`docker exec ingest_events-bo3no... sh -c 'echo FATAL test'` or similar). +6. **Clone the Poll workflow per remaining group**. Tune patterns / thresholds in Variables. +7. **Tune & quiet**: run for 24h, capture false positives, adjust regex / thresholds. +8. **Document** in `README.md` how to add a new group when Coolify spins up a new service. + +## Critical files + +- `log-proxy/app.py` — the only thing with docker.sock access. Treat as security-sensitive; no write endpoints, no shell-out. +- `log-proxy/groups.yml` — single source of truth for UUID → friendly name. Keep in sync as Coolify services are added. +- `n8n/workflows/notify.json` — fan-out logic; any new channel (Slack, email) is added here, not in each poll workflow. +- `n8n/workflows/poll-.json` — per-group thresholds. Variables block at the top is the only thing operators normally edit. +- `coolify/log-proxy.compose.yml` — controls log-proxy deployment + network attachment. Misconfiguring network = n8n can't reach proxy. + +## Reused / existing infrastructure + +- **n8n queue mode** `n8n-o55elukmxacgp1s2xcwktyam` — runs the workflows; its built-in Postgres + Redis cover persistence and queueing. No new DB needed. +- **Evolution API** `api-vc4ok84gw4s0kcgwwg8gooco` — already deployed; we only consume its REST API. +- **Coolify Sentinel** `coolify-sentinel` — left untouched; could later feed container-down events into the same Notify workflow if desired. +- **Coolify networks + Traefik** — handle internal service discovery and TLS for ntfy. +- **All Coolify-managed containers already carry `COOLIFY_RESOURCE_UUID`** — confirmed via `docker inspect` on the Dozzle container in the previous session. This is what makes auto-grouping possible without a hand-written container list. + +## Open items to gather at implementation time + +- `ntfy.rahamafresh.com` DNS record (or chosen FQDN). +- Evolution API: instance name, API key, target WhatsApp number(s). +- Confirmation of which Coolify network `n8n-o55...` runs on (read from `docker inspect` at implementation start). +- Optional: bearer token value for log-proxy if defence-in-depth is wanted. + +## Verification + +1. **log-proxy unit checks**: from inside n8n container, `curl http://log-proxy:8080/services` returns all groups; `curl http://log-proxy:8080/logs/tracksolid?since=$(date -d '5 minutes ago' +%s)` returns recent lines from all tracksolid containers. +2. **End-to-end critical alert**: run `docker run --rm alpine sh -c 'echo "FATAL synthetic test from $(date)"'` inside a tracksolid container; within 30s, ntfy topic `tracksolid-alerts` receives a high-priority message AND WhatsApp number receives the same. +3. **Threshold smoke test**: emit 11 lines containing `ERROR` to a single container over 30s; expect exactly one ntfy notification, not eleven. +4. **Cooldown smoke test**: trigger the same critical alert twice within the cooldown window; expect only one notification. +5. **Cursor durability**: restart the n8n worker; confirm cursor in `getWorkflowStaticData` persisted in Postgres and no logs were re-processed or skipped. +6. **Per-group isolation**: deliberately spam errors in one group; confirm other groups' workflows are unaffected (separate workflow = separate static data, separate schedule). +7. **Read-only safety**: from inside n8n, attempt `POST http://log-proxy:8080/anything` — expect 404/405. Confirm `docker.sock` is not mounted inside n8n. diff --git a/README.md b/README.md new file mode 100644 index 0000000..af97772 --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# dozzle_n8n_logging + +n8n-driven Docker log alerting for the Coolify host at `twala.rahamafresh.com`. Critical errors fan out to **ntfy** (self-hosted) and **WhatsApp** (via Evolution API); lower-severity events go to ntfy only. + +Dozzle stays as the human-facing log viewer. This project does **not** integrate with Dozzle — it reads Docker logs independently via a small read-only proxy. + +## Layout + +``` +log-proxy/ FastAPI app, only thing with docker.sock access. /services + /logs/ + /healthz. +coolify/ Coolify Docker Compose file for log-proxy. +ntfy/ Deploy notes for self-hosted ntfy. +n8n/ Exported workflow JSON (poll-.json + notify.json). +``` + +## Architecture + +``` +Docker Engine log-proxy (RO sock) n8n queue-mode + socket ────► HTTP /logs/ ────► poll → match → threshold → notify + │ + ┌───────────────────┴───────────────────┐ + ▼ ▼ + ntfy.rahamafresh.com Evolution API (WhatsApp) +``` + +Service groups are auto-derived from each container's `COOLIFY_RESOURCE_UUID`. The allow-list (and friendly names) live in `log-proxy/groups.yml`. + +Severity defaults: + +| Severity | Threshold | Cooldown | Channels | +| --- | --- | --- | --- | +| critical | 1 match (immediate) | 30 min | ntfy + WhatsApp | +| error | 10 / 5 min | 15 min | ntfy | +| warn | 50 / 15 min | 30 min | ntfy (low prio) | + +See `260517_docker_n8n_logging.md` for the full design rationale. + +## Adding a new service group + +1. Append a line to `log-proxy/groups.yml`: `: ` +2. Restart the `log-proxy` Coolify service +3. In n8n: duplicate `poll-tracksolid` workflow, retarget its `group` Variable, tune severity patterns/thresholds, activate + +## Operating + +- Tune thresholds: edit the `severity` Variables block at the top of each `poll-` workflow. +- Silence during maintenance: deactivate the workflow in n8n (or set a global `silenced=true` flag in the Notify workflow's Variables). +- Rotate ntfy publisher credential: update the credential in n8n; restart workflows. + +## Status + +Bootstrap (2026-05-17): log-proxy code complete; n8n workflows, ntfy deploy, and Coolify deploy still pending. diff --git a/log-proxy/Dockerfile b/log-proxy/Dockerfile new file mode 100644 index 0000000..3e4577f --- /dev/null +++ b/log-proxy/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app.py . + +ENV GROUPS_PATH=/config/groups.yml \ + PORT=8080 + +EXPOSE 8080 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD python -c "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8080/healthz', timeout=3).status==200 else 1)" + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080", "--log-level", "info"] diff --git a/log-proxy/app.py b/log-proxy/app.py new file mode 100644 index 0000000..0771e3e --- /dev/null +++ b/log-proxy/app.py @@ -0,0 +1,148 @@ +"""log-proxy: read-only Docker logs API for n8n. + +Only endpoint surface: + GET /healthz liveness + GET /services list allow-listed Coolify service groups + their containers + GET /logs/ pull recent log lines from every container in the group + +No write endpoints. No shell-out. Docker socket is RO-mounted at /var/run/docker.sock. +""" + +from __future__ import annotations + +import os +import re +import time +from datetime import datetime +from typing import Iterable + +import docker +import yaml +from fastapi import FastAPI, HTTPException, Query +from fastapi.responses import JSONResponse + +GROUPS_PATH = os.getenv("GROUPS_PATH", "/config/groups.yml") +DOCKER_SOCK = os.getenv("DOCKER_SOCK", "unix:///var/run/docker.sock") +COOLIFY_UUID_ENV = "COOLIFY_RESOURCE_UUID" + +app = FastAPI(title="log-proxy", version="0.1.0") +docker_client = docker.DockerClient(base_url=DOCKER_SOCK, timeout=30) + +# ISO timestamp prefix Docker emits when timestamps=True +TS_NANO_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z) (.*)$") + + +def load_groups() -> dict[str, str]: + """Read the UUID -> friendly-name allow-list. Empty if file missing.""" + try: + with open(GROUPS_PATH) as fh: + data = yaml.safe_load(fh) or {} + except FileNotFoundError: + return {} + return {k: v for k, v in data.items() if isinstance(k, str) and isinstance(v, str)} + + +def container_uuid(container) -> str | None: + env_list = container.attrs.get("Config", {}).get("Env") or [] + for item in env_list: + if item.startswith(f"{COOLIFY_UUID_ENV}="): + return item.split("=", 1)[1] + return None + + +def resolve_group(name_or_uuid: str, allowed: dict[str, str]) -> str | None: + """Accept either the UUID or the friendly name. Return UUID, or None if unknown.""" + if name_or_uuid in allowed: + return name_or_uuid + for uuid, friendly in allowed.items(): + if friendly == name_or_uuid: + return uuid + return None + + +def monitored_containers(allowed: dict[str, str]) -> Iterable[tuple[object, str]]: + """Yield (container, uuid) for every running container whose UUID is allow-listed.""" + for c in docker_client.containers.list(all=False): + uuid = container_uuid(c) + if uuid and uuid in allowed: + yield c, uuid + + +def parse_ts(prefix: str) -> float | None: + """Docker emits nanosecond precision; Python only takes microseconds. Truncate.""" + iso = prefix + if iso.endswith("Z"): + iso = iso[:-1] + "+00:00" + iso = re.sub(r"(\.\d{6})\d+", r"\1", iso) + try: + return datetime.fromisoformat(iso).timestamp() + except ValueError: + return None + + +@app.get("/healthz") +def healthz(): + try: + docker_client.ping() + except Exception as exc: + raise HTTPException(status_code=503, detail=f"docker unreachable: {exc}") from exc + return {"ok": True} + + +@app.get("/services") +def services(): + allowed = load_groups() + by_uuid: dict[str, dict] = {} + for c, uuid in monitored_containers(allowed): + entry = by_uuid.setdefault(uuid, {"group": uuid, "name": allowed[uuid], "containers": []}) + entry["containers"].append(c.name) + return JSONResponse([by_uuid[u] for u in sorted(by_uuid)]) + + +@app.get("/logs/{group}") +def logs( + group: str, + since: int | None = Query(None, description="Unix seconds; default now-60"), + until: int | None = Query(None, description="Unix seconds; default now"), + limit: int = Query(2000, ge=1, le=10000), +): + allowed = load_groups() + target_uuid = resolve_group(group, allowed) + if target_uuid is None: + raise HTTPException(status_code=404, detail=f"Unknown group: {group}") + + now = int(time.time()) + since_ts = since if since is not None else now - 60 + until_ts = until if until is not None else now + + out: list[dict] = [] + for c in docker_client.containers.list(all=False): + if container_uuid(c) != target_uuid: + continue + try: + raw = c.logs( + stdout=True, + stderr=True, + since=since_ts, + until=until_ts, + timestamps=True, + tail=limit, + ) + except Exception: + continue + if not raw: + continue + for raw_line in raw.decode("utf-8", errors="replace").splitlines(): + if not raw_line.strip(): + continue + match = TS_NANO_RE.match(raw_line) + if match: + ts_val = parse_ts(match.group(1)) or float(since_ts) + msg = match.group(2) + else: + ts_val = float(since_ts) + msg = raw_line + out.append({"container": c.name, "ts": ts_val, "line": msg}) + + out.sort(key=lambda m: m["ts"]) + return JSONResponse(out[:limit]) diff --git a/log-proxy/groups.yml b/log-proxy/groups.yml new file mode 100644 index 0000000..1662c88 --- /dev/null +++ b/log-proxy/groups.yml @@ -0,0 +1,19 @@ +# UUID -> friendly name. This file is also the allow-list: +# containers whose COOLIFY_RESOURCE_UUID is not listed here are ignored entirely. +# +# Add a line, restart log-proxy, clone a Poll workflow in n8n -> new group is live. + +bo3nov2ija7g8wn9b1g2paxs: tracksolid +o55elukmxacgp1s2xcwktyam: n8n-prod +usoksgg8o40044g0cw08s8wc: n8n-simple +vc4ok84gw4s0kcgwwg8gooco: evolution-api +ks4sc8k4804swk0c0c4kk44c: chatwoot +foo048cw4skg8kswwsowwo0c: forgejo +u7rj0du43d33ncurig2t6ni1: dekart +e11bva63bu7swlq6zyfckxm3: rustfs +now8k08wcs044scwggos0wos: dozzle + +# Explicitly NOT monitored (non-mission-critical, per user 2026-05-17): +# dy82njm7qgb5f2m573d1u3rh garage +# r77s24tgmfifmpfqe86xyqsp ente +# vw0wk0cg8gkwgwogsg4k0gsg excalidraw diff --git a/log-proxy/requirements.txt b/log-proxy/requirements.txt new file mode 100644 index 0000000..cf769bf --- /dev/null +++ b/log-proxy/requirements.txt @@ -0,0 +1,4 @@ +fastapi>=0.115,<1 +uvicorn[standard]>=0.32,<1 +docker>=7.1,<8 +PyYAML>=6.0,<7