infra(db-roles): validated Option A — shared tracksolid_owner for migrators

Discovery (live) corrected the design: webhook_receiver, ingest_worker, and worker all run run_migrations.py (DDL) and write telemetry — worker is the same image as ingest_worker, not a reader. Because they ALTER objects they must own them, so all three connect as the shared non-superuser tracksolid_owner (the role the repo already intends to own these schemas). dashboard_api backend stays a reader (dashboard_app). - app_roles_tracksolid_db.sql rewritten: tracksolid_owner LOGIN + CONNECTION LIMIT 30 + GUCs + USAGE/CREATE; Timescale-aware ownership reassignment (skips table-linked sequences, ALTER MATERIALIZED VIEW for continuous aggregates, leaves reporting.v_trips with reporting_refresher, reassigns functions); dashboard_app read role. - Reassignment validated in a rolled-back transaction on the live DB: reassigns the 31-chunk position_history hypertable + the v_mileage_daily_cagg continuous aggregate, and as tracksolid_owner can ALTER the hypertable and create/drop tables. - Runbook updated: discovery marked done, ownership folded into the apply (safe while apps still run as postgres — superuser bypasses ownership), corrected cutover order. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
infra(db-roles): dedicated non-superuser roles for the six apps on postgres
2026-06-20 00:08:52 +03:00 · 2026-06-19 23:51:52 +03:00
10 changed files with 324 additions and 1412 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,5 +1,3 @@
 # NOTE: uv.lock is intentionally NOT ignored — the Dockerfile copies it for
 # reproducible `uv sync --frozen` builds.
 .git
 .venv
 __pycache__
--- a/.gitignore
+++ b/.gitignore
@ -7,5 +7,4 @@ __pycache__/
 .DS_Store
 .ruff_cache/
 .mypy_cache/
-# uv.lock IS committed on purpose — the Docker build uses `uv sync --frozen` for
+uv.lock
 # reproducible installs, and the Coolify build clones the repo (it must include it).
--- a/17
+++ b/17
@ -11,26 +11,15 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /app
 # Install ONLY dependencies (flat module — the project itself is not a package).
-# Copy the lockfile and build --frozen so rebuilds are reproducible: without it,
+COPY pyproject.toml ./
-# `uv sync` re-resolves the >= ranges in pyproject.toml and a redeploy could pull a
+RUN uv sync --no-dev --no-install-project
 # newer, behaviour-changed mcp/starlette and break the running server.
 COPY pyproject.toml uv.lock ./
 RUN uv sync --no-dev --no-install-project --frozen
 ENV PATH="/app/.venv/bin:$PATH"
 COPY analytics_mcp.py ./
 # Run as a non-root user (least privilege; nothing here needs root).
 RUN useradd -m -u 10001 app && chown -R app:app /app
 USER app
 EXPOSE 8892
 HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
  CMD python -c "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8892/healthz').status==200 else 1)" || exit 1
-# Single worker: this is a low-traffic read-only proxy for a handful of analysts, and
+CMD ["uvicorn", "analytics_mcp:app", "--host", "0.0.0.0", "--port", "8892", "--workers", "2"]
 # the DB connection budget = workers × MCP_POOL_MAX. One worker (× default pool 8) caps
 # the MCP at 8 backends instead of 16, which matters on a shared 100-connection DB.
 # Scale out by raising MCP_POOL_MAX, not workers, so the budget stays obvious.
 CMD ["uvicorn", "analytics_mcp:app", "--host", "0.0.0.0", "--port", "8892", "--workers", "1"]
--- a/analytics_mcp.py
+++ b/analytics_mcp.py
@ -26,12 +26,9 @@ Env:
 """
 from __future__ import annotations
 import contextvars
 import hmac
 import logging
 import os
 import re
 import threading
 import time
 from contextlib import contextmanager
@ -60,18 +57,11 @@ def _get_logger(name: str) -> logging.Logger:
        )
        root.addHandler(handler)
        root.setLevel(logging.INFO)
        root.propagate = False   # don't double-emit through uvicorn's root handler
    return root.getChild(name)
 log = _get_logger("server")
 # Per-request caller name, set by BearerAuth from the matched token so the tools can
 # attribute each query to an analyst in the logs. A ContextVar (not a tool arg) because
 # FastMCP tools never receive the HTTP request; anyio propagates the context into the
 # worker thread that runs each sync tool. Defaults to "?" if auth ever didn't run.
 _caller_var: contextvars.ContextVar[str] = contextvars.ContextVar("caller", default="?")
 DATABASE_URL = os.environ["DATABASE_URL"]          # analytics_ro DSN (set by deploy)
 MAX_ROWS_CEIL = int(os.getenv("MCP_MAX_ROWS", "10000"))
 # Schemas the introspection helpers (list_tables/describe_table/sample_table) expose.
@ -88,89 +78,26 @@ READABLE_SCHEMAS = tuple(
 # Force read-only + a statement timeout at the connection level (belt + braces;
 # the analytics_ro role already sets these, but a self-contained server is safer
 # in case it is ever pointed at a less-restricted DSN).
-POOL_MAX = int(os.getenv("MCP_POOL_MAX", "8"))
+_pool = psycopg2.pool.ThreadedConnectionPool(
-_POOL_OPTS = "-c default_transaction_read_only=on -c statement_timeout=30000 -c client_encoding=UTF8"
+    1,
-
+    int(os.getenv("MCP_POOL_MAX", "8")),
-
+    DATABASE_URL,
-def _init_pool(retries: int = 5, delay: float = 2.0) -> psycopg2.pool.ThreadedConnectionPool:
+    options="-c default_transaction_read_only=on -c statement_timeout=30000 -c client_encoding=UTF8",
    """Create the pool, retrying so a brief DB outage at deploy time doesn't crash
    the worker into a boot loop. minconn=0 → no eager connect at import (connections
    are opened lazily on first use)."""
    last: Exception | None = None
    for attempt in range(1, retries + 1):
        try:
            return psycopg2.pool.ThreadedConnectionPool(
                0, POOL_MAX, DATABASE_URL, options=_POOL_OPTS
            )
        except psycopg2.OperationalError as exc:
            last = exc
            log.warning("pool init attempt %d/%d failed: %s", attempt, retries, exc)
            if attempt < retries:
                time.sleep(delay)
    assert last is not None
    raise last
 _pool = _init_pool()
 # FastMCP runs each sync tool in an anyio worker thread (default ~40). Gate checkouts
 # behind a bounded semaphore so that >POOL_MAX concurrent queries QUEUE (block in
 # their worker thread) instead of overflowing the pool and raising PoolError — a 500
 # to the analyst. The event loop is never blocked; only surplus worker threads wait.
 _pool_slots = threading.BoundedSemaphore(POOL_MAX)
 # SQLSTATE classes that mean the CONNECTION is gone (vs. a query that merely failed,
 # e.g. a statement_timeout, which is QueryCanceled / 57014 and leaves the conn usable).
 # class 08 = connection exception; 57P01/02/03 = admin/crash shutdown.
 _DISCONNECT_SQLSTATES = frozenset(
    {"08000", "08003", "08006", "08001", "08004", "08007", "57P01", "57P02", "57P03"}
 )
 def _is_disconnect(exc: Exception) -> bool:
    """True only for a genuinely lost connection — so we discard/retry on a dropped
    socket but NOT on a statement_timeout or other in-session query error."""
    if isinstance(exc, psycopg2.InterfaceError):
        return True
    if isinstance(exc, psycopg2.OperationalError):
        code = getattr(exc, "pgcode", None)
        return code is None or code in _DISCONNECT_SQLSTATES   # None = socket-level failure
    return False
@contextmanager
 def _ro_conn():
-    """Read-only connection; the transaction is ALWAYS rolled back (never commits).
+    """Read-only connection; the transaction is ALWAYS rolled back (never commits)."""
    Dead connections (DB restart, network blip, crash) are DISCARDED rather than
    recycled — otherwise a single broken socket poisons the pool and every later
    query handed that connection fails until the container is recreated. A query that
    merely errors (e.g. statement_timeout) leaves the connection healthy, so it is
    rolled back and returned to the pool as normal."""
    _pool_slots.acquire()
    try:
    conn = _pool.getconn()
    except Exception:
        _pool_slots.release()
        raise
    broken = False
    try:
        conn.set_session(readonly=True, autocommit=False)
        yield conn
    except Exception as exc:
        broken = _is_disconnect(exc)
        raise
    finally:
        try:
            if broken or conn.closed:
                _pool.putconn(conn, close=True)
            else:
        try:
            conn.rollback()
                    _pool.putconn(conn)
                except (psycopg2.OperationalError, psycopg2.InterfaceError):
                    _pool.putconn(conn, close=True)
        finally:
-            _pool_slots.release()
+            _pool.putconn(conn)
 def _rows(cur) -> list[dict]:
@ -208,13 +135,6 @@ def _strip_comments(sql: str) -> str:
    return sql.strip()
 def _strip_literals(sql: str) -> str:
    """Blank out the contents of single-quoted string literals so the keyword guard
    does not fire on DATA (e.g. WHERE summary ILIKE '%please delete%'). '' escapes
    are handled so we don't mis-terminate a literal mid-string."""
    return re.sub(r"'(?:[^']|'')*'", "''", sql)
 def _guard(sql: str) -> str:
    """Validate a single read-only statement; return the cleaned statement."""
    stripped = _strip_comments(sql)
@ -226,11 +146,7 @@ def _guard(sql: str) -> str:
    stmt = parts[0].strip()
    if not re.match(r"^(select|with)\b", stmt, re.IGNORECASE):
        raise ValueError("Only SELECT / WITH queries are allowed.")
-    # Scan with string literals blanked: the blocklist's real job is to reject
+    if _FORBIDDEN.search(stmt):
    # data-modifying CTEs (WITH x AS (DELETE ... RETURNING ...)), not to trip over a
    # keyword that merely appears inside a filter value. (Writes are impossible anyway
    # via the analytics_ro GRANTs + the read-only, rolled-back transaction.)
    if _FORBIDDEN.search(_strip_literals(stmt)):
        raise ValueError("Query contains a forbidden (write/DDL) keyword.")
    return stmt
@ -263,44 +179,28 @@ mcp = FastMCP("fireside-analytics", stateless_http=True, transport_security=_tra
 def query(sql: str, max_rows: int = 1000) -> dict:
    """Run a read-only SELECT/WITH query against the fleet database.
-    Readable schemas are the analytics_ro grant surface (reporting, tracksolid,
+    Only the reporting.* and tracksolid.* schemas are readable. Single statement
-    tickets, fuel by default). Single statement only; write/DDL is rejected. Returns
+    only; write/DDL is rejected. Returns up to `max_rows` rows (default 1000, hard
-    up to `max_rows` rows (default 1000, hard cap 10000). A LIMIT is auto-applied when
+    cap 10000). A LIMIT is auto-applied when absent. Result: {row_count, truncated, rows}.
    absent. Result: {row_count, truncated, rows}.
    """
    stmt = _guard(sql)
    cap = max(1, min(int(max_rows), MAX_ROWS_CEIL))
    if not re.search(r"\blimit\b", stmt, re.IGNORECASE):
        stmt = f"{stmt}\nLIMIT {cap + 1}"   # +1 row to detect truncation
    t0 = time.monotonic()
    # Retry once on a dead connection (only): _ro_conn() discards broken sockets, so a
    # second attempt gets a fresh one — making a recycled-but-stale pool connection
    # invisible to the analyst. A real query error (statement_timeout, bad SQL) is NOT
    # retried; it surfaces immediately.
    rows = []
    for attempt in (1, 2):
        try:
    with _ro_conn() as conn, conn.cursor() as cur:
        cur.execute(stmt)
        rows = _rows(cur)
            break
        except Exception as exc:
            if attempt == 2 or not _is_disconnect(exc):
                raise
            log.warning("stale DB connection on attempt %d — retrying once", attempt)
    truncated = len(rows) > cap
    rows = rows[:cap]
    dur_ms = int((time.monotonic() - t0) * 1000)
-    log.info(
+    log.info("query rows=%d trunc=%s %dms :: %s", len(rows), truncated, dur_ms, sql[:200])
        "query caller=%s rows=%d trunc=%s %dms :: %s",
        _caller_var.get(), len(rows), truncated, dur_ms, sql[:200],
    )
    return {"row_count": len(rows), "truncated": truncated, "rows": rows}
@mcp.tool()
 def list_schemas() -> list[dict]:
-    """List the readable schemas (reporting, tracksolid, tickets, fuel) with object counts."""
+    """List the readable schemas (reporting, tracksolid) with their object counts."""
    with _ro_conn() as conn, conn.cursor() as cur:
        cur.execute(
            "SELECT table_schema AS schema, count(*) AS objects "
@ -313,7 +213,7 @@ def list_schemas() -> list[dict]:
@mcp.tool()
 def list_tables(schema: str) -> list[dict]:
-    """List tables + views in a schema (must be one of the readable schemas)."""
+    """List tables + views in a schema (must be reporting or tracksolid)."""
    if schema not in READABLE_SCHEMAS:
        raise ValueError(f"schema must be one of {READABLE_SCHEMAS}")
    with _ro_conn() as conn, conn.cursor() as cur:
@ -380,35 +280,21 @@ _TOKENS = {
 }
 def _lookup_token(token: str) -> str | None:
    """Constant-time token match: compare against every known token so the response
    time does not reveal how far a guessed prefix matched. Cheap for a handful of
    per-analyst tokens."""
    if not token:
        return None
    match = None
    for known, name in _TOKENS.items():
        if hmac.compare_digest(token, known):
            match = name
    return match
 class BearerAuth(BaseHTTPMiddleware):
    async def dispatch(self, request, call_next):
        if request.url.path == "/healthz":
            return await call_next(request)
        auth = request.headers.get("authorization", "")
        token = auth[7:] if auth.lower().startswith("bearer ") else ""
-        caller = _lookup_token(token)
+        caller = _TOKENS.get(token)
        if caller is None:
            return JSONResponse({"error": "unauthorized"}, status_code=401)
        request.state.caller = caller
        _caller_var.set(caller)   # so the tools can attribute the query in the logs
        return await call_next(request)
 async def healthz(_request):
-    return JSONResponse({"ok": True})
+    return JSONResponse({"ok": True, "tokens": len(_TOKENS)})
 app = mcp.streamable_http_app()
--- a/deploy.sh
+++ b/deploy.sh
@ -24,20 +24,6 @@ set -euo pipefail
 NAME=analytics_mcp
 PORT=8892
 HOST_DOMAIN="${HOST_DOMAIN:-fleetmcp.fivetitude.com}"   # prod: fleetmcp.rahamafresh.com
 # Comma-separated list of every domain this service answers on (defaults to
 # HOST_DOMAIN). All are folded into ONE Traefik router rule so a single cert
 # covers them and connectors on either domain keep working.
 HOST_DOMAINS="${HOST_DOMAINS:-$HOST_DOMAIN}"
 BT='`'
 RULE=""
 IFS=',' read -ra _DOMS <<< "$HOST_DOMAINS"
 for _d in "${_DOMS[@]}"; do
  _d="${_d// /}"
  if [ -n "$_d" ]; then
    seg="Host(${BT}${_d}${BT})"
    if [ -z "$RULE" ]; then RULE="$seg"; else RULE="$RULE || $seg"; fi
  fi
 done
 IMAGE="fleetanalytics-mcp:latest"
 ENV_FILE="$(pwd)/.deploy.env"
@ -67,15 +53,9 @@ RO_PW=$(cat "${ANALYTICS_RO_PW_FILE:-$HOME/.analytics_ro.pw}" 2>/dev/null || tru
 HOSTPART="${SRC_DB_URL#*@}"                       # host:port/dbname[?params]
 RO_DB_URL="postgresql://analytics_ro:${RO_PW}@${HOSTPART}"
-# Build the image from this repo (SKIP_BUILD=1 reuses the existing image for a
+# Build the image from this repo.
-# labels/env-only change — no new code is pulled in).
+echo "Building $IMAGE ..."
-if [ "${SKIP_BUILD:-0}" = "1" ]; then
+docker build -t "$IMAGE" .
  echo "SKIP_BUILD=1 — reusing existing $IMAGE (no rebuild)."
  docker image inspect "$IMAGE" >/dev/null 2>&1 || { echo "ERROR: $IMAGE not present"; exit 1; }
 else
  echo "Building $IMAGE ..."
  docker build -t "$IMAGE" .
 fi
 # Minimal env (read-only DSN + auth only — no Tracksolid ingestion secrets).
 { echo "DATABASE_URL=${RO_DB_URL}"; echo "MCP_AUTH_TOKENS=${MCP_AUTH_TOKENS}"; } > "$ENV_FILE"
@ -85,18 +65,14 @@ docker rm -f "$NAME" 2>/dev/null || true
 docker run -d --name "$NAME" --restart unless-stopped \
  --network "$APPNET" \
  --env-file "$ENV_FILE" \
  --log-opt max-size=10m --log-opt max-file=5 \
  --label 'traefik.enable=true' \
  --label 'traefik.docker.network=coolify' \
  --label 'traefik.http.middlewares.redirect-to-https.redirectscheme.scheme=https' \
  --label 'traefik.http.middlewares.fleetmcp-ratelimit.ratelimit.average=30' \
  --label 'traefik.http.middlewares.fleetmcp-ratelimit.ratelimit.burst=60' \
  --label "traefik.http.routers.http-0-fleetmcp.entryPoints=http" \
  --label "traefik.http.routers.http-0-fleetmcp.middlewares=redirect-to-https" \
-  --label "traefik.http.routers.http-0-fleetmcp.rule=${RULE}" \
+  --label "traefik.http.routers.http-0-fleetmcp.rule=Host(\`${HOST_DOMAIN}\`)" \
  --label "traefik.http.routers.https-0-fleetmcp.entryPoints=https" \
-  --label "traefik.http.routers.https-0-fleetmcp.rule=${RULE}" \
+  --label "traefik.http.routers.https-0-fleetmcp.rule=Host(\`${HOST_DOMAIN}\`)" \
  --label "traefik.http.routers.https-0-fleetmcp.middlewares=fleetmcp-ratelimit" \
  --label "traefik.http.routers.https-0-fleetmcp.tls=true" \
  --label "traefik.http.routers.https-0-fleetmcp.tls.certresolver=letsencrypt" \
  --label "traefik.http.services.fleetmcp.loadbalancer.server.port=${PORT}" \
--- a/scripts/MIGRATE_APPS_OFF_SUPERUSER.md
+++ b/scripts/MIGRATE_APPS_OFF_SUPERUSER.md
@ -0,0 +1,128 @@
 # Migrating the stack apps off the `postgres` superuser
 ## Why
 The Postgres server (`timescale_db`) has `max_connections = 100`. Six service
 connections run as the **`postgres` superuser**, each with a persistent pool that
 sits idle for hours. That's the root of the intermittent `FATAL: sorry, too many
 clients already`:
 - superuser sessions can use the **`superuser_reserved_connections`** slots, so the
  server can fill completely with no admin headroom;
 - you can't put a per-role **`CONNECTION LIMIT`** or enforce timeouts on them
  effectively;
 - and it's a standing least-privilege risk (any of these apps can read/write/DROP
  anything in any database).
 Giving each app a dedicated **NOSUPERUSER** role with a hard `CONNECTION LIMIT` fixes
 all three.
 ## The six connections (confirmed live 2026-06-20)
 | Service | Database | Current user | New role | Conn limit | Notes |
 |---|---|---|---|---|---|
 | `webhook_receiver` | tracksolid_db | postgres | **`tracksolid_owner`** | 30 (shared) | runs migrations |
 | `ingest_worker` | tracksolid_db | postgres | **`tracksolid_owner`** | (shared) | runs migrations |
 | `worker` | tracksolid_db | postgres | **`tracksolid_owner`** | (shared) | = ingest_worker image; runs migrations |
 | `dashboard_api` (prod backend) | tracksolid_db | postgres | `dashboard_app` (read) | 8 | reader |
 | `gateway` | **fleet_platform** | postgres | `gateway_app` | 15 | migration TBD |
 | `cron` | **fleet_platform** | postgres | `cron_app` | 5 | migration TBD |
 > **Migrators share `tracksolid_owner`.** `webhook_receiver`, `ingest_worker`, and
 > `worker` all run `run_migrations.py` (DDL) and write telemetry. Because they ALTER
 > objects, they must OWN them — so they connect as the single non-superuser
 > `tracksolid_owner` (the role the repo already intends to own these schemas). One
 > shared role = correct ownership, no app code change, one bounded connection cap.
 > `gateway`/`cron` use a **different database** (`fleet_platform`) on the same server —
 > still counted against the 100-slot ceiling; confirm whether they migrate before
 > cutover (apply the same owner pattern if so).
 ### Connection budget (keep the sum < ~95, leaving 3 reserved + admin headroom)
 ```
 tracksolid_owner 30 (shared by 3 migrators) + dashboard_app 8         = 38  (tracksolid_db)
 gateway_app 15 + cron_app 5                                            = 20  (fleet_platform)
 analytics_ro ~8 + dashboard_ro ~12 + grafana_ro ~5 + reporting_refresher ~3 = ~28  (existing)
                                                                  TOTAL ≈ 86  ✅
 ```
 Tune the `CONNECTION LIMIT`s to your real pool sizes; the point is the sum is now
 **bounded and visible**, not open-ended superuser pools.
 ## Step 1 — Discovery (DONE 2026-06-20)
 Confirmed live: `webhook_receiver`, `ingest_worker`, `worker` all start with
 `python run_migrations.py && …` → they run **DDL** and write telemetry (`worker` is
 the same image as `ingest_worker`). Writes span `tracksolid`, `reporting`, `tickets`.
 `dashboard_api` (prod backend) reads. `gateway`/`cron` are on `fleet_platform` and
 write `state`; their migration behaviour is **not yet confirmed** (opaque
 `entrypoint.sh`) — verify before cutover with:
 ```sql
 -- re-run after a deploy to see writes; or set log_statement='ddl' on fleet_platform.
 SELECT schemaname, sum(n_tup_ins+n_tup_upd+n_tup_del) FROM pg_stat_user_tables GROUP BY 1;
 ```
 ## Step 2 — Create roles + reassign ownership (no app impact yet)
 The ownership reassignment in `app_roles_tracksolid_db.sql` is **safe to run while the
 apps still connect as `postgres`** — superuser bypasses ownership, so nothing breaks
 until you flip a `DATABASE_URL`. It is Timescale-aware (skips linked sequences, uses
 `ALTER MATERIALIZED VIEW` for continuous aggregates, leaves `reporting.v_trips` with
 `reporting_refresher`) and idempotent — validated in a rolled-back transaction against
 the live DB.
 ```bash
 for r in tracksolid_owner dashboard_app gateway_app cron_app; do
  [ -s ~/.$r.pw ] || ( umask 077; openssl rand -hex 24 > ~/.$r.pw )
 done
 DB=$(docker ps --filter name=timescale_db --format '{{.Names}}' | head -1)
 # tracksolid_db: owner/migrator role + ownership reassignment + dashboard reader
 docker exec -i "$DB" psql -U postgres -d tracksolid_db -v ON_ERROR_STOP=1 \
  -v owner_pw="$(cat ~/.tracksolid_owner.pw)" -v dash_pw="$(cat ~/.dashboard_app.pw)" \
  < scripts/app_roles_tracksolid_db.sql
 # fleet_platform: gateway/cron roles (see that file's notes re: migrations)
 docker exec -i "$DB" psql -U postgres -d fleet_platform -v ON_ERROR_STOP=1 \
  -v gateway_pw="$(cat ~/.gateway_app.pw)" -v cron_pw="$(cat ~/.cron_app.pw)" \
  < scripts/app_roles_fleet_platform.sql
 ```
 > If `gateway`/`cron` run migrations, they need the same owner treatment on
 > `fleet_platform` (reassign its schemas to a `fleet_platform_owner` login role) — do
 > that before cutting them over. Until confirmed, leave them on `postgres`.
 ## Step 3 — Cut over one app at a time
 Change each service's `DATABASE_URL` user/password (same host/port/dbname), redeploy
 **just that one**, watch its logs for `permission denied` and the DB for the count:
 ```
 # the three migrators → the shared owner role:
 postgresql://tracksolid_owner:<owner_pw>@timescale_db:5432/tracksolid_db
 # the dashboard backend → the reader:
 postgresql://dashboard_app:<dash_pw>@timescale_db:5432/tracksolid_db
 ```
 ```bash
 docker exec -i "$DB" psql -U postgres -d tracksolid_db -c \
  "SELECT usename, count(*) FROM pg_stat_activity GROUP BY 1 ORDER BY 2 DESC;"
 ```
 **Order:** `dashboard_api` (reader, lowest risk) first → confirm → then the migrators
 one at a time (`ingest_worker`, then `worker`, then `webhook_receiver`), watching that
 `run_migrations.py` succeeds and ingestion resumes after each.
 ## Rollback (instant)
 Each app's only change is its `DATABASE_URL`. If anything misbehaves, set it back to
 the `postgres:…` DSN and redeploy that one app — no DB change required. The roles are
 additive; to remove one entirely: `DROP ROLE <app>;` (after nothing uses it).
 ## After all six are migrated
 - Add `idle_session_timeout` is already covered by the per-role GUCs above.
 - Consider **rotating the `postgres` superuser password** and restricting it to admin
  use only (it should no longer appear in any app's env).
 - Re-check the budget: `SELECT usename, count(*) FROM pg_stat_activity GROUP BY 1;`
  — no app should exceed its `CONNECTION LIMIT`, and the total should sit comfortably
  under 100. This is also when PgBouncer (separate PR) becomes optional rather than
  necessary.
--- a/scripts/analytics_ro_role.sql
+++ b/scripts/analytics_ro_role.sql
@ -62,9 +62,3 @@ ALTER DEFAULT PRIVILEGES FOR ROLE postgres         IN SCHEMA fuel       GRANT EX
 ALTER ROLE analytics_ro SET default_transaction_read_only = on;
 ALTER ROLE analytics_ro SET statement_timeout = '30s';
 ALTER ROLE analytics_ro SET idle_in_transaction_session_timeout = '60s';
 -- Cap idle POOLED connections too (these sit idle but NOT in a transaction, so the
 -- idle_in_transaction timeout never reaps them). On a shared 100-connection DB this
 -- returns slots the MCP isn't using. Safe with the server's dead-connection handling:
 -- a reaped pooled connection is discarded + transparently retried, not surfaced as an
 -- error. (Requires PostgreSQL 14+.)
 ALTER ROLE analytics_ro SET idle_session_timeout = '5min';
--- a/scripts/app_roles_fleet_platform.sql
+++ b/scripts/app_roles_fleet_platform.sql
@ -0,0 +1,71 @@
 -- app_roles_fleet_platform.sql — dedicated NON-SUPERUSER login roles for the apps
 -- that connect to the fleet_platform database as the `postgres` SUPERUSER.
 -- ─────────────────────────────────────────────────────────────────────────────
 -- Sibling of app_roles_tracksolid_db.sql, for the OTHER database on the same server.
 -- gateway + cron (the fleet_platform Coolify app) connect here as postgres. Same
 -- rationale: least privilege + a hard per-role CONNECTION LIMIT so they can't
 -- exhaust the server-wide 100-connection ceiling.
 --
 -- Schemas in fleet_platform: auth, domain, events, geo, ops, serve, slo, state
 -- (all owned by postgres). gateway (the API) and cron (scheduled jobs) almost
 -- certainly READ+WRITE app state across these, so they get DML; widen/narrow per
 -- the discovery step in MIGRATE_APPS_OFF_SUPERUSER.md. As with the sibling file,
 -- this does NOT change object ownership, so it does not grant DDL on existing
 -- (postgres-owned) objects — see step 3 of the runbook if these apps run migrations.
 --
 -- Run as the postgres SUPERUSER, on the fleet_platform database:
 --   docker exec -i <timescale_db> psql -U postgres -d fleet_platform -v ON_ERROR_STOP=1 \
 --     -v gateway_pw="$(cat ~/.gateway_app.pw)" \
 --     -v cron_pw="$(cat ~/.cron_app.pw)" \
 --     < scripts/app_roles_fleet_platform.sql
 \set ON_ERROR_STOP on
 -- ── 1. Capability group (read + write across the app schemas) ───────────────────
 DO $$ BEGIN
  IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='fp_app_rw') THEN CREATE ROLE fp_app_rw NOLOGIN; END IF;
 END $$;
 DO $grants$
 DECLARE s text;
 BEGIN
  FOREACH s IN ARRAY ARRAY['auth','domain','events','geo','ops','serve','slo','state'] LOOP
    EXECUTE format('GRANT USAGE ON SCHEMA %I TO fp_app_rw', s);
    EXECUTE format('GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA %I TO fp_app_rw', s);
    EXECUTE format('GRANT USAGE, SELECT, UPDATE ON ALL SEQUENCES IN SCHEMA %I TO fp_app_rw', s);
    EXECUTE format('GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA %I TO fp_app_rw', s);
    EXECUTE format('ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA %I GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO fp_app_rw', s);
    EXECUTE format('ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA %I GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO fp_app_rw', s);
    EXECUTE format('ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA %I GRANT EXECUTE ON FUNCTIONS TO fp_app_rw', s);
  END LOOP;
 END $grants$;
 -- ── 2. Per-app LOGIN roles ──────────────────────────────────────────────────────
 -- gateway — the request-facing API (latency-sensitive: short statement_timeout).
 DO $$ BEGIN
  IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='gateway_app') THEN
    CREATE ROLE gateway_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE;
  END IF; END $$;
 ALTER ROLE gateway_app WITH LOGIN PASSWORD :'gateway_pw' CONNECTION LIMIT 15;
 GRANT CONNECT ON DATABASE fleet_platform TO gateway_app;
 GRANT fp_app_rw TO gateway_app;
 ALTER ROLE gateway_app SET statement_timeout = '15s';
 ALTER ROLE gateway_app SET idle_in_transaction_session_timeout = '30s';
 ALTER ROLE gateway_app SET idle_session_timeout = '5min';
 ALTER ROLE gateway_app SET lock_timeout = '3s';
 -- cron — scheduled/background jobs (longer queries tolerated).
 DO $$ BEGIN
  IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='cron_app') THEN
    CREATE ROLE cron_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE;
  END IF; END $$;
 ALTER ROLE cron_app WITH LOGIN PASSWORD :'cron_pw' CONNECTION LIMIT 5;
 GRANT CONNECT ON DATABASE fleet_platform TO cron_app;
 GRANT fp_app_rw TO cron_app;
 ALTER ROLE cron_app SET statement_timeout = '120s';
 ALTER ROLE cron_app SET idle_in_transaction_session_timeout = '120s';
 ALTER ROLE cron_app SET idle_session_timeout = '10min';
 ALTER ROLE cron_app SET lock_timeout = '5s';
 -- ── 3. Verify ───────────────────────────────────────────────────────────────────
 -- \du+
--- a/scripts/app_roles_tracksolid_db.sql
+++ b/scripts/app_roles_tracksolid_db.sql
@ -0,0 +1,95 @@
 -- app_roles_tracksolid_db.sql — get the tracksolid_db apps off the postgres SUPERUSER.
 -- ─────────────────────────────────────────────────────────────────────────────
 -- DESIGN (validated against the live DB, 2026-06-20):
 --   * webhook_receiver, ingest_worker, worker each run `run_migrations.py` (DDL) and
 --     write telemetry. `worker` is a second copy of the ingest_worker image. Because
 --     they run migrations, they need to OWN the objects they ALTER. They therefore
 --     connect as the shared, NON-SUPERUSER **tracksolid_owner** (the role the repo
 --     already intends to own these schemas — see analytics_ro_role.sql default privs).
 --   * the prod dashboard_api backend only reads → its own read role `dashboard_app`
 --     (or reuse the existing dashboard_ro).
 --
 -- This file is idempotent. Section 2 (ownership reassignment) is Timescale-aware:
 -- it skips table-linked sequences, uses ALTER MATERIALIZED VIEW for continuous
 -- aggregates, and leaves reporting.v_trips with reporting_refresher. Reassigning
 -- while the apps still run as postgres is SAFE — superuser bypasses ownership, so
 -- nothing breaks until you flip each app's DATABASE_URL (see the runbook).
 --
 -- Run as the postgres SUPERUSER, on tracksolid_db:
 --   docker exec -i <timescale_db> psql -U postgres -d tracksolid_db -v ON_ERROR_STOP=1 \
 --     -v owner_pw="$(cat ~/.tracksolid_owner.pw)" \
 --     -v dash_pw="$(cat ~/.dashboard_app.pw)" \
 --     < scripts/app_roles_tracksolid_db.sql
 \set ON_ERROR_STOP on
 -- ── 1. tracksolid_owner: the shared owner/migrator login for the ingestion apps ──
 DO $$ BEGIN
  IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='tracksolid_owner') THEN
    CREATE ROLE tracksolid_owner LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE;
  END IF; END $$;
 -- LOGIN + password + a HARD connection cap (the real budget control). No
 -- statement_timeout: migrations (e.g. CREATE INDEX on a hypertable) can run long.
 ALTER ROLE tracksolid_owner WITH LOGIN PASSWORD :'owner_pw' CONNECTION LIMIT 30;
 ALTER ROLE tracksolid_owner SET idle_in_transaction_session_timeout = '5min';
 ALTER ROLE tracksolid_owner SET idle_session_timeout = '10min';
 ALTER ROLE tracksolid_owner SET lock_timeout = '10s';
 GRANT CONNECT ON DATABASE tracksolid_db TO tracksolid_owner;
 GRANT USAGE, CREATE ON SCHEMA tracksolid, reporting, tickets, fuel TO tracksolid_owner;
 -- ── 2. Reassign the app objects to tracksolid_owner (Timescale-aware, idempotent) ─
 DO $reassign$
 DECLARE r record; k text;
 BEGIN
  FOR r IN
    SELECT n.nspname, c.relname, c.relkind,
      EXISTS (SELECT 1 FROM timescaledb_information.continuous_aggregates ca
              WHERE ca.view_schema=n.nspname AND ca.view_name=c.relname) AS is_cagg
    FROM pg_class c JOIN pg_namespace n ON n.oid=c.relnamespace
    WHERE n.nspname IN ('tracksolid','reporting','tickets','fuel')
      AND c.relkind IN ('r','p','v','m','S')
      AND pg_get_userbyid(c.relowner) <> 'tracksolid_owner'                 -- idempotent
      AND NOT (n.nspname='reporting' AND c.relname='v_trips')               -- keep with refresher
      AND NOT (c.relkind='S' AND EXISTS (                                   -- skip linked seqs
        SELECT 1 FROM pg_depend d WHERE d.objid=c.oid AND d.deptype IN ('a','i')))
  LOOP
    k := CASE WHEN r.is_cagg OR r.relkind='m' THEN 'MATERIALIZED VIEW'
             WHEN r.relkind='v' THEN 'VIEW' WHEN r.relkind='S' THEN 'SEQUENCE' ELSE 'TABLE' END;
    EXECUTE format('ALTER %s %I.%I OWNER TO tracksolid_owner', k, r.nspname, r.relname);
  END LOOP;
 END $reassign$;
 DO $fns$
 DECLARE r record;
 BEGIN
  FOR r IN SELECT p.oid::regprocedure AS sig
    FROM pg_proc p JOIN pg_namespace n ON n.oid=p.pronamespace
    WHERE n.nspname IN ('tracksolid','reporting','tickets','fuel')
      AND pg_get_userbyid(p.proowner) <> 'tracksolid_owner'
  LOOP EXECUTE format('ALTER FUNCTION %s OWNER TO tracksolid_owner', r.sig); END LOOP;
 END $fns$;
 -- ── 3. dashboard_app: read-only role for the prod dashboard_api backend ──────────
 -- (If that backend turns out to also WRITE app state, widen via a write group like
 -- the fleet_platform file; start read-only.)
 DO $$ BEGIN
  IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='dashboard_app') THEN
    CREATE ROLE dashboard_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE;
  END IF; END $$;
 ALTER ROLE dashboard_app WITH LOGIN PASSWORD :'dash_pw' CONNECTION LIMIT 8;
 GRANT CONNECT ON DATABASE tracksolid_db TO dashboard_app;
 GRANT USAGE ON SCHEMA tracksolid, reporting, tickets, fuel TO dashboard_app;
 GRANT SELECT ON ALL TABLES IN SCHEMA tracksolid, reporting, tickets, fuel TO dashboard_app;
 GRANT SELECT ON reporting.v_trips TO dashboard_app;
 GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA reporting TO dashboard_app;
 ALTER DEFAULT PRIVILEGES FOR ROLE tracksolid_owner IN SCHEMA tracksolid, reporting, tickets, fuel
  GRANT SELECT ON TABLES TO dashboard_app;        -- future objects (now owned by tracksolid_owner)
 ALTER ROLE dashboard_app SET statement_timeout = '30s';
 ALTER ROLE dashboard_app SET idle_in_transaction_session_timeout = '60s';
 ALTER ROLE dashboard_app SET idle_session_timeout = '5min';
 ALTER ROLE dashboard_app SET lock_timeout = '5s';
 -- ── 4. Verify ────────────────────────────────────────────────────────────────────
 -- \du+ tracksolid_owner    -- LOGIN + CONNECTION LIMIT 30
 -- SELECT pg_get_userbyid(relowner), count(*) FROM pg_class
 --   WHERE relnamespace IN (SELECT oid FROM pg_namespace WHERE nspname='tracksolid') GROUP BY 1;
--- a/uv.lock
+++ b/uv.lock