From e1472adc3ae50532f5fef84753677c6c4007c4a3 Mon Sep 17 00:00:00 2001 From: kiania Date: Fri, 19 Jun 2026 23:51:52 +0300 Subject: [PATCH] infra(db-roles): dedicated non-superuser roles for the six apps on postgres MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six service connections run as the postgres SUPERUSER across two databases on the shared 100-connection server — the root of the "too many connections" peaks and a standing least-privilege risk. Superuser sessions ignore per-role CONNECTION LIMIT and can consume the superuser-reserved slots. Drafts (apply as postgres; nothing applied here): - scripts/app_roles_tracksolid_db.sql — webhook_app, ingest_app, worker_app, dashboard_app. Capability groups (ts_app_read / ts_app_write), per-app NOSUPERUSER login roles with hard CONNECTION LIMIT + bounded GUCs (statement_timeout, idle_session_timeout, idle_in_transaction, lock_timeout). - scripts/app_roles_fleet_platform.sql — gateway_app, cron_app (the apps on the separate fleet_platform DB), fp_app_rw group over its schemas. - scripts/MIGRATE_APPS_OFF_SUPERUSER.md — runbook: discovery (what each app actually writes / whether it runs DDL), connection-budget table (sum ≈ 81 < 100), the object-ownership step for migration-running apps (reassign app schemas to the existing tracksolid_owner — scoped, never REASSIGN OWNED globally), one-at-a-time cutover, and instant rollback (DATABASE_URL only). Grants are best-effort by app function and explicitly call out where to verify before cutover; all objects are postgres-owned, so row DML works but DDL needs the ownership step. See the runbook. Co-Authored-By: Claude Opus 4.8 --- scripts/MIGRATE_APPS_OFF_SUPERUSER.md | 144 ++++++++++++++++++++++++++ scripts/app_roles_fleet_platform.sql | 71 +++++++++++++ scripts/app_roles_tracksolid_db.sql | 119 +++++++++++++++++++++ 3 files changed, 334 insertions(+) create mode 100644 scripts/MIGRATE_APPS_OFF_SUPERUSER.md create mode 100644 scripts/app_roles_fleet_platform.sql create mode 100644 scripts/app_roles_tracksolid_db.sql diff --git a/scripts/MIGRATE_APPS_OFF_SUPERUSER.md b/scripts/MIGRATE_APPS_OFF_SUPERUSER.md new file mode 100644 index 0000000..69ae4f2 --- /dev/null +++ b/scripts/MIGRATE_APPS_OFF_SUPERUSER.md @@ -0,0 +1,144 @@ +# Migrating the stack apps off the `postgres` superuser + +## Why + +The Postgres server (`timescale_db`) has `max_connections = 100`. Six service +connections run as the **`postgres` superuser**, each with a persistent pool that +sits idle for hours. That's the root of the intermittent `FATAL: sorry, too many +clients already`: + +- superuser sessions can use the **`superuser_reserved_connections`** slots, so the + server can fill completely with no admin headroom; +- you can't put a per-role **`CONNECTION LIMIT`** or enforce timeouts on them + effectively; +- and it's a standing least-privilege risk (any of these apps can read/write/DROP + anything in any database). + +Giving each app a dedicated **NOSUPERUSER** role with a hard `CONNECTION LIMIT` fixes +all three. + +## The six connections (confirmed live) + +| Service | Database | Current user | New role | Conn limit | +|---|---|---|---|---| +| `webhook_receiver` | tracksolid_db | postgres | `webhook_app` | 10 | +| `ingest_worker` | tracksolid_db | postgres | `ingest_app` | 10 | +| `worker` | tracksolid_db | postgres | `worker_app` (read) | 5 | +| `dashboard_api` (prod backend) | tracksolid_db | postgres | `dashboard_app` (or reuse `dashboard_ro`) | 8 | +| `gateway` | **fleet_platform** | postgres | `gateway_app` | 15 | +| `cron` | **fleet_platform** | postgres | `cron_app` | 5 | + +> Note `gateway`/`cron` use a **different database** (`fleet_platform`) on the same +> server — they still count against the shared 100-slot ceiling. + +### Connection budget (keep the sum < ~95, leaving 3 reserved + admin headroom) + +``` +webhook_app 10 + ingest_app 10 + worker_app 5 + dashboard_app 8 = 33 (tracksolid_db) +gateway_app 15 + cron_app 5 = 20 (fleet_platform) +analytics_ro ~8 + dashboard_ro ~12 + grafana_ro ~5 + reporting_refresher ~3 = ~28 (existing) + TOTAL ≈ 81 ✅ +``` +Tune the `CONNECTION LIMIT`s in the SQL to your real pool sizes; the point is the sum +is now **bounded and visible**, not open-ended superuser pools. + +## Step 1 — Discover what each app actually needs (do NOT skip) + +The drafted grants are best-effort (ingestion = write telemetry; gateway/cron = RW +app state; worker/dashboard = read). Confirm before cutover: + +```sql +-- (a) Which tables does each app WRITE? Reset stats, run the app for a bit, re-check: +SELECT schemaname, relname, n_tup_ins, n_tup_upd, n_tup_del +FROM pg_stat_user_tables +WHERE n_tup_ins + n_tup_upd + n_tup_del > 0 +ORDER BY 1,2; + +-- (b) Does the app run DDL/migrations at deploy? Check its code/entrypoint for +-- CREATE/ALTER/DROP or a migrations runner (e.g. run_migrations.py, alembic). +-- If yes → it needs object OWNERSHIP, see Step 3. +``` +Or temporarily set `log_statement = 'ddl'` (or `'mod'`) and watch one deploy cycle. + +## Step 2 — Create the roles (no app impact yet) + +Generate a password per role (host-only, 0600), then apply the SQL as postgres: + +```bash +for r in webhook_app ingest_app worker_app dashboard_app gateway_app cron_app; do + [ -s ~/.$r.pw ] || ( umask 077; openssl rand -hex 24 > ~/.$r.pw ) +done +DB=$(docker ps --filter name=timescale_db --format '{{.Names}}' | head -1) + +docker exec -i "$DB" psql -U postgres -d tracksolid_db -v ON_ERROR_STOP=1 \ + -v webhook_pw="$(cat ~/.webhook_app.pw)" -v ingest_pw="$(cat ~/.ingest_app.pw)" \ + -v worker_pw="$(cat ~/.worker_app.pw)" -v dash_pw="$(cat ~/.dashboard_app.pw)" \ + < scripts/app_roles_tracksolid_db.sql + +docker exec -i "$DB" psql -U postgres -d fleet_platform -v ON_ERROR_STOP=1 \ + -v gateway_pw="$(cat ~/.gateway_app.pw)" -v cron_pw="$(cat ~/.cron_app.pw)" \ + < scripts/app_roles_fleet_platform.sql +``` + +## Step 3 — (Only if an app runs migrations) give its role object ownership + +All objects are owned by `postgres`, so a non-superuser role can write **rows** but +not `ALTER`/`DROP` existing tables. If discovery showed an app issues DDL, reassign +the **app schemas** to the existing non-superuser owner role and add the app role to +it. **Scope this to the app schemas — never `REASSIGN OWNED BY postgres` globally** +(that would also try to move TimescaleDB/system objects). + +```sql +-- tracksolid_db: make tracksolid_owner own the app objects, then add the ingestors. +DO $$ +DECLARE r record; +BEGIN + FOR r IN + SELECT n.nspname, c.relname, + CASE c.relkind WHEN 'v' THEN 'VIEW' WHEN 'm' THEN 'MATERIALIZED VIEW' ELSE 'TABLE' END AS kind + FROM pg_class c JOIN pg_namespace n ON n.oid=c.relnamespace + WHERE n.nspname IN ('tracksolid','reporting') AND c.relkind IN ('r','p','v','m') + LOOP + EXECUTE format('ALTER %s %I.%I OWNER TO tracksolid_owner', r.kind, r.nspname, r.relname); + END LOOP; +END $$; +GRANT CREATE ON SCHEMA tracksolid, reporting TO tracksolid_owner; +GRANT tracksolid_owner TO webhook_app, ingest_app; -- they inherit ownership rights +``` +(Do the analogous reassignment in `fleet_platform` to a `fleet_platform_owner` role +if `gateway`/`cron` run migrations. Keep `reporting.v_trips` owned by +`reporting_refresher` if that role refreshes it.) + +Test one deploy/migration as the new role **before** cutting over all apps. + +## Step 4 — Cut over one app at a time + +For each service, change its `DATABASE_URL` user/password from `postgres:…` to the new +role (same host/port/dbname), redeploy **just that one**, and watch its logs for +`permission denied` (→ widen the group grant) and the DB for connection count: + +```bash +# in the app's env (Coolify secret or compose): +# tracksolid_db: postgresql://webhook_app:@timescale_db:5432/tracksolid_db +# fleet_platform: postgresql://gateway_app:@timescale_db:5432/fleet_platform +docker exec -i "$DB" psql -U postgres -d tracksolid_db -c \ + "SELECT usename, count(*) FROM pg_stat_activity GROUP BY 1 ORDER BY 2 DESC;" +``` +Order: start with the **lowest-risk reader** (`worker`/`dashboard_api`), then the +ingestors, then `gateway`/`cron`. + +## Rollback (instant) + +Each app's only change is its `DATABASE_URL`. If anything misbehaves, set it back to +the `postgres:…` DSN and redeploy that one app — no DB change required. The roles are +additive; to remove one entirely: `DROP ROLE ;` (after nothing uses it). + +## After all six are migrated + +- Add `idle_session_timeout` is already covered by the per-role GUCs above. +- Consider **rotating the `postgres` superuser password** and restricting it to admin + use only (it should no longer appear in any app's env). +- Re-check the budget: `SELECT usename, count(*) FROM pg_stat_activity GROUP BY 1;` + — no app should exceed its `CONNECTION LIMIT`, and the total should sit comfortably + under 100. This is also when PgBouncer (separate PR) becomes optional rather than + necessary. diff --git a/scripts/app_roles_fleet_platform.sql b/scripts/app_roles_fleet_platform.sql new file mode 100644 index 0000000..b156a16 --- /dev/null +++ b/scripts/app_roles_fleet_platform.sql @@ -0,0 +1,71 @@ +-- app_roles_fleet_platform.sql — dedicated NON-SUPERUSER login roles for the apps +-- that connect to the fleet_platform database as the `postgres` SUPERUSER. +-- ───────────────────────────────────────────────────────────────────────────── +-- Sibling of app_roles_tracksolid_db.sql, for the OTHER database on the same server. +-- gateway + cron (the fleet_platform Coolify app) connect here as postgres. Same +-- rationale: least privilege + a hard per-role CONNECTION LIMIT so they can't +-- exhaust the server-wide 100-connection ceiling. +-- +-- Schemas in fleet_platform: auth, domain, events, geo, ops, serve, slo, state +-- (all owned by postgres). gateway (the API) and cron (scheduled jobs) almost +-- certainly READ+WRITE app state across these, so they get DML; widen/narrow per +-- the discovery step in MIGRATE_APPS_OFF_SUPERUSER.md. As with the sibling file, +-- this does NOT change object ownership, so it does not grant DDL on existing +-- (postgres-owned) objects — see step 3 of the runbook if these apps run migrations. +-- +-- Run as the postgres SUPERUSER, on the fleet_platform database: +-- docker exec -i psql -U postgres -d fleet_platform -v ON_ERROR_STOP=1 \ +-- -v gateway_pw="$(cat ~/.gateway_app.pw)" \ +-- -v cron_pw="$(cat ~/.cron_app.pw)" \ +-- < scripts/app_roles_fleet_platform.sql + +\set ON_ERROR_STOP on + +-- ── 1. Capability group (read + write across the app schemas) ─────────────────── +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='fp_app_rw') THEN CREATE ROLE fp_app_rw NOLOGIN; END IF; +END $$; + +DO $grants$ +DECLARE s text; +BEGIN + FOREACH s IN ARRAY ARRAY['auth','domain','events','geo','ops','serve','slo','state'] LOOP + EXECUTE format('GRANT USAGE ON SCHEMA %I TO fp_app_rw', s); + EXECUTE format('GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA %I TO fp_app_rw', s); + EXECUTE format('GRANT USAGE, SELECT, UPDATE ON ALL SEQUENCES IN SCHEMA %I TO fp_app_rw', s); + EXECUTE format('GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA %I TO fp_app_rw', s); + EXECUTE format('ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA %I GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO fp_app_rw', s); + EXECUTE format('ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA %I GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO fp_app_rw', s); + EXECUTE format('ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA %I GRANT EXECUTE ON FUNCTIONS TO fp_app_rw', s); + END LOOP; +END $grants$; + +-- ── 2. Per-app LOGIN roles ────────────────────────────────────────────────────── +-- gateway — the request-facing API (latency-sensitive: short statement_timeout). +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='gateway_app') THEN + CREATE ROLE gateway_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE; + END IF; END $$; +ALTER ROLE gateway_app WITH LOGIN PASSWORD :'gateway_pw' CONNECTION LIMIT 15; +GRANT CONNECT ON DATABASE fleet_platform TO gateway_app; +GRANT fp_app_rw TO gateway_app; +ALTER ROLE gateway_app SET statement_timeout = '15s'; +ALTER ROLE gateway_app SET idle_in_transaction_session_timeout = '30s'; +ALTER ROLE gateway_app SET idle_session_timeout = '5min'; +ALTER ROLE gateway_app SET lock_timeout = '3s'; + +-- cron — scheduled/background jobs (longer queries tolerated). +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='cron_app') THEN + CREATE ROLE cron_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE; + END IF; END $$; +ALTER ROLE cron_app WITH LOGIN PASSWORD :'cron_pw' CONNECTION LIMIT 5; +GRANT CONNECT ON DATABASE fleet_platform TO cron_app; +GRANT fp_app_rw TO cron_app; +ALTER ROLE cron_app SET statement_timeout = '120s'; +ALTER ROLE cron_app SET idle_in_transaction_session_timeout = '120s'; +ALTER ROLE cron_app SET idle_session_timeout = '10min'; +ALTER ROLE cron_app SET lock_timeout = '5s'; + +-- ── 3. Verify ─────────────────────────────────────────────────────────────────── +-- \du+ diff --git a/scripts/app_roles_tracksolid_db.sql b/scripts/app_roles_tracksolid_db.sql new file mode 100644 index 0000000..5916c72 --- /dev/null +++ b/scripts/app_roles_tracksolid_db.sql @@ -0,0 +1,119 @@ +-- app_roles_tracksolid_db.sql — dedicated NON-SUPERUSER login roles for the apps +-- that currently connect to tracksolid_db as the `postgres` SUPERUSER. +-- ───────────────────────────────────────────────────────────────────────────── +-- WHY: six stack services connect to this Postgres server as the postgres superuser +-- (webhook_receiver, ingest_worker, worker, the prod dashboard_api backend on +-- tracksolid_db; gateway + cron on fleet_platform — see the sibling file). That is +-- both a least-privilege problem AND the root of the "too many connections" error: +-- superuser sessions ignore per-role connection caps and can exhaust the 100-slot +-- ceiling (incl. the superuser-reserved slots). Dedicated roles let us pin a hard +-- CONNECTION LIMIT and timeouts per app. +-- +-- WHAT THIS DOES (run as the postgres SUPERUSER, on tracksolid_db): +-- * creates capability GROUP roles (NOLOGIN) for read vs. read-write, +-- * creates one LOGIN role per app, NOSUPERUSER, with a CONNECTION LIMIT and +-- bounded GUCs, as a member of the group it needs, +-- * grants the groups SELECT / DML on the operational schemas. +-- +-- WHAT IT DOES *NOT* DO: change object ownership. All objects here are owned by +-- `postgres`, so a non-superuser role can write ROWS but cannot ALTER/DROP existing +-- tables (i.e. run migrations). If an app runs DDL at deploy, see step 3 in +-- MIGRATE_APPS_OFF_SUPERUSER.md (reassign the app schemas to `tracksolid_owner` and +-- add the app role to it). Roles here INHERIT, so membership grants apply directly. +-- +-- Idempotent. Passwords are supplied as psql vars (never stored in the repo): +-- docker exec -i psql -U postgres -d tracksolid_db -v ON_ERROR_STOP=1 \ +-- -v webhook_pw="$(cat ~/.webhook_app.pw)" \ +-- -v ingest_pw="$(cat ~/.ingest_app.pw)" \ +-- -v worker_pw="$(cat ~/.worker_app.pw)" \ +-- -v dash_pw="$(cat ~/.dashboard_app.pw)" \ +-- < scripts/app_roles_tracksolid_db.sql + +\set ON_ERROR_STOP on + +-- ── 1. Capability groups (NOLOGIN; apps inherit privileges via membership) ────── +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='ts_app_read') THEN CREATE ROLE ts_app_read NOLOGIN; END IF; + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='ts_app_write') THEN CREATE ROLE ts_app_write NOLOGIN; END IF; +END $$; + +-- Read surface: telemetry + curated reporting layer. +GRANT USAGE ON SCHEMA tracksolid, reporting TO ts_app_read; +GRANT SELECT ON ALL TABLES IN SCHEMA tracksolid, reporting TO ts_app_read; +GRANT SELECT ON reporting.v_trips TO ts_app_read; -- matview (not in ALL TABLES) +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA reporting TO ts_app_read; +ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA tracksolid, reporting GRANT SELECT ON TABLES TO ts_app_read; + +-- Write surface for ingestion: row DML on telemetry (NOT DDL — see header). +GRANT ts_app_read TO ts_app_write; -- write implies read +GRANT INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA tracksolid TO ts_app_write; +GRANT USAGE, SELECT, UPDATE ON ALL SEQUENCES IN SCHEMA tracksolid TO ts_app_write; +ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA tracksolid + GRANT INSERT, UPDATE, DELETE ON TABLES TO ts_app_write; +ALTER DEFAULT PRIVILEGES FOR ROLE postgres IN SCHEMA tracksolid + GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO ts_app_write; + +-- ── 2. Per-app LOGIN roles ────────────────────────────────────────────────────── +-- CONNECTION LIMIT is the hard budget cap (sum across all roles must stay < 100). +-- GUCs are belt-and-braces and tunable per app. + +-- webhook_receiver — ingests Tracksolid webhooks (writes telemetry; may run migrations). +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='webhook_app') THEN + CREATE ROLE webhook_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE; + END IF; END $$; +ALTER ROLE webhook_app WITH LOGIN PASSWORD :'webhook_pw' CONNECTION LIMIT 10; +GRANT CONNECT ON DATABASE tracksolid_db TO webhook_app; +GRANT ts_app_write TO webhook_app; +ALTER ROLE webhook_app SET statement_timeout = '120s'; -- bulk inserts +ALTER ROLE webhook_app SET idle_in_transaction_session_timeout = '120s'; +ALTER ROLE webhook_app SET idle_session_timeout = '10min'; +ALTER ROLE webhook_app SET lock_timeout = '5s'; + +-- ingest_worker — background ingestion/normalisation (writes telemetry). +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='ingest_app') THEN + CREATE ROLE ingest_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE; + END IF; END $$; +ALTER ROLE ingest_app WITH LOGIN PASSWORD :'ingest_pw' CONNECTION LIMIT 10; +GRANT CONNECT ON DATABASE tracksolid_db TO ingest_app; +GRANT ts_app_write TO ingest_app; +ALTER ROLE ingest_app SET statement_timeout = '120s'; +ALTER ROLE ingest_app SET idle_in_transaction_session_timeout = '120s'; +ALTER ROLE ingest_app SET idle_session_timeout = '10min'; +ALTER ROLE ingest_app SET lock_timeout = '5s'; +-- If ingestion REFRESHes reporting.v_trips, add it to the existing refresher role: +-- GRANT reporting_refresher TO ingest_app; -- (uncomment after confirming) + +-- worker — fleet_platform worker that also reads tracksolid_db. Assumed READ-ONLY +-- here; widen to ts_app_write only if it actually writes telemetry. +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='worker_app') THEN + CREATE ROLE worker_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE; + END IF; END $$; +ALTER ROLE worker_app WITH LOGIN PASSWORD :'worker_pw' CONNECTION LIMIT 5; +GRANT CONNECT ON DATABASE tracksolid_db TO worker_app; +GRANT ts_app_read TO worker_app; +ALTER ROLE worker_app SET statement_timeout = '60s'; +ALTER ROLE worker_app SET idle_in_transaction_session_timeout = '60s'; +ALTER ROLE worker_app SET idle_session_timeout = '10min'; +ALTER ROLE worker_app SET lock_timeout = '5s'; + +-- dashboard_api (PROD backend, currently postgres). If it only reads, prefer the +-- existing dashboard_ro. This role is for a backend that ALSO writes app state; +-- start read-only and widen per discovery. +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='dashboard_app') THEN + CREATE ROLE dashboard_app LOGIN INHERIT NOSUPERUSER NOCREATEDB NOCREATEROLE; + END IF; END $$; +ALTER ROLE dashboard_app WITH LOGIN PASSWORD :'dash_pw' CONNECTION LIMIT 8; +GRANT CONNECT ON DATABASE tracksolid_db TO dashboard_app; +GRANT ts_app_read TO dashboard_app; +ALTER ROLE dashboard_app SET statement_timeout = '30s'; +ALTER ROLE dashboard_app SET idle_in_transaction_session_timeout = '60s'; +ALTER ROLE dashboard_app SET idle_session_timeout = '5min'; +ALTER ROLE dashboard_app SET lock_timeout = '5s'; + +-- ── 3. Verify ─────────────────────────────────────────────────────────────────── +-- \du+ -- inspect roles, CONNECTION LIMIT, and memberships