diff --git a/10_pgbouncer_auth.sql b/10_pgbouncer_auth.sql new file mode 100644 index 0000000..00c1a5c --- /dev/null +++ b/10_pgbouncer_auth.sql @@ -0,0 +1,32 @@ +-- 10_pgbouncer_auth.sql +-- pgbouncer SCRAM passthrough auth: dedicated role + user_lookup() function. +-- Runbook: 260507_pgbouncer_deployment.md +-- +-- Idempotent. Re-applying is a no-op: +-- * Role created only when missing (placeholder password, replaced on every +-- container startup by run_migrations.py:sync_role_passwords from +-- PGBOUNCER_AUTH_PASSWORD). +-- * Function uses CREATE OR REPLACE. +-- * GRANT/REVOKE are safe to re-run. + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pgbouncer') THEN + CREATE ROLE pgbouncer LOGIN PASSWORD 'SET_PASSWORD_IN_ENV'; + END IF; +END +$$; + +CREATE OR REPLACE FUNCTION public.user_lookup(in_user text, + OUT uname text, OUT phash text) RETURNS record AS $$ +BEGIN + SELECT usename, passwd + FROM pg_catalog.pg_shadow + WHERE usename = in_user + INTO uname, phash; + RETURN; +END; +$$ LANGUAGE plpgsql SECURITY DEFINER; + +REVOKE ALL ON FUNCTION public.user_lookup(text) FROM public; +GRANT EXECUTE ON FUNCTION public.user_lookup(text) TO pgbouncer; diff --git a/260507_pgbouncer_deployment.md b/260507_pgbouncer_deployment.md new file mode 100644 index 0000000..7decf0f --- /dev/null +++ b/260507_pgbouncer_deployment.md @@ -0,0 +1,328 @@ +# pgbouncer + pgAdmin4 sidecar deployment + +**Date:** 2026-05-07 +**Branch:** `quality-program-2026-04-12` +**Status:** Plan approved; implementation pending + +--- + +## Context + +**Driver:** pgAdmin4 running on the maintainer's laptop has been exhausting +`tracksolid_db`'s `max_connections`. Each Query Tool tab in pgAdmin holds its +own long-lived backend connection; combined with the existing peak of ~50–60 +connections from the ingest pipeline, the budget tips over and cascades — +pgcli (and anything else trying to connect) starts failing. + +**Goal:** Add pgbouncer in front of `timescale_db` to enforce a connection +budget via transaction-mode pooling, and deploy pgAdmin4 as a Coolify-managed +sidecar that connects through pgbouncer over the Docker network. Net effect: +pgAdmin sprawl is multiplexed onto a small fixed pool of backends, admin +tooling moves on-VM (lower latency, persistent workspace, smaller external +attack surface), and host port 5433 becomes optional/closeable in a follow-up. + +**Frozen scope (unchanged this round):** +- DWH bronze pipeline (`dwh/*.sql`, `tracksolid_dwh@31.97.44.246:5888`) +- n8n DWH workflows (`n8n-workflows/dwh_extract*`, `dwh_load_bronze*`) +- Grafana provisioning (`grafana/provisioning/datasources/...`) +- Python ingest containers (`ingest_movement_rev.py`, `ingest_events_rev.py`, + `webhook_receiver_rev.py`) — they keep talking to `timescale_db:5432` + directly. Cutover, if desired, is a separate plan. +- `db_backup` sidecar — `pg_dump` is incompatible with transaction-mode + pooling and stays on `timescale_db:5432`. + +--- + +## Phase 1 — pgbouncer sidecar, no client cutover + +Add a new service to `docker-compose.yaml`. Internal Docker network only; +no host port binding. + +```yaml + pgbouncer: + image: edoburu/pgbouncer:1.23.1 + restart: always + depends_on: + timescale_db: + condition: service_healthy + env_file: .env + environment: + - DB_HOST=timescale_db + - DB_PORT=5432 + - DB_USER=${POSTGRES_USER} + - DB_PASSWORD=${POSTGRES_PASSWORD} + - DB_NAME=${POSTGRES_DB} + - POOL_MODE=transaction + - AUTH_TYPE=scram-sha-256 + - MAX_CLIENT_CONN=200 + - DEFAULT_POOL_SIZE=15 + - MIN_POOL_SIZE=2 + - RESERVE_POOL_SIZE=5 + - SERVER_RESET_QUERY=DISCARD ALL + - SERVER_IDLE_TIMEOUT=600 + - ADMIN_USERS=${POSTGRES_USER} + - LISTEN_PORT=6432 + - AUTH_USER=pgbouncer + - AUTH_QUERY=SELECT uname, phash FROM public.user_lookup($$1) + healthcheck: + test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 6432 -U ${POSTGRES_USER}"] + interval: 30s + timeout: 5s + retries: 3 +``` + +**Why these values:** +- `POOL_MODE=transaction` — recycles backend on every transaction boundary. + Cuts pgAdmin's per-tab idle conn from 1 backend → ~0 when idle. +- `DEFAULT_POOL_SIZE=15` — total backend slots per (user, db) pair. Sits + comfortably under Postgres `max_connections` (default 100) leaving room for + ingest's existing ~50–60. +- `MAX_CLIENT_CONN=200` — pgAdmin can open as many tabs as it wants; they + queue rather than fail. +- `RESERVE_POOL_SIZE=5` — emergency slack when `default_pool_size` saturates. +- `SERVER_RESET_QUERY=DISCARD ALL` — wipes session state between transactions + so leaked `SET`s from one client don't bleed into the next. + +### Auth: SCRAM passthrough via `auth_query` + +Avoids hand-maintaining `userlist.txt`. pgbouncer authenticates as a +dedicated `pgbouncer` Postgres role and looks up SCRAM hashes for the +requesting user via a SECURITY DEFINER function. + +New migration `10_pgbouncer_auth.sql` (08 and 09 are taken by +`08_analytics_config.sql` and `09_trips_enrichment.sql`): + +```sql +-- Role created with placeholder password; run_migrations.py:sync_role_passwords +-- replaces it with PGBOUNCER_AUTH_PASSWORD on every container startup. +-- Same convention used today for grafana_ro. +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pgbouncer') THEN + CREATE ROLE pgbouncer LOGIN PASSWORD 'SET_PASSWORD_IN_ENV'; + END IF; +END +$$; + +CREATE OR REPLACE FUNCTION public.user_lookup(in_user text, + OUT uname text, OUT phash text) RETURNS record AS $$ +BEGIN + SELECT usename, passwd FROM pg_catalog.pg_shadow + WHERE usename = in_user INTO uname, phash; + RETURN; +END; +$$ LANGUAGE plpgsql SECURITY DEFINER; + +REVOKE ALL ON FUNCTION public.user_lookup(text) FROM public; +GRANT EXECUTE ON FUNCTION public.user_lookup(text) TO pgbouncer; +``` + +Two changes to `run_migrations.py`: +1. Append `"10_pgbouncer_auth.sql"` to `MIGRATIONS`. +2. Extend `sync_role_passwords()` `roles` dict with + `"pgbouncer": os.getenv("PGBOUNCER_AUTH_PASSWORD")`. + +The migration is applied by the next ingest container restart and recorded +in `tracksolid.schema_migrations`. `sync_role_passwords` then ALTER ROLEs +the password from the env var so the placeholder is never live. + +### New env vars in `.env` + +- `PGBOUNCER_AUTH_PASSWORD` — password for the new `pgbouncer` Postgres role +- (existing vars reused: `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB`) + +### Phase 1 verification + +1. Apply migration via ingest container restart; confirm in + `tracksolid.schema_migrations` that `10_pgbouncer_auth.sql` is recorded. +2. `docker compose up -d pgbouncer`. +3. From inside any compose service: + ```bash + psql -h pgbouncer -p 6432 -U postgres -d tracksolid_db -c 'SELECT 1' + ``` +4. From the pgbouncer container's admin console: + ```bash + psql -h 127.0.0.1 -p 6432 -U postgres -d pgbouncer -c 'SHOW POOLS;' + ``` + Confirm pool mode = `transaction`, server connections within + `default_pool_size`. +5. `SHOW STATS;` and `SHOW CLIENTS;` should both respond. +6. Confirm no client has cut over: `tracksolid.ingestion_log` continues + accumulating; Grafana panels keep refreshing. + +--- + +## Phase 2 — pgAdmin4 sidecar pointed at pgbouncer + +Coolify UI maps an HTTPS subdomain (e.g. `pgadmin.stage.rahamafresh.com`) to +internal port 80, mirroring the Grafana pattern at `docker-compose.yaml:78–80`. + +```yaml + pgadmin: + image: dpage/pgadmin4:8.14 + restart: always + depends_on: + pgbouncer: + condition: service_healthy + env_file: .env + environment: + - PGADMIN_DEFAULT_EMAIL=${PGADMIN_DEFAULT_EMAIL} + - PGADMIN_DEFAULT_PASSWORD=${PGADMIN_DEFAULT_PASSWORD} + - PGADMIN_CONFIG_SERVER_MODE=True + - PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED=False + - PGADMIN_DISABLE_POSTFIX=True + volumes: + - pgadmin-data:/var/lib/pgadmin + - ./pgadmin/servers.json:/pgadmin4/servers.json:ro + # COOLIFY DOMAIN LOGIC: + # Set the actual URL in the Coolify UI; service exposes port 80 internally. +``` + +Add `pgadmin-data` to the `volumes:` block at the bottom of the compose file. + +### Pre-registered server (`pgadmin/servers.json`) + +```json +{ + "Servers": { + "1": { + "Name": "tracksolid_db (via pgbouncer)", + "Group": "Servers", + "Host": "pgbouncer", + "Port": 6432, + "MaintenanceDB": "tracksolid_db", + "Username": "postgres", + "SSLMode": "disable", + "ConnectionParameters": { + "sslmode": "disable", + "connect_timeout": 10 + } + } + } +} +``` + +### New env vars in `.env` + +- `PGADMIN_DEFAULT_EMAIL` +- `PGADMIN_DEFAULT_PASSWORD` + +### Phase 2 verification + +1. In the Coolify UI, point a subdomain at the `pgadmin` service, port 80. +2. Open the URL, log in with `PGADMIN_DEFAULT_EMAIL` / + `PGADMIN_DEFAULT_PASSWORD`. +3. The pre-registered "tracksolid_db (via pgbouncer)" server appears in the + left tree. Connect; provide the `postgres` password when prompted (pgAdmin + stores it in its own keyring after first use). +4. Open a Query Tool, run `SELECT now(), current_user;`. +5. From the `pgbouncer` container admin console: + ```sql + SHOW POOLS; + ``` + `cl_active` should reflect the open pgAdmin tab(s); `sv_active` / + `sv_idle` should sum to ≤ `default_pool_size` (15). +6. **Stress test:** open ~30 Query Tool tabs, run `SELECT pg_sleep(0.1);` in + each. Confirm `tracksolid_db` total connection count stays bounded: + ```sql + SELECT count(*) FROM pg_stat_activity; + ``` + Should be `default_pool_size + reserve_pool_size + (other clients)`, + not the number of pgAdmin tabs. + +--- + +## Files to modify / create + +| Path | Change | +|---|---| +| `260507_pgbouncer_deployment.md` | THIS FILE — runbook for the rollout | +| `docker-compose.yaml` | Add `pgbouncer` and `pgadmin` services; add `pgadmin-data` volume | +| `10_pgbouncer_auth.sql` | NEW — creates `pgbouncer` role + `public.user_lookup` SECURITY DEFINER function | +| `pgadmin/servers.json` | NEW — pre-registers `pgbouncer:6432` as the default server | +| `.env` | Add `PGBOUNCER_AUTH_PASSWORD`, `PGADMIN_DEFAULT_EMAIL`, `PGADMIN_DEFAULT_PASSWORD` (do not commit values) | +| `docs/CONNECTIONS.md` | Add a "pgbouncer + pgAdmin" section: pool mode, exposure, who uses it, how to connect for ad-hoc admin | +| `CLAUDE.md` §3 / §4 | Note that admin tooling now goes through `pgbouncer:6432`; ingest/grafana/backup remain direct; reference this runbook | + +## Files NOT to modify (frozen scope) + +- `grafana/provisioning/datasources/tracksolid_postgres.yaml` +- `n8n-workflows/dwh_extract*.json`, `n8n-workflows/dwh_load_bronze*.json` +- `dwh/*.sql` +- `ingest_movement_rev.py`, `ingest_events_rev.py`, `webhook_receiver_rev.py`, + `ts_shared_rev.py` +- `backup/` — `pg_dump` keeps using `timescale_db:5432` directly + +--- + +## Reused conventions and utilities + +- `run_migrations.py` already applies new `NN_*.sql` files in order against + `tracksolid_db` and tracks them in `tracksolid.schema_migrations`. Phase 1 + adds `10_pgbouncer_auth.sql` to this flow — no new tooling needed. +- `env_file: .env` + `depends_on: condition: service_healthy` mirrors + the existing pattern at `docker-compose.yaml:28–31, 39–42, 50–53, 67–70, + 87–90`. +- Coolify domain-via-UI mirrors the Grafana comment at + `docker-compose.yaml:78–80` and the webhook_receiver comment at + `docker-compose.yaml:54–55`. +- Container-name resolution rule from CLAUDE.md §3 still applies for any + `docker exec` against the new services: + ```bash + docker ps --filter name=pgbouncer --format "{{.Names}}" | head -1 + docker ps --filter name=pgadmin --format "{{.Names}}" | head -1 + ``` + +--- + +## Out-of-scope follow-ups (separate plans) + +1. **Cut over Python ingest to pgbouncer.** Change `DATABASE_URL` in `.env` + from `timescale_db:5432` to `pgbouncer:6432`. Requires verifying psycopg2 + pool + SAVEPOINTs against transaction-mode pgbouncer (low risk per + exploration — no LISTEN/NOTIFY, no advisory locks across statements, no + prepared statements in the codebase). +2. **Close host port 5433** on `timescale_db` once pgAdmin web UI is the + established admin path. Removes the public-IP Postgres exposure entirely. +3. **Rotate `dwh_owner` / `grafana_ro` plaintext passwords** still in + `dwh/260423_dwh_ddl_v1.sql` (pre-existing item from CLAUDE.md §10). + +--- + +## Rollback + +If pgbouncer or pgAdmin misbehaves: + +1. **Stop the new services without touching the rest of the stack:** + ```bash + docker compose stop pgbouncer pgadmin + docker compose rm -f pgbouncer pgadmin + ``` + Ingest, Grafana, webhook, backup are unaffected — they were never cut + over. +2. **Revert the SQL migration if needed:** + ```sql + DROP FUNCTION public.user_lookup(text); + DROP ROLE pgbouncer; + DELETE FROM tracksolid.schema_migrations + WHERE filename = '10_pgbouncer_auth.sql'; + ``` +3. **Revert compose changes** by checking out the prior `docker-compose.yaml`. + +--- + +## End-to-end verification checklist + +- [ ] `10_pgbouncer_auth.sql` applied — visible in + `tracksolid.schema_migrations` +- [ ] `pgbouncer` service healthy — `docker compose ps` shows `healthy` +- [ ] `psql -h pgbouncer -p 6432 -U postgres -d tracksolid_db -c 'SELECT 1'` + from inside the network +- [ ] `SHOW POOLS;` in pgbouncer admin shows `transaction` mode +- [ ] `pgadmin` service healthy — Coolify domain reachable over HTTPS +- [ ] Login + query through pgAdmin succeeds +- [ ] `SELECT count(*) FROM pg_stat_activity;` stays bounded under + 30-tab stress test +- [ ] Existing pipelines unaffected: `tracksolid.ingestion_log` continues + growing at current rate; Grafana dashboards still render +- [ ] pgcli no longer hits "too many connections" when used alongside pgAdmin diff --git a/docker-compose.yaml b/docker-compose.yaml index 882b3d0..d672140 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -79,6 +79,42 @@ services: # You will set the actual URL in the Coolify UI, # but the service needs to expose port 3000 internally. + pgbouncer: + # Connection pooler in front of timescale_db. + # Runbook: 260507_pgbouncer_deployment.md + # Internal Docker network only — no host port. SCRAM passthrough via + # auth_query against the public.user_lookup() function (migration 10). + image: edoburu/pgbouncer:1.23.1 + restart: always + depends_on: + timescale_db: + condition: service_healthy + env_file: .env + environment: + - DB_HOST=timescale_db + - DB_PORT=5432 + - DB_USER=${POSTGRES_USER} + - DB_PASSWORD=${POSTGRES_PASSWORD} + - DB_NAME=${POSTGRES_DB} + - POOL_MODE=transaction + - AUTH_TYPE=scram-sha-256 + - AUTH_USER=pgbouncer + # $$1 escapes docker-compose interpolation; pgbouncer sees literal $1. + - AUTH_QUERY=SELECT uname, phash FROM public.user_lookup($$1) + - MAX_CLIENT_CONN=200 + - DEFAULT_POOL_SIZE=15 + - MIN_POOL_SIZE=2 + - RESERVE_POOL_SIZE=5 + - SERVER_RESET_QUERY=DISCARD ALL + - SERVER_IDLE_TIMEOUT=600 + - ADMIN_USERS=${POSTGRES_USER} + - LISTEN_PORT=6432 + healthcheck: + test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 6432 -U ${POSTGRES_USER}"] + interval: 30s + timeout: 5s + retries: 3 + db_backup: build: context: ./backup diff --git a/run_migrations.py b/run_migrations.py index 143f71e..0618a55 100644 --- a/run_migrations.py +++ b/run_migrations.py @@ -33,6 +33,7 @@ MIGRATIONS = [ "07_analytics_views.sql", # Grafana-facing views in tracksolid.* "08_analytics_config.sql", # ops.cost_rates, ops.kpi_targets + seed data "09_trips_enrichment.sql", # trips.route_geom + addresses + plate + v_trips_enriched + "10_pgbouncer_auth.sql", # pgbouncer role + user_lookup() for SCRAM passthrough ] # ── Tables that must exist before the service is allowed to start ───────────── @@ -180,6 +181,7 @@ def sync_role_passwords(conn): """ roles = { "grafana_ro": os.getenv("GRAFANA_DB_RO_PASSWORD"), + "pgbouncer": os.getenv("PGBOUNCER_AUTH_PASSWORD"), } with conn.cursor() as cur: for role, password in roles.items():