feat(stack): consolidate 7→4 services (merge pollers, drop pgbouncer/grafana)
Collapse the backend from 7 Coolify services to 4 app services + the DB. - Merge ingest_movement + ingest_events into a single ingest_worker: split each poller's main() into reusable startup_catchup()/register_jobs() and drive both from one schedule loop in new ingest_worker_rev.py (standalone entrypoints retained for local debug). - docker-compose.yaml: replace the two poller services with ingest_worker; remove the pgbouncer service (dormant; transaction-mode pooling is unsafe for the advisory-lock'd v_trips refresher) and the grafana service + grafana-data volume (redundant with the FleetOps SPA). - Add reporting.v_ingest_health (migration 19) + dashboard_api GET /health/ingest as the pipeline-freshness surface that replaces Grafana's health panels. webhook_receiver stays isolated so a poller fault can't drop inbound pushes. timescale_db and db_backup are unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
2f6ab1ba3b
commit
76f6915e61
7 changed files with 204 additions and 80 deletions
|
|
@ -180,6 +180,33 @@ def health():
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Ingest pipeline freshness ────────────────────────────────────────────────
|
||||||
|
# Replaces the Grafana pipeline-health panels (Grafana removed 2026-06-10).
|
||||||
|
# Reads reporting.v_ingest_health (migration 19) — one row per ingest endpoint
|
||||||
|
# with last-run age + freshness verdict (ok|stale|error). Lets FleetOps show
|
||||||
|
# whether the ingest_worker pollers are alive without a separate dashboard product.
|
||||||
|
@app.get("/health/ingest")
|
||||||
|
def ingest_health():
|
||||||
|
try:
|
||||||
|
with get_conn() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute("SELECT * FROM reporting.v_ingest_health")
|
||||||
|
rows = cur.fetchall()
|
||||||
|
worst = (
|
||||||
|
"error" if any(r["freshness"] == "error" for r in rows)
|
||||||
|
else "stale" if any(r["freshness"] == "stale" for r in rows)
|
||||||
|
else "ok"
|
||||||
|
) if rows else "unknown"
|
||||||
|
return JSONResponse({"overall": worst, "endpoints": rows})
|
||||||
|
except Exception:
|
||||||
|
log.exception("ingest-health failed")
|
||||||
|
return JSONResponse(
|
||||||
|
{"overall": "unknown", "endpoints": [],
|
||||||
|
"error": {"type": "unknown",
|
||||||
|
"message": "Ingest-health feed is unavailable. Try again in a few seconds."}}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ── Live positions (#004) ───────────────────────────────────────────────────
|
# ── Live positions (#004) ───────────────────────────────────────────────────
|
||||||
|
|
||||||
@app.get("/webhook/live-positions")
|
@app.get("/webhook/live-positions")
|
||||||
|
|
|
||||||
|
|
@ -19,22 +19,14 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
ingest_movement:
|
ingest_worker:
|
||||||
|
# Merged movement + events pollers (was ingest_movement + ingest_events).
|
||||||
|
# Both pipelines run in one process via ingest_worker_rev.py — same image,
|
||||||
|
# same shared connection pool, one `schedule` loop. See ingest_worker_rev.py.
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
command: sh -c "python run_migrations.py && python ingest_movement_rev.py"
|
command: sh -c "python run_migrations.py && python ingest_worker_rev.py"
|
||||||
restart: always
|
|
||||||
depends_on:
|
|
||||||
timescale_db:
|
|
||||||
condition: service_healthy
|
|
||||||
env_file: .env
|
|
||||||
|
|
||||||
ingest_events:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile
|
|
||||||
command: sh -c "python run_migrations.py && python ingest_events_rev.py"
|
|
||||||
restart: always
|
restart: always
|
||||||
depends_on:
|
depends_on:
|
||||||
timescale_db:
|
timescale_db:
|
||||||
|
|
@ -84,61 +76,19 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 3
|
retries: 3
|
||||||
|
|
||||||
grafana:
|
# grafana — REMOVED 2026-06-10. Fleet visualisation/KPIs are now served by the
|
||||||
build:
|
# FleetOps SPA (own repo) via the dashboard_api read layer. Pipeline freshness
|
||||||
context: ./grafana
|
# (the one thing only Grafana surfaced) is replaced by reporting.v_ingest_health
|
||||||
dockerfile: Dockerfile
|
# (migration 19) exposed on the read-API. The grafana_ro role + reporting.*
|
||||||
restart: always
|
# grants are retained (harmless, reusable). Provisioning kept in ./grafana for
|
||||||
depends_on:
|
# reference. To restore, re-add this service block.
|
||||||
timescale_db:
|
|
||||||
condition: service_healthy
|
|
||||||
env_file: .env
|
|
||||||
environment:
|
|
||||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
|
|
||||||
- GF_USERS_DEFAULT_THEME=dark
|
|
||||||
- GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/provisioning/dashboards-json/noc_fleet_dashboard.json
|
|
||||||
volumes:
|
|
||||||
- grafana-data:/var/lib/grafana
|
|
||||||
# Provisioning is baked into the image via grafana/Dockerfile — no bind mount needed.
|
|
||||||
# COOLIFY DOMAIN LOGIC:
|
|
||||||
# You will set the actual URL in the Coolify UI,
|
|
||||||
# but the service needs to expose port 3000 internally.
|
|
||||||
|
|
||||||
pgbouncer:
|
# pgbouncer — REMOVED 2026-06-10. It was deployed but dormant (zero clients
|
||||||
# Connection pooler in front of timescale_db.
|
# pointed at :6432; every service connects directly to timescale_db:5432).
|
||||||
# Runbook: docs/reference/260507_pgbouncer_deployment.md
|
# In-process pooling (ts_shared_rev ThreadedConnectionPool) is more than
|
||||||
# Internal Docker network only — no host port. SCRAM passthrough via
|
# sufficient at this scale, and transaction-mode pooling is unsafe for the
|
||||||
# auth_query against the public.user_lookup() function (migration 10).
|
# advisory-lock'd v_trips refresher (FIX-D02). Migration 10 (pgbouncer role +
|
||||||
image: edoburu/pgbouncer
|
# user_lookup()) is left applied but inert. To restore, re-add this service block.
|
||||||
restart: always
|
|
||||||
depends_on:
|
|
||||||
timescale_db:
|
|
||||||
condition: service_healthy
|
|
||||||
env_file: .env
|
|
||||||
environment:
|
|
||||||
- DB_HOST=timescale_db
|
|
||||||
- DB_PORT=5432
|
|
||||||
- DB_USER=${POSTGRES_USER}
|
|
||||||
- DB_PASSWORD=${POSTGRES_PASSWORD}
|
|
||||||
- DB_NAME=${POSTGRES_DB}
|
|
||||||
- POOL_MODE=transaction
|
|
||||||
- AUTH_TYPE=scram-sha-256
|
|
||||||
- AUTH_USER=pgbouncer
|
|
||||||
# $$1 escapes docker-compose interpolation; pgbouncer sees literal $1.
|
|
||||||
- AUTH_QUERY=SELECT uname, phash FROM public.user_lookup($$1)
|
|
||||||
- MAX_CLIENT_CONN=200
|
|
||||||
- DEFAULT_POOL_SIZE=15
|
|
||||||
- MIN_POOL_SIZE=2
|
|
||||||
- RESERVE_POOL_SIZE=5
|
|
||||||
- SERVER_RESET_QUERY=DISCARD ALL
|
|
||||||
- SERVER_IDLE_TIMEOUT=600
|
|
||||||
- ADMIN_USERS=${POSTGRES_USER}
|
|
||||||
- LISTEN_PORT=6432
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 6432 -U ${POSTGRES_USER}"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
db_backup:
|
db_backup:
|
||||||
build:
|
build:
|
||||||
|
|
@ -164,5 +114,4 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
timescale-data:
|
timescale-data:
|
||||||
name: timescale-data
|
name: timescale-data
|
||||||
grafana-data:
|
# grafana-data removed with the grafana service (2026-06-10).
|
||||||
name: grafana-data
|
|
||||||
|
|
|
||||||
|
|
@ -105,16 +105,24 @@ def poll_alarms():
|
||||||
|
|
||||||
# ── Main Loop ─────────────────────────────────────────────────────────────────
|
# ── Main Loop ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def main():
|
def startup_catchup():
|
||||||
log.info("Starting EVENTS PIPELINE (v2.1)...")
|
"""Run the alarm poll once on boot. Split out of main() so the merged
|
||||||
# OBD removed: Data arrives via webhook push (/pushobd), not polling.
|
ingest_worker can reuse it (DRY).
|
||||||
|
OBD removed: data arrives via webhook push (/pushobd), not polling."""
|
||||||
# Startup catch-up
|
|
||||||
safe_task(poll_alarms, log)()
|
safe_task(poll_alarms, log)()
|
||||||
|
|
||||||
# Schedule
|
|
||||||
|
def register_jobs():
|
||||||
|
"""Register the events jobs on the global `schedule` scheduler.
|
||||||
|
Reused by both this module's main() and ingest_worker_rev.main()."""
|
||||||
schedule.every(5).minutes.do(safe_task(poll_alarms, log))
|
schedule.every(5).minutes.do(safe_task(poll_alarms, log))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log.info("Starting EVENTS PIPELINE (v2.1)...")
|
||||||
|
startup_catchup()
|
||||||
|
register_jobs()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
schedule.run_pending()
|
schedule.run_pending()
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
|
||||||
|
|
@ -671,10 +671,9 @@ def poll_stale_locations():
|
||||||
|
|
||||||
# ── Main Loop ─────────────────────────────────────────────────────────────────
|
# ── Main Loop ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def main():
|
def startup_catchup():
|
||||||
log.info("Starting MOVEMENT PIPELINE (v2.2)...")
|
"""Run every movement task once on boot so the DB is warm immediately.
|
||||||
|
Split out of main() so the merged ingest_worker can reuse it (DRY)."""
|
||||||
# Startup catch-up
|
|
||||||
safe_task(sync_devices, log)()
|
safe_task(sync_devices, log)()
|
||||||
safe_task(poll_live_positions, log)()
|
safe_task(poll_live_positions, log)()
|
||||||
safe_task(poll_trips, log)()
|
safe_task(poll_trips, log)()
|
||||||
|
|
@ -682,7 +681,10 @@ def main():
|
||||||
safe_task(poll_track_list, log)()
|
safe_task(poll_track_list, log)()
|
||||||
safe_task(poll_stale_locations, log)()
|
safe_task(poll_stale_locations, log)()
|
||||||
|
|
||||||
# Schedule
|
|
||||||
|
def register_jobs():
|
||||||
|
"""Register the movement jobs on the global `schedule` scheduler.
|
||||||
|
Reused by both this module's main() and ingest_worker_rev.main()."""
|
||||||
schedule.every(60).seconds.do(safe_task(poll_live_positions, log))
|
schedule.every(60).seconds.do(safe_task(poll_live_positions, log))
|
||||||
schedule.every(15).minutes.do(safe_task(poll_trips, log))
|
schedule.every(15).minutes.do(safe_task(poll_trips, log))
|
||||||
schedule.every(15).minutes.do(safe_task(poll_parking, log))
|
schedule.every(15).minutes.do(safe_task(poll_parking, log))
|
||||||
|
|
@ -690,6 +692,12 @@ def main():
|
||||||
schedule.every(10).minutes.do(safe_task(poll_stale_locations, log)) # [FIX-M21]
|
schedule.every(10).minutes.do(safe_task(poll_stale_locations, log)) # [FIX-M21]
|
||||||
schedule.every().day.at("02:00").do(safe_task(sync_devices, log))
|
schedule.every().day.at("02:00").do(safe_task(sync_devices, log))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log.info("Starting MOVEMENT PIPELINE (v2.2)...")
|
||||||
|
startup_catchup()
|
||||||
|
register_jobs()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
schedule.run_pending()
|
schedule.run_pending()
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
|
||||||
52
ingest_worker_rev.py
Normal file
52
ingest_worker_rev.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
"""
|
||||||
|
ingest_worker_rev.py — Fireside Communications · Merged Ingest Worker
|
||||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
RESPONSIBILITY: Run the movement and events polling pipelines in a single
|
||||||
|
process. Consolidates the former `ingest_movement` and `ingest_events`
|
||||||
|
containers into one `ingest_worker` service.
|
||||||
|
|
||||||
|
WHY ONE PROCESS: both pipelines were identical in shape — blocking
|
||||||
|
`while True: schedule.run_pending()` daemons that register jobs onto the
|
||||||
|
`schedule` library's module-global default scheduler and share the same
|
||||||
|
ts_shared_rev ThreadedConnectionPool. Driving every job from one
|
||||||
|
run_pending() loop is strictly equivalent to running them separately, with
|
||||||
|
one fewer container, one log stream, and one connection pool.
|
||||||
|
|
||||||
|
The inbound `webhook_receiver` is deliberately NOT merged here: pushed
|
||||||
|
device data is unrecoverable, so it stays isolated from poller faults.
|
||||||
|
|
||||||
|
Standalone entrypoints (`python ingest_movement_rev.py`,
|
||||||
|
`python ingest_events_rev.py`) remain intact for local debugging — this
|
||||||
|
module only reuses their startup_catchup()/register_jobs() helpers.
|
||||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import schedule
|
||||||
|
|
||||||
|
from ts_shared_rev import get_logger, setup_shutdown
|
||||||
|
import ingest_movement_rev as mv
|
||||||
|
import ingest_events_rev as ev
|
||||||
|
|
||||||
|
log = get_logger("ingest_worker")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log.info("Starting INGEST WORKER — merged MOVEMENT + EVENTS pipelines")
|
||||||
|
setup_shutdown(log) # one SIGTERM/SIGINT handler for the shared DB pool
|
||||||
|
|
||||||
|
# Startup catch-up — warm both pipelines immediately.
|
||||||
|
mv.startup_catchup()
|
||||||
|
ev.startup_catchup()
|
||||||
|
|
||||||
|
# Register every job onto the shared global `schedule` scheduler.
|
||||||
|
mv.register_jobs()
|
||||||
|
ev.register_jobs()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
schedule.run_pending()
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
79
migrations/19_v_ingest_health.sql
Normal file
79
migrations/19_v_ingest_health.sql
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
-- 19_v_ingest_health.sql
|
||||||
|
-- reporting.v_ingest_health — per-endpoint ingest freshness for FleetOps.
|
||||||
|
--
|
||||||
|
-- CONTEXT: Grafana was removed (2026-06-10) as redundant with the FleetOps SPA.
|
||||||
|
-- The one signal only Grafana surfaced was pipeline freshness. This view replaces
|
||||||
|
-- it with a read-API-friendly surface derived from the existing
|
||||||
|
-- tracksolid.ingestion_log (every poll already writes a row via log_ingestion()),
|
||||||
|
-- so FleetOps can show "is the ingest pipeline alive / stale / erroring" per
|
||||||
|
-- endpoint without a separate dashboard product. Exposed by dashboard_api as
|
||||||
|
-- GET /health/ingest.
|
||||||
|
--
|
||||||
|
-- One row per endpoint: the latest run, how long ago, last success/error, 1-hour
|
||||||
|
-- run + failure counts, and a coarse freshness verdict. Each endpoint's expected
|
||||||
|
-- cadence mirrors the ingest_worker schedule (ingest_worker_rev.py); 'stale' fires
|
||||||
|
-- only past 3x that cadence so daily/low-frequency jobs aren't false-flagged.
|
||||||
|
-- Guarded + idempotent -> safe to re-apply.
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW reporting.v_ingest_health AS
|
||||||
|
WITH last_run AS (
|
||||||
|
SELECT DISTINCT ON (endpoint)
|
||||||
|
endpoint, run_at, success, error_code, error_message,
|
||||||
|
rows_inserted, rows_upserted, imei_count, duration_ms
|
||||||
|
FROM tracksolid.ingestion_log
|
||||||
|
ORDER BY endpoint, run_at DESC
|
||||||
|
),
|
||||||
|
agg AS (
|
||||||
|
SELECT endpoint,
|
||||||
|
count(*) FILTER (WHERE run_at > now() - interval '1 hour') AS runs_1h,
|
||||||
|
count(*) FILTER (WHERE run_at > now() - interval '1 hour' AND NOT success) AS failures_1h
|
||||||
|
FROM tracksolid.ingestion_log
|
||||||
|
GROUP BY endpoint
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
lr.endpoint,
|
||||||
|
lr.run_at AS last_run_at,
|
||||||
|
EXTRACT(EPOCH FROM (now() - lr.run_at))::int AS seconds_ago,
|
||||||
|
lr.success AS last_success,
|
||||||
|
lr.error_code,
|
||||||
|
lr.error_message,
|
||||||
|
lr.rows_inserted,
|
||||||
|
lr.rows_upserted,
|
||||||
|
COALESCE(a.runs_1h, 0) AS runs_1h,
|
||||||
|
COALESCE(a.failures_1h, 0) AS failures_1h,
|
||||||
|
ex.expected_interval_s,
|
||||||
|
CASE
|
||||||
|
WHEN EXTRACT(EPOCH FROM (now() - lr.run_at)) > 3 * ex.expected_interval_s THEN 'stale'
|
||||||
|
WHEN NOT lr.success THEN 'error'
|
||||||
|
ELSE 'ok'
|
||||||
|
END AS freshness
|
||||||
|
FROM last_run lr
|
||||||
|
LEFT JOIN agg a USING (endpoint)
|
||||||
|
CROSS JOIN LATERAL (
|
||||||
|
SELECT CASE lr.endpoint
|
||||||
|
WHEN 'jimi.user.device.location.list' THEN 60 -- live sweep (60s)
|
||||||
|
WHEN 'jimi.device.alarm.list' THEN 300 -- alarms (5m)
|
||||||
|
WHEN 'jimi.device.track.mileage' THEN 900 -- trips (15m)
|
||||||
|
WHEN 'jimi.open.platform.report.parking' THEN 900 -- parking (15m)
|
||||||
|
WHEN 'jimi.device.track.list' THEN 1800 -- high-res trail (30m)
|
||||||
|
ELSE 3600 -- default (1h)
|
||||||
|
END AS expected_interval_s
|
||||||
|
) ex
|
||||||
|
ORDER BY seconds_ago DESC;
|
||||||
|
|
||||||
|
COMMENT ON VIEW reporting.v_ingest_health IS
|
||||||
|
'Per-endpoint ingest freshness from tracksolid.ingestion_log. Replaces the '
|
||||||
|
'Grafana pipeline-health panels (Grafana removed 2026-06-10). Surfaced by '
|
||||||
|
'dashboard_api GET /health/ingest. freshness = ok|stale|error (stale = past 3x '
|
||||||
|
'the ingest_worker_rev.py cadence).';
|
||||||
|
|
||||||
|
-- Read-only access for the dashboard roles (guarded; idempotent).
|
||||||
|
DO $grants$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'grafana_ro') THEN
|
||||||
|
GRANT SELECT ON reporting.v_ingest_health TO grafana_ro;
|
||||||
|
END IF;
|
||||||
|
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'dashboard_ro') THEN
|
||||||
|
GRANT SELECT ON reporting.v_ingest_health TO dashboard_ro;
|
||||||
|
END IF;
|
||||||
|
END $grants$;
|
||||||
|
|
@ -42,6 +42,7 @@ MIGRATIONS = [
|
||||||
"16_live_feed_vehicle_type.sql", # add vehicle_type + fleet_segment to fn_live_positions feed
|
"16_live_feed_vehicle_type.sql", # add vehicle_type + fleet_segment to fn_live_positions feed
|
||||||
"17_fleetops_fuel_view.sql", # reporting.v_fuel_daily — FleetOps GET /analytics/fuel source
|
"17_fleetops_fuel_view.sql", # reporting.v_fuel_daily — FleetOps GET /analytics/fuel source
|
||||||
"18_grant_reporting_ro.sql", # grant SELECT on reporting.* to grafana_ro (staging read-only role)
|
"18_grant_reporting_ro.sql", # grant SELECT on reporting.* to grafana_ro (staging read-only role)
|
||||||
|
"19_v_ingest_health.sql", # reporting.v_ingest_health — pipeline freshness (replaces Grafana panels)
|
||||||
]
|
]
|
||||||
|
|
||||||
# ── Tables that must exist before the service is allowed to start ─────────────
|
# ── Tables that must exist before the service is allowed to start ─────────────
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue