Security:
- .dockerignore + Dockerfile: stop baking .env / the 346MB OSM pbf into image
layers; install pinned from uv.lock (reproducible builds) (SEC-04/05).
- docker-compose: DB port binds ${DB_BIND_ADDR:-127.0.0.1} — loopback-only by
default; remote tooling moves to an SSH tunnel (SEC-01).
- webhook_receiver: CRITICAL startup warning + WEBHOOK_REQUIRE_TOKEN=1 fail-closed
when JIMI_WEBHOOK_TOKEN is empty (SEC-02 / FIX-W01).
Correctness:
- FIX-M22/E07: capture cur.rowcount BEFORE RELEASE SAVEPOINT in poll_alarms/
poll_trips/poll_parking — the RELEASE reported -1, producing "Alarms: -4 new
events inserted" logs and negative ingestion_log.rows_inserted.
- FIX-W02: parse application/json push bodies (were silently dropped).
- FIX-W03: move webhook DB work off the event loop via asyncio.to_thread.
- FIX-M23: poll_trips phased so no txn/connection is held across Tracksolid +
Nominatim (1 req/s) network calls.
- FIX-M24: sync_devices disables devices absent from every target (guarded).
- FIX-W04: reject device-clock-garbage alarm_time (2019 timestamps observed).
- get_token(): don't relabel already-aware timestamptz expiries (BUG-P9).
Observability/lifecycle:
- migration 21: v_ingest_health restricted to active pipeline endpoints so
one-shot tools stop wedging /health/ingest at 'stale' (dry-run verified).
- FIX-M25: daily purge_audit_logs() trims ingestion_log (90d) + refresh_log (180d).
- remove orphaned duplicate migrations/10_driver_clock_views.sql; ruff lint config.
+5 webhook tests (82 pass). Report/plan/work-log in docs/reports/260702_*.
Local only; not deployed. CLAUDE.md fix-history edits left uncommitted (that file
also carries unrelated in-progress edits).
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
97 lines
4.5 KiB
SQL
97 lines
4.5 KiB
SQL
-- 21_ingest_health_active_only.sql
|
|
-- BUG-P5 (260702 audit): reporting.v_ingest_health included EVERY endpoint ever
|
|
-- written to tracksolid.ingestion_log. One-shot operator tools (e.g.
|
|
-- backfill_trips_enrichment, last run 2026-05-01) therefore sat at 'stale'
|
|
-- forever, wedging dashboard_api GET /health/ingest at "overall": "stale" even
|
|
-- when every real poller was healthy — the FleetOps freshness panel signal was
|
|
-- useless.
|
|
--
|
|
-- Fix: restrict the view to the ALLOW-LIST of endpoints the running pipeline
|
|
-- actually emits (ingest_worker_rev.py schedule + webhook_receiver_rev.py).
|
|
-- Adding a new pipeline endpoint => extend the list here (and the cadence CASE).
|
|
-- Everything else about the view (columns, verdict logic) is unchanged, so the
|
|
-- dashboard_api endpoint and its consumers need no changes.
|
|
-- Idempotent — CREATE OR REPLACE.
|
|
|
|
CREATE OR REPLACE VIEW reporting.v_ingest_health AS
|
|
WITH pipeline_endpoints(endpoint) AS (
|
|
VALUES
|
|
('jimi.user.device.location.list'), -- live sweep (60s)
|
|
('jimi.device.alarm.list'), -- alarms (5m)
|
|
('jimi.device.track.mileage'), -- trips (15m)
|
|
('jimi.open.platform.report.parking'), -- parking (15m)
|
|
('jimi.device.track.list'), -- high-res trail (30m)
|
|
('jimi.user.device.list+detail'), -- registry sync (daily)
|
|
('webhook/pushobd'),
|
|
('webhook/pushfaultinfo'),
|
|
('webhook/pushalarm'),
|
|
('webhook/pushgps'),
|
|
('webhook/pushhb'),
|
|
('webhook/pushtripreport'),
|
|
('webhook/pushevent')
|
|
),
|
|
last_run AS (
|
|
SELECT DISTINCT ON (il.endpoint)
|
|
il.endpoint, il.run_at, il.success, il.error_code, il.error_message,
|
|
il.rows_inserted, il.rows_upserted, il.imei_count, il.duration_ms
|
|
FROM tracksolid.ingestion_log il
|
|
JOIN pipeline_endpoints pe USING (endpoint)
|
|
ORDER BY il.endpoint, il.run_at DESC
|
|
),
|
|
agg AS (
|
|
SELECT il.endpoint,
|
|
count(*) FILTER (WHERE il.run_at > now() - interval '1 hour') AS runs_1h,
|
|
count(*) FILTER (WHERE il.run_at > now() - interval '1 hour' AND NOT il.success) AS failures_1h
|
|
FROM tracksolid.ingestion_log il
|
|
JOIN pipeline_endpoints pe USING (endpoint)
|
|
WHERE il.run_at > now() - interval '1 hour'
|
|
GROUP BY il.endpoint
|
|
)
|
|
SELECT
|
|
lr.endpoint,
|
|
lr.run_at AS last_run_at,
|
|
EXTRACT(EPOCH FROM (now() - lr.run_at))::int AS seconds_ago,
|
|
lr.success AS last_success,
|
|
lr.error_code,
|
|
lr.error_message,
|
|
lr.rows_inserted,
|
|
lr.rows_upserted,
|
|
COALESCE(a.runs_1h, 0) AS runs_1h,
|
|
COALESCE(a.failures_1h, 0) AS failures_1h,
|
|
ex.expected_interval_s,
|
|
CASE
|
|
WHEN EXTRACT(EPOCH FROM (now() - lr.run_at)) > 3 * ex.expected_interval_s THEN 'stale'
|
|
WHEN NOT lr.success THEN 'error'
|
|
ELSE 'ok'
|
|
END AS freshness
|
|
FROM last_run lr
|
|
LEFT JOIN agg a USING (endpoint)
|
|
CROSS JOIN LATERAL (
|
|
SELECT CASE lr.endpoint
|
|
WHEN 'jimi.user.device.location.list' THEN 60 -- live sweep (60s)
|
|
WHEN 'jimi.device.alarm.list' THEN 300 -- alarms (5m)
|
|
WHEN 'jimi.device.track.mileage' THEN 900 -- trips (15m)
|
|
WHEN 'jimi.open.platform.report.parking' THEN 900 -- parking (15m)
|
|
WHEN 'jimi.device.track.list' THEN 1800 -- high-res trail (30m)
|
|
WHEN 'jimi.user.device.list+detail' THEN 86400 -- registry sync (daily)
|
|
ELSE 3600 -- webhooks / default (1h)
|
|
END AS expected_interval_s
|
|
) ex
|
|
ORDER BY seconds_ago DESC;
|
|
|
|
COMMENT ON VIEW reporting.v_ingest_health IS
|
|
'Per-endpoint ingest freshness from tracksolid.ingestion_log, restricted to '
|
|
'the active pipeline endpoints (migration 21) so one-shot tools cannot wedge '
|
|
'the verdict at stale. Surfaced by dashboard_api GET /health/ingest. '
|
|
'freshness = ok|stale|error (stale = past 3x the ingest_worker_rev.py cadence).';
|
|
|
|
-- Read-only access for the dashboard roles (guarded; idempotent).
|
|
DO $grants$
|
|
BEGIN
|
|
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'grafana_ro') THEN
|
|
GRANT SELECT ON reporting.v_ingest_health TO grafana_ro;
|
|
END IF;
|
|
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'dashboard_ro') THEN
|
|
GRANT SELECT ON reporting.v_ingest_health TO dashboard_ro;
|
|
END IF;
|
|
END $grants$;
|