tracksolid_timescale_grafan.../migrations/21_ingest_health_active_only.sql
david kiania b11294009b
Some checks are pending
Static Analysis / static (push) Waiting to run
Tests / test (push) Waiting to run
Static Analysis / static (pull_request) Waiting to run
Tests / test (pull_request) Waiting to run
fix(security,ingest): 260702 audit — secure the stack, correct poller counters
Security:
- .dockerignore + Dockerfile: stop baking .env / the 346MB OSM pbf into image
  layers; install pinned from uv.lock (reproducible builds) (SEC-04/05).
- docker-compose: DB port binds ${DB_BIND_ADDR:-127.0.0.1} — loopback-only by
  default; remote tooling moves to an SSH tunnel (SEC-01).
- webhook_receiver: CRITICAL startup warning + WEBHOOK_REQUIRE_TOKEN=1 fail-closed
  when JIMI_WEBHOOK_TOKEN is empty (SEC-02 / FIX-W01).

Correctness:
- FIX-M22/E07: capture cur.rowcount BEFORE RELEASE SAVEPOINT in poll_alarms/
  poll_trips/poll_parking — the RELEASE reported -1, producing "Alarms: -4 new
  events inserted" logs and negative ingestion_log.rows_inserted.
- FIX-W02: parse application/json push bodies (were silently dropped).
- FIX-W03: move webhook DB work off the event loop via asyncio.to_thread.
- FIX-M23: poll_trips phased so no txn/connection is held across Tracksolid +
  Nominatim (1 req/s) network calls.
- FIX-M24: sync_devices disables devices absent from every target (guarded).
- FIX-W04: reject device-clock-garbage alarm_time (2019 timestamps observed).
- get_token(): don't relabel already-aware timestamptz expiries (BUG-P9).

Observability/lifecycle:
- migration 21: v_ingest_health restricted to active pipeline endpoints so
  one-shot tools stop wedging /health/ingest at 'stale' (dry-run verified).
- FIX-M25: daily purge_audit_logs() trims ingestion_log (90d) + refresh_log (180d).
- remove orphaned duplicate migrations/10_driver_clock_views.sql; ruff lint config.

+5 webhook tests (82 pass). Report/plan/work-log in docs/reports/260702_*.
Local only; not deployed. CLAUDE.md fix-history edits left uncommitted (that file
also carries unrelated in-progress edits).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 09:51:02 +03:00

97 lines
4.5 KiB
SQL

-- 21_ingest_health_active_only.sql
-- BUG-P5 (260702 audit): reporting.v_ingest_health included EVERY endpoint ever
-- written to tracksolid.ingestion_log. One-shot operator tools (e.g.
-- backfill_trips_enrichment, last run 2026-05-01) therefore sat at 'stale'
-- forever, wedging dashboard_api GET /health/ingest at "overall": "stale" even
-- when every real poller was healthy — the FleetOps freshness panel signal was
-- useless.
--
-- Fix: restrict the view to the ALLOW-LIST of endpoints the running pipeline
-- actually emits (ingest_worker_rev.py schedule + webhook_receiver_rev.py).
-- Adding a new pipeline endpoint => extend the list here (and the cadence CASE).
-- Everything else about the view (columns, verdict logic) is unchanged, so the
-- dashboard_api endpoint and its consumers need no changes.
-- Idempotent — CREATE OR REPLACE.
CREATE OR REPLACE VIEW reporting.v_ingest_health AS
WITH pipeline_endpoints(endpoint) AS (
VALUES
('jimi.user.device.location.list'), -- live sweep (60s)
('jimi.device.alarm.list'), -- alarms (5m)
('jimi.device.track.mileage'), -- trips (15m)
('jimi.open.platform.report.parking'), -- parking (15m)
('jimi.device.track.list'), -- high-res trail (30m)
('jimi.user.device.list+detail'), -- registry sync (daily)
('webhook/pushobd'),
('webhook/pushfaultinfo'),
('webhook/pushalarm'),
('webhook/pushgps'),
('webhook/pushhb'),
('webhook/pushtripreport'),
('webhook/pushevent')
),
last_run AS (
SELECT DISTINCT ON (il.endpoint)
il.endpoint, il.run_at, il.success, il.error_code, il.error_message,
il.rows_inserted, il.rows_upserted, il.imei_count, il.duration_ms
FROM tracksolid.ingestion_log il
JOIN pipeline_endpoints pe USING (endpoint)
ORDER BY il.endpoint, il.run_at DESC
),
agg AS (
SELECT il.endpoint,
count(*) FILTER (WHERE il.run_at > now() - interval '1 hour') AS runs_1h,
count(*) FILTER (WHERE il.run_at > now() - interval '1 hour' AND NOT il.success) AS failures_1h
FROM tracksolid.ingestion_log il
JOIN pipeline_endpoints pe USING (endpoint)
WHERE il.run_at > now() - interval '1 hour'
GROUP BY il.endpoint
)
SELECT
lr.endpoint,
lr.run_at AS last_run_at,
EXTRACT(EPOCH FROM (now() - lr.run_at))::int AS seconds_ago,
lr.success AS last_success,
lr.error_code,
lr.error_message,
lr.rows_inserted,
lr.rows_upserted,
COALESCE(a.runs_1h, 0) AS runs_1h,
COALESCE(a.failures_1h, 0) AS failures_1h,
ex.expected_interval_s,
CASE
WHEN EXTRACT(EPOCH FROM (now() - lr.run_at)) > 3 * ex.expected_interval_s THEN 'stale'
WHEN NOT lr.success THEN 'error'
ELSE 'ok'
END AS freshness
FROM last_run lr
LEFT JOIN agg a USING (endpoint)
CROSS JOIN LATERAL (
SELECT CASE lr.endpoint
WHEN 'jimi.user.device.location.list' THEN 60 -- live sweep (60s)
WHEN 'jimi.device.alarm.list' THEN 300 -- alarms (5m)
WHEN 'jimi.device.track.mileage' THEN 900 -- trips (15m)
WHEN 'jimi.open.platform.report.parking' THEN 900 -- parking (15m)
WHEN 'jimi.device.track.list' THEN 1800 -- high-res trail (30m)
WHEN 'jimi.user.device.list+detail' THEN 86400 -- registry sync (daily)
ELSE 3600 -- webhooks / default (1h)
END AS expected_interval_s
) ex
ORDER BY seconds_ago DESC;
COMMENT ON VIEW reporting.v_ingest_health IS
'Per-endpoint ingest freshness from tracksolid.ingestion_log, restricted to '
'the active pipeline endpoints (migration 21) so one-shot tools cannot wedge '
'the verdict at stale. Surfaced by dashboard_api GET /health/ingest. '
'freshness = ok|stale|error (stale = past 3x the ingest_worker_rev.py cadence).';
-- Read-only access for the dashboard roles (guarded; idempotent).
DO $grants$
BEGIN
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'grafana_ro') THEN
GRANT SELECT ON reporting.v_ingest_health TO grafana_ro;
END IF;
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'dashboard_ro') THEN
GRANT SELECT ON reporting.v_ingest_health TO dashboard_ro;
END IF;
END $grants$;