98 lines
4.5 KiB
MySQL
98 lines
4.5 KiB
MySQL
|
|
-- 21_ingest_health_active_only.sql
|
||
|
|
-- BUG-P5 (260702 audit): reporting.v_ingest_health included EVERY endpoint ever
|
||
|
|
-- written to tracksolid.ingestion_log. One-shot operator tools (e.g.
|
||
|
|
-- backfill_trips_enrichment, last run 2026-05-01) therefore sat at 'stale'
|
||
|
|
-- forever, wedging dashboard_api GET /health/ingest at "overall": "stale" even
|
||
|
|
-- when every real poller was healthy — the FleetOps freshness panel signal was
|
||
|
|
-- useless.
|
||
|
|
--
|
||
|
|
-- Fix: restrict the view to the ALLOW-LIST of endpoints the running pipeline
|
||
|
|
-- actually emits (ingest_worker_rev.py schedule + webhook_receiver_rev.py).
|
||
|
|
-- Adding a new pipeline endpoint => extend the list here (and the cadence CASE).
|
||
|
|
-- Everything else about the view (columns, verdict logic) is unchanged, so the
|
||
|
|
-- dashboard_api endpoint and its consumers need no changes.
|
||
|
|
-- Idempotent — CREATE OR REPLACE.
|
||
|
|
|
||
|
|
CREATE OR REPLACE VIEW reporting.v_ingest_health AS
|
||
|
|
WITH pipeline_endpoints(endpoint) AS (
|
||
|
|
VALUES
|
||
|
|
('jimi.user.device.location.list'), -- live sweep (60s)
|
||
|
|
('jimi.device.alarm.list'), -- alarms (5m)
|
||
|
|
('jimi.device.track.mileage'), -- trips (15m)
|
||
|
|
('jimi.open.platform.report.parking'), -- parking (15m)
|
||
|
|
('jimi.device.track.list'), -- high-res trail (30m)
|
||
|
|
('jimi.user.device.list+detail'), -- registry sync (daily)
|
||
|
|
('webhook/pushobd'),
|
||
|
|
('webhook/pushfaultinfo'),
|
||
|
|
('webhook/pushalarm'),
|
||
|
|
('webhook/pushgps'),
|
||
|
|
('webhook/pushhb'),
|
||
|
|
('webhook/pushtripreport'),
|
||
|
|
('webhook/pushevent')
|
||
|
|
),
|
||
|
|
last_run AS (
|
||
|
|
SELECT DISTINCT ON (il.endpoint)
|
||
|
|
il.endpoint, il.run_at, il.success, il.error_code, il.error_message,
|
||
|
|
il.rows_inserted, il.rows_upserted, il.imei_count, il.duration_ms
|
||
|
|
FROM tracksolid.ingestion_log il
|
||
|
|
JOIN pipeline_endpoints pe USING (endpoint)
|
||
|
|
ORDER BY il.endpoint, il.run_at DESC
|
||
|
|
),
|
||
|
|
agg AS (
|
||
|
|
SELECT il.endpoint,
|
||
|
|
count(*) FILTER (WHERE il.run_at > now() - interval '1 hour') AS runs_1h,
|
||
|
|
count(*) FILTER (WHERE il.run_at > now() - interval '1 hour' AND NOT il.success) AS failures_1h
|
||
|
|
FROM tracksolid.ingestion_log il
|
||
|
|
JOIN pipeline_endpoints pe USING (endpoint)
|
||
|
|
WHERE il.run_at > now() - interval '1 hour'
|
||
|
|
GROUP BY il.endpoint
|
||
|
|
)
|
||
|
|
SELECT
|
||
|
|
lr.endpoint,
|
||
|
|
lr.run_at AS last_run_at,
|
||
|
|
EXTRACT(EPOCH FROM (now() - lr.run_at))::int AS seconds_ago,
|
||
|
|
lr.success AS last_success,
|
||
|
|
lr.error_code,
|
||
|
|
lr.error_message,
|
||
|
|
lr.rows_inserted,
|
||
|
|
lr.rows_upserted,
|
||
|
|
COALESCE(a.runs_1h, 0) AS runs_1h,
|
||
|
|
COALESCE(a.failures_1h, 0) AS failures_1h,
|
||
|
|
ex.expected_interval_s,
|
||
|
|
CASE
|
||
|
|
WHEN EXTRACT(EPOCH FROM (now() - lr.run_at)) > 3 * ex.expected_interval_s THEN 'stale'
|
||
|
|
WHEN NOT lr.success THEN 'error'
|
||
|
|
ELSE 'ok'
|
||
|
|
END AS freshness
|
||
|
|
FROM last_run lr
|
||
|
|
LEFT JOIN agg a USING (endpoint)
|
||
|
|
CROSS JOIN LATERAL (
|
||
|
|
SELECT CASE lr.endpoint
|
||
|
|
WHEN 'jimi.user.device.location.list' THEN 60 -- live sweep (60s)
|
||
|
|
WHEN 'jimi.device.alarm.list' THEN 300 -- alarms (5m)
|
||
|
|
WHEN 'jimi.device.track.mileage' THEN 900 -- trips (15m)
|
||
|
|
WHEN 'jimi.open.platform.report.parking' THEN 900 -- parking (15m)
|
||
|
|
WHEN 'jimi.device.track.list' THEN 1800 -- high-res trail (30m)
|
||
|
|
WHEN 'jimi.user.device.list+detail' THEN 86400 -- registry sync (daily)
|
||
|
|
ELSE 3600 -- webhooks / default (1h)
|
||
|
|
END AS expected_interval_s
|
||
|
|
) ex
|
||
|
|
ORDER BY seconds_ago DESC;
|
||
|
|
|
||
|
|
COMMENT ON VIEW reporting.v_ingest_health IS
|
||
|
|
'Per-endpoint ingest freshness from tracksolid.ingestion_log, restricted to '
|
||
|
|
'the active pipeline endpoints (migration 21) so one-shot tools cannot wedge '
|
||
|
|
'the verdict at stale. Surfaced by dashboard_api GET /health/ingest. '
|
||
|
|
'freshness = ok|stale|error (stale = past 3x the ingest_worker_rev.py cadence).';
|
||
|
|
|
||
|
|
-- Read-only access for the dashboard roles (guarded; idempotent).
|
||
|
|
DO $grants$
|
||
|
|
BEGIN
|
||
|
|
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'grafana_ro') THEN
|
||
|
|
GRANT SELECT ON reporting.v_ingest_health TO grafana_ro;
|
||
|
|
END IF;
|
||
|
|
IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'dashboard_ro') THEN
|
||
|
|
GRANT SELECT ON reporting.v_ingest_health TO dashboard_ro;
|
||
|
|
END IF;
|
||
|
|
END $grants$;
|