91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
|
|
"""SLO measurement worker — task #12.
|
||
|
|
|
||
|
|
Every TRACKSOLID_POLL_INTERVAL_SEC (default 60s) we compute the current value
|
||
|
|
of each metric in `slo.targets` and append one row to `slo.measurements`. The
|
||
|
|
`slo.v_current_status` view does the rest — joining the latest measurement to
|
||
|
|
the target threshold so the dashboard panel can render green/red badges.
|
||
|
|
|
||
|
|
Metrics computed in P1:
|
||
|
|
fix_freshness_pct_60s → % of active devices whose latest fix is within
|
||
|
|
90 seconds (threshold 95)
|
||
|
|
parser_lag_p95_sec → p95 of (events.parsed.inserted_at - events.raw.received_at)
|
||
|
|
for events.parsed rows inserted in the last 5 minutes
|
||
|
|
(threshold 30s)
|
||
|
|
|
||
|
|
Not yet computed (waiting on dependent work):
|
||
|
|
contract_drift_days → needs the daily Tracksolid contract checker (#13)
|
||
|
|
and an ops.contract_check_log table
|
||
|
|
"""
|
||
|
|
|
||
|
|
import structlog
|
||
|
|
|
||
|
|
from app.db import get_pool
|
||
|
|
|
||
|
|
log = structlog.get_logger("worker.slo")
|
||
|
|
|
||
|
|
|
||
|
|
async def _record(metric: str, value: float) -> None:
|
||
|
|
pool = await get_pool()
|
||
|
|
async with pool.connection() as conn, conn.cursor() as cur:
|
||
|
|
await cur.execute(
|
||
|
|
"INSERT INTO slo.measurements (metric, value) VALUES (%s, %s)",
|
||
|
|
(metric, value),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
async def _fix_freshness_pct_60s() -> float | None:
|
||
|
|
pool = await get_pool()
|
||
|
|
async with pool.connection() as conn, conn.cursor() as cur:
|
||
|
|
await cur.execute(
|
||
|
|
"""
|
||
|
|
SELECT 100.0
|
||
|
|
* count(*) FILTER (
|
||
|
|
WHERE lp.occurred_at > now() - interval '90 seconds'
|
||
|
|
)
|
||
|
|
/ NULLIF(count(*), 0) AS pct
|
||
|
|
FROM state.live_positions lp
|
||
|
|
JOIN domain.devices d ON d.imei = lp.imei
|
||
|
|
WHERE d.lifecycle = 'active'
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
row = await cur.fetchone()
|
||
|
|
if row is None or row[0] is None:
|
||
|
|
return None
|
||
|
|
return float(row[0])
|
||
|
|
|
||
|
|
|
||
|
|
async def _parser_lag_p95_sec() -> float | None:
|
||
|
|
pool = await get_pool()
|
||
|
|
async with pool.connection() as conn, conn.cursor() as cur:
|
||
|
|
await cur.execute(
|
||
|
|
"""
|
||
|
|
SELECT EXTRACT(EPOCH FROM percentile_disc(0.95) WITHIN GROUP (
|
||
|
|
ORDER BY (p.inserted_at - r.received_at)
|
||
|
|
))
|
||
|
|
FROM events.parsed p
|
||
|
|
JOIN events.raw r
|
||
|
|
ON r.event_id = p.raw_event_id
|
||
|
|
AND r.received_at = p.raw_received_at
|
||
|
|
WHERE p.inserted_at > now() - interval '5 minutes'
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
row = await cur.fetchone()
|
||
|
|
if row is None or row[0] is None:
|
||
|
|
return None
|
||
|
|
return float(row[0])
|
||
|
|
|
||
|
|
|
||
|
|
async def record_all() -> None:
|
||
|
|
measured: dict[str, float | None] = {
|
||
|
|
"fix_freshness_pct_60s": await _fix_freshness_pct_60s(),
|
||
|
|
"parser_lag_p95_sec": await _parser_lag_p95_sec(),
|
||
|
|
}
|
||
|
|
written: dict[str, float] = {}
|
||
|
|
for metric, value in measured.items():
|
||
|
|
if value is None:
|
||
|
|
continue
|
||
|
|
await _record(metric, value)
|
||
|
|
written[metric] = round(value, 2)
|
||
|
|
if written:
|
||
|
|
log.info("slo.recorded", **{k: written[k] for k in written})
|