tracksolid_timescale_grafan.../08_fix_etl_vehicle_key.sql
david kiania 7bc0a2ce87
Some checks failed
Static Analysis / static (push) Has been cancelled
Tests / test (push) Has been cancelled
Static Analysis / static (pull_request) Has been cancelled
Tests / test (pull_request) Has been cancelled
fix(BUG-01b): aggregate trips and alarms in separate CTEs to avoid cartesian explosion
The original refresh_daily_metrics() joined trips × alarms in one SELECT,
producing one row per (trip, alarm) pair. Every SUM/COUNT over trip
columns was multiplied by the per-IMEI alarm count, so spot-checks
showed total_trips identical to alarm_count, drive_hours > 1000/day,
and distance_km in the tens of thousands per vehicle per day.

Migration 08 carried that flawed join forward when fixing the
TEXT→INTEGER vehicle_key crash. Rewriting the function so trip_agg and
alarm_agg are computed in separate CTEs and then joined on imei
restores correct per-vehicle aggregates: total_trips reflects real trip
count, drive_hours ≤ 24, alarms are counted once.

This bug is being fixed in the same migration file (08) before PR #12
merges; no deploy has applied the prior version, so no second migration
is needed.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 16:44:40 +03:00

134 lines
6.2 KiB
PL/PgSQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- 08_fix_etl_vehicle_key.sql
-- Fixes two distinct bugs in dwh_gold.refresh_daily_metrics():
--
-- BUG-01a (type crash): the original function inserted t.imei (TEXT) into
-- fact_daily_fleet_metrics.vehicle_key (INTEGER REFERENCES dim_vehicles),
-- so every nightly call raised "invalid input syntax for type integer".
--
-- BUG-01b (cartesian explosion): the original function joined
-- trips × alarms in a single SELECT. For every trip row it produced one
-- output row per matching alarm, multiplying every SUM/COUNT over trip
-- columns by the per-IMEI alarm count. Spot-checking the broken output
-- showed total_trips identical to alarm_count and drive_hours > 1000/day.
--
-- The fix has three parts:
-- 1. Seed dwh_gold.dim_vehicles from tracksolid.devices so every IMEI
-- has a serial vehicle_key to point at.
-- 2. Rewrite refresh_daily_metrics() so trip aggregates and alarm
-- aggregates are computed in separate CTEs and then joined on imei.
-- 3. Map IMEI → vehicle_key via dim_vehicles inside the same statement.
BEGIN;
-- ── 1. Backfill dim_vehicles ─────────────────────────────────────────────────
-- One row per device. is_active mirrors enabled_flag; vehicle_number tracks
-- the plate so dashboards can label charts without joining back to devices.
INSERT INTO dwh_gold.dim_vehicles (imei, vehicle_number, is_active)
SELECT
d.imei,
d.vehicle_number,
COALESCE(d.enabled_flag, 1) = 1
FROM tracksolid.devices d
ON CONFLICT (imei) DO UPDATE SET
vehicle_number = EXCLUDED.vehicle_number,
is_active = EXCLUDED.is_active;
-- ── 2. Rewrite ETL function ──────────────────────────────────────────────────
CREATE OR REPLACE FUNCTION dwh_gold.refresh_daily_metrics(target_date DATE)
RETURNS void LANGUAGE plpgsql AS $$
BEGIN
-- Sync dim_vehicles first so any IMEI seen in trips has a vehicle_key.
-- Without this, a brand-new device would have trip rows but no
-- dim_vehicles entry, and the JOIN below would drop its metrics.
INSERT INTO dwh_gold.dim_vehicles (imei, vehicle_number, is_active)
SELECT
d.imei,
d.vehicle_number,
COALESCE(d.enabled_flag, 1) = 1
FROM tracksolid.devices d
ON CONFLICT (imei) DO UPDATE SET
vehicle_number = EXCLUDED.vehicle_number,
is_active = EXCLUDED.is_active;
-- Aggregate trips and alarms in separate CTEs to avoid the cartesian
-- multiplication that the original single-SELECT version produced.
WITH trip_agg AS (
SELECT
t.imei,
SUM(t.distance_km) AS total_distance_km,
COUNT(*) AS total_trips,
SUM(t.driving_time_s) AS total_drive_seconds,
SUM(t.idle_time_s) AS total_idle_seconds,
SUM(t.fuel_consumed_l) AS fuel_consumed_l,
MIN(t.start_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_start_time,
MAX(t.end_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_end_time,
AVG(t.avg_speed_kmh) AS avg_speed_kmh,
MAX(t.max_speed_kmh) AS peak_speed_kmh
FROM tracksolid.trips t
WHERE DATE(t.start_time AT TIME ZONE 'Africa/Nairobi') = target_date
AND t.end_time IS NOT NULL
GROUP BY t.imei
),
alarm_agg AS (
SELECT
a.imei,
COUNT(*) AS alarm_count,
COUNT(*) FILTER (WHERE a.alarm_type ILIKE '%speed%') AS overspeed_count
FROM tracksolid.alarms a
WHERE DATE(a.alarm_time AT TIME ZONE 'Africa/Nairobi') = target_date
GROUP BY a.imei
)
INSERT INTO dwh_gold.fact_daily_fleet_metrics (
day,
vehicle_key,
total_distance_km,
total_trips,
total_drive_hours,
total_idle_hours,
fuel_consumed_l,
alarm_count,
overspeed_count,
day_start_time,
day_end_time,
avg_speed_kmh,
peak_speed_kmh
)
SELECT
target_date AS day,
dv.vehicle_key AS vehicle_key,
ROUND(tr.total_distance_km::numeric, 3) AS total_distance_km,
tr.total_trips AS total_trips,
ROUND((tr.total_drive_seconds / 3600.0)::numeric, 2) AS total_drive_hours,
ROUND((tr.total_idle_seconds / 3600.0)::numeric, 2) AS total_idle_hours,
ROUND(tr.fuel_consumed_l::numeric, 3) AS fuel_consumed_l,
COALESCE(al.alarm_count, 0) AS alarm_count,
COALESCE(al.overspeed_count, 0) AS overspeed_count,
tr.day_start_time AS day_start_time,
tr.day_end_time AS day_end_time,
ROUND(tr.avg_speed_kmh::numeric, 2) AS avg_speed_kmh,
tr.peak_speed_kmh AS peak_speed_kmh
FROM trip_agg tr
JOIN dwh_gold.dim_vehicles dv ON dv.imei = tr.imei
LEFT JOIN alarm_agg al ON al.imei = tr.imei
ON CONFLICT (day, vehicle_key) DO UPDATE SET
total_distance_km = EXCLUDED.total_distance_km,
total_trips = EXCLUDED.total_trips,
total_drive_hours = EXCLUDED.total_drive_hours,
total_idle_hours = EXCLUDED.total_idle_hours,
fuel_consumed_l = EXCLUDED.fuel_consumed_l,
alarm_count = EXCLUDED.alarm_count,
overspeed_count = EXCLUDED.overspeed_count,
day_start_time = EXCLUDED.day_start_time,
day_end_time = EXCLUDED.day_end_time,
avg_speed_kmh = EXCLUDED.avg_speed_kmh,
peak_speed_kmh = EXCLUDED.peak_speed_kmh;
END;
$$;
COMMENT ON FUNCTION dwh_gold.refresh_daily_metrics(DATE)
IS 'Populates or refreshes fact_daily_fleet_metrics for the given date. '
'Trips and alarms are aggregated in separate CTEs to avoid cartesian '
'multiplication. Maps IMEI → vehicle_key via dwh_gold.dim_vehicles. '
'Call nightly: SELECT dwh_gold.refresh_daily_metrics(CURRENT_DATE - 1);';
COMMIT;