From 7bc0a2ce8730396fdc309b1c68caaf1b1b96c5e6 Mon Sep 17 00:00:00 2001 From: david kiania Date: Fri, 15 May 2026 16:44:40 +0300 Subject: [PATCH] fix(BUG-01b): aggregate trips and alarms in separate CTEs to avoid cartesian explosion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original refresh_daily_metrics() joined trips × alarms in one SELECT, producing one row per (trip, alarm) pair. Every SUM/COUNT over trip columns was multiplied by the per-IMEI alarm count, so spot-checks showed total_trips identical to alarm_count, drive_hours > 1000/day, and distance_km in the tens of thousands per vehicle per day. Migration 08 carried that flawed join forward when fixing the TEXT→INTEGER vehicle_key crash. Rewriting the function so trip_agg and alarm_agg are computed in separate CTEs and then joined on imei restores correct per-vehicle aggregates: total_trips reflects real trip count, drive_hours ≤ 24, alarms are counted once. This bug is being fixed in the same migration file (08) before PR #12 merges; no deploy has applied the prior version, so no second migration is needed. Co-Authored-By: Claude Opus 4.7 --- 08_fix_etl_vehicle_key.sql | 96 +++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/08_fix_etl_vehicle_key.sql b/08_fix_etl_vehicle_key.sql index c49053b..9a1621a 100644 --- a/08_fix_etl_vehicle_key.sql +++ b/08_fix_etl_vehicle_key.sql @@ -1,15 +1,22 @@ -- 08_fix_etl_vehicle_key.sql --- Fixes BUG-01: dwh_gold.refresh_daily_metrics() inserted t.imei (TEXT) into --- fact_daily_fleet_metrics.vehicle_key (INTEGER REFERENCES dim_vehicles), so --- every nightly call raised "invalid input syntax for type integer". +-- Fixes two distinct bugs in dwh_gold.refresh_daily_metrics(): -- --- The fix has two parts: --- 1. Seed dwh_gold.dim_vehicles from tracksolid.devices so every IMEI has --- a serial vehicle_key to point at. --- 2. Rewrite refresh_daily_metrics() to JOIN through dim_vehicles and --- SELECT the serial key instead of the raw IMEI. The function also --- upserts dim_vehicles at the top of each run so newly-registered --- devices appear in the warehouse without manual intervention. +-- BUG-01a (type crash): the original function inserted t.imei (TEXT) into +-- fact_daily_fleet_metrics.vehicle_key (INTEGER REFERENCES dim_vehicles), +-- so every nightly call raised "invalid input syntax for type integer". +-- +-- BUG-01b (cartesian explosion): the original function joined +-- trips × alarms in a single SELECT. For every trip row it produced one +-- output row per matching alarm, multiplying every SUM/COUNT over trip +-- columns by the per-IMEI alarm count. Spot-checking the broken output +-- showed total_trips identical to alarm_count and drive_hours > 1000/day. +-- +-- The fix has three parts: +-- 1. Seed dwh_gold.dim_vehicles from tracksolid.devices so every IMEI +-- has a serial vehicle_key to point at. +-- 2. Rewrite refresh_daily_metrics() so trip aggregates and alarm +-- aggregates are computed in separate CTEs and then joined on imei. +-- 3. Map IMEI → vehicle_key via dim_vehicles inside the same statement. BEGIN; @@ -43,6 +50,34 @@ BEGIN vehicle_number = EXCLUDED.vehicle_number, is_active = EXCLUDED.is_active; + -- Aggregate trips and alarms in separate CTEs to avoid the cartesian + -- multiplication that the original single-SELECT version produced. + WITH trip_agg AS ( + SELECT + t.imei, + SUM(t.distance_km) AS total_distance_km, + COUNT(*) AS total_trips, + SUM(t.driving_time_s) AS total_drive_seconds, + SUM(t.idle_time_s) AS total_idle_seconds, + SUM(t.fuel_consumed_l) AS fuel_consumed_l, + MIN(t.start_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_start_time, + MAX(t.end_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_end_time, + AVG(t.avg_speed_kmh) AS avg_speed_kmh, + MAX(t.max_speed_kmh) AS peak_speed_kmh + FROM tracksolid.trips t + WHERE DATE(t.start_time AT TIME ZONE 'Africa/Nairobi') = target_date + AND t.end_time IS NOT NULL + GROUP BY t.imei + ), + alarm_agg AS ( + SELECT + a.imei, + COUNT(*) AS alarm_count, + COUNT(*) FILTER (WHERE a.alarm_type ILIKE '%speed%') AS overspeed_count + FROM tracksolid.alarms a + WHERE DATE(a.alarm_time AT TIME ZONE 'Africa/Nairobi') = target_date + GROUP BY a.imei + ) INSERT INTO dwh_gold.fact_daily_fleet_metrics ( day, vehicle_key, @@ -59,28 +94,22 @@ BEGIN peak_speed_kmh ) SELECT - target_date AS day, - dv.vehicle_key AS vehicle_key, - ROUND(SUM(t.distance_km)::numeric, 3) AS total_distance_km, - COUNT(*) AS total_trips, - ROUND((SUM(t.driving_time_s) / 3600.0)::numeric, 2) AS total_drive_hours, - ROUND((SUM(t.idle_time_s) / 3600.0)::numeric, 2) AS total_idle_hours, - ROUND(SUM(t.fuel_consumed_l)::numeric, 3) AS fuel_consumed_l, - COUNT(a.id) AS alarm_count, - COUNT(a.id) FILTER (WHERE a.alarm_type ILIKE '%speed%') AS overspeed_count, - MIN(t.start_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_start_time, - MAX(t.end_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_end_time, - ROUND(AVG(t.avg_speed_kmh)::numeric, 2) AS avg_speed_kmh, - MAX(t.max_speed_kmh) AS peak_speed_kmh - FROM tracksolid.trips t - JOIN dwh_gold.dim_vehicles dv - ON dv.imei = t.imei - LEFT JOIN tracksolid.alarms a - ON a.imei = t.imei - AND DATE(a.alarm_time AT TIME ZONE 'Africa/Nairobi') = target_date - WHERE DATE(t.start_time AT TIME ZONE 'Africa/Nairobi') = target_date - AND t.end_time IS NOT NULL - GROUP BY dv.vehicle_key + target_date AS day, + dv.vehicle_key AS vehicle_key, + ROUND(tr.total_distance_km::numeric, 3) AS total_distance_km, + tr.total_trips AS total_trips, + ROUND((tr.total_drive_seconds / 3600.0)::numeric, 2) AS total_drive_hours, + ROUND((tr.total_idle_seconds / 3600.0)::numeric, 2) AS total_idle_hours, + ROUND(tr.fuel_consumed_l::numeric, 3) AS fuel_consumed_l, + COALESCE(al.alarm_count, 0) AS alarm_count, + COALESCE(al.overspeed_count, 0) AS overspeed_count, + tr.day_start_time AS day_start_time, + tr.day_end_time AS day_end_time, + ROUND(tr.avg_speed_kmh::numeric, 2) AS avg_speed_kmh, + tr.peak_speed_kmh AS peak_speed_kmh + FROM trip_agg tr + JOIN dwh_gold.dim_vehicles dv ON dv.imei = tr.imei + LEFT JOIN alarm_agg al ON al.imei = tr.imei ON CONFLICT (day, vehicle_key) DO UPDATE SET total_distance_km = EXCLUDED.total_distance_km, total_trips = EXCLUDED.total_trips, @@ -98,7 +127,8 @@ $$; COMMENT ON FUNCTION dwh_gold.refresh_daily_metrics(DATE) IS 'Populates or refreshes fact_daily_fleet_metrics for the given date. ' - 'Joins tracksolid.trips through dwh_gold.dim_vehicles to map IMEI → vehicle_key. ' + 'Trips and alarms are aggregated in separate CTEs to avoid cartesian ' + 'multiplication. Maps IMEI → vehicle_key via dwh_gold.dim_vehicles. ' 'Call nightly: SELECT dwh_gold.refresh_daily_metrics(CURRENT_DATE - 1);'; COMMIT;