fix(BUG-01b): aggregate trips and alarms in separate CTEs to avoid cartesian explosion
The original refresh_daily_metrics() joined trips × alarms in one SELECT, producing one row per (trip, alarm) pair. Every SUM/COUNT over trip columns was multiplied by the per-IMEI alarm count, so spot-checks showed total_trips identical to alarm_count, drive_hours > 1000/day, and distance_km in the tens of thousands per vehicle per day. Migration 08 carried that flawed join forward when fixing the TEXT→INTEGER vehicle_key crash. Rewriting the function so trip_agg and alarm_agg are computed in separate CTEs and then joined on imei restores correct per-vehicle aggregates: total_trips reflects real trip count, drive_hours ≤ 24, alarms are counted once. This bug is being fixed in the same migration file (08) before PR #12 merges; no deploy has applied the prior version, so no second migration is needed. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8d386bf27a
commit
7bc0a2ce87
1 changed files with 63 additions and 33 deletions
|
|
@ -1,15 +1,22 @@
|
|||
-- 08_fix_etl_vehicle_key.sql
|
||||
-- Fixes BUG-01: dwh_gold.refresh_daily_metrics() inserted t.imei (TEXT) into
|
||||
-- fact_daily_fleet_metrics.vehicle_key (INTEGER REFERENCES dim_vehicles), so
|
||||
-- every nightly call raised "invalid input syntax for type integer".
|
||||
-- Fixes two distinct bugs in dwh_gold.refresh_daily_metrics():
|
||||
--
|
||||
-- The fix has two parts:
|
||||
-- 1. Seed dwh_gold.dim_vehicles from tracksolid.devices so every IMEI has
|
||||
-- a serial vehicle_key to point at.
|
||||
-- 2. Rewrite refresh_daily_metrics() to JOIN through dim_vehicles and
|
||||
-- SELECT the serial key instead of the raw IMEI. The function also
|
||||
-- upserts dim_vehicles at the top of each run so newly-registered
|
||||
-- devices appear in the warehouse without manual intervention.
|
||||
-- BUG-01a (type crash): the original function inserted t.imei (TEXT) into
|
||||
-- fact_daily_fleet_metrics.vehicle_key (INTEGER REFERENCES dim_vehicles),
|
||||
-- so every nightly call raised "invalid input syntax for type integer".
|
||||
--
|
||||
-- BUG-01b (cartesian explosion): the original function joined
|
||||
-- trips × alarms in a single SELECT. For every trip row it produced one
|
||||
-- output row per matching alarm, multiplying every SUM/COUNT over trip
|
||||
-- columns by the per-IMEI alarm count. Spot-checking the broken output
|
||||
-- showed total_trips identical to alarm_count and drive_hours > 1000/day.
|
||||
--
|
||||
-- The fix has three parts:
|
||||
-- 1. Seed dwh_gold.dim_vehicles from tracksolid.devices so every IMEI
|
||||
-- has a serial vehicle_key to point at.
|
||||
-- 2. Rewrite refresh_daily_metrics() so trip aggregates and alarm
|
||||
-- aggregates are computed in separate CTEs and then joined on imei.
|
||||
-- 3. Map IMEI → vehicle_key via dim_vehicles inside the same statement.
|
||||
|
||||
BEGIN;
|
||||
|
||||
|
|
@ -43,6 +50,34 @@ BEGIN
|
|||
vehicle_number = EXCLUDED.vehicle_number,
|
||||
is_active = EXCLUDED.is_active;
|
||||
|
||||
-- Aggregate trips and alarms in separate CTEs to avoid the cartesian
|
||||
-- multiplication that the original single-SELECT version produced.
|
||||
WITH trip_agg AS (
|
||||
SELECT
|
||||
t.imei,
|
||||
SUM(t.distance_km) AS total_distance_km,
|
||||
COUNT(*) AS total_trips,
|
||||
SUM(t.driving_time_s) AS total_drive_seconds,
|
||||
SUM(t.idle_time_s) AS total_idle_seconds,
|
||||
SUM(t.fuel_consumed_l) AS fuel_consumed_l,
|
||||
MIN(t.start_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_start_time,
|
||||
MAX(t.end_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_end_time,
|
||||
AVG(t.avg_speed_kmh) AS avg_speed_kmh,
|
||||
MAX(t.max_speed_kmh) AS peak_speed_kmh
|
||||
FROM tracksolid.trips t
|
||||
WHERE DATE(t.start_time AT TIME ZONE 'Africa/Nairobi') = target_date
|
||||
AND t.end_time IS NOT NULL
|
||||
GROUP BY t.imei
|
||||
),
|
||||
alarm_agg AS (
|
||||
SELECT
|
||||
a.imei,
|
||||
COUNT(*) AS alarm_count,
|
||||
COUNT(*) FILTER (WHERE a.alarm_type ILIKE '%speed%') AS overspeed_count
|
||||
FROM tracksolid.alarms a
|
||||
WHERE DATE(a.alarm_time AT TIME ZONE 'Africa/Nairobi') = target_date
|
||||
GROUP BY a.imei
|
||||
)
|
||||
INSERT INTO dwh_gold.fact_daily_fleet_metrics (
|
||||
day,
|
||||
vehicle_key,
|
||||
|
|
@ -59,28 +94,22 @@ BEGIN
|
|||
peak_speed_kmh
|
||||
)
|
||||
SELECT
|
||||
target_date AS day,
|
||||
dv.vehicle_key AS vehicle_key,
|
||||
ROUND(SUM(t.distance_km)::numeric, 3) AS total_distance_km,
|
||||
COUNT(*) AS total_trips,
|
||||
ROUND((SUM(t.driving_time_s) / 3600.0)::numeric, 2) AS total_drive_hours,
|
||||
ROUND((SUM(t.idle_time_s) / 3600.0)::numeric, 2) AS total_idle_hours,
|
||||
ROUND(SUM(t.fuel_consumed_l)::numeric, 3) AS fuel_consumed_l,
|
||||
COUNT(a.id) AS alarm_count,
|
||||
COUNT(a.id) FILTER (WHERE a.alarm_type ILIKE '%speed%') AS overspeed_count,
|
||||
MIN(t.start_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_start_time,
|
||||
MAX(t.end_time AT TIME ZONE 'Africa/Nairobi')::TIME AS day_end_time,
|
||||
ROUND(AVG(t.avg_speed_kmh)::numeric, 2) AS avg_speed_kmh,
|
||||
MAX(t.max_speed_kmh) AS peak_speed_kmh
|
||||
FROM tracksolid.trips t
|
||||
JOIN dwh_gold.dim_vehicles dv
|
||||
ON dv.imei = t.imei
|
||||
LEFT JOIN tracksolid.alarms a
|
||||
ON a.imei = t.imei
|
||||
AND DATE(a.alarm_time AT TIME ZONE 'Africa/Nairobi') = target_date
|
||||
WHERE DATE(t.start_time AT TIME ZONE 'Africa/Nairobi') = target_date
|
||||
AND t.end_time IS NOT NULL
|
||||
GROUP BY dv.vehicle_key
|
||||
target_date AS day,
|
||||
dv.vehicle_key AS vehicle_key,
|
||||
ROUND(tr.total_distance_km::numeric, 3) AS total_distance_km,
|
||||
tr.total_trips AS total_trips,
|
||||
ROUND((tr.total_drive_seconds / 3600.0)::numeric, 2) AS total_drive_hours,
|
||||
ROUND((tr.total_idle_seconds / 3600.0)::numeric, 2) AS total_idle_hours,
|
||||
ROUND(tr.fuel_consumed_l::numeric, 3) AS fuel_consumed_l,
|
||||
COALESCE(al.alarm_count, 0) AS alarm_count,
|
||||
COALESCE(al.overspeed_count, 0) AS overspeed_count,
|
||||
tr.day_start_time AS day_start_time,
|
||||
tr.day_end_time AS day_end_time,
|
||||
ROUND(tr.avg_speed_kmh::numeric, 2) AS avg_speed_kmh,
|
||||
tr.peak_speed_kmh AS peak_speed_kmh
|
||||
FROM trip_agg tr
|
||||
JOIN dwh_gold.dim_vehicles dv ON dv.imei = tr.imei
|
||||
LEFT JOIN alarm_agg al ON al.imei = tr.imei
|
||||
ON CONFLICT (day, vehicle_key) DO UPDATE SET
|
||||
total_distance_km = EXCLUDED.total_distance_km,
|
||||
total_trips = EXCLUDED.total_trips,
|
||||
|
|
@ -98,7 +127,8 @@ $$;
|
|||
|
||||
COMMENT ON FUNCTION dwh_gold.refresh_daily_metrics(DATE)
|
||||
IS 'Populates or refreshes fact_daily_fleet_metrics for the given date. '
|
||||
'Joins tracksolid.trips through dwh_gold.dim_vehicles to map IMEI → vehicle_key. '
|
||||
'Trips and alarms are aggregated in separate CTEs to avoid cartesian '
|
||||
'multiplication. Maps IMEI → vehicle_key via dwh_gold.dim_vehicles. '
|
||||
'Call nightly: SELECT dwh_gold.refresh_daily_metrics(CURRENT_DATE - 1);';
|
||||
|
||||
COMMIT;
|
||||
|
|
|
|||
Loading…
Reference in a new issue