diff --git a/.dockerignore b/.dockerignore index 368cfac..80eb0b2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,4 +5,3 @@ __pycache__/ *.csv .env .DS_Store -uv.lock diff --git a/Dockerfile b/Dockerfile index 4660d77..5fc8261 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,9 +18,20 @@ RUN apt-get update \ WORKDIR /app -# Install from pyproject.toml (single source of truth for deps — no manual mirror). +# Pinned, reproducible installs from uv.lock (FT-SEC-02): uv export --frozen fails +# the build if the lockfile drifts from pyproject.toml. Runtime imports straight +# from /app via `python -m inc.import_inc` — the project itself needs no install. +COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv +COPY pyproject.toml uv.lock ./ +RUN uv export --frozen --no-dev --no-emit-project --format requirements-txt -o /tmp/requirements.txt \ + && uv pip install --system -r /tmp/requirements.txt \ + && rm /tmp/requirements.txt + COPY . . -RUN pip install . + +# Non-privileged runtime user (Coolify Scheduled Tasks exec as this user too). +RUN useradd -m tickets-user +USER tickets-user # Keep the container alive so Coolify Scheduled Tasks can exec into it. CMD ["tail", "-f", "/dev/null"] diff --git a/docs/260702_audit_report.md b/docs/260702_audit_report.md new file mode 100644 index 0000000..161c5ba --- /dev/null +++ b/docs/260702_audit_report.md @@ -0,0 +1,83 @@ +# fleettickets — Platform Audit Report (2026-07-02) + +Part of the 2026-07-02 cross-repo audit (see the tracksolid repo's +`docs/reports/260702_platform_audit_report.md` for the DB/host-wide findings). +Scope here: this repo's code, its migrations, the live `tickets` schema contents, +and the deployed Coolify container. Companion docs: `260702_fix_plan.md`, +`260702_work_done.md`. + +--- + +## Critical + +### FT-BUG-01 — Precise location geocodes are never applied at ingest time (and get clobbered) +Verified live: `tickets.geo_locations` holds **808 geocoded locations**, yet only +**51 of 114k geocoded INC tickets** have `geo_source='location'` — and **0 of 42k CRQ**. +Two compounding defects: + +1. The `tg_ticket_geom` trigger (migration 01) resolves feed coords → **cluster + centroid** → none. It **never consults `tickets.geo_locations`**, so a freshly + upserted ticket can only ever get cluster precision. +2. `tickets.resolve_ticket_geoms()` (which does honour geo_locations) only runs at + the end of the manual `--geocode-*` commands. Every 20-minute delta ingest + re-upserts changed rows, the trigger recomputes geom, and any previously + resolved `'location'` geom is silently downgraded back to `'cluster'`. + +Net effect: the LocationIQ budget spent geocoding 808 locations produces almost no +map precision. The "cluster-gazetteer geocoding is the critical path" goal is +effectively defeated by the write path. + +### FT-SEC-01 — Live container connects as the postgres superuser over the PUBLIC port +The deployed container's `DATABASE_URL` is +`postgres://postgres:…@twala.rahamafresh.com:5433/tracksolid_db` — the superuser, +over the internet-exposed host port, without TLS. The repo's own `.env.example` +prescribes `tracksolid_owner@timescale_db:5432` (internal Docker network). This +must be fixed in the Coolify app env **before** the tracksolid stack's new +loopback-only port binding deploys, or ingestion breaks. (Operational — see plan +Phase B.) + +## High + +### FT-SEC-02 — Unpinned image builds, root runtime +`Dockerfile` does `pip install .` from `pyproject.toml` version ranges; `uv.lock` +exists but is **excluded by `.dockerignore`**, so builds are unpinned and +unreproducible. The container also runs as root (no `USER`). + +### FT-BUG-02 — New clusters/locations only geocode when someone remembers to run the command +`--geocode-clusters` / `--geocode-locations` are manual. Live data shows the lag: +156 INC + 71 CRQ tickets sit at `geo_source='none'`, and location coverage is 808 +keys vs thousands of distinct location_names. Both functions are already +incremental (NOT EXISTS guards), so chaining a cluster-geocode pass into the +scheduled ingest is nearly free (0 API calls on a quiet run). + +## Medium / notes + +- **FT-PERF-01** — `tickets.inc` = 766 MB / `tickets.crq` = 486 MB, dominated by the + `raw` jsonb (avg 754 B/row) and **134 MB of never-scanned geo indexes** + (`ix_inc_geog` 83 MB, `ix_inc_geom` 51 MB, `ix_crq_geom` 49 MB — 0 scans; the map + path reads `fn_tickets_for_map`, which planner-serves off the raw/actionable + indexes). A separate spawned task is already preparing the index-drop migration — + not duplicated here. **Numbering note:** that task landed `17_drop_unused_geo_indexes.sql`; this + audit's trigger fix is therefore numbered `18_trigger_location_geom.sql`. +- **FT-OPS-01** — `20260618_bug.txt` (untracked, repo root) reviews a file + (`import_tickets.py`) that no longer exists; its live findings (ETag skip, + meta-outside-txn) were fixed in the pipeline rewrite. Stale — remove. +- **FT-OPS-02** — migration numbering gap (no `11_*.sql`) — harmless (lexical + ordering), just don't reuse 11. +- **FT-OPS-03** — `run_migrations.py` is manual (not run on container start, unlike + the tracksolid stack). Acceptable for a batch worker, but document it in deploys. +- **FT-NOTE-01** — straggler archiving: when nothing is pending, `ingest()` archives + *every* file still in `changes/` (including pre-watermark leftovers) — files are + moved to `processed/`, not destroyed, so this is acceptable; noted for awareness. +- **FT-NOTE-02** — the geocode loops open a fresh psycopg2 connection per written + row (`shared.get_conn` has no pool). Fine at batch cadence; revisit only if + geocode volumes grow 10×. + +## In good shape +- The CDC drain design is solid: per-file watermark + archive in one txn each, + crash-safe resume, `--reseed` for bucket cutovers, raw-first schema resilient to + source drift. +- `capture_history()` gives durable closure/backlog history despite the + current-state upsert model. +- Sentinel/alarm-row filtering, place extraction, and the viewbox+distance-guarded + two-pass geocoder are careful, well-commented work. diff --git a/docs/260702_fix_plan.md b/docs/260702_fix_plan.md new file mode 100644 index 0000000..ab1a514 --- /dev/null +++ b/docs/260702_fix_plan.md @@ -0,0 +1,30 @@ +# fleettickets — Fix Plan (2026-07-02) + +Companion to `260702_audit_report.md` (findings) and `260702_work_done.md` +(execution log). + +## Phase A — repo changes (implemented in this session) + +| Step | Finding | Change | +|---|---|---| +| A1 | FT-BUG-01 | Migration `18_trigger_location_geom.sql`: `tg_ticket_geom` resolves feed coords → **geo_locations (precise)** → cluster centroid → none, so upserts stop clobbering precise geoms; the migration ends with one `resolve_ticket_geoms()` call to repair the existing 114k+42k rows immediately. | +| A2 | FT-BUG-01 | `pipeline.ingest()`: after an `--apply` run that ingested files, call `_resolve()` — belt-and-braces so location geoms self-heal even on a DB where migration 18 hasn't landed yet. | +| A3 | FT-BUG-02 | `run_ingest.sh`: chain an incremental `--geocode-clusters --apply` after the two ingests (0 API calls when no new clusters). Location geocoding stays manual (budget control). | +| A4 | FT-SEC-02 | Dockerfile: install pinned deps from `uv.lock` (`uv export --frozen`), stop `pip install .` (runtime imports from /app via `-m`), run as a non-root user. Remove `uv.lock` from `.dockerignore`. | +| A5 | FT-OPS-01 | Delete the stale `20260618_bug.txt` (its findings are fixed or superseded). | + +## Phase B — operational actions (need operator confirmation) + +| # | Finding | Action | +|---|---|---| +| B1 | FT-SEC-01 | In the Coolify app env, change `DATABASE_URL` to `postgresql://tracksolid_owner:@timescale_db:5432/tracksolid_db` and attach the container to the tracksolid stack's Docker network. Must land before/with the tracksolid stack's loopback port-binding deploy. Verify `tracksolid_owner` has the needed GRANTs on the `tickets` schema (it should own it; check `\dn+ tickets`). | +| B2 | FT-BUG-01/02 | After merging Phase A: `python run_migrations.py` (applies 18), redeploy the container, and update the Coolify Scheduled Tasks to also run the chained cluster geocode (or keep using `run_ingest.sh`). One-time: run `python -m inc.import_inc --geocode-locations --apply` to extend location coverage now that it will actually stick. | +| B3 | FT-PERF-01 | The index-drop migration landed as `17_drop_unused_geo_indexes.sql` (background task, committed on branch chore/drop-unused-geo-indexes); this audit's trigger fix is renumbered to `18_trigger_location_geom.sql` so the two coexist. | + +## Verification +- `python -m py_compile pipeline.py shared.py inc/import_inc.py crq/import_crq.py`; `ruff check .`. +- Migration 18 dry-run in a rolled-back transaction against the live DB; after real + apply, expect `SELECT geo_source, count(*) FROM tickets.inc GROUP BY 1` to show + `location` ≫ 51 (should approach the number of tickets whose location_name is in + the 808-key gazetteer). +- Next scheduled ingest run: confirm `location` counts don't regress after deltas. diff --git a/docs/260702_work_done.md b/docs/260702_work_done.md new file mode 100644 index 0000000..5bec621 --- /dev/null +++ b/docs/260702_work_done.md @@ -0,0 +1,33 @@ +# fleettickets — Work Done (2026-07-02) + +Execution log for `260702_fix_plan.md` Phase A. **Local changes only — nothing +committed, pushed, or applied to the live DB** (dry-runs were wrapped in rolled-back +transactions). + +## Changes + +| Finding | Files | What changed | +|---|---|---| +| FT-BUG-01 | `migrations/18_trigger_location_geom.sql` (new) | `tg_ticket_geom` now resolves feed coords → **precise geo_locations** → cluster centroid → none (was skipping geo_locations entirely, so 20-min delta upserts clobbered precise geoms back to cluster centroids). Migration ends with one `resolve_ticket_geoms()` to repair the backlog. **Dry-run verified against the live DB (rolled back): 7,481 rows repaired — INC `geo_source='location'` 51 → 5,339; CRQ 0 → 2,193.** | +| FT-BUG-01 | `pipeline.py` | `ingest()` now calls `_resolve()` after every applied run that ingested files — geoms self-heal when a location was geocoded after a ticket's last upsert, and on any DB where migration 18 hasn't landed yet. No-op runs skip it (no extra load on the 20-min quiet cycles). | +| FT-BUG-02 | `run_ingest.sh` | Chains `python -m inc.import_inc --geocode-clusters --apply` after the two ingests. Incremental (NOT EXISTS-guarded): zero geocoder calls when no new clusters. Location-level geocoding stays manual for budget control. If you use Coolify Scheduled Tasks instead of this wrapper, add the same command as a third task (or switch the tasks to `run_ingest.sh`). | +| FT-SEC-02 | `Dockerfile`, `.dockerignore` | Dependencies now install pinned from `uv.lock` (`uv export --frozen` → hash-checked; build fails on lockfile drift; verified locally: 12 pinned packages). `uv.lock` removed from `.dockerignore` so it reaches the build. Container now runs as non-root `tickets-user`. Dropped the unnecessary `pip install .` (runtime imports from `/app` via `-m`). | +| FT-OPS-01 | `20260618_bug.txt` (deleted) | Stale review of the pre-rewrite `import_tickets.py`; its live findings were fixed by the pipeline.py rewrite (watermark drain replaced the ETag skip; meta now commits in the upsert txn). | + +## Verification +- `python -m py_compile` clean on all five modules; `ruff check .` shows the same + 14 pre-existing style items as before the changes (nothing new introduced). +- Migration 18 executed against the live DB inside `BEGIN … ROLLBACK`: applies + cleanly, resolve repaired 7,481 rows (counts above). +- `uv export --frozen` succeeds against the committed lockfile. + +## NOT done — operational (Phase B, needs operator confirmation) +1. **Change the Coolify app's `DATABASE_URL`** off the postgres superuser / public + `twala:5433` route → `tracksolid_owner@timescale_db:5432` on the internal + network (FT-SEC-01). Required anyway before the tracksolid stack's + loopback-only port binding deploys. +2. Merge + push, `python run_migrations.py` (applies 18), redeploy the container. +3. One-time `--geocode-locations --apply` run to extend location coverage now that + precise geoms persist. +4. The separately-spawned background task is preparing the unused-geo-index drop + (~134 MB) landed as migration 17; this trigger fix was renumbered to 18 so both coexist. diff --git a/migrations/18_trigger_location_geom.sql b/migrations/18_trigger_location_geom.sql new file mode 100644 index 0000000..ba17197 --- /dev/null +++ b/migrations/18_trigger_location_geom.sql @@ -0,0 +1,55 @@ +-- 18_trigger_location_geom.sql — fleettickets · precise geoms survive re-upserts. +-- (migration 17 is 17_drop_unused_geo_indexes.sql — a parallel 260702 change.) +-- ───────────────────────────────────────────────────────────────────────────── +-- FT-BUG-01 (260702 audit): tg_ticket_geom resolved feed coords → cluster +-- centroid → none, NEVER consulting tickets.geo_locations. So the precise +-- location geocodes (808 keys, paid LocationIQ budget) were only applied by the +-- manual --geocode-* commands' resolve pass — and the very next 20-minute delta +-- ingest re-upserted changed rows and the trigger downgraded them back to the +-- cluster centroid. Live effect: 51 of 114k INC rows (and 0 CRQ) carried +-- geo_source='location'. +-- +-- Fix: the trigger now resolves feed → LOCATION (geo_locations by +-- norm_cluster(location_name)) → cluster → none, mirroring +-- resolve_ticket_geoms()'s precedence exactly. Then one resolve pass repairs the +-- whole backlog immediately. +-- Idempotent — CREATE OR REPLACE + a re-runnable resolve. +-- ───────────────────────────────────────────────────────────────────────────── + +CREATE OR REPLACE FUNCTION tickets.tg_ticket_geom() + RETURNS trigger LANGUAGE plpgsql AS $fn$ +DECLARE + v_lat double precision := NULLIF(NEW.raw->>'latitude','')::double precision; + v_lng double precision := NULLIF(NEW.raw->>'longitude','')::double precision; + g geometry(Point, 4326); +BEGIN + IF TG_OP = 'UPDATE' AND NEW.raw IS NOT DISTINCT FROM OLD.raw THEN + RETURN NEW; -- geom/geo_source-only update — keep caller's value + END IF; + IF v_lat IS NOT NULL AND v_lng IS NOT NULL + AND v_lat BETWEEN -90 AND 90 AND v_lng BETWEEN -180 AND 180 + AND NOT (v_lat = 0 AND v_lng = 0) THEN + NEW.geom := ST_SetSRID(ST_MakePoint(v_lng, v_lat), 4326); + NEW.geo_source := 'feed'; + RETURN NEW; + END IF; + -- [mig-18] precise location geocode BEFORE the coarse cluster centroid + SELECT gl.geom INTO g FROM tickets.geo_locations gl + WHERE gl.query_key = tickets.norm_cluster(NEW.raw->>'location_name') + AND gl.geom IS NOT NULL LIMIT 1; + IF g IS NOT NULL THEN + NEW.geom := g; NEW.geo_source := 'location'; + RETURN NEW; + END IF; + SELECT gc.geom INTO g FROM tickets.geo_clusters gc + WHERE gc.cluster_key = tickets.norm_cluster(NEW.raw->>'cluster') + AND gc.geom IS NOT NULL LIMIT 1; + IF g IS NOT NULL THEN NEW.geom := g; NEW.geo_source := 'cluster'; + ELSE NEW.geom := NULL; NEW.geo_source := 'none'; END IF; + RETURN NEW; +END $fn$; + +-- Repair the backlog now: re-resolve every non-feed row against the gazetteer +-- (resolve_ticket_geoms already prefers geo_locations over geo_clusters and only +-- touches rows whose geom/geo_source would actually change). +SELECT tickets.resolve_ticket_geoms(); diff --git a/pipeline.py b/pipeline.py index 4233038..e349c55 100644 --- a/pipeline.py +++ b/pipeline.py @@ -330,8 +330,16 @@ def ingest(ds: Dataset, args) -> None: else: log.info("DRY-RUN — would archive %s to %s", key, ds.processed_prefix) log.info("ingested %d %s change file(s); %d rows kept in total", len(pending), ds.name, total) - if args.apply and ds.post_apply: - ds.post_apply() + if args.apply: + # FT-BUG-01 (260702): re-resolve geoms after every applied ingest so rows + # whose location was geocoded AFTER their last upsert regain their precise + # geom (and, on a DB without migration 17, repair trigger downgrades). + # Cheap: resolve only touches rows whose geom/geo_source would change. + n = _resolve() + if n: + log.info("post-ingest resolve: %d %s geoms updated", n, ds.name) + if ds.post_apply: + ds.post_apply() # ── place extraction (strip network codes, keep the real place) ─────────────── diff --git a/run_ingest.sh b/run_ingest.sh index 06d19b3..5ffd604 100755 --- a/run_ingest.sh +++ b/run_ingest.sh @@ -32,3 +32,8 @@ PY="python" # resolve the packages alongside pipeline.py + shared.py. "$PY" -m inc.import_inc --from-bucket --apply "$PY" -m crq.import_crq --from-bucket --apply + +# Incremental cluster geocode (FT-BUG-02): NOT-EXISTS-guarded, so a run with no +# new clusters makes zero geocoder calls. Location-level geocoding stays a manual +# command (budget control): python -m inc.import_inc --geocode-locations --apply +"$PY" -m inc.import_inc --geocode-clusters --apply