feat(geocode): two-pass estate fallback for building-level location_names

Building-level names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so the
precise forward-geocode 404s and tickets stay on the bare cluster centroid
(observed 0/133 placed). geocode_locations now tries an ordered set of
candidates per location (compose_queries): full precise -> estate (leading 2
tokens) -> leading token, each constrained by the existing cluster viewbox +
25km distance check, accepting the FIRST in-range hit. This places tickets in
the right neighbourhood (e.g. 'KAHAWA WENDANI', 'BAMBURI') instead of the broad
cluster centroid. Wrong-area matches for ambiguous coarse tokens are rejected by
the distance check and fall through; genuinely unmatchable tickets keep the
honest cluster-centroid fallback (no pure-cluster candidate, which would only
mislabel the centroid as geo_source='location'). Verified the cascade finds
hits against live LocationIQ on real samples.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
david kiania 2026-06-18 18:51:58 +03:00
parent cdb6186dca
commit e71c8914f1

View file

@ -357,6 +357,38 @@ def compose_query(location_name: str | None, cluster: str | None, region: str |
return ", ".join(dict.fromkeys(parts)) # de-dupe while preserving order
def compose_queries(location_name: str | None, cluster: str | None,
region: str | None) -> list[str]:
"""Ordered geocode candidates, most → least specific (two-pass estate fallback).
Building-level location_names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so
the precise query 404s. We then fall back to the estate (leading tokens of the
place) each still constrained to the cluster viewbox + distance check by the
caller, so a coarse hit lands in the right neighbourhood (tighter than the bare
cluster centroid). We deliberately do NOT add a pure-cluster candidate: that would
just reproduce the cluster centroid while mislabelling it geo_source='location';
a truly unmatchable ticket should keep its honest cluster-centroid fallback.
e.g. 'KAHAWA WENDANI ALVO HOUSE' -> ['KAHAWA WENDANI ALVO HOUSE, WENDANI, nairobi,
Kenya', 'KAHAWA WENDANI, nairobi, Kenya', 'KAHAWA, nairobi, Kenya']
"""
region_part, cluster_part = clean(region), clean(cluster)
place = extract_place(location_name)
toks = place.split()
out: list[str] = []
def add(*parts: str | None) -> None:
q = ", ".join(dict.fromkeys([p for p in parts if p] + ["Kenya"]))
if q and q != "Kenya" and q not in out:
out.append(q)
add(place, cluster_part, region_part) # 1. full precise
if len(toks) > 2:
add(" ".join(toks[:2]), region_part) # 2. estate (leading 2 tokens)
if len(toks) > 1:
add(toks[0], region_part) # 3. leading token (broad estate)
return out
# ── keyed geocoder ────────────────────────────────────────────────────────────
def _throttle() -> None:
global _last_geocode_at
@ -496,26 +528,32 @@ def geocode_locations(apply: bool) -> None:
log.info("%d actionable-INC locations to geocode (provider=%s)", len(todo), _PROVIDER)
if not apply:
for key, loc, cluster, region, clat, clng in todo[:50]:
log.info(" %s -> %r", key, compose_query(loc, cluster, region))
log.info(" %s -> %s", key, " | ".join(compose_queries(loc, cluster, region)))
return
written = rejected = 0
written = missed = coarse = 0
for key, loc, cluster, region, clat, clng in todo:
query = compose_query(loc, cluster, region)
viewbox = None
if clat is not None and clng is not None:
viewbox = (clng - _VIEWBOX_DEG, clat - _VIEWBOX_DEG, clng + _VIEWBOX_DEG, clat + _VIEWBOX_DEG)
hit = geocode(query, viewbox)
# two-pass: precise → estate → cluster; accept the FIRST in-range hit. A wrong-area
# match (> MAX_KM from the cluster centroid) is skipped so we try a coarser query.
hit = used = None
for i, cand in enumerate(compose_queries(loc, cluster, region)):
g = geocode(cand, viewbox)
if not g:
continue
lat, lng, conf = g
if (clat is not None and clng is not None
and _haversine_km(lat, lng, clat, clng) > _MAX_KM_FROM_CLUSTER):
continue
hit, used = g, cand
if i > 0:
coarse += 1
break
if not hit:
missed += 1 # no match even coarsely — keeps cluster-centroid fallback
continue
lat, lng, conf = hit
# distance sanity: a result far from the cluster centroid is a wrong-city
# match — drop it so the ticket keeps the cluster-centroid fallback.
if clat is not None and clng is not None:
km = _haversine_km(lat, lng, clat, clng)
if km > _MAX_KM_FROM_CLUSTER:
rejected += 1
log.info(" reject (%.0f km from cluster): %s", km, query)
continue
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
@ -526,13 +564,14 @@ def geocode_locations(apply: bool) -> None:
SET location_name = EXCLUDED.location_name, cluster = EXCLUDED.cluster,
region = EXCLUDED.region, query = EXCLUDED.query, lat = EXCLUDED.lat,
lng = EXCLUDED.lng, confidence = EXCLUDED.confidence, provider = EXCLUDED.provider""",
(key, loc, cluster, region, query, lat, lng, conf, _PROVIDER),
(key, loc, cluster, region, used, lat, lng, conf, _PROVIDER),
)
written += 1
log.info(" geocoded %s -> %.5f, %.5f", query, lat, lng)
log.info(" geocoded %s -> %.5f, %.5f", used, lat, lng)
n = _resolve()
log.info("locations: %d accepted, %d rejected (too far); re-resolved geom on %d tickets "
"(unverified — review tickets.geo_locations)", written, rejected, n)
log.info("locations: %d accepted (%d via estate/cluster fallback), %d unmatched; "
"re-resolved geom on %d tickets (unverified — review tickets.geo_locations)",
written, coarse, missed, n)
def _resolve() -> int: