feat(geocode): two-pass estate fallback for building-level location_names
Building-level names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so the precise forward-geocode 404s and tickets stay on the bare cluster centroid (observed 0/133 placed). geocode_locations now tries an ordered set of candidates per location (compose_queries): full precise -> estate (leading 2 tokens) -> leading token, each constrained by the existing cluster viewbox + 25km distance check, accepting the FIRST in-range hit. This places tickets in the right neighbourhood (e.g. 'KAHAWA WENDANI', 'BAMBURI') instead of the broad cluster centroid. Wrong-area matches for ambiguous coarse tokens are rejected by the distance check and fall through; genuinely unmatchable tickets keep the honest cluster-centroid fallback (no pure-cluster candidate, which would only mislabel the centroid as geo_source='location'). Verified the cascade finds hits against live LocationIQ on real samples. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
cdb6186dca
commit
e71c8914f1
1 changed files with 55 additions and 16 deletions
|
|
@ -357,6 +357,38 @@ def compose_query(location_name: str | None, cluster: str | None, region: str |
|
|||
return ", ".join(dict.fromkeys(parts)) # de-dupe while preserving order
|
||||
|
||||
|
||||
def compose_queries(location_name: str | None, cluster: str | None,
|
||||
region: str | None) -> list[str]:
|
||||
"""Ordered geocode candidates, most → least specific (two-pass estate fallback).
|
||||
|
||||
Building-level location_names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so
|
||||
the precise query 404s. We then fall back to the estate (leading tokens of the
|
||||
place) — each still constrained to the cluster viewbox + distance check by the
|
||||
caller, so a coarse hit lands in the right neighbourhood (tighter than the bare
|
||||
cluster centroid). We deliberately do NOT add a pure-cluster candidate: that would
|
||||
just reproduce the cluster centroid while mislabelling it geo_source='location';
|
||||
a truly unmatchable ticket should keep its honest cluster-centroid fallback.
|
||||
e.g. 'KAHAWA WENDANI ALVO HOUSE' -> ['KAHAWA WENDANI ALVO HOUSE, WENDANI, nairobi,
|
||||
Kenya', 'KAHAWA WENDANI, nairobi, Kenya', 'KAHAWA, nairobi, Kenya']
|
||||
"""
|
||||
region_part, cluster_part = clean(region), clean(cluster)
|
||||
place = extract_place(location_name)
|
||||
toks = place.split()
|
||||
out: list[str] = []
|
||||
|
||||
def add(*parts: str | None) -> None:
|
||||
q = ", ".join(dict.fromkeys([p for p in parts if p] + ["Kenya"]))
|
||||
if q and q != "Kenya" and q not in out:
|
||||
out.append(q)
|
||||
|
||||
add(place, cluster_part, region_part) # 1. full precise
|
||||
if len(toks) > 2:
|
||||
add(" ".join(toks[:2]), region_part) # 2. estate (leading 2 tokens)
|
||||
if len(toks) > 1:
|
||||
add(toks[0], region_part) # 3. leading token (broad estate)
|
||||
return out
|
||||
|
||||
|
||||
# ── keyed geocoder ────────────────────────────────────────────────────────────
|
||||
def _throttle() -> None:
|
||||
global _last_geocode_at
|
||||
|
|
@ -496,26 +528,32 @@ def geocode_locations(apply: bool) -> None:
|
|||
log.info("%d actionable-INC locations to geocode (provider=%s)", len(todo), _PROVIDER)
|
||||
if not apply:
|
||||
for key, loc, cluster, region, clat, clng in todo[:50]:
|
||||
log.info(" %s -> %r", key, compose_query(loc, cluster, region))
|
||||
log.info(" %s -> %s", key, " | ".join(compose_queries(loc, cluster, region)))
|
||||
return
|
||||
written = rejected = 0
|
||||
written = missed = coarse = 0
|
||||
for key, loc, cluster, region, clat, clng in todo:
|
||||
query = compose_query(loc, cluster, region)
|
||||
viewbox = None
|
||||
if clat is not None and clng is not None:
|
||||
viewbox = (clng - _VIEWBOX_DEG, clat - _VIEWBOX_DEG, clng + _VIEWBOX_DEG, clat + _VIEWBOX_DEG)
|
||||
hit = geocode(query, viewbox)
|
||||
# two-pass: precise → estate → cluster; accept the FIRST in-range hit. A wrong-area
|
||||
# match (> MAX_KM from the cluster centroid) is skipped so we try a coarser query.
|
||||
hit = used = None
|
||||
for i, cand in enumerate(compose_queries(loc, cluster, region)):
|
||||
g = geocode(cand, viewbox)
|
||||
if not g:
|
||||
continue
|
||||
lat, lng, conf = g
|
||||
if (clat is not None and clng is not None
|
||||
and _haversine_km(lat, lng, clat, clng) > _MAX_KM_FROM_CLUSTER):
|
||||
continue
|
||||
hit, used = g, cand
|
||||
if i > 0:
|
||||
coarse += 1
|
||||
break
|
||||
if not hit:
|
||||
missed += 1 # no match even coarsely — keeps cluster-centroid fallback
|
||||
continue
|
||||
lat, lng, conf = hit
|
||||
# distance sanity: a result far from the cluster centroid is a wrong-city
|
||||
# match — drop it so the ticket keeps the cluster-centroid fallback.
|
||||
if clat is not None and clng is not None:
|
||||
km = _haversine_km(lat, lng, clat, clng)
|
||||
if km > _MAX_KM_FROM_CLUSTER:
|
||||
rejected += 1
|
||||
log.info(" reject (%.0f km from cluster): %s", km, query)
|
||||
continue
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
|
|
@ -526,13 +564,14 @@ def geocode_locations(apply: bool) -> None:
|
|||
SET location_name = EXCLUDED.location_name, cluster = EXCLUDED.cluster,
|
||||
region = EXCLUDED.region, query = EXCLUDED.query, lat = EXCLUDED.lat,
|
||||
lng = EXCLUDED.lng, confidence = EXCLUDED.confidence, provider = EXCLUDED.provider""",
|
||||
(key, loc, cluster, region, query, lat, lng, conf, _PROVIDER),
|
||||
(key, loc, cluster, region, used, lat, lng, conf, _PROVIDER),
|
||||
)
|
||||
written += 1
|
||||
log.info(" geocoded %s -> %.5f, %.5f", query, lat, lng)
|
||||
log.info(" geocoded %s -> %.5f, %.5f", used, lat, lng)
|
||||
n = _resolve()
|
||||
log.info("locations: %d accepted, %d rejected (too far); re-resolved geom on %d tickets "
|
||||
"(unverified — review tickets.geo_locations)", written, rejected, n)
|
||||
log.info("locations: %d accepted (%d via estate/cluster fallback), %d unmatched; "
|
||||
"re-resolved geom on %d tickets (unverified — review tickets.geo_locations)",
|
||||
written, coarse, missed, n)
|
||||
|
||||
|
||||
def _resolve() -> int:
|
||||
|
|
|
|||
Loading…
Reference in a new issue