feat(geocode): two-pass estate fallback for building-level location_names
Building-level names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so the precise forward-geocode 404s and tickets stay on the bare cluster centroid (observed 0/133 placed). geocode_locations now tries an ordered set of candidates per location (compose_queries): full precise -> estate (leading 2 tokens) -> leading token, each constrained by the existing cluster viewbox + 25km distance check, accepting the FIRST in-range hit. This places tickets in the right neighbourhood (e.g. 'KAHAWA WENDANI', 'BAMBURI') instead of the broad cluster centroid. Wrong-area matches for ambiguous coarse tokens are rejected by the distance check and fall through; genuinely unmatchable tickets keep the honest cluster-centroid fallback (no pure-cluster candidate, which would only mislabel the centroid as geo_source='location'). Verified the cascade finds hits against live LocationIQ on real samples. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
cdb6186dca
commit
e71c8914f1
1 changed files with 55 additions and 16 deletions
|
|
@ -357,6 +357,38 @@ def compose_query(location_name: str | None, cluster: str | None, region: str |
|
||||||
return ", ".join(dict.fromkeys(parts)) # de-dupe while preserving order
|
return ", ".join(dict.fromkeys(parts)) # de-dupe while preserving order
|
||||||
|
|
||||||
|
|
||||||
|
def compose_queries(location_name: str | None, cluster: str | None,
|
||||||
|
region: str | None) -> list[str]:
|
||||||
|
"""Ordered geocode candidates, most → least specific (two-pass estate fallback).
|
||||||
|
|
||||||
|
Building-level location_names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so
|
||||||
|
the precise query 404s. We then fall back to the estate (leading tokens of the
|
||||||
|
place) — each still constrained to the cluster viewbox + distance check by the
|
||||||
|
caller, so a coarse hit lands in the right neighbourhood (tighter than the bare
|
||||||
|
cluster centroid). We deliberately do NOT add a pure-cluster candidate: that would
|
||||||
|
just reproduce the cluster centroid while mislabelling it geo_source='location';
|
||||||
|
a truly unmatchable ticket should keep its honest cluster-centroid fallback.
|
||||||
|
e.g. 'KAHAWA WENDANI ALVO HOUSE' -> ['KAHAWA WENDANI ALVO HOUSE, WENDANI, nairobi,
|
||||||
|
Kenya', 'KAHAWA WENDANI, nairobi, Kenya', 'KAHAWA, nairobi, Kenya']
|
||||||
|
"""
|
||||||
|
region_part, cluster_part = clean(region), clean(cluster)
|
||||||
|
place = extract_place(location_name)
|
||||||
|
toks = place.split()
|
||||||
|
out: list[str] = []
|
||||||
|
|
||||||
|
def add(*parts: str | None) -> None:
|
||||||
|
q = ", ".join(dict.fromkeys([p for p in parts if p] + ["Kenya"]))
|
||||||
|
if q and q != "Kenya" and q not in out:
|
||||||
|
out.append(q)
|
||||||
|
|
||||||
|
add(place, cluster_part, region_part) # 1. full precise
|
||||||
|
if len(toks) > 2:
|
||||||
|
add(" ".join(toks[:2]), region_part) # 2. estate (leading 2 tokens)
|
||||||
|
if len(toks) > 1:
|
||||||
|
add(toks[0], region_part) # 3. leading token (broad estate)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
# ── keyed geocoder ────────────────────────────────────────────────────────────
|
# ── keyed geocoder ────────────────────────────────────────────────────────────
|
||||||
def _throttle() -> None:
|
def _throttle() -> None:
|
||||||
global _last_geocode_at
|
global _last_geocode_at
|
||||||
|
|
@ -496,26 +528,32 @@ def geocode_locations(apply: bool) -> None:
|
||||||
log.info("%d actionable-INC locations to geocode (provider=%s)", len(todo), _PROVIDER)
|
log.info("%d actionable-INC locations to geocode (provider=%s)", len(todo), _PROVIDER)
|
||||||
if not apply:
|
if not apply:
|
||||||
for key, loc, cluster, region, clat, clng in todo[:50]:
|
for key, loc, cluster, region, clat, clng in todo[:50]:
|
||||||
log.info(" %s -> %r", key, compose_query(loc, cluster, region))
|
log.info(" %s -> %s", key, " | ".join(compose_queries(loc, cluster, region)))
|
||||||
return
|
return
|
||||||
written = rejected = 0
|
written = missed = coarse = 0
|
||||||
for key, loc, cluster, region, clat, clng in todo:
|
for key, loc, cluster, region, clat, clng in todo:
|
||||||
query = compose_query(loc, cluster, region)
|
|
||||||
viewbox = None
|
viewbox = None
|
||||||
if clat is not None and clng is not None:
|
if clat is not None and clng is not None:
|
||||||
viewbox = (clng - _VIEWBOX_DEG, clat - _VIEWBOX_DEG, clng + _VIEWBOX_DEG, clat + _VIEWBOX_DEG)
|
viewbox = (clng - _VIEWBOX_DEG, clat - _VIEWBOX_DEG, clng + _VIEWBOX_DEG, clat + _VIEWBOX_DEG)
|
||||||
hit = geocode(query, viewbox)
|
# two-pass: precise → estate → cluster; accept the FIRST in-range hit. A wrong-area
|
||||||
|
# match (> MAX_KM from the cluster centroid) is skipped so we try a coarser query.
|
||||||
|
hit = used = None
|
||||||
|
for i, cand in enumerate(compose_queries(loc, cluster, region)):
|
||||||
|
g = geocode(cand, viewbox)
|
||||||
|
if not g:
|
||||||
|
continue
|
||||||
|
lat, lng, conf = g
|
||||||
|
if (clat is not None and clng is not None
|
||||||
|
and _haversine_km(lat, lng, clat, clng) > _MAX_KM_FROM_CLUSTER):
|
||||||
|
continue
|
||||||
|
hit, used = g, cand
|
||||||
|
if i > 0:
|
||||||
|
coarse += 1
|
||||||
|
break
|
||||||
if not hit:
|
if not hit:
|
||||||
|
missed += 1 # no match even coarsely — keeps cluster-centroid fallback
|
||||||
continue
|
continue
|
||||||
lat, lng, conf = hit
|
lat, lng, conf = hit
|
||||||
# distance sanity: a result far from the cluster centroid is a wrong-city
|
|
||||||
# match — drop it so the ticket keeps the cluster-centroid fallback.
|
|
||||||
if clat is not None and clng is not None:
|
|
||||||
km = _haversine_km(lat, lng, clat, clng)
|
|
||||||
if km > _MAX_KM_FROM_CLUSTER:
|
|
||||||
rejected += 1
|
|
||||||
log.info(" reject (%.0f km from cluster): %s", km, query)
|
|
||||||
continue
|
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
|
|
@ -526,13 +564,14 @@ def geocode_locations(apply: bool) -> None:
|
||||||
SET location_name = EXCLUDED.location_name, cluster = EXCLUDED.cluster,
|
SET location_name = EXCLUDED.location_name, cluster = EXCLUDED.cluster,
|
||||||
region = EXCLUDED.region, query = EXCLUDED.query, lat = EXCLUDED.lat,
|
region = EXCLUDED.region, query = EXCLUDED.query, lat = EXCLUDED.lat,
|
||||||
lng = EXCLUDED.lng, confidence = EXCLUDED.confidence, provider = EXCLUDED.provider""",
|
lng = EXCLUDED.lng, confidence = EXCLUDED.confidence, provider = EXCLUDED.provider""",
|
||||||
(key, loc, cluster, region, query, lat, lng, conf, _PROVIDER),
|
(key, loc, cluster, region, used, lat, lng, conf, _PROVIDER),
|
||||||
)
|
)
|
||||||
written += 1
|
written += 1
|
||||||
log.info(" geocoded %s -> %.5f, %.5f", query, lat, lng)
|
log.info(" geocoded %s -> %.5f, %.5f", used, lat, lng)
|
||||||
n = _resolve()
|
n = _resolve()
|
||||||
log.info("locations: %d accepted, %d rejected (too far); re-resolved geom on %d tickets "
|
log.info("locations: %d accepted (%d via estate/cluster fallback), %d unmatched; "
|
||||||
"(unverified — review tickets.geo_locations)", written, rejected, n)
|
"re-resolved geom on %d tickets (unverified — review tickets.geo_locations)",
|
||||||
|
written, coarse, missed, n)
|
||||||
|
|
||||||
|
|
||||||
def _resolve() -> int:
|
def _resolve() -> int:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue