diff --git a/import_tickets.py b/import_tickets.py index 353048a..97bbcca 100644 --- a/import_tickets.py +++ b/import_tickets.py @@ -357,6 +357,38 @@ def compose_query(location_name: str | None, cluster: str | None, region: str | return ", ".join(dict.fromkeys(parts)) # de-dupe while preserving order +def compose_queries(location_name: str | None, cluster: str | None, + region: str | None) -> list[str]: + """Ordered geocode candidates, most → least specific (two-pass estate fallback). + + Building-level location_names (e.g. 'KAHAWA WENDANI ALVO HOUSE') aren't in OSM, so + the precise query 404s. We then fall back to the estate (leading tokens of the + place) — each still constrained to the cluster viewbox + distance check by the + caller, so a coarse hit lands in the right neighbourhood (tighter than the bare + cluster centroid). We deliberately do NOT add a pure-cluster candidate: that would + just reproduce the cluster centroid while mislabelling it geo_source='location'; + a truly unmatchable ticket should keep its honest cluster-centroid fallback. + e.g. 'KAHAWA WENDANI ALVO HOUSE' -> ['KAHAWA WENDANI ALVO HOUSE, WENDANI, nairobi, + Kenya', 'KAHAWA WENDANI, nairobi, Kenya', 'KAHAWA, nairobi, Kenya'] + """ + region_part, cluster_part = clean(region), clean(cluster) + place = extract_place(location_name) + toks = place.split() + out: list[str] = [] + + def add(*parts: str | None) -> None: + q = ", ".join(dict.fromkeys([p for p in parts if p] + ["Kenya"])) + if q and q != "Kenya" and q not in out: + out.append(q) + + add(place, cluster_part, region_part) # 1. full precise + if len(toks) > 2: + add(" ".join(toks[:2]), region_part) # 2. estate (leading 2 tokens) + if len(toks) > 1: + add(toks[0], region_part) # 3. leading token (broad estate) + return out + + # ── keyed geocoder ──────────────────────────────────────────────────────────── def _throttle() -> None: global _last_geocode_at @@ -496,26 +528,32 @@ def geocode_locations(apply: bool) -> None: log.info("%d actionable-INC locations to geocode (provider=%s)", len(todo), _PROVIDER) if not apply: for key, loc, cluster, region, clat, clng in todo[:50]: - log.info(" %s -> %r", key, compose_query(loc, cluster, region)) + log.info(" %s -> %s", key, " | ".join(compose_queries(loc, cluster, region))) return - written = rejected = 0 + written = missed = coarse = 0 for key, loc, cluster, region, clat, clng in todo: - query = compose_query(loc, cluster, region) viewbox = None if clat is not None and clng is not None: viewbox = (clng - _VIEWBOX_DEG, clat - _VIEWBOX_DEG, clng + _VIEWBOX_DEG, clat + _VIEWBOX_DEG) - hit = geocode(query, viewbox) + # two-pass: precise → estate → cluster; accept the FIRST in-range hit. A wrong-area + # match (> MAX_KM from the cluster centroid) is skipped so we try a coarser query. + hit = used = None + for i, cand in enumerate(compose_queries(loc, cluster, region)): + g = geocode(cand, viewbox) + if not g: + continue + lat, lng, conf = g + if (clat is not None and clng is not None + and _haversine_km(lat, lng, clat, clng) > _MAX_KM_FROM_CLUSTER): + continue + hit, used = g, cand + if i > 0: + coarse += 1 + break if not hit: + missed += 1 # no match even coarsely — keeps cluster-centroid fallback continue lat, lng, conf = hit - # distance sanity: a result far from the cluster centroid is a wrong-city - # match — drop it so the ticket keeps the cluster-centroid fallback. - if clat is not None and clng is not None: - km = _haversine_km(lat, lng, clat, clng) - if km > _MAX_KM_FROM_CLUSTER: - rejected += 1 - log.info(" reject (%.0f km from cluster): %s", km, query) - continue with get_conn() as conn: with conn.cursor() as cur: cur.execute( @@ -526,13 +564,14 @@ def geocode_locations(apply: bool) -> None: SET location_name = EXCLUDED.location_name, cluster = EXCLUDED.cluster, region = EXCLUDED.region, query = EXCLUDED.query, lat = EXCLUDED.lat, lng = EXCLUDED.lng, confidence = EXCLUDED.confidence, provider = EXCLUDED.provider""", - (key, loc, cluster, region, query, lat, lng, conf, _PROVIDER), + (key, loc, cluster, region, used, lat, lng, conf, _PROVIDER), ) written += 1 - log.info(" geocoded %s -> %.5f, %.5f", query, lat, lng) + log.info(" geocoded %s -> %.5f, %.5f", used, lat, lng) n = _resolve() - log.info("locations: %d accepted, %d rejected (too far); re-resolved geom on %d tickets " - "(unverified — review tickets.geo_locations)", written, rejected, n) + log.info("locations: %d accepted (%d via estate/cluster fallback), %d unmatched; " + "re-resolved geom on %d tickets (unverified — review tickets.geo_locations)", + written, coarse, missed, n) def _resolve() -> int: