From 8b9dcc73f219e3ed8e39e9208bafcffa6ebb4d60 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 5 Jun 2026 17:24:17 +0000 Subject: [PATCH 1/2] fix --- .../bulk_upload_finaliser_orchestrator.py | 51 ++++++++++++++++--- ...test_bulk_upload_finaliser_orchestrator.py | 24 ++++++++- 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/orchestration/bulk_upload_finaliser_orchestrator.py b/orchestration/bulk_upload_finaliser_orchestrator.py index 4d16d7f4..4aa49ab8 100644 --- a/orchestration/bulk_upload_finaliser_orchestrator.py +++ b/orchestration/bulk_upload_finaliser_orchestrator.py @@ -107,8 +107,18 @@ def _parse_uprn(raw: Any) -> Optional[int]: return None try: return int(val) + except ValueError: + pass + # The combiner writes the UPRN column via pandas; when any row in the batch is + # unmatched (NaN), pandas coerces the whole column to float64, so a UPRN + # arrives as e.g. "100020933699.0". Recover the integer from that float form + # (otherwise every matched row in a mixed batch parses to None — the cause of + # NULL property.uprn and empty property_overrides for matched rows). + try: + as_float = float(val) except ValueError: return None + return int(as_float) if as_float.is_integer() else None def _parse_lexiscore(raw: Any) -> Optional[float]: @@ -189,15 +199,22 @@ class BulkUploadFinaliserOrchestrator: multi_entry_ordering: Optional[dict[str, list[int]]], column_mapping: Optional[dict[str, str]], ) -> None: - # Nothing to do without the classifier inputs or the override - # collaborators (e.g. an upload with no classifier columns mapped). + # Nothing to do without the override collaborators (the v1 property-only + # path / its tests construct without them). if ( - not classifier_rows - or not column_mapping - or self._property_override_repo is None + self._property_override_repo is None or self._landlord_override_reader is None ): return + # Log WHY when we write nothing, so an empty property_overrides is never a + # silent mystery (e.g. an upload whose CSVs predate the source_row_id join + # key, or one with no classifier columns mapped). + if not classifier_rows: + logger.info("Finalise: no classifier CSV rows; no property_overrides.") + return + if not column_mapping: + logger.info("Finalise: empty column_mapping; no property_overrides.") + return override_rows = self._build_overrides( combiner_rows, classifier_rows, @@ -205,7 +222,8 @@ class BulkUploadFinaliserOrchestrator: column_mapping, portfolio_id, ) - self._property_override_repo.upsert_all(override_rows) + affected = self._property_override_repo.upsert_all(override_rows) + logger.info("Finalise: upserted %d property_overrides.", affected) def _build_overrides( self, @@ -224,12 +242,33 @@ class BulkUploadFinaliserOrchestrator: # UPRN-matched combiner rows: source_row_id -> uprn. uprn_by_row_id: dict[str, int] = {} + combiner_with_srid = 0 + combiner_with_uprn = 0 for row in combiner_rows: row_id = _normalize(row.get(SOURCE_ROW_ID_COL)) uprn = _parse_uprn(row.get(UPRN_COL)) + if row_id: + combiner_with_srid += 1 + if uprn is not None: + combiner_with_uprn += 1 if row_id and uprn is not None: uprn_by_row_id[row_id] = uprn if not uprn_by_row_id: + classifier_with_srid = sum( + 1 for r in classifier_rows if _normalize(r.get(SOURCE_ROW_ID_COL)) + ) + logger.warning( + "Finalise: 0 override candidates — %d combiner rows (%d with " + "source_row_id, %d with a UPRN), %d/%d classifier rows with " + "source_row_id. If source_row_id counts are 0, the upload's CSVs " + "predate the join-key change (ADR-0006) — re-run from " + "start-address-matching so both CSVs carry it.", + len(combiner_rows), + combiner_with_srid, + combiner_with_uprn, + classifier_with_srid, + len(classifier_rows), + ) return [] ids_by_uprn = self._property_repo.ids_by_uprn( diff --git a/tests/orchestration/test_bulk_upload_finaliser_orchestrator.py b/tests/orchestration/test_bulk_upload_finaliser_orchestrator.py index ae285fae..94742dca 100644 --- a/tests/orchestration/test_bulk_upload_finaliser_orchestrator.py +++ b/tests/orchestration/test_bulk_upload_finaliser_orchestrator.py @@ -135,6 +135,27 @@ def test_finalise_falls_back_to_user_input_when_unmatched() -> None: assert row.lexiscore is None +def test_finalise_parses_pandas_float_uprn() -> None: + # The combiner writes UPRNs as float strings ("100023.0") whenever the batch + # has any unmatched rows; the finaliser must still recover the integer, else + # matched properties get NULL uprn and no overrides. + orchestrator, prop, _status = _orchestrator() + rows = [ + { + "Address 1": "1 Some Street", + "postcode": "A0 0AA", + "address2uprn_uprn": "100023.0", + "address2uprn_address": "1 SOME STREET, TOWN, A0 0AA", + "address2uprn_lexiscore": "0.9", + }, + ] + + orchestrator.finalise(rows, portfolio_id=7, task_id=uuid4()) + + (row,) = prop.inserted + assert row.uprn == 100023 + + # --- v2: property_overrides (ADR-0006) ------------------------------------ @@ -158,7 +179,8 @@ def test_finalise_writes_overrides_for_uprn_rows_splitting_by_part() -> None: combiner = [ { "Address 1": "1 Some Street", - "address2uprn_uprn": "100023", + # pandas float form, as the combiner actually writes it in a mixed batch. + "address2uprn_uprn": "100023.0", "source_row_id": "row-a", }, # A no-UPRN row: must get NO overrides (deferred to v3). From e60ca6ee5deec26f4b17982c4a7b86acc8889a4f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 5 Jun 2026 19:03:33 +0000 Subject: [PATCH 2/2] source of the problem in address2uprn --- backend/address2UPRN/main.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 02eb27dc..136f4344 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -510,6 +510,17 @@ def handler(event, context, local=False): # Create results DataFrame result_df = pd.DataFrame(results_data) + # The UPRN is integer-valued, but the no-match rows append None, so the + # mixed column lands as float64 and would serialise as "100020933699.0". + # Coerce to a nullable integer so it round-trips as "100020933699" + # (empty when missing) — the form the finaliser and the combined-results + # UI expect. `to_numeric(errors="coerce")` also folds the + # "invalid postcode" sentinel + blanks to NA (read back as missing). + if "address2uprn_uprn" in result_df.columns: + result_df["address2uprn_uprn"] = pd.to_numeric( + result_df["address2uprn_uprn"], errors="coerce" + ).astype("Int64") + # Save results to S3 try: save_results_to_s3(result_df, str(task_id), str(subtask_id))