Merge branch 'main' into feature/handle-new-magicplan-response-structure

This commit is contained in:
Daniel Roth 2026-06-08 09:57:26 +00:00
commit c22ee3821b
4 changed files with 83 additions and 10 deletions

View file

@ -510,6 +510,17 @@ def handler(event, context, local=False):
# Create results DataFrame
result_df = pd.DataFrame(results_data)
# The UPRN is integer-valued, but the no-match rows append None, so the
# mixed column lands as float64 and would serialise as "100020933699.0".
# Coerce to a nullable integer so it round-trips as "100020933699"
# (empty when missing) — the form the finaliser and the combined-results
# UI expect. `to_numeric(errors="coerce")` also folds the
# "invalid postcode" sentinel + blanks to NA (read back as missing).
if "address2uprn_uprn" in result_df.columns:
result_df["address2uprn_uprn"] = pd.to_numeric(
result_df["address2uprn_uprn"], errors="coerce"
).astype("Int64")
# Save results to S3
try:
save_results_to_s3(result_df, str(task_id), str(subtask_id))

View file

@ -572,8 +572,9 @@ module "bulk_upload_finaliser_registry" {
stage = var.stage
}
# The finaliser only reads the combiner output (bulk_final_outputs) to insert
# property rows; it writes to Postgres, not S3.
# The finaliser reads the combiner output (bulk_final_outputs) to insert property
# rows, and for v2 (ADR-0006) the classifier CSV (bulk_onboarding_inputs) to
# populate property_overrides. It writes to Postgres, not S3.
module "bulk_upload_finaliser_s3_read" {
source = "../modules/s3_iam_policy"
@ -581,7 +582,7 @@ module "bulk_upload_finaliser_s3_read" {
policy_description = "Allow bulk_upload_finaliser Lambda to read combiner output from retrofit-data bucket"
bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"]
actions = ["s3:GetObject", "s3:ListBucket"]
resource_paths = ["/bulk_final_outputs/*"]
resource_paths = ["/bulk_final_outputs/*", "/bulk_onboarding_inputs/*"]
}
output "bulk_upload_finaliser_s3_read_arn" {

View file

@ -107,8 +107,18 @@ def _parse_uprn(raw: Any) -> Optional[int]:
return None
try:
return int(val)
except ValueError:
pass
# The combiner writes the UPRN column via pandas; when any row in the batch is
# unmatched (NaN), pandas coerces the whole column to float64, so a UPRN
# arrives as e.g. "100020933699.0". Recover the integer from that float form
# (otherwise every matched row in a mixed batch parses to None — the cause of
# NULL property.uprn and empty property_overrides for matched rows).
try:
as_float = float(val)
except ValueError:
return None
return int(as_float) if as_float.is_integer() else None
def _parse_lexiscore(raw: Any) -> Optional[float]:
@ -189,15 +199,22 @@ class BulkUploadFinaliserOrchestrator:
multi_entry_ordering: Optional[dict[str, list[int]]],
column_mapping: Optional[dict[str, str]],
) -> None:
# Nothing to do without the classifier inputs or the override
# collaborators (e.g. an upload with no classifier columns mapped).
# Nothing to do without the override collaborators (the v1 property-only
# path / its tests construct without them).
if (
not classifier_rows
or not column_mapping
or self._property_override_repo is None
self._property_override_repo is None
or self._landlord_override_reader is None
):
return
# Log WHY when we write nothing, so an empty property_overrides is never a
# silent mystery (e.g. an upload whose CSVs predate the source_row_id join
# key, or one with no classifier columns mapped).
if not classifier_rows:
logger.info("Finalise: no classifier CSV rows; no property_overrides.")
return
if not column_mapping:
logger.info("Finalise: empty column_mapping; no property_overrides.")
return
override_rows = self._build_overrides(
combiner_rows,
classifier_rows,
@ -205,7 +222,8 @@ class BulkUploadFinaliserOrchestrator:
column_mapping,
portfolio_id,
)
self._property_override_repo.upsert_all(override_rows)
affected = self._property_override_repo.upsert_all(override_rows)
logger.info("Finalise: upserted %d property_overrides.", affected)
def _build_overrides(
self,
@ -224,12 +242,33 @@ class BulkUploadFinaliserOrchestrator:
# UPRN-matched combiner rows: source_row_id -> uprn.
uprn_by_row_id: dict[str, int] = {}
combiner_with_srid = 0
combiner_with_uprn = 0
for row in combiner_rows:
row_id = _normalize(row.get(SOURCE_ROW_ID_COL))
uprn = _parse_uprn(row.get(UPRN_COL))
if row_id:
combiner_with_srid += 1
if uprn is not None:
combiner_with_uprn += 1
if row_id and uprn is not None:
uprn_by_row_id[row_id] = uprn
if not uprn_by_row_id:
classifier_with_srid = sum(
1 for r in classifier_rows if _normalize(r.get(SOURCE_ROW_ID_COL))
)
logger.warning(
"Finalise: 0 override candidates — %d combiner rows (%d with "
"source_row_id, %d with a UPRN), %d/%d classifier rows with "
"source_row_id. If source_row_id counts are 0, the upload's CSVs "
"predate the join-key change (ADR-0006) — re-run from "
"start-address-matching so both CSVs carry it.",
len(combiner_rows),
combiner_with_srid,
combiner_with_uprn,
classifier_with_srid,
len(classifier_rows),
)
return []
ids_by_uprn = self._property_repo.ids_by_uprn(

View file

@ -135,6 +135,27 @@ def test_finalise_falls_back_to_user_input_when_unmatched() -> None:
assert row.lexiscore is None
def test_finalise_parses_pandas_float_uprn() -> None:
# The combiner writes UPRNs as float strings ("100023.0") whenever the batch
# has any unmatched rows; the finaliser must still recover the integer, else
# matched properties get NULL uprn and no overrides.
orchestrator, prop, _status = _orchestrator()
rows = [
{
"Address 1": "1 Some Street",
"postcode": "A0 0AA",
"address2uprn_uprn": "100023.0",
"address2uprn_address": "1 SOME STREET, TOWN, A0 0AA",
"address2uprn_lexiscore": "0.9",
},
]
orchestrator.finalise(rows, portfolio_id=7, task_id=uuid4())
(row,) = prop.inserted
assert row.uprn == 100023
# --- v2: property_overrides (ADR-0006) ------------------------------------
@ -158,7 +179,8 @@ def test_finalise_writes_overrides_for_uprn_rows_splitting_by_part() -> None:
combiner = [
{
"Address 1": "1 Some Street",
"address2uprn_uprn": "100023",
# pandas float form, as the combiner actually writes it in a mixed batch.
"address2uprn_uprn": "100023.0",
"source_row_id": "row-a",
},
# A no-UPRN row: must get NO overrides (deferred to v3).