From 8bb90a5aa5beb495de799c481c2faa7899e6c5de Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 12:57:03 +0000 Subject: [PATCH] sanitisation of postcode --- backend/bulk_address2uprn_combiner/main.py | 14 +----- backend/ordnanceSurvey/main.py | 12 +---- domain/addresses/postcode_batching.py | 8 ++-- domain/addresses/user_address.py | 20 ++++---- domain/postcode.py | 40 ++++++++++++++++ domain/postcodes/__init__.py | 0 domain/postcodes/sanitise.py | 23 --------- .../user_address_csv_s3_repository.py | 5 +- .../user_address/user_address_repository.py | 4 +- .../addresses/test_postcode_batching.py | 17 ++++--- tests/domain/addresses/test_user_address.py | 48 +++++++++++-------- tests/domain/postcodes/__init__.py | 0 tests/domain/postcodes/test_sanitise.py | 28 ----------- tests/domain/test_postcode.py | 48 +++++++++++++++++++ .../test_user_address_csv_s3_repository.py | 11 +++-- 15 files changed, 153 insertions(+), 125 deletions(-) create mode 100644 domain/postcode.py delete mode 100644 domain/postcodes/__init__.py delete mode 100644 domain/postcodes/sanitise.py delete mode 100644 tests/domain/postcodes/__init__.py delete mode 100644 tests/domain/postcodes/test_sanitise.py create mode 100644 tests/domain/test_postcode.py diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py index 37136e52..44f0b3f9 100644 --- a/backend/bulk_address2uprn_combiner/main.py +++ b/backend/bulk_address2uprn_combiner/main.py @@ -2,7 +2,7 @@ import os import boto3 import pandas as pd from io import BytesIO -from typing import Any, Optional +from typing import Any from uuid import UUID from datetime import datetime, timezone @@ -12,7 +12,6 @@ from backend.app.db.functions.bulk_address_uploads_functions import ( set_combined_output_s3_uri, set_combining_status, ) -from orchestration.task_orchestrator import TaskOrchestrator logger = setup_logger() @@ -36,16 +35,7 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame: @subtask_handler() -def handler( - body: dict[str, Any], - context: Any, - orchestrator: Optional[TaskOrchestrator] = None, -) -> str: - # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler - # decorator; unused here but accepted so the contract is uniform across - # callers (see issue #1103). - del orchestrator - +def handler(body: dict[str, Any], context: Any) -> str: task_id_str: str = body.get("task_id", "") if not task_id_str: diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 18c4e2f2..6e82b468 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -16,7 +16,6 @@ from backend.ordnanceSurvey.helpers import ( os_places_results_to_dataframe, ) from backend.app.config import get_settings -from orchestration.task_orchestrator import TaskOrchestrator from sqlalchemy import select from datetime import datetime import uuid @@ -106,16 +105,7 @@ def save_results_to_s3( @subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body -def handler( - body: dict[str, Any], - context: Any, - orchestrator: Optional[TaskOrchestrator] = None, - local: bool = False, -) -> None: - # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler - # decorator; unused here but accepted so the contract is uniform across - # callers (see issue #1103). - del orchestrator +def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # delete this line after test # local = True diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 209e0784..b73dc1bb 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -22,6 +22,7 @@ from __future__ import annotations from collections.abc import Iterable, Iterator from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode def iter_postcode_grouped_batches( @@ -75,13 +76,14 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( addresses: Iterable[UserAddress], -) -> dict[str, list[UserAddress]]: +) -> dict[Postcode, list[UserAddress]]: """Group addresses by ``postcode`` preserving first-seen order. Python dicts retain insertion order since 3.7, so a plain dict suffices - for the same effect as pandas ``groupby(..., sort=False)``. + for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode`` + is a frozen value object, hence hashable and usable as the dict key. """ - groups: dict[str, list[UserAddress]] = {} + groups: dict[Postcode, list[UserAddress]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) return groups diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index 120a3659..672b2c54 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -1,9 +1,9 @@ """The :class:`UserAddress` value object. A frozen dataclass capturing the splitter's domain entity: the raw input -address line, a sanitised postcode, and an optional internal reference from -the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no -caller can construct an instance with an un-normalised postcode. +address line, a :class:`~domain.postcode.Postcode`, and an optional internal +reference from the customer dataset. The postcode is a value object that is +canonical by construction, so no caller can hold an un-normalised postcode. """ from __future__ import annotations @@ -11,7 +11,7 @@ from __future__ import annotations from dataclasses import dataclass, field from typing import Optional -from domain.postcodes.sanitise import sanitise_postcode +from domain.postcode import Postcode def _empty_source_row() -> dict[str, str]: @@ -25,9 +25,9 @@ class UserAddress: Attributes: user_address: The free-text address string as supplied upstream. - postcode: The postcode; always stored in canonical form - (uppercased, whitespace stripped). Sanitisation is enforced by - :meth:`__post_init__`. + postcode: The postcode as a :class:`~domain.postcode.Postcode` value + object -- canonical (uppercased, whitespace stripped) by + construction. internal_reference: Optional customer-side identifier preserved for traceability through the matching pipeline. source_row: The complete original CSV row this address was parsed @@ -39,12 +39,8 @@ class UserAddress: """ user_address: str - postcode: str + postcode: Postcode internal_reference: Optional[str] = None source_row: dict[str, str] = field( default_factory=_empty_source_row, compare=False ) - - def __post_init__(self) -> None: - # Frozen dataclass: bypass the descriptor with object.__setattr__. - object.__setattr__(self, "postcode", sanitise_postcode(self.postcode)) diff --git a/domain/postcode.py b/domain/postcode.py new file mode 100644 index 00000000..514e1a39 --- /dev/null +++ b/domain/postcode.py @@ -0,0 +1,40 @@ +"""The :class:`Postcode` value object. + +A frozen value object that owns postcode sanitisation. Constructing a +``Postcode`` always yields the canonical form -- uppercase with all +whitespace removed -- so no part of the domain can hold an un-normalised +postcode. This matches the legacy splitter's +``df["postcode"].str.upper().str.replace(" ", "")``. + +``Postcode`` is the single sanitisation point: anywhere a postcode crosses a +domain boundary it should be wrapped in one, and ``str(postcode)`` gives the +canonical string back for serialisation. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Postcode: + """A postcode held in canonical form. + + The ``value`` passed to the constructor is sanitised eagerly in + :meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs, + newlines) removed -- so every ``Postcode`` instance is canonical by + construction. Two postcodes that differ only in surface whitespace or + case therefore compare equal. + + Attributes: + value: The canonical postcode string (e.g. ``"SW1A1AA"``). + """ + + value: str + + def __post_init__(self) -> None: + # Frozen dataclass: bypass the descriptor with object.__setattr__. + object.__setattr__(self, "value", "".join(self.value.split()).upper()) + + def __str__(self) -> str: + return self.value diff --git a/domain/postcodes/__init__.py b/domain/postcodes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/domain/postcodes/sanitise.py b/domain/postcodes/sanitise.py deleted file mode 100644 index 94b0dcf7..00000000 --- a/domain/postcodes/sanitise.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Canonical postcode sanitisation for the domain layer. - -The legacy postcode_splitter normalises postcodes inline with -``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes -that operation to a pure, reusable function so the same canonical form is -applied wherever a postcode crosses a domain boundary -- including -:class:`domain.addresses.user_address.UserAddress` construction and future -migrations. -""" - -from __future__ import annotations - - -def sanitise_postcode(s: str) -> str: - """Return the canonical form of a postcode. - - The canonical form is uppercase with all whitespace removed. This matches - the legacy splitter's ``str.upper().str.replace(" ", "")`` for the - overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"`` - becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep - in from CSV ingestion. - """ - return "".join(s.split()).upper() diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 7cd10bac..2432d8e9 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -20,6 +20,7 @@ from datetime import datetime, timezone from typing import Optional from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_repository import UserAddressRepository @@ -77,7 +78,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): addresses.append( UserAddress( user_address=user_address, - postcode=postcode, + postcode=Postcode(postcode), internal_reference=internal_reference, source_row=row, ) @@ -96,7 +97,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): Returns the full ``s3://bucket/key`` URI. """ rows: list[dict[str, str]] = [ - {**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode} + {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)} for addr in addresses ] filename = ( diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index d8c12855..ab9b6671 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -17,8 +17,8 @@ class UserAddressRepository(ABC): Implementations choose the underlying storage (S3 CSV, Postgres, in-memory, ...) but must preserve the canonical column semantics: - the address text, postcode (sanitised by ``UserAddress.__post_init__``), - and an optional internal reference. + the address text, postcode (a :class:`~domain.postcode.Postcode` value + object), and an optional internal reference. """ @abstractmethod diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 2dac46cc..6e52b581 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -2,12 +2,15 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode def _addrs(postcode: str, n: int) -> list[UserAddress]: """Build ``n`` addresses sharing a postcode, with distinct address lines.""" return [ - UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode) + UserAddress( + user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) + ) for i in range(n) ] @@ -38,8 +41,8 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None: addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3) batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) assert len(batches) == 2 - assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3 - assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3 + assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3 + assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3 def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None: @@ -61,9 +64,9 @@ def test_oversize_group_flushes_existing_buffer_first() -> None: iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) ) assert len(batches) == 3 - assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"] - assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7 - assert [a.postcode for a in batches[2]] == ["CC33CC"] + assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"] + assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7 + assert [str(a.postcode) for a in batches[2]] == ["CC33CC"] def test_final_flush_yields_remaining_buffer() -> None: @@ -80,7 +83,7 @@ def test_postcode_grouping_preserves_first_seen_order() -> None: b1, b2 = _addrs("AA1 1AA", 2) batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2])) assert len(batches) == 1 - assert [a.postcode for a in batches[0]] == [ + assert [str(a.postcode) for a in batches[0]] == [ "ZZ99ZZ", "ZZ99ZZ", "AA11AA", diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index 4d8322da..fa44ad61 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -3,69 +3,77 @@ import dataclasses import pytest from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode -def test_user_address_sanitises_postcode_on_construction() -> None: - addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") - assert addr.postcode == "SW1A1AA" +def test_user_address_holds_postcode_value_object() -> None: + addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + assert addr.postcode == Postcode("SW1A1AA") def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- - # only the postcode is canonicalised at the boundary. - addr = UserAddress(user_address=" 1 The Street ", postcode="sw1a 1aa") + # only the postcode is canonicalised, and that happens inside Postcode. + addr = UserAddress( + user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") + ) assert addr.user_address == " 1 The Street " def test_user_address_internal_reference_defaults_to_none() -> None: - addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) assert addr.internal_reference is None def test_user_address_internal_reference_accepted() -> None: addr = UserAddress( user_address="1 The Street", - postcode="SW1A1AA", + postcode=Postcode("SW1A1AA"), internal_reference="cust-42", ) assert addr.internal_reference == "cust-42" def test_user_address_is_frozen() -> None: - addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) with pytest.raises(dataclasses.FrozenInstanceError): - addr.postcode = "OTHER" # type: ignore[misc] + addr.postcode = Postcode("OTHER") # type: ignore[misc] -def test_user_address_equality_uses_sanitised_postcode() -> None: - # Two instances constructed with different surface forms of the same - # postcode must compare equal because sanitisation runs eagerly. - a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") - b = UserAddress(user_address="1 The Street", postcode="SW1A1AA") +def test_user_address_equality_uses_canonical_postcode() -> None: + # Postcode sanitises eagerly, so addresses built from different surface + # forms of the same postcode compare equal. + a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) assert a == b def test_user_address_source_row_defaults_to_empty_dict() -> None: - addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) assert addr.source_row == {} def test_user_address_carries_source_row() -> None: row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} addr = UserAddress( - user_address="1 The Street", postcode="SW1A 1AA", source_row=row + user_address="1 The Street", + postcode=Postcode("SW1A 1AA"), + source_row=row, ) assert addr.source_row == row def test_user_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays - # defined by the parsed fields, so two addresses parsed from rows with - # different incidental columns still compare equal. + # defined by the parsed fields. a = UserAddress( - user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"} + user_address="1 The Street", + postcode=Postcode("SW1A1AA"), + source_row={"x": "1"}, ) b = UserAddress( - user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"} + user_address="1 The Street", + postcode=Postcode("SW1A1AA"), + source_row={"y": "2"}, ) assert a == b diff --git a/tests/domain/postcodes/__init__.py b/tests/domain/postcodes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/domain/postcodes/test_sanitise.py b/tests/domain/postcodes/test_sanitise.py deleted file mode 100644 index edd1679c..00000000 --- a/tests/domain/postcodes/test_sanitise.py +++ /dev/null @@ -1,28 +0,0 @@ -from domain.postcodes.sanitise import sanitise_postcode - - -def test_sanitise_uppercases() -> None: - assert sanitise_postcode("sw1a1aa") == "SW1A1AA" - - -def test_sanitise_strips_internal_spaces() -> None: - assert sanitise_postcode("sw1a 1aa") == "SW1A1AA" - - -def test_sanitise_strips_leading_and_trailing_whitespace() -> None: - assert sanitise_postcode(" sw1a 1aa ") == "SW1A1AA" - - -def test_sanitise_strips_tabs_and_newlines() -> None: - # CSV ingestion occasionally introduces stray whitespace characters; the - # canonical form must absorb them just like literal spaces. - assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA" - - -def test_sanitise_already_canonical_is_idempotent() -> None: - assert sanitise_postcode("SW1A1AA") == "SW1A1AA" - assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA" - - -def test_sanitise_empty_string() -> None: - assert sanitise_postcode("") == "" diff --git a/tests/domain/test_postcode.py b/tests/domain/test_postcode.py new file mode 100644 index 00000000..89d5cdc8 --- /dev/null +++ b/tests/domain/test_postcode.py @@ -0,0 +1,48 @@ +import dataclasses + +import pytest + +from domain.postcode import Postcode + + +def test_postcode_uppercases() -> None: + assert Postcode("sw1a1aa").value == "SW1A1AA" + + +def test_postcode_strips_internal_spaces() -> None: + assert Postcode("sw1a 1aa").value == "SW1A1AA" + + +def test_postcode_strips_leading_and_trailing_whitespace() -> None: + assert Postcode(" sw1a 1aa ").value == "SW1A1AA" + + +def test_postcode_strips_tabs_and_newlines() -> None: + # CSV ingestion occasionally introduces stray whitespace characters; the + # canonical form must absorb them just like literal spaces. + assert Postcode("sw1a\t1aa\n").value == "SW1A1AA" + + +def test_postcode_construction_is_idempotent() -> None: + once = Postcode("sw1a 1aa") + assert Postcode(once.value).value == "SW1A1AA" + + +def test_postcode_empty_string() -> None: + assert Postcode("").value == "" + + +def test_postcode_str_returns_canonical_value() -> None: + assert str(Postcode("sw1a 1aa")) == "SW1A1AA" + + +def test_postcode_equality_ignores_surface_form() -> None: + # Differing case / whitespace sanitise to the same canonical value, so + # the value objects compare equal. + assert Postcode("sw1a 1aa") == Postcode("SW1A1AA") + + +def test_postcode_is_frozen() -> None: + postcode = Postcode("SW1A1AA") + with pytest.raises(dataclasses.FrozenInstanceError): + postcode.value = "OTHER" # type: ignore[misc] diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index 48733b55..c1acee32 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -4,6 +4,7 @@ import pytest from moto import mock_aws from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, @@ -47,7 +48,7 @@ def test_load_batch_parses_address_postcode_and_reference( assert len(addresses) == 1 address = addresses[0] assert address.user_address == "1 High Street, Flat 2, Townville" - assert address.postcode == "SW1A1AA" + assert address.postcode == Postcode("SW1A1AA") assert address.internal_reference == "REF-001" @@ -69,7 +70,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( assert len(addresses) == 1 assert addresses[0].user_address == "10 Cardiff Road" - assert addresses[0].postcode == "CF101AA" + assert addresses[0].postcode == Postcode("CF101AA") assert addresses[0].internal_reference == "REF-002" @@ -91,7 +92,7 @@ def test_load_batch_handles_missing_internal_reference( assert len(addresses) == 1 assert addresses[0].user_address == "5 Park Lane" - assert addresses[0].postcode == "M11AA" + assert addresses[0].postcode == Postcode("M11AA") assert addresses[0].internal_reference is None @@ -154,7 +155,7 @@ def test_save_batch_returns_uri_under_path_prefix( addresses = [ UserAddress( user_address="1 High Street", - postcode="SW1A 1AA", + postcode=Postcode("SW1A 1AA"), source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, ), ] @@ -199,7 +200,7 @@ def test_save_batch_uses_unique_filename_per_call( addresses = [ UserAddress( user_address="1 High Street", - postcode="SW1A 1AA", + postcode=Postcode("SW1A 1AA"), source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, ), ]