sanitisation of postcode

2026-07-27 23:35:01 +00:00 · 2026-05-20 12:57:03 +00:00 · 2026-05-20 12:57:03 +00:00 · 8bb90a5aa5
commit 8bb90a5aa5
parent 914a8ed51e
15 changed files with 153 additions and 125 deletions
--- a/backend/bulk_address2uprn_combiner/main.py
+++ b/backend/bulk_address2uprn_combiner/main.py
@ -2,7 +2,7 @@ import os
 import boto3
 import pandas as pd
 from io import BytesIO
-from typing import Any, Optional
+from typing import Any
 from uuid import UUID
 from datetime import datetime, timezone

@ -12,7 +12,6 @@ from backend.app.db.functions.bulk_address_uploads_functions import (
    set_combined_output_s3_uri,
    set_combining_status,
 )
-from orchestration.task_orchestrator import TaskOrchestrator

 logger = setup_logger()

@ -36,16 +35,7 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame:


@subtask_handler()
-def handler(
-    body: dict[str, Any],
-    context: Any,
-    orchestrator: Optional[TaskOrchestrator] = None,
-) -> str:
-    # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
-    # decorator; unused here but accepted so the contract is uniform across
-    # callers (see issue #1103).
-    del orchestrator
-
+def handler(body: dict[str, Any], context: Any) -> str:
    task_id_str: str = body.get("task_id", "")

    if not task_id_str:
--- a/backend/ordnanceSurvey/main.py
+++ b/backend/ordnanceSurvey/main.py
@ -16,7 +16,6 @@ from backend.ordnanceSurvey.helpers import (
    os_places_results_to_dataframe,
 )
 from backend.app.config import get_settings
-from orchestration.task_orchestrator import TaskOrchestrator
 from sqlalchemy import select
 from datetime import datetime
 import uuid
@ -106,16 +105,7 @@ def save_results_to_s3(


@subtask_handler()  # This assumes task_id and subtask_id is defined in event.Records.body
-def handler(
-    body: dict[str, Any],
-    context: Any,
-    orchestrator: Optional[TaskOrchestrator] = None,
-    local: bool = False,
-) -> None:
-    # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
-    # decorator; unused here but accepted so the contract is uniform across
-    # callers (see issue #1103).
-    del orchestrator
+def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:

    # delete this line after test
    # local = True
--- a/domain/addresses/postcode_batching.py
+++ b/domain/addresses/postcode_batching.py
@ -22,6 +22,7 @@ from __future__ import annotations
 from collections.abc import Iterable, Iterator

 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode


 def iter_postcode_grouped_batches(
@ -75,13 +76,14 @@ def iter_postcode_grouped_batches(

 def _group_by_postcode_in_order(
    addresses: Iterable[UserAddress],
-) -> dict[str, list[UserAddress]]:
+) -> dict[Postcode, list[UserAddress]]:
    """Group addresses by ``postcode`` preserving first-seen order.

    Python dicts retain insertion order since 3.7, so a plain dict suffices
-    for the same effect as pandas ``groupby(..., sort=False)``.
+    for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode``
+    is a frozen value object, hence hashable and usable as the dict key.
    """
-    groups: dict[str, list[UserAddress]] = {}
+    groups: dict[Postcode, list[UserAddress]] = {}
    for address in addresses:
        groups.setdefault(address.postcode, []).append(address)
    return groups
--- a/domain/addresses/user_address.py
+++ b/domain/addresses/user_address.py
@ -1,9 +1,9 @@
 """The :class:`UserAddress` value object.

 A frozen dataclass capturing the splitter's domain entity: the raw input
-address line, a sanitised postcode, and an optional internal reference from
-the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no
-caller can construct an instance with an un-normalised postcode.
+address line, a :class:`~domain.postcode.Postcode`, and an optional internal
+reference from the customer dataset. The postcode is a value object that is
+canonical by construction, so no caller can hold an un-normalised postcode.
 """

 from __future__ import annotations
@ -11,7 +11,7 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Optional

-from domain.postcodes.sanitise import sanitise_postcode
+from domain.postcode import Postcode


 def _empty_source_row() -> dict[str, str]:
@ -25,9 +25,9 @@ class UserAddress:

    Attributes:
        user_address: The free-text address string as supplied upstream.
-        postcode: The postcode; always stored in canonical form
-            (uppercased, whitespace stripped). Sanitisation is enforced by
-            :meth:`__post_init__`.
+        postcode: The postcode as a :class:`~domain.postcode.Postcode` value
+            object -- canonical (uppercased, whitespace stripped) by
+            construction.
        internal_reference: Optional customer-side identifier preserved for
            traceability through the matching pipeline.
        source_row: The complete original CSV row this address was parsed
@ -39,12 +39,8 @@ class UserAddress:
    """

    user_address: str
-    postcode: str
+    postcode: Postcode
    internal_reference: Optional[str] = None
    source_row: dict[str, str] = field(
        default_factory=_empty_source_row, compare=False
    )
-
-    def __post_init__(self) -> None:
-        # Frozen dataclass: bypass the descriptor with object.__setattr__.
-        object.__setattr__(self, "postcode", sanitise_postcode(self.postcode))
--- a/domain/postcode.py
+++ b/domain/postcode.py
@ -0,0 +1,40 @@
+"""The :class:`Postcode` value object.
+
+A frozen value object that owns postcode sanitisation. Constructing a
+``Postcode`` always yields the canonical form -- uppercase with all
+whitespace removed -- so no part of the domain can hold an un-normalised
+postcode. This matches the legacy splitter's
+``df["postcode"].str.upper().str.replace(" ", "")``.
+
+``Postcode`` is the single sanitisation point: anywhere a postcode crosses a
+domain boundary it should be wrapped in one, and ``str(postcode)`` gives the
+canonical string back for serialisation.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Postcode:
+    """A postcode held in canonical form.
+
+    The ``value`` passed to the constructor is sanitised eagerly in
+    :meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs,
+    newlines) removed -- so every ``Postcode`` instance is canonical by
+    construction. Two postcodes that differ only in surface whitespace or
+    case therefore compare equal.
+
+    Attributes:
+        value: The canonical postcode string (e.g. ``"SW1A1AA"``).
+    """
+
+    value: str
+
+    def __post_init__(self) -> None:
+        # Frozen dataclass: bypass the descriptor with object.__setattr__.
+        object.__setattr__(self, "value", "".join(self.value.split()).upper())
+
+    def __str__(self) -> str:
+        return self.value
--- a/domain/postcodes/init.py
+++ b/domain/postcodes/init.py
--- a/domain/postcodes/sanitise.py
+++ b/domain/postcodes/sanitise.py
@ -1,23 +0,0 @@
-"""Canonical postcode sanitisation for the domain layer.
-
-The legacy postcode_splitter normalises postcodes inline with
-``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes
-that operation to a pure, reusable function so the same canonical form is
-applied wherever a postcode crosses a domain boundary -- including
-:class:`domain.addresses.user_address.UserAddress` construction and future
-migrations.
-"""
-
-from __future__ import annotations
-
-
-def sanitise_postcode(s: str) -> str:
-    """Return the canonical form of a postcode.
-
-    The canonical form is uppercase with all whitespace removed. This matches
-    the legacy splitter's ``str.upper().str.replace(" ", "")`` for the
-    overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"``
-    becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep
-    in from CSV ingestion.
-    """
-    return "".join(s.split()).upper()
--- a/repositories/user_address/user_address_csv_s3_repository.py
+++ b/repositories/user_address/user_address_csv_s3_repository.py
@ -20,6 +20,7 @@ from datetime import datetime, timezone
 from typing import Optional

 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_repository import UserAddressRepository

@ -77,7 +78,7 @@ class UserAddressCsvS3Repository(UserAddressRepository):
            addresses.append(
                UserAddress(
                    user_address=user_address,
-                    postcode=postcode,
+                    postcode=Postcode(postcode),
                    internal_reference=internal_reference,
                    source_row=row,
                )
@ -96,7 +97,7 @@ class UserAddressCsvS3Repository(UserAddressRepository):
        Returns the full ``s3://bucket/key`` URI.
        """
        rows: list[dict[str, str]] = [
-            {**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode}
+            {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
            for addr in addresses
        ]
        filename = (
--- a/repositories/user_address/user_address_repository.py
+++ b/repositories/user_address/user_address_repository.py
@ -17,8 +17,8 @@ class UserAddressRepository(ABC):

    Implementations choose the underlying storage (S3 CSV, Postgres,
    in-memory, ...) but must preserve the canonical column semantics:
-    the address text, postcode (sanitised by ``UserAddress.__post_init__``),
-    and an optional internal reference.
+    the address text, postcode (a :class:`~domain.postcode.Postcode` value
+    object), and an optional internal reference.
    """

    @abstractmethod
--- a/tests/domain/addresses/test_postcode_batching.py
+++ b/tests/domain/addresses/test_postcode_batching.py
@ -2,12 +2,15 @@ import pytest

 from domain.addresses.postcode_batching import iter_postcode_grouped_batches
 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode


 def _addrs(postcode: str, n: int) -> list[UserAddress]:
    """Build ``n`` addresses sharing a postcode, with distinct address lines."""
    return [
-        UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode)
+        UserAddress(
+            user_address=f"{i} {postcode} Street", postcode=Postcode(postcode)
+        )
        for i in range(n)
    ]

@ -38,8 +41,8 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None:
    addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3)
    batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
    assert len(batches) == 2
-    assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3
-    assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3
+    assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3
+    assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3


 def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
@ -61,9 +64,9 @@ def test_oversize_group_flushes_existing_buffer_first() -> None:
        iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
    )
    assert len(batches) == 3
-    assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"]
-    assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7
-    assert [a.postcode for a in batches[2]] == ["CC33CC"]
+    assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"]
+    assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7
+    assert [str(a.postcode) for a in batches[2]] == ["CC33CC"]


 def test_final_flush_yields_remaining_buffer() -> None:
@ -80,7 +83,7 @@ def test_postcode_grouping_preserves_first_seen_order() -> None:
    b1, b2 = _addrs("AA1 1AA", 2)
    batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2]))
    assert len(batches) == 1
-    assert [a.postcode for a in batches[0]] == [
+    assert [str(a.postcode) for a in batches[0]] == [
        "ZZ99ZZ",
        "ZZ99ZZ",
        "AA11AA",
--- a/tests/domain/addresses/test_user_address.py
+++ b/tests/domain/addresses/test_user_address.py
@ -3,69 +3,77 @@ import dataclasses
 import pytest

 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode


-def test_user_address_sanitises_postcode_on_construction() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
-    assert addr.postcode == "SW1A1AA"
+def test_user_address_holds_postcode_value_object() -> None:
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
+    assert addr.postcode == Postcode("SW1A1AA")


 def test_user_address_preserves_user_address_verbatim() -> None:
    # The free-text user_address string is intentionally NOT normalised --
-    # only the postcode is canonicalised at the boundary.
-    addr = UserAddress(user_address="  1 The   Street  ", postcode="sw1a 1aa")
+    # only the postcode is canonicalised, and that happens inside Postcode.
+    addr = UserAddress(
+        user_address="  1 The   Street  ", postcode=Postcode("SW1A1AA")
+    )
    assert addr.user_address == "  1 The   Street  "


 def test_user_address_internal_reference_defaults_to_none() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
    assert addr.internal_reference is None


 def test_user_address_internal_reference_accepted() -> None:
    addr = UserAddress(
        user_address="1 The Street",
-        postcode="SW1A1AA",
+        postcode=Postcode("SW1A1AA"),
        internal_reference="cust-42",
    )
    assert addr.internal_reference == "cust-42"


 def test_user_address_is_frozen() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
    with pytest.raises(dataclasses.FrozenInstanceError):
-        addr.postcode = "OTHER"  # type: ignore[misc]
+        addr.postcode = Postcode("OTHER")  # type: ignore[misc]


-def test_user_address_equality_uses_sanitised_postcode() -> None:
-    # Two instances constructed with different surface forms of the same
-    # postcode must compare equal because sanitisation runs eagerly.
-    a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
-    b = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+def test_user_address_equality_uses_canonical_postcode() -> None:
+    # Postcode sanitises eagerly, so addresses built from different surface
+    # forms of the same postcode compare equal.
+    a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
+    b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
    assert a == b


 def test_user_address_source_row_defaults_to_empty_dict() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
    assert addr.source_row == {}


 def test_user_address_carries_source_row() -> None:
    row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
    addr = UserAddress(
-        user_address="1 The Street", postcode="SW1A 1AA", source_row=row
+        user_address="1 The Street",
+        postcode=Postcode("SW1A 1AA"),
+        source_row=row,
    )
    assert addr.source_row == row


 def test_user_address_equality_ignores_source_row() -> None:
    # source_row is excluded from equality (and hashing): identity stays
-    # defined by the parsed fields, so two addresses parsed from rows with
-    # different incidental columns still compare equal.
+    # defined by the parsed fields.
    a = UserAddress(
-        user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"}
+        user_address="1 The Street",
+        postcode=Postcode("SW1A1AA"),
+        source_row={"x": "1"},
    )
    b = UserAddress(
-        user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"}
+        user_address="1 The Street",
+        postcode=Postcode("SW1A1AA"),
+        source_row={"y": "2"},
    )
    assert a == b
--- a/tests/domain/postcodes/init.py
+++ b/tests/domain/postcodes/init.py
--- a/tests/domain/postcodes/test_sanitise.py
+++ b/tests/domain/postcodes/test_sanitise.py
@ -1,28 +0,0 @@
-from domain.postcodes.sanitise import sanitise_postcode
-
-
-def test_sanitise_uppercases() -> None:
-    assert sanitise_postcode("sw1a1aa") == "SW1A1AA"
-
-
-def test_sanitise_strips_internal_spaces() -> None:
-    assert sanitise_postcode("sw1a 1aa") == "SW1A1AA"
-
-
-def test_sanitise_strips_leading_and_trailing_whitespace() -> None:
-    assert sanitise_postcode("  sw1a 1aa  ") == "SW1A1AA"
-
-
-def test_sanitise_strips_tabs_and_newlines() -> None:
-    # CSV ingestion occasionally introduces stray whitespace characters; the
-    # canonical form must absorb them just like literal spaces.
-    assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA"
-
-
-def test_sanitise_already_canonical_is_idempotent() -> None:
-    assert sanitise_postcode("SW1A1AA") == "SW1A1AA"
-    assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA"
-
-
-def test_sanitise_empty_string() -> None:
-    assert sanitise_postcode("") == ""
--- a/tests/domain/test_postcode.py
+++ b/tests/domain/test_postcode.py
@ -0,0 +1,48 @@
+import dataclasses
+
+import pytest
+
+from domain.postcode import Postcode
+
+
+def test_postcode_uppercases() -> None:
+    assert Postcode("sw1a1aa").value == "SW1A1AA"
+
+
+def test_postcode_strips_internal_spaces() -> None:
+    assert Postcode("sw1a 1aa").value == "SW1A1AA"
+
+
+def test_postcode_strips_leading_and_trailing_whitespace() -> None:
+    assert Postcode("  sw1a 1aa  ").value == "SW1A1AA"
+
+
+def test_postcode_strips_tabs_and_newlines() -> None:
+    # CSV ingestion occasionally introduces stray whitespace characters; the
+    # canonical form must absorb them just like literal spaces.
+    assert Postcode("sw1a\t1aa\n").value == "SW1A1AA"
+
+
+def test_postcode_construction_is_idempotent() -> None:
+    once = Postcode("sw1a 1aa")
+    assert Postcode(once.value).value == "SW1A1AA"
+
+
+def test_postcode_empty_string() -> None:
+    assert Postcode("").value == ""
+
+
+def test_postcode_str_returns_canonical_value() -> None:
+    assert str(Postcode("sw1a 1aa")) == "SW1A1AA"
+
+
+def test_postcode_equality_ignores_surface_form() -> None:
+    # Differing case / whitespace sanitise to the same canonical value, so
+    # the value objects compare equal.
+    assert Postcode("sw1a 1aa") == Postcode("SW1A1AA")
+
+
+def test_postcode_is_frozen() -> None:
+    postcode = Postcode("SW1A1AA")
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        postcode.value = "OTHER"  # type: ignore[misc]
--- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py
+++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
@ -4,6 +4,7 @@ import pytest
 from moto import mock_aws

 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_csv_s3_repository import (
    UserAddressCsvS3Repository,
@ -47,7 +48,7 @@ def test_load_batch_parses_address_postcode_and_reference(
    assert len(addresses) == 1
    address = addresses[0]
    assert address.user_address == "1 High Street, Flat 2, Townville"
-    assert address.postcode == "SW1A1AA"
+    assert address.postcode == Postcode("SW1A1AA")
    assert address.internal_reference == "REF-001"


@ -69,7 +70,7 @@ def test_load_batch_uses_only_address_1_when_others_missing(

    assert len(addresses) == 1
    assert addresses[0].user_address == "10 Cardiff Road"
-    assert addresses[0].postcode == "CF101AA"
+    assert addresses[0].postcode == Postcode("CF101AA")
    assert addresses[0].internal_reference == "REF-002"


@ -91,7 +92,7 @@ def test_load_batch_handles_missing_internal_reference(

    assert len(addresses) == 1
    assert addresses[0].user_address == "5 Park Lane"
-    assert addresses[0].postcode == "M11AA"
+    assert addresses[0].postcode == Postcode("M11AA")
    assert addresses[0].internal_reference is None


@ -154,7 +155,7 @@ def test_save_batch_returns_uri_under_path_prefix(
    addresses = [
        UserAddress(
            user_address="1 High Street",
-            postcode="SW1A 1AA",
+            postcode=Postcode("SW1A 1AA"),
            source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
        ),
    ]
@ -199,7 +200,7 @@ def test_save_batch_uses_unique_filename_per_call(
    addresses = [
        UserAddress(
            user_address="1 High Street",
-            postcode="SW1A 1AA",
+            postcode=Postcode("SW1A 1AA"),
            source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
        ),
    ]