sanitisation of postcode

This commit is contained in:
Jun-te Kim 2026-05-20 12:57:03 +00:00
parent 914a8ed51e
commit 8bb90a5aa5
15 changed files with 153 additions and 125 deletions

View file

@ -2,7 +2,7 @@ import os
import boto3
import pandas as pd
from io import BytesIO
from typing import Any, Optional
from typing import Any
from uuid import UUID
from datetime import datetime, timezone
@ -12,7 +12,6 @@ from backend.app.db.functions.bulk_address_uploads_functions import (
set_combined_output_s3_uri,
set_combining_status,
)
from orchestration.task_orchestrator import TaskOrchestrator
logger = setup_logger()
@ -36,16 +35,7 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame:
@subtask_handler()
def handler(
body: dict[str, Any],
context: Any,
orchestrator: Optional[TaskOrchestrator] = None,
) -> str:
# `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
# decorator; unused here but accepted so the contract is uniform across
# callers (see issue #1103).
del orchestrator
def handler(body: dict[str, Any], context: Any) -> str:
task_id_str: str = body.get("task_id", "")
if not task_id_str:

View file

@ -16,7 +16,6 @@ from backend.ordnanceSurvey.helpers import (
os_places_results_to_dataframe,
)
from backend.app.config import get_settings
from orchestration.task_orchestrator import TaskOrchestrator
from sqlalchemy import select
from datetime import datetime
import uuid
@ -106,16 +105,7 @@ def save_results_to_s3(
@subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body
def handler(
body: dict[str, Any],
context: Any,
orchestrator: Optional[TaskOrchestrator] = None,
local: bool = False,
) -> None:
# `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
# decorator; unused here but accepted so the contract is uniform across
# callers (see issue #1103).
del orchestrator
def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
# delete this line after test
# local = True

View file

@ -22,6 +22,7 @@ from __future__ import annotations
from collections.abc import Iterable, Iterator
from domain.addresses.user_address import UserAddress
from domain.postcode import Postcode
def iter_postcode_grouped_batches(
@ -75,13 +76,14 @@ def iter_postcode_grouped_batches(
def _group_by_postcode_in_order(
addresses: Iterable[UserAddress],
) -> dict[str, list[UserAddress]]:
) -> dict[Postcode, list[UserAddress]]:
"""Group addresses by ``postcode`` preserving first-seen order.
Python dicts retain insertion order since 3.7, so a plain dict suffices
for the same effect as pandas ``groupby(..., sort=False)``.
for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode``
is a frozen value object, hence hashable and usable as the dict key.
"""
groups: dict[str, list[UserAddress]] = {}
groups: dict[Postcode, list[UserAddress]] = {}
for address in addresses:
groups.setdefault(address.postcode, []).append(address)
return groups

View file

@ -1,9 +1,9 @@
"""The :class:`UserAddress` value object.
A frozen dataclass capturing the splitter's domain entity: the raw input
address line, a sanitised postcode, and an optional internal reference from
the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no
caller can construct an instance with an un-normalised postcode.
address line, a :class:`~domain.postcode.Postcode`, and an optional internal
reference from the customer dataset. The postcode is a value object that is
canonical by construction, so no caller can hold an un-normalised postcode.
"""
from __future__ import annotations
@ -11,7 +11,7 @@ from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
from domain.postcodes.sanitise import sanitise_postcode
from domain.postcode import Postcode
def _empty_source_row() -> dict[str, str]:
@ -25,9 +25,9 @@ class UserAddress:
Attributes:
user_address: The free-text address string as supplied upstream.
postcode: The postcode; always stored in canonical form
(uppercased, whitespace stripped). Sanitisation is enforced by
:meth:`__post_init__`.
postcode: The postcode as a :class:`~domain.postcode.Postcode` value
object -- canonical (uppercased, whitespace stripped) by
construction.
internal_reference: Optional customer-side identifier preserved for
traceability through the matching pipeline.
source_row: The complete original CSV row this address was parsed
@ -39,12 +39,8 @@ class UserAddress:
"""
user_address: str
postcode: str
postcode: Postcode
internal_reference: Optional[str] = None
source_row: dict[str, str] = field(
default_factory=_empty_source_row, compare=False
)
def __post_init__(self) -> None:
# Frozen dataclass: bypass the descriptor with object.__setattr__.
object.__setattr__(self, "postcode", sanitise_postcode(self.postcode))

40
domain/postcode.py Normal file
View file

@ -0,0 +1,40 @@
"""The :class:`Postcode` value object.
A frozen value object that owns postcode sanitisation. Constructing a
``Postcode`` always yields the canonical form -- uppercase with all
whitespace removed -- so no part of the domain can hold an un-normalised
postcode. This matches the legacy splitter's
``df["postcode"].str.upper().str.replace(" ", "")``.
``Postcode`` is the single sanitisation point: anywhere a postcode crosses a
domain boundary it should be wrapped in one, and ``str(postcode)`` gives the
canonical string back for serialisation.
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class Postcode:
"""A postcode held in canonical form.
The ``value`` passed to the constructor is sanitised eagerly in
:meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs,
newlines) removed -- so every ``Postcode`` instance is canonical by
construction. Two postcodes that differ only in surface whitespace or
case therefore compare equal.
Attributes:
value: The canonical postcode string (e.g. ``"SW1A1AA"``).
"""
value: str
def __post_init__(self) -> None:
# Frozen dataclass: bypass the descriptor with object.__setattr__.
object.__setattr__(self, "value", "".join(self.value.split()).upper())
def __str__(self) -> str:
return self.value

View file

@ -1,23 +0,0 @@
"""Canonical postcode sanitisation for the domain layer.
The legacy postcode_splitter normalises postcodes inline with
``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes
that operation to a pure, reusable function so the same canonical form is
applied wherever a postcode crosses a domain boundary -- including
:class:`domain.addresses.user_address.UserAddress` construction and future
migrations.
"""
from __future__ import annotations
def sanitise_postcode(s: str) -> str:
"""Return the canonical form of a postcode.
The canonical form is uppercase with all whitespace removed. This matches
the legacy splitter's ``str.upper().str.replace(" ", "")`` for the
overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"``
becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep
in from CSV ingestion.
"""
return "".join(s.split()).upper()

View file

@ -20,6 +20,7 @@ from datetime import datetime, timezone
from typing import Optional
from domain.addresses.user_address import UserAddress
from domain.postcode import Postcode
from infrastructure.csv_s3_client import CsvS3Client
from repositories.user_address.user_address_repository import UserAddressRepository
@ -77,7 +78,7 @@ class UserAddressCsvS3Repository(UserAddressRepository):
addresses.append(
UserAddress(
user_address=user_address,
postcode=postcode,
postcode=Postcode(postcode),
internal_reference=internal_reference,
source_row=row,
)
@ -96,7 +97,7 @@ class UserAddressCsvS3Repository(UserAddressRepository):
Returns the full ``s3://bucket/key`` URI.
"""
rows: list[dict[str, str]] = [
{**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode}
{**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
for addr in addresses
]
filename = (

View file

@ -17,8 +17,8 @@ class UserAddressRepository(ABC):
Implementations choose the underlying storage (S3 CSV, Postgres,
in-memory, ...) but must preserve the canonical column semantics:
the address text, postcode (sanitised by ``UserAddress.__post_init__``),
and an optional internal reference.
the address text, postcode (a :class:`~domain.postcode.Postcode` value
object), and an optional internal reference.
"""
@abstractmethod

View file

@ -2,12 +2,15 @@ import pytest
from domain.addresses.postcode_batching import iter_postcode_grouped_batches
from domain.addresses.user_address import UserAddress
from domain.postcode import Postcode
def _addrs(postcode: str, n: int) -> list[UserAddress]:
"""Build ``n`` addresses sharing a postcode, with distinct address lines."""
return [
UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode)
UserAddress(
user_address=f"{i} {postcode} Street", postcode=Postcode(postcode)
)
for i in range(n)
]
@ -38,8 +41,8 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None:
addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3)
batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
assert len(batches) == 2
assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3
assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3
assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3
assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3
def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
@ -61,9 +64,9 @@ def test_oversize_group_flushes_existing_buffer_first() -> None:
iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
)
assert len(batches) == 3
assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"]
assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7
assert [a.postcode for a in batches[2]] == ["CC33CC"]
assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"]
assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7
assert [str(a.postcode) for a in batches[2]] == ["CC33CC"]
def test_final_flush_yields_remaining_buffer() -> None:
@ -80,7 +83,7 @@ def test_postcode_grouping_preserves_first_seen_order() -> None:
b1, b2 = _addrs("AA1 1AA", 2)
batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2]))
assert len(batches) == 1
assert [a.postcode for a in batches[0]] == [
assert [str(a.postcode) for a in batches[0]] == [
"ZZ99ZZ",
"ZZ99ZZ",
"AA11AA",

View file

@ -3,69 +3,77 @@ import dataclasses
import pytest
from domain.addresses.user_address import UserAddress
from domain.postcode import Postcode
def test_user_address_sanitises_postcode_on_construction() -> None:
addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
assert addr.postcode == "SW1A1AA"
def test_user_address_holds_postcode_value_object() -> None:
addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
assert addr.postcode == Postcode("SW1A1AA")
def test_user_address_preserves_user_address_verbatim() -> None:
# The free-text user_address string is intentionally NOT normalised --
# only the postcode is canonicalised at the boundary.
addr = UserAddress(user_address=" 1 The Street ", postcode="sw1a 1aa")
# only the postcode is canonicalised, and that happens inside Postcode.
addr = UserAddress(
user_address=" 1 The Street ", postcode=Postcode("SW1A1AA")
)
assert addr.user_address == " 1 The Street "
def test_user_address_internal_reference_defaults_to_none() -> None:
addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
assert addr.internal_reference is None
def test_user_address_internal_reference_accepted() -> None:
addr = UserAddress(
user_address="1 The Street",
postcode="SW1A1AA",
postcode=Postcode("SW1A1AA"),
internal_reference="cust-42",
)
assert addr.internal_reference == "cust-42"
def test_user_address_is_frozen() -> None:
addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
with pytest.raises(dataclasses.FrozenInstanceError):
addr.postcode = "OTHER" # type: ignore[misc]
addr.postcode = Postcode("OTHER") # type: ignore[misc]
def test_user_address_equality_uses_sanitised_postcode() -> None:
# Two instances constructed with different surface forms of the same
# postcode must compare equal because sanitisation runs eagerly.
a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
b = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
def test_user_address_equality_uses_canonical_postcode() -> None:
# Postcode sanitises eagerly, so addresses built from different surface
# forms of the same postcode compare equal.
a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
assert a == b
def test_user_address_source_row_defaults_to_empty_dict() -> None:
addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
assert addr.source_row == {}
def test_user_address_carries_source_row() -> None:
row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
addr = UserAddress(
user_address="1 The Street", postcode="SW1A 1AA", source_row=row
user_address="1 The Street",
postcode=Postcode("SW1A 1AA"),
source_row=row,
)
assert addr.source_row == row
def test_user_address_equality_ignores_source_row() -> None:
# source_row is excluded from equality (and hashing): identity stays
# defined by the parsed fields, so two addresses parsed from rows with
# different incidental columns still compare equal.
# defined by the parsed fields.
a = UserAddress(
user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"}
user_address="1 The Street",
postcode=Postcode("SW1A1AA"),
source_row={"x": "1"},
)
b = UserAddress(
user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"}
user_address="1 The Street",
postcode=Postcode("SW1A1AA"),
source_row={"y": "2"},
)
assert a == b

View file

@ -1,28 +0,0 @@
from domain.postcodes.sanitise import sanitise_postcode
def test_sanitise_uppercases() -> None:
assert sanitise_postcode("sw1a1aa") == "SW1A1AA"
def test_sanitise_strips_internal_spaces() -> None:
assert sanitise_postcode("sw1a 1aa") == "SW1A1AA"
def test_sanitise_strips_leading_and_trailing_whitespace() -> None:
assert sanitise_postcode(" sw1a 1aa ") == "SW1A1AA"
def test_sanitise_strips_tabs_and_newlines() -> None:
# CSV ingestion occasionally introduces stray whitespace characters; the
# canonical form must absorb them just like literal spaces.
assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA"
def test_sanitise_already_canonical_is_idempotent() -> None:
assert sanitise_postcode("SW1A1AA") == "SW1A1AA"
assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA"
def test_sanitise_empty_string() -> None:
assert sanitise_postcode("") == ""

View file

@ -0,0 +1,48 @@
import dataclasses
import pytest
from domain.postcode import Postcode
def test_postcode_uppercases() -> None:
assert Postcode("sw1a1aa").value == "SW1A1AA"
def test_postcode_strips_internal_spaces() -> None:
assert Postcode("sw1a 1aa").value == "SW1A1AA"
def test_postcode_strips_leading_and_trailing_whitespace() -> None:
assert Postcode(" sw1a 1aa ").value == "SW1A1AA"
def test_postcode_strips_tabs_and_newlines() -> None:
# CSV ingestion occasionally introduces stray whitespace characters; the
# canonical form must absorb them just like literal spaces.
assert Postcode("sw1a\t1aa\n").value == "SW1A1AA"
def test_postcode_construction_is_idempotent() -> None:
once = Postcode("sw1a 1aa")
assert Postcode(once.value).value == "SW1A1AA"
def test_postcode_empty_string() -> None:
assert Postcode("").value == ""
def test_postcode_str_returns_canonical_value() -> None:
assert str(Postcode("sw1a 1aa")) == "SW1A1AA"
def test_postcode_equality_ignores_surface_form() -> None:
# Differing case / whitespace sanitise to the same canonical value, so
# the value objects compare equal.
assert Postcode("sw1a 1aa") == Postcode("SW1A1AA")
def test_postcode_is_frozen() -> None:
postcode = Postcode("SW1A1AA")
with pytest.raises(dataclasses.FrozenInstanceError):
postcode.value = "OTHER" # type: ignore[misc]

View file

@ -4,6 +4,7 @@ import pytest
from moto import mock_aws
from domain.addresses.user_address import UserAddress
from domain.postcode import Postcode
from infrastructure.csv_s3_client import CsvS3Client
from repositories.user_address.user_address_csv_s3_repository import (
UserAddressCsvS3Repository,
@ -47,7 +48,7 @@ def test_load_batch_parses_address_postcode_and_reference(
assert len(addresses) == 1
address = addresses[0]
assert address.user_address == "1 High Street, Flat 2, Townville"
assert address.postcode == "SW1A1AA"
assert address.postcode == Postcode("SW1A1AA")
assert address.internal_reference == "REF-001"
@ -69,7 +70,7 @@ def test_load_batch_uses_only_address_1_when_others_missing(
assert len(addresses) == 1
assert addresses[0].user_address == "10 Cardiff Road"
assert addresses[0].postcode == "CF101AA"
assert addresses[0].postcode == Postcode("CF101AA")
assert addresses[0].internal_reference == "REF-002"
@ -91,7 +92,7 @@ def test_load_batch_handles_missing_internal_reference(
assert len(addresses) == 1
assert addresses[0].user_address == "5 Park Lane"
assert addresses[0].postcode == "M11AA"
assert addresses[0].postcode == Postcode("M11AA")
assert addresses[0].internal_reference is None
@ -154,7 +155,7 @@ def test_save_batch_returns_uri_under_path_prefix(
addresses = [
UserAddress(
user_address="1 High Street",
postcode="SW1A 1AA",
postcode=Postcode("SW1A 1AA"),
source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
),
]
@ -199,7 +200,7 @@ def test_save_batch_uses_unique_filename_per_call(
addresses = [
UserAddress(
user_address="1 High Street",
postcode="SW1A 1AA",
postcode=Postcode("SW1A 1AA"),
source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
),
]