diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index 1765cbc8..c3074c02 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -23,7 +23,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve |------|------------|------------------| | **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" | | **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | -| **User Address** | A free-text address string provided by a user or imported from a customer dataset, before any normalisation or matching. | "user input", "raw address", "user_inputed_address" | +| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" | | **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | ## Address Matching @@ -72,7 +72,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve ## Flagged ambiguities -- **"address"** appears as both the raw **User Address** (free-text from customer data) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". +- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense. - **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments. - **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`. - **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter. diff --git a/domain/addresses/__init__.py b/domain/addresses/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py new file mode 100644 index 00000000..209e0784 --- /dev/null +++ b/domain/addresses/postcode_batching.py @@ -0,0 +1,87 @@ +"""Pure-Python postcode-grouped batching. + +This module preserves the batching invariants from the legacy postcode +splitter (``backend/postcode_splitter/main.py``) without touching pandas, +S3, or SQS: + + * Addresses are grouped by **Postcode** in *insertion order* -- the first + Postcode seen produces the first group. + * A Postcode group is never split across two batches. + * If a single Postcode group is larger than ``max_batch_size``, it is + flushed as its own oversize batch (any buffered groups go out first, + untouched). + * Adding a group that would push the buffer past ``max_batch_size`` first + flushes the existing buffer, then starts a new buffer with the group. + * Whatever remains in the buffer after the input is exhausted is flushed + as the final batch. + * Empty input yields no batches. +""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator + +from domain.addresses.user_address import UserAddress + + +def iter_postcode_grouped_batches( + addresses: Iterable[UserAddress], + *, + max_batch_size: int = 500, +) -> Iterator[list[UserAddress]]: + """Yield batches of ``UserAddress`` grouped by Postcode. + + Args: + addresses: An iterable of :class:`UserAddress`. Order is preserved + within each Postcode group, and groups are yielded in the order + their first member was seen. + max_batch_size: The soft upper bound on batch size, in number of + addresses. A single Postcode group larger than this cap is + dispatched whole (the cap is never used to split a group). + + Yields: + Lists of ``UserAddress``. Each list is non-empty. + """ + if max_batch_size < 1: + raise ValueError("max_batch_size must be >= 1") + + groups = _group_by_postcode_in_order(addresses) + + buffer: list[UserAddress] = [] + for group in groups.values(): + group_len = len(group) + + # Oversize single-Postcode group: flush buffer first, then dispatch + # the group as its own batch. Mirrors the legacy + # ``if group_len >= batch_size`` branch. + if group_len >= max_batch_size: + if buffer: + yield buffer + buffer = [] + yield group + continue + + # Adding this group would overflow: flush buffer before appending. + if len(buffer) + group_len > max_batch_size: + yield buffer + buffer = [] + + buffer.extend(group) + + # Final flush. + if buffer: + yield buffer + + +def _group_by_postcode_in_order( + addresses: Iterable[UserAddress], +) -> dict[str, list[UserAddress]]: + """Group addresses by ``postcode`` preserving first-seen order. + + Python dicts retain insertion order since 3.7, so a plain dict suffices + for the same effect as pandas ``groupby(..., sort=False)``. + """ + groups: dict[str, list[UserAddress]] = {} + for address in addresses: + groups.setdefault(address.postcode, []).append(address) + return groups diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py new file mode 100644 index 00000000..e48dfdec --- /dev/null +++ b/domain/addresses/user_address.py @@ -0,0 +1,36 @@ +"""The :class:`UserAddress` value object. + +A frozen dataclass capturing the splitter's domain entity: the raw input +address line, a sanitised postcode, and an optional internal reference from +the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no +caller can construct an instance with an un-normalised postcode. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +from domain.postcodes.sanitise import sanitise_postcode + + +@dataclass(frozen=True) +class UserAddress: + """A user-supplied address paired with its canonical postcode. + + Attributes: + user_address: The free-text address string as supplied upstream. + postcode: The postcode; always stored in canonical form + (uppercased, whitespace stripped). Sanitisation is enforced by + :meth:`__post_init__`. + internal_reference: Optional customer-side identifier preserved for + traceability through the matching pipeline. + """ + + user_address: str + postcode: str + internal_reference: Optional[str] = None + + def __post_init__(self) -> None: + # Frozen dataclass: bypass the descriptor with object.__setattr__. + object.__setattr__(self, "postcode", sanitise_postcode(self.postcode)) diff --git a/domain/postcodes/__init__.py b/domain/postcodes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/postcodes/sanitise.py b/domain/postcodes/sanitise.py new file mode 100644 index 00000000..94b0dcf7 --- /dev/null +++ b/domain/postcodes/sanitise.py @@ -0,0 +1,23 @@ +"""Canonical postcode sanitisation for the domain layer. + +The legacy postcode_splitter normalises postcodes inline with +``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes +that operation to a pure, reusable function so the same canonical form is +applied wherever a postcode crosses a domain boundary -- including +:class:`domain.addresses.user_address.UserAddress` construction and future +migrations. +""" + +from __future__ import annotations + + +def sanitise_postcode(s: str) -> str: + """Return the canonical form of a postcode. + + The canonical form is uppercase with all whitespace removed. This matches + the legacy splitter's ``str.upper().str.replace(" ", "")`` for the + overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"`` + becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep + in from CSV ingestion. + """ + return "".join(s.split()).upper() diff --git a/tests/domain/addresses/__init__.py b/tests/domain/addresses/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py new file mode 100644 index 00000000..2dac46cc --- /dev/null +++ b/tests/domain/addresses/test_postcode_batching.py @@ -0,0 +1,93 @@ +import pytest + +from domain.addresses.postcode_batching import iter_postcode_grouped_batches +from domain.addresses.user_address import UserAddress + + +def _addrs(postcode: str, n: int) -> list[UserAddress]: + """Build ``n`` addresses sharing a postcode, with distinct address lines.""" + return [ + UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode) + for i in range(n) + ] + + +def test_empty_input_yields_no_batches() -> None: + assert list(iter_postcode_grouped_batches([])) == [] + + +def test_single_batch_under_cap() -> None: + addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500)) + assert len(batches) == 1 + assert batches[0] == addrs + + +def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None: + # Two groups whose total exactly equals the cap pack into a single + # batch -- no premature flush. + addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + assert len(batches) == 1 + assert len(batches[0]) == 5 + + +def test_flush_on_overflow_before_adding_next_postcode() -> None: + # Cap is 5. First group fills 3 slots; second group of 3 would overflow, + # so the buffer is flushed first and the next group starts a fresh batch. + addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + assert len(batches) == 2 + assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3 + assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3 + + +def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None: + # An oversize single-postcode group goes out as one batch larger than + # the cap -- the cap never splits a postcode. + addrs = _addrs("AA1 1AA", 7) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + assert len(batches) == 1 + assert len(batches[0]) == 7 + + +def test_oversize_group_flushes_existing_buffer_first() -> None: + # Mirrors the legacy ``if buffer: flush`` branch when an oversize group + # is encountered: buffered work must not be lost or interleaved. + small = _addrs("AA1 1AA", 2) + big = _addrs("BB2 2BB", 7) + tail = _addrs("CC3 3CC", 1) + batches = list( + iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) + ) + assert len(batches) == 3 + assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"] + assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7 + assert [a.postcode for a in batches[2]] == ["CC33CC"] + + +def test_final_flush_yields_remaining_buffer() -> None: + # No overflow ever happens, but the trailing buffer must still come out. + addrs = _addrs("AA1 1AA", 2) + _addrs("BB2 2BB", 2) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500)) + assert batches == [addrs] + + +def test_postcode_grouping_preserves_first_seen_order() -> None: + # Interleaved input must still group by postcode and emit in first-seen + # order -- never alphabetical. + a1, a2 = _addrs("ZZ9 9ZZ", 2) + b1, b2 = _addrs("AA1 1AA", 2) + batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2])) + assert len(batches) == 1 + assert [a.postcode for a in batches[0]] == [ + "ZZ99ZZ", + "ZZ99ZZ", + "AA11AA", + "AA11AA", + ] + + +def test_invalid_max_batch_size_raises() -> None: + with pytest.raises(ValueError, match="max_batch_size"): + list(iter_postcode_grouped_batches([], max_batch_size=0)) diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py new file mode 100644 index 00000000..e722077d --- /dev/null +++ b/tests/domain/addresses/test_user_address.py @@ -0,0 +1,45 @@ +import dataclasses + +import pytest + +from domain.addresses.user_address import UserAddress + + +def test_user_address_sanitises_postcode_on_construction() -> None: + addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") + assert addr.postcode == "SW1A1AA" + + +def test_user_address_preserves_user_address_verbatim() -> None: + # The free-text user_address string is intentionally NOT normalised -- + # only the postcode is canonicalised at the boundary. + addr = UserAddress(user_address=" 1 The Street ", postcode="sw1a 1aa") + assert addr.user_address == " 1 The Street " + + +def test_user_address_internal_reference_defaults_to_none() -> None: + addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + assert addr.internal_reference is None + + +def test_user_address_internal_reference_accepted() -> None: + addr = UserAddress( + user_address="1 The Street", + postcode="SW1A1AA", + internal_reference="cust-42", + ) + assert addr.internal_reference == "cust-42" + + +def test_user_address_is_frozen() -> None: + addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + with pytest.raises(dataclasses.FrozenInstanceError): + addr.postcode = "OTHER" # type: ignore[misc] + + +def test_user_address_equality_uses_sanitised_postcode() -> None: + # Two instances constructed with different surface forms of the same + # postcode must compare equal because sanitisation runs eagerly. + a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") + b = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + assert a == b diff --git a/tests/domain/postcodes/__init__.py b/tests/domain/postcodes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/postcodes/test_sanitise.py b/tests/domain/postcodes/test_sanitise.py new file mode 100644 index 00000000..edd1679c --- /dev/null +++ b/tests/domain/postcodes/test_sanitise.py @@ -0,0 +1,28 @@ +from domain.postcodes.sanitise import sanitise_postcode + + +def test_sanitise_uppercases() -> None: + assert sanitise_postcode("sw1a1aa") == "SW1A1AA" + + +def test_sanitise_strips_internal_spaces() -> None: + assert sanitise_postcode("sw1a 1aa") == "SW1A1AA" + + +def test_sanitise_strips_leading_and_trailing_whitespace() -> None: + assert sanitise_postcode(" sw1a 1aa ") == "SW1A1AA" + + +def test_sanitise_strips_tabs_and_newlines() -> None: + # CSV ingestion occasionally introduces stray whitespace characters; the + # canonical form must absorb them just like literal spaces. + assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA" + + +def test_sanitise_already_canonical_is_idempotent() -> None: + assert sanitise_postcode("SW1A1AA") == "SW1A1AA" + assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA" + + +def test_sanitise_empty_string() -> None: + assert sanitise_postcode("") == ""