mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
postcode_splitter: pure domain (UserAddress, sanitise_postcode, postcode_batching)
Slice 1/6 of the postcode_splitter refactor (Hestia-Homes/Model#1100). Introduces the pure-domain foundation under domain/, with no AWS, Postgres, or pandas. UserAddress is a frozen dataclass that sanitises its postcode in __post_init__ via the canonical sanitise_postcode helper, and iter_postcode_grouped_batches preserves the legacy splitter's batching invariants (group-by-postcode in insertion order, never split a group, oversize single-postcode groups dispatched whole, final flush). Updates UBIQUITOUS_LANGUAGE.md so the User Address term covers both the dataclass sense (preferred in domain code) and the raw upstream-string sense. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
54a674b5c8
commit
6198d7a46d
11 changed files with 314 additions and 2 deletions
|
|
@ -23,7 +23,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve
|
|||
|------|------------|------------------|
|
||||
| **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" |
|
||||
| **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" |
|
||||
| **User Address** | A free-text address string provided by a user or imported from a customer dataset, before any normalisation or matching. | "user input", "raw address", "user_inputed_address" |
|
||||
| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" |
|
||||
| **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" |
|
||||
|
||||
## Address Matching
|
||||
|
|
@ -72,7 +72,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve
|
|||
|
||||
## Flagged ambiguities
|
||||
|
||||
- **"address"** appears as both the raw **User Address** (free-text from customer data) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1".
|
||||
- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense.
|
||||
- **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments.
|
||||
- **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`.
|
||||
- **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter.
|
||||
|
|
|
|||
0
domain/addresses/__init__.py
Normal file
0
domain/addresses/__init__.py
Normal file
87
domain/addresses/postcode_batching.py
Normal file
87
domain/addresses/postcode_batching.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
"""Pure-Python postcode-grouped batching.
|
||||
|
||||
This module preserves the batching invariants from the legacy postcode
|
||||
splitter (``backend/postcode_splitter/main.py``) without touching pandas,
|
||||
S3, or SQS:
|
||||
|
||||
* Addresses are grouped by **Postcode** in *insertion order* -- the first
|
||||
Postcode seen produces the first group.
|
||||
* A Postcode group is never split across two batches.
|
||||
* If a single Postcode group is larger than ``max_batch_size``, it is
|
||||
flushed as its own oversize batch (any buffered groups go out first,
|
||||
untouched).
|
||||
* Adding a group that would push the buffer past ``max_batch_size`` first
|
||||
flushes the existing buffer, then starts a new buffer with the group.
|
||||
* Whatever remains in the buffer after the input is exhausted is flushed
|
||||
as the final batch.
|
||||
* Empty input yields no batches.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable, Iterator
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
|
||||
|
||||
def iter_postcode_grouped_batches(
|
||||
addresses: Iterable[UserAddress],
|
||||
*,
|
||||
max_batch_size: int = 500,
|
||||
) -> Iterator[list[UserAddress]]:
|
||||
"""Yield batches of ``UserAddress`` grouped by Postcode.
|
||||
|
||||
Args:
|
||||
addresses: An iterable of :class:`UserAddress`. Order is preserved
|
||||
within each Postcode group, and groups are yielded in the order
|
||||
their first member was seen.
|
||||
max_batch_size: The soft upper bound on batch size, in number of
|
||||
addresses. A single Postcode group larger than this cap is
|
||||
dispatched whole (the cap is never used to split a group).
|
||||
|
||||
Yields:
|
||||
Lists of ``UserAddress``. Each list is non-empty.
|
||||
"""
|
||||
if max_batch_size < 1:
|
||||
raise ValueError("max_batch_size must be >= 1")
|
||||
|
||||
groups = _group_by_postcode_in_order(addresses)
|
||||
|
||||
buffer: list[UserAddress] = []
|
||||
for group in groups.values():
|
||||
group_len = len(group)
|
||||
|
||||
# Oversize single-Postcode group: flush buffer first, then dispatch
|
||||
# the group as its own batch. Mirrors the legacy
|
||||
# ``if group_len >= batch_size`` branch.
|
||||
if group_len >= max_batch_size:
|
||||
if buffer:
|
||||
yield buffer
|
||||
buffer = []
|
||||
yield group
|
||||
continue
|
||||
|
||||
# Adding this group would overflow: flush buffer before appending.
|
||||
if len(buffer) + group_len > max_batch_size:
|
||||
yield buffer
|
||||
buffer = []
|
||||
|
||||
buffer.extend(group)
|
||||
|
||||
# Final flush.
|
||||
if buffer:
|
||||
yield buffer
|
||||
|
||||
|
||||
def _group_by_postcode_in_order(
|
||||
addresses: Iterable[UserAddress],
|
||||
) -> dict[str, list[UserAddress]]:
|
||||
"""Group addresses by ``postcode`` preserving first-seen order.
|
||||
|
||||
Python dicts retain insertion order since 3.7, so a plain dict suffices
|
||||
for the same effect as pandas ``groupby(..., sort=False)``.
|
||||
"""
|
||||
groups: dict[str, list[UserAddress]] = {}
|
||||
for address in addresses:
|
||||
groups.setdefault(address.postcode, []).append(address)
|
||||
return groups
|
||||
36
domain/addresses/user_address.py
Normal file
36
domain/addresses/user_address.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""The :class:`UserAddress` value object.
|
||||
|
||||
A frozen dataclass capturing the splitter's domain entity: the raw input
|
||||
address line, a sanitised postcode, and an optional internal reference from
|
||||
the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no
|
||||
caller can construct an instance with an un-normalised postcode.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from domain.postcodes.sanitise import sanitise_postcode
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UserAddress:
|
||||
"""A user-supplied address paired with its canonical postcode.
|
||||
|
||||
Attributes:
|
||||
user_address: The free-text address string as supplied upstream.
|
||||
postcode: The postcode; always stored in canonical form
|
||||
(uppercased, whitespace stripped). Sanitisation is enforced by
|
||||
:meth:`__post_init__`.
|
||||
internal_reference: Optional customer-side identifier preserved for
|
||||
traceability through the matching pipeline.
|
||||
"""
|
||||
|
||||
user_address: str
|
||||
postcode: str
|
||||
internal_reference: Optional[str] = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Frozen dataclass: bypass the descriptor with object.__setattr__.
|
||||
object.__setattr__(self, "postcode", sanitise_postcode(self.postcode))
|
||||
0
domain/postcodes/__init__.py
Normal file
0
domain/postcodes/__init__.py
Normal file
23
domain/postcodes/sanitise.py
Normal file
23
domain/postcodes/sanitise.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
"""Canonical postcode sanitisation for the domain layer.
|
||||
|
||||
The legacy postcode_splitter normalises postcodes inline with
|
||||
``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes
|
||||
that operation to a pure, reusable function so the same canonical form is
|
||||
applied wherever a postcode crosses a domain boundary -- including
|
||||
:class:`domain.addresses.user_address.UserAddress` construction and future
|
||||
migrations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def sanitise_postcode(s: str) -> str:
|
||||
"""Return the canonical form of a postcode.
|
||||
|
||||
The canonical form is uppercase with all whitespace removed. This matches
|
||||
the legacy splitter's ``str.upper().str.replace(" ", "")`` for the
|
||||
overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"``
|
||||
becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep
|
||||
in from CSV ingestion.
|
||||
"""
|
||||
return "".join(s.split()).upper()
|
||||
0
tests/domain/addresses/__init__.py
Normal file
0
tests/domain/addresses/__init__.py
Normal file
93
tests/domain/addresses/test_postcode_batching.py
Normal file
93
tests/domain/addresses/test_postcode_batching.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
import pytest
|
||||
|
||||
from domain.addresses.postcode_batching import iter_postcode_grouped_batches
|
||||
from domain.addresses.user_address import UserAddress
|
||||
|
||||
|
||||
def _addrs(postcode: str, n: int) -> list[UserAddress]:
|
||||
"""Build ``n`` addresses sharing a postcode, with distinct address lines."""
|
||||
return [
|
||||
UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode)
|
||||
for i in range(n)
|
||||
]
|
||||
|
||||
|
||||
def test_empty_input_yields_no_batches() -> None:
|
||||
assert list(iter_postcode_grouped_batches([])) == []
|
||||
|
||||
|
||||
def test_single_batch_under_cap() -> None:
|
||||
addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2)
|
||||
batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500))
|
||||
assert len(batches) == 1
|
||||
assert batches[0] == addrs
|
||||
|
||||
|
||||
def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None:
|
||||
# Two groups whose total exactly equals the cap pack into a single
|
||||
# batch -- no premature flush.
|
||||
addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2)
|
||||
batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
|
||||
assert len(batches) == 1
|
||||
assert len(batches[0]) == 5
|
||||
|
||||
|
||||
def test_flush_on_overflow_before_adding_next_postcode() -> None:
|
||||
# Cap is 5. First group fills 3 slots; second group of 3 would overflow,
|
||||
# so the buffer is flushed first and the next group starts a fresh batch.
|
||||
addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3)
|
||||
batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
|
||||
assert len(batches) == 2
|
||||
assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3
|
||||
assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3
|
||||
|
||||
|
||||
def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
|
||||
# An oversize single-postcode group goes out as one batch larger than
|
||||
# the cap -- the cap never splits a postcode.
|
||||
addrs = _addrs("AA1 1AA", 7)
|
||||
batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
|
||||
assert len(batches) == 1
|
||||
assert len(batches[0]) == 7
|
||||
|
||||
|
||||
def test_oversize_group_flushes_existing_buffer_first() -> None:
|
||||
# Mirrors the legacy ``if buffer: flush`` branch when an oversize group
|
||||
# is encountered: buffered work must not be lost or interleaved.
|
||||
small = _addrs("AA1 1AA", 2)
|
||||
big = _addrs("BB2 2BB", 7)
|
||||
tail = _addrs("CC3 3CC", 1)
|
||||
batches = list(
|
||||
iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
|
||||
)
|
||||
assert len(batches) == 3
|
||||
assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"]
|
||||
assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7
|
||||
assert [a.postcode for a in batches[2]] == ["CC33CC"]
|
||||
|
||||
|
||||
def test_final_flush_yields_remaining_buffer() -> None:
|
||||
# No overflow ever happens, but the trailing buffer must still come out.
|
||||
addrs = _addrs("AA1 1AA", 2) + _addrs("BB2 2BB", 2)
|
||||
batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500))
|
||||
assert batches == [addrs]
|
||||
|
||||
|
||||
def test_postcode_grouping_preserves_first_seen_order() -> None:
|
||||
# Interleaved input must still group by postcode and emit in first-seen
|
||||
# order -- never alphabetical.
|
||||
a1, a2 = _addrs("ZZ9 9ZZ", 2)
|
||||
b1, b2 = _addrs("AA1 1AA", 2)
|
||||
batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2]))
|
||||
assert len(batches) == 1
|
||||
assert [a.postcode for a in batches[0]] == [
|
||||
"ZZ99ZZ",
|
||||
"ZZ99ZZ",
|
||||
"AA11AA",
|
||||
"AA11AA",
|
||||
]
|
||||
|
||||
|
||||
def test_invalid_max_batch_size_raises() -> None:
|
||||
with pytest.raises(ValueError, match="max_batch_size"):
|
||||
list(iter_postcode_grouped_batches([], max_batch_size=0))
|
||||
45
tests/domain/addresses/test_user_address.py
Normal file
45
tests/domain/addresses/test_user_address.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import dataclasses
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.addresses.user_address import UserAddress
|
||||
|
||||
|
||||
def test_user_address_sanitises_postcode_on_construction() -> None:
|
||||
addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
|
||||
assert addr.postcode == "SW1A1AA"
|
||||
|
||||
|
||||
def test_user_address_preserves_user_address_verbatim() -> None:
|
||||
# The free-text user_address string is intentionally NOT normalised --
|
||||
# only the postcode is canonicalised at the boundary.
|
||||
addr = UserAddress(user_address=" 1 The Street ", postcode="sw1a 1aa")
|
||||
assert addr.user_address == " 1 The Street "
|
||||
|
||||
|
||||
def test_user_address_internal_reference_defaults_to_none() -> None:
|
||||
addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
|
||||
assert addr.internal_reference is None
|
||||
|
||||
|
||||
def test_user_address_internal_reference_accepted() -> None:
|
||||
addr = UserAddress(
|
||||
user_address="1 The Street",
|
||||
postcode="SW1A1AA",
|
||||
internal_reference="cust-42",
|
||||
)
|
||||
assert addr.internal_reference == "cust-42"
|
||||
|
||||
|
||||
def test_user_address_is_frozen() -> None:
|
||||
addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
|
||||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||||
addr.postcode = "OTHER" # type: ignore[misc]
|
||||
|
||||
|
||||
def test_user_address_equality_uses_sanitised_postcode() -> None:
|
||||
# Two instances constructed with different surface forms of the same
|
||||
# postcode must compare equal because sanitisation runs eagerly.
|
||||
a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
|
||||
b = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
|
||||
assert a == b
|
||||
0
tests/domain/postcodes/__init__.py
Normal file
0
tests/domain/postcodes/__init__.py
Normal file
28
tests/domain/postcodes/test_sanitise.py
Normal file
28
tests/domain/postcodes/test_sanitise.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
from domain.postcodes.sanitise import sanitise_postcode
|
||||
|
||||
|
||||
def test_sanitise_uppercases() -> None:
|
||||
assert sanitise_postcode("sw1a1aa") == "SW1A1AA"
|
||||
|
||||
|
||||
def test_sanitise_strips_internal_spaces() -> None:
|
||||
assert sanitise_postcode("sw1a 1aa") == "SW1A1AA"
|
||||
|
||||
|
||||
def test_sanitise_strips_leading_and_trailing_whitespace() -> None:
|
||||
assert sanitise_postcode(" sw1a 1aa ") == "SW1A1AA"
|
||||
|
||||
|
||||
def test_sanitise_strips_tabs_and_newlines() -> None:
|
||||
# CSV ingestion occasionally introduces stray whitespace characters; the
|
||||
# canonical form must absorb them just like literal spaces.
|
||||
assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA"
|
||||
|
||||
|
||||
def test_sanitise_already_canonical_is_idempotent() -> None:
|
||||
assert sanitise_postcode("SW1A1AA") == "SW1A1AA"
|
||||
assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA"
|
||||
|
||||
|
||||
def test_sanitise_empty_string() -> None:
|
||||
assert sanitise_postcode("") == ""
|
||||
Loading…
Add table
Reference in a new issue