mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Slice 1/6 of the postcode_splitter refactor (Hestia-Homes/Model#1100). Introduces the pure-domain foundation under domain/, with no AWS, Postgres, or pandas. UserAddress is a frozen dataclass that sanitises its postcode in __post_init__ via the canonical sanitise_postcode helper, and iter_postcode_grouped_batches preserves the legacy splitter's batching invariants (group-by-postcode in insertion order, never split a group, oversize single-postcode groups dispatched whole, final flush). Updates UBIQUITOUS_LANGUAGE.md so the User Address term covers both the dataclass sense (preferred in domain code) and the raw upstream-string sense. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
87 lines
2.9 KiB
Python
87 lines
2.9 KiB
Python
"""Pure-Python postcode-grouped batching.
|
|
|
|
This module preserves the batching invariants from the legacy postcode
|
|
splitter (``backend/postcode_splitter/main.py``) without touching pandas,
|
|
S3, or SQS:
|
|
|
|
* Addresses are grouped by **Postcode** in *insertion order* -- the first
|
|
Postcode seen produces the first group.
|
|
* A Postcode group is never split across two batches.
|
|
* If a single Postcode group is larger than ``max_batch_size``, it is
|
|
flushed as its own oversize batch (any buffered groups go out first,
|
|
untouched).
|
|
* Adding a group that would push the buffer past ``max_batch_size`` first
|
|
flushes the existing buffer, then starts a new buffer with the group.
|
|
* Whatever remains in the buffer after the input is exhausted is flushed
|
|
as the final batch.
|
|
* Empty input yields no batches.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Iterable, Iterator
|
|
|
|
from domain.addresses.user_address import UserAddress
|
|
|
|
|
|
def iter_postcode_grouped_batches(
|
|
addresses: Iterable[UserAddress],
|
|
*,
|
|
max_batch_size: int = 500,
|
|
) -> Iterator[list[UserAddress]]:
|
|
"""Yield batches of ``UserAddress`` grouped by Postcode.
|
|
|
|
Args:
|
|
addresses: An iterable of :class:`UserAddress`. Order is preserved
|
|
within each Postcode group, and groups are yielded in the order
|
|
their first member was seen.
|
|
max_batch_size: The soft upper bound on batch size, in number of
|
|
addresses. A single Postcode group larger than this cap is
|
|
dispatched whole (the cap is never used to split a group).
|
|
|
|
Yields:
|
|
Lists of ``UserAddress``. Each list is non-empty.
|
|
"""
|
|
if max_batch_size < 1:
|
|
raise ValueError("max_batch_size must be >= 1")
|
|
|
|
groups = _group_by_postcode_in_order(addresses)
|
|
|
|
buffer: list[UserAddress] = []
|
|
for group in groups.values():
|
|
group_len = len(group)
|
|
|
|
# Oversize single-Postcode group: flush buffer first, then dispatch
|
|
# the group as its own batch. Mirrors the legacy
|
|
# ``if group_len >= batch_size`` branch.
|
|
if group_len >= max_batch_size:
|
|
if buffer:
|
|
yield buffer
|
|
buffer = []
|
|
yield group
|
|
continue
|
|
|
|
# Adding this group would overflow: flush buffer before appending.
|
|
if len(buffer) + group_len > max_batch_size:
|
|
yield buffer
|
|
buffer = []
|
|
|
|
buffer.extend(group)
|
|
|
|
# Final flush.
|
|
if buffer:
|
|
yield buffer
|
|
|
|
|
|
def _group_by_postcode_in_order(
|
|
addresses: Iterable[UserAddress],
|
|
) -> dict[str, list[UserAddress]]:
|
|
"""Group addresses by ``postcode`` preserving first-seen order.
|
|
|
|
Python dicts retain insertion order since 3.7, so a plain dict suffices
|
|
for the same effect as pandas ``groupby(..., sort=False)``.
|
|
"""
|
|
groups: dict[str, list[UserAddress]] = {}
|
|
for address in addresses:
|
|
groups.setdefault(address.postcode, []).append(address)
|
|
return groups
|