Model/domain/addresses/postcode_batching.py

"""Pure-Python postcode-grouped batching.

This module preserves the batching invariants from the legacy postcode
splitter (``backend/postcode_splitter/main.py``) without touching pandas,
S3, or SQS:

  * Addresses are grouped by **Postcode** in *insertion order* -- the first
    Postcode seen produces the first group.
  * A Postcode group is never split across two batches.
  * If a single Postcode group is larger than ``max_batch_size``, it is
    flushed as its own oversize batch (any buffered groups go out first,
    untouched).
  * Adding a group that would push the buffer past ``max_batch_size`` first
    flushes the existing buffer, then starts a new buffer with the group.
  * Whatever remains in the buffer after the input is exhausted is flushed
    as the final batch.
  * Empty input yields no batches.
"""

from __future__ import annotations

from collections.abc import Iterable, Iterator

from domain.addresses.user_address import UserAddress


def iter_postcode_grouped_batches(
    addresses: Iterable[UserAddress],
    *,
    max_batch_size: int = 500,
) -> Iterator[list[UserAddress]]:
    """Yield batches of ``UserAddress`` grouped by Postcode.

    Args:
        addresses: An iterable of :class:`UserAddress`. Order is preserved
            within each Postcode group, and groups are yielded in the order
            their first member was seen.
        max_batch_size: The soft upper bound on batch size, in number of
            addresses. A single Postcode group larger than this cap is
            dispatched whole (the cap is never used to split a group).

    Yields:
        Lists of ``UserAddress``. Each list is non-empty.
    """
    if max_batch_size < 1:
        raise ValueError("max_batch_size must be >= 1")

    groups = _group_by_postcode_in_order(addresses)

    buffer: list[UserAddress] = []
    for group in groups.values():
        group_len = len(group)

        # Oversize single-Postcode group: flush buffer first, then dispatch
        # the group as its own batch. Mirrors the legacy
        # ``if group_len >= batch_size`` branch.
        if group_len >= max_batch_size:
            if buffer:
                yield buffer
                buffer = []
            yield group
            continue

        # Adding this group would overflow: flush buffer before appending.
        if len(buffer) + group_len > max_batch_size:
            yield buffer
            buffer = []

        buffer.extend(group)

    # Final flush.
    if buffer:
        yield buffer


def _group_by_postcode_in_order(
    addresses: Iterable[UserAddress],
) -> dict[str, list[UserAddress]]:
    """Group addresses by ``postcode`` preserving first-seen order.

    Python dicts retain insertion order since 3.7, so a plain dict suffices
    for the same effect as pandas ``groupby(..., sort=False)``.
    """
    groups: dict[str, list[UserAddress]] = {}
    for address in addresses:
        groups.setdefault(address.postcode, []).append(address)
    return groups