"""Pure-Python postcode-grouped batching. This module preserves the batching invariants from the legacy postcode splitter (``backend/postcode_splitter/main.py``) without touching pandas, S3, or SQS: * Addresses are grouped by **Postcode** in *insertion order* -- the first Postcode seen produces the first group. * A Postcode group is never split across two batches. * If a single Postcode group is larger than ``max_batch_size``, it is flushed as its own oversize batch (any buffered groups go out first, untouched). * Adding a group that would push the buffer past ``max_batch_size`` first flushes the existing buffer, then starts a new buffer with the group. * Whatever remains in the buffer after the input is exhausted is flushed as the final batch. * Empty input yields no batches. """ from __future__ import annotations from collections.abc import Iterable, Iterator from domain.addresses.user_address import UserAddress def iter_postcode_grouped_batches( addresses: Iterable[UserAddress], *, max_batch_size: int = 500, ) -> Iterator[list[UserAddress]]: """Yield batches of ``UserAddress`` grouped by Postcode. Args: addresses: An iterable of :class:`UserAddress`. Order is preserved within each Postcode group, and groups are yielded in the order their first member was seen. max_batch_size: The soft upper bound on batch size, in number of addresses. A single Postcode group larger than this cap is dispatched whole (the cap is never used to split a group). Yields: Lists of ``UserAddress``. Each list is non-empty. """ if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") groups = _group_by_postcode_in_order(addresses) buffer: list[UserAddress] = [] for group in groups.values(): group_len = len(group) # Oversize single-Postcode group: flush buffer first, then dispatch # the group as its own batch. Mirrors the legacy # ``if group_len >= batch_size`` branch. if group_len >= max_batch_size: if buffer: yield buffer buffer = [] yield group continue # Adding this group would overflow: flush buffer before appending. if len(buffer) + group_len > max_batch_size: yield buffer buffer = [] buffer.extend(group) # Final flush. if buffer: yield buffer def _group_by_postcode_in_order( addresses: Iterable[UserAddress], ) -> dict[str, list[UserAddress]]: """Group addresses by ``postcode`` preserving first-seen order. Python dicts retain insertion order since 3.7, so a plain dict suffices for the same effect as pandas ``groupby(..., sort=False)``. """ groups: dict[str, list[UserAddress]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) return groups