diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 65297dac..2655beb9 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,7 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, ) -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList def handler( @@ -32,7 +32,7 @@ def handler( user_address_repo=user_address_repo, ) - list_of_user_address: list[UserAddress] = orchestrator.get_user_address( + list_of_user_address: list[LandlordAssetList] = orchestrator.get_user_address( input_s3_uri=s3_uri ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 44e4d967..d4d04b00 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,21 +2,21 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[UserAddress], + addresses: Iterable[LandlordAssetList], *, max_batch_size: int = 500, -) -> Iterator[list[UserAddress]]: +) -> Iterator[list[LandlordAssetList]]: if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") groups = _group_by_postcode_in_order(addresses) - buffer: list[UserAddress] = [] + buffer: list[LandlordAssetList] = [] for group in groups.values(): group_len = len(group) @@ -43,9 +43,9 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[UserAddress], -) -> dict[Postcode, list[UserAddress]]: - groups: dict[Postcode, list[UserAddress]] = {} + addresses: Iterable[LandlordAssetList], +) -> dict[Postcode, list[LandlordAssetList]]: + groups: dict[Postcode, list[LandlordAssetList]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) return groups diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index b6deb2e4..c93f46e5 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -11,7 +11,7 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class UserAddress: +class LandlordAssetList: user_address: str postcode: Postcode internal_reference: Optional[str] = None diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 7f3c3396..9321994d 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -1,5 +1,5 @@ from repositories.user_address.user_address_repository import UserAddressRepository -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList class LandlordDescriptionOverridesOrchestrator: @@ -9,14 +9,15 @@ class LandlordDescriptionOverridesOrchestrator: def get_user_address( self, input_s3_uri: str, - ) -> list[UserAddress]: + ) -> list[LandlordAssetList]: return self._user_address_repo.load_batch(input_s3_uri) def get_col_to_description_mappings( - self, list_of_user_address: list[UserAddress] + self, list_of_user_address: list[LandlordAssetList] ) -> dict[str, set[str]]: mappings: dict[str, set[str]] = {} for user_address in list_of_user_address: for key, value in user_address.landlord_additional_info.items(): - mappings.setdefault(key, set()).add(value) + # Lower-case so case-only typos collapse to one variant. + mappings.setdefault(key, set()).add(value.lower()) return mappings diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 0b54d360..612a52ec 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -4,7 +4,7 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_repository import UserAddressRepository @@ -20,14 +20,14 @@ class UserAddressCsvS3Repository(UserAddressRepository): self._csv_client = csv_client self._bucket = bucket - def load_batch(self, s3_uri: str) -> list[UserAddress]: + def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: rows = self._csv_client.read_rows(s3_uri) if rows and _POSTCODE_COLUMN not in rows[0]: raise ValueError( f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; " f"columns present: {sorted(rows[0])}" ) - addresses: list[UserAddress] = [] + addresses: list[LandlordAssetList] = [] for row in rows: parts = [ row[col].strip() @@ -39,7 +39,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - UserAddress( + LandlordAssetList( user_address=user_address, postcode=Postcode(postcode), internal_reference=internal_reference, @@ -48,7 +48,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): ) return addresses - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + def save_batch(self, addresses: list[LandlordAssetList], path_prefix: str) -> str: rows: list[dict[str, str]] = [ { **addr.landlord_additional_info, diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index b2c0f866..b89247c5 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -2,12 +2,14 @@ from __future__ import annotations from abc import ABC, abstractmethod -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList class UserAddressRepository(ABC): @abstractmethod - def load_batch(self, s3_uri: str) -> list[UserAddress]: ... + def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: ... @abstractmethod - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ... + def save_batch( + self, addresses: list[LandlordAssetList], path_prefix: str + ) -> str: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 8ffcf1b5..82e5ced7 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,13 +1,13 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode -def _addrs(postcode: str, n: int) -> list[UserAddress]: +def _addrs(postcode: str, n: int) -> list[LandlordAssetList]: return [ - UserAddress( + LandlordAssetList( user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) ) for i in range(n) @@ -74,9 +74,7 @@ def test_oversize_group_flushes_existing_buffer_first() -> None: big = _addrs("BB2 2BB", 7) tail = _addrs("CC3 3CC", 1) # act - batches = list( - iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) - ) + batches = list(iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)) # assert assert len(batches) == 3 assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"] diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index 21e5050d..39c52283 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -2,13 +2,13 @@ import dataclasses import pytest -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode def test_user_address_holds_postcode_value_object() -> None: # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") @@ -17,21 +17,23 @@ def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = UserAddress(user_address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList( + user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") + ) # assert assert addr.user_address == " 1 The Street " def test_user_address_internal_reference_defaults_to_none() -> None: # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.internal_reference is None def test_user_address_internal_reference_accepted() -> None: # act - addr = UserAddress( + addr = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A1AA"), internal_reference="cust-42", @@ -42,7 +44,7 @@ def test_user_address_internal_reference_accepted() -> None: def test_user_address_is_frozen() -> None: # arrange - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] @@ -52,15 +54,15 @@ def test_user_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + a = LandlordAssetList(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b def test_user_address_source_row_defaults_to_empty_dict() -> None: # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.landlord_additional_info == {} @@ -69,7 +71,7 @@ def test_user_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = UserAddress( + addr = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A 1AA"), landlord_additional_info=row, @@ -82,12 +84,12 @@ def test_user_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = UserAddress( + a = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A1AA"), landlord_additional_info={"x": "1"}, ) - b = UserAddress( + b = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A1AA"), landlord_additional_info={"y": "2"}, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 4f241423..c7197071 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,6 +1,6 @@ from __future__ import annotations -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode from orchestration.landlord_description_overrides_orchestrator import ( LandlordDescriptionOverridesOrchestrator, @@ -11,15 +11,15 @@ from repositories.user_address.user_address_repository import UserAddressReposit class _StubUserAddressRepository(UserAddressRepository): """``get_col_to_description_mappings`` never touches the repo.""" - def load_batch(self, s3_uri: str) -> list[UserAddress]: + def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: raise NotImplementedError() - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + def save_batch(self, addresses: list[LandlordAssetList], path_prefix: str) -> str: raise NotImplementedError() -def _make_user_address(landlord_additional_info: dict[str, str]) -> UserAddress: - return UserAddress( +def _make_user_address(landlord_additional_info: dict[str, str]) -> LandlordAssetList: + return LandlordAssetList( user_address="1 High St", postcode=Postcode("AA1 1AA"), landlord_additional_info=landlord_additional_info, @@ -65,6 +65,21 @@ def test_repeated_values_collapse_to_one_variant() -> None: assert mappings == {"description": {"cosy", "bright"}} +def test_case_only_variants_collapse_to_one() -> None: + # arrange: the same description typed with inconsistent casing. + addresses = [ + _make_user_address({"description": "Cosy"}), + _make_user_address({"description": "cosy"}), + _make_user_address({"description": "COSY"}), + ] + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: lower-casing folds the casing typos into one variant. + assert mappings == {"description": {"cosy"}} + + def test_empty_address_list_yields_empty_mapping() -> None: # arrange / act mappings = _orchestrator().get_col_to_description_mappings([]) diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index 0f630923..9d53b35b 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( @@ -173,7 +173,7 @@ def test_save_batch_returns_uri_under_path_prefix( ) -> None: # arrange addresses = [ - UserAddress( + LandlordAssetList( user_address="1 High Street", postcode=Postcode("SW1A 1AA"), landlord_additional_info={ @@ -229,7 +229,7 @@ def test_save_batch_uses_unique_filename_per_call( ) -> None: # arrange addresses = [ - UserAddress( + LandlordAssetList( user_address="1 High Street", postcode=Postcode("SW1A 1AA"), landlord_additional_info={