From 61efcad27b5ac309fcc1dd87dddee610fa9f1a1e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 10:13:32 +0000 Subject: [PATCH] standardist Address --- UBIQUITOUS_LANGUAGE.md | 22 ++++++---- applications/SAL/handler.py | 25 ++++++++--- applications/postcode_splitter/handler.py | 8 ++-- domain/addresses/postcode_batching.py | 6 +-- domain/addresses/standardised_address_list.py | 21 +++++++++ ...d_address.py => unstandardised_address.py} | 4 +- .../postcode_splitter_orchestrator.py | 12 ++--- orchestration/sal_orchestrator.py | 20 ++++----- .../__init__.py | 0 ...ardised_address_list_csv_s3_repository.py} | 14 +++--- ...unstandardised_address_list_repository.py} | 4 +- .../addresses/test_postcode_batching.py | 4 +- ...ress.py => test_unstandardised_address.py} | 44 +++++++++---------- ...lord_description_overrides_orchestrator.py | 34 +++++++------- .../test_postcode_splitter_orchestrator.py | 10 ++--- .../__init__.py | 0 .../conftest.py | 0 ...ardised_address_list_csv_s3_repository.py} | 36 +++++++-------- 18 files changed, 151 insertions(+), 113 deletions(-) create mode 100644 domain/addresses/standardised_address_list.py rename domain/addresses/{unsanitised_address.py => unstandardised_address.py} (84%) rename repositories/{unsanitised_address => unstandardised_address}/__init__.py (100%) rename repositories/{unsanitised_address/unsanitised_address_list_csv_s3_repository.py => unstandardised_address/unstandardised_address_list_csv_s3_repository.py} (83%) rename repositories/{unsanitised_address/unsanitised_address_list_repository.py => unstandardised_address/unstandardised_address_list_repository.py} (69%) rename tests/domain/addresses/{test_unsanitised_address.py => test_unstandardised_address.py} (52%) rename tests/repositories/{unsanitised_address => unstandardised_address}/__init__.py (100%) rename tests/repositories/{unsanitised_address => unstandardised_address}/conftest.py (100%) rename tests/repositories/{unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py => unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py} (85%) diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index c3074c02..d2fde99a 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -23,16 +23,18 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve |------|------------|------------------| | **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" | | **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | -| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" | +| **Unstandardised Address** | A frozen dataclass (`domain.addresses.unstandardised_address.UnstandardisedAddress`) capturing a single address exactly as a customer supplied it, before any standardisation: a free-text `address` line (intentionally NOT normalised), a canonical `postcode` (a `Postcode` value object, sanitised on construction), an optional `org_reference` (the customer's own identifier for the property), and `additional_info` (the full source row — every column of the customer's upload, preserved verbatim). | "user address", "asset list", "raw address", "landlord address", "Hyde address" | +| **Address List** | A nominal `NewType` over `list[UnstandardisedAddress]` (`domain.addresses.unstandardised_address.AddressList`) — a batch of unstandardised addresses, such as one customer's bulk-onboarding upload or a postcode-grouped sub-batch produced for downstream processing. Being nominal, it is constructed explicitly: `AddressList([...])`. It is the raw *input* to ingestion; the standardised *output* is a **Standardised Asset List**. | "asset list", "Hyde address list", "user addresses" | +| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**; generated by the `SALOrchestrator`. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" | | **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | ## Address Matching | Term | Definition | Aliases to avoid | |------|------------|------------------| -| **Lexiscore** | A similarity score in [0, 1] between a user address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" | +| **Lexiscore** | A similarity score in [0, 1] between an unstandardised address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" | | **Lexirank** | Dense rank of candidates sorted by lexiscore descending; rank 1 = best match. | "rank", "position" | -| **UPRN Candidate** | An EPC search result that is a plausible match for a given user address, before scoring decides the winner. | "match candidate", "result" | +| **UPRN Candidate** | An EPC search result that is a plausible match for a given unstandardised address, before scoring decides the winner. | "match candidate", "result" | | **Score Threshold** | The minimum lexiscore (currently 0.6) below which no match is returned even if a candidate exists. | "minimum score", "cutoff" | | **Ambiguous Match** | A matching outcome where two or more candidates share lexirank 1, making it impossible to select a unique winner. | "tie", "draw", "duplicate" | | **Best Match** | The single UPRN candidate with lexirank 1 that meets or exceeds the score threshold. | "winner", "top result" | @@ -53,14 +55,16 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve - A **Dwelling** may have multiple **EPCs** across time; the one with the most recent **Registration Date** is the current one. - A **UPRN** identifies a **Dwelling** permanently; it does not change when the property changes owner. - An **EPC Search Result** is a summary; it points to a full **EPC** via its **Certificate Number**. -- **Address Matching** uses a **User Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search. +- An **Address List** is an ordered batch of **Unstandardised Addresses**; a customer's bulk-onboarding upload arrives as one. +- Ingestion turns an **Address List** (raw input) into a **Standardised Asset List** (standardised output) — the **SAL Orchestrator** drives this. +- **Address Matching** uses an **Unstandardised Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search. - A **Lexirank** of 1 with no **Ambiguous Match** and a **Lexiscore** ≥ the **Score Threshold** produces a **Best Match**. ## Example dialogue -> **Dev:** "We have a user address and postcode. How do we find the UPRN?" +> **Dev:** "We have an unstandardised address and postcode. How do we find the UPRN?" -> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **User Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**." +> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **Unstandardised Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**." > **Dev:** "What if two results share the same address line 1?" @@ -72,7 +76,9 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve ## Flagged ambiguities -- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense. +- **"address"** appears in several senses: the **Unstandardised Address** dataclass (one customer-supplied address before standardisation), its free-text `address` field, and the normalised address lines on an **EPC Search Result**. Always qualify: "unstandardised address" vs "EPC address" or "address line 1". Within `domain/addresses/`, the dataclass is **Unstandardised Address**; in upstream ingestion contexts (CSV columns, SQS payloads) "address" may still mean the bare free-text string. - **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments. -- **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`. +- **"user_inputed_address"** (and `user_address`) in `backend/address2UPRN/` is legacy naming — a misspelled synonym for what is now the **Unstandardised Address**. That address-matching code has not been renamed; new code should use **Unstandardised Address**. +- **"Hyde address list"** — "Hyde" is the name of one customer, not a domain concept. A domain expert may say "the Hyde address list" because Hyde is the customer in front of them, but the generalised term is **Address List** (and **Unstandardised Address** for a single item). A customer's identity is data — it belongs in `org_reference` or `additional_info`, never in a type or module name. +- **"address list"** vs **"asset list"** — opposite ends of the ingestion pipeline; do not conflate them. An **Address List** is the raw *input* (unstandardised addresses as the customer supplied them); a **Standardised Asset List** is the standardised *output*. The historical `AssetList` dataclass (now **Unstandardised Address**) misnamed the input an "asset list" — that mistake is what the rename corrected. - **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter. diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index fbed3b83..6076a662 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -4,10 +4,10 @@ from orchestration.sal_orchestrator import ( SALOrchestrator, ) from infrastructure.csv_s3_client import CsvS3Client -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) -from domain.addresses.unsanitised_address import AddressList +from domain.addresses.unstandardised_address import AddressList def handler( @@ -24,16 +24,16 @@ def handler( boto_s3: Any = boto3_client("s3") csv_client = CsvS3Client(boto_s3, bucket) - unsanitised_address_repo = UnsanitisedAddressListCsvS3Repository(csv_client, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket) sal = SALOrchestrator( - unsanitised_address_repo=unsanitised_address_repo, + unstandardised_address_repo=unstandardised_address_repo, ) - addressList: AddressList = sal.get_unsanitised_addresses(input_s3_uri=s3_uri) + addressList: AddressList = sal.get_unstandardised_addresses(input_s3_uri=s3_uri) col_to_desc_map = sal.get_col_to_description_mappings( - list_of_unsanitised_address=addressList + list_of_unstandardised_address=addressList ) # Read csv of user input @@ -41,4 +41,15 @@ def handler( # { walls: "wall variation 1", "wall varition 2"} # Call chatgpt(input from landlord, our way of understanding the mapping) Retrun -> lanlordMapped + + ENUM Walls: + cavity_wall_1976: 1 + + # 1) COuld download site notes from pashub and get + # 2) Open Data communites API -> + # 3) new api + + # User story: + # cavity: asbuilt (1976 - 1982): + return {"hello world": ["hello world"]} diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 6614ecda..ac2c4e99 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -12,8 +12,8 @@ from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from infrastructure.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) from utilities.aws_lambda.subtask_handler import subtask_handler @@ -36,12 +36,12 @@ def handler( boto_sqs: Any = boto3_client("sqs") csv_client = CsvS3Client(boto_s3, bucket) - unsanitised_address_repo = UnsanitisedAddressListCsvS3Repository(csv_client, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - unsanitised_address_repo=unsanitised_address_repo, + unstandardised_address_repo=unstandardised_address_repo, queue_client=queue_client, ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 18135dbd..ca4cd752 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,12 +2,12 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[UnsanitisedAddress], + addresses: Iterable[UnstandardisedAddress], *, max_batch_size: int = 500, ) -> Iterator[AddressList]: @@ -43,7 +43,7 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[UnsanitisedAddress], + addresses: Iterable[UnstandardisedAddress], ) -> dict[Postcode, AddressList]: groups: dict[Postcode, AddressList] = {} for address in addresses: diff --git a/domain/addresses/standardised_address_list.py b/domain/addresses/standardised_address_list.py new file mode 100644 index 00000000..8e3f4fc7 --- /dev/null +++ b/domain/addresses/standardised_address_list.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import NewType, Optional + +from domain.postcode import Postcode + + +def _empty_source_row() -> dict[str, str]: + return {} + + +@dataclass(frozen=True) +class StandardisedAddress: + address: str + postcode: Postcode + org_reference: Optional[str] = None + + +# Standardised Asset List -- the cleaned output counterpart to AddressList. +SAL = NewType("SAL", list[StandardisedAddress]) diff --git a/domain/addresses/unsanitised_address.py b/domain/addresses/unstandardised_address.py similarity index 84% rename from domain/addresses/unsanitised_address.py rename to domain/addresses/unstandardised_address.py index a33f0d88..8917bdf4 100644 --- a/domain/addresses/unsanitised_address.py +++ b/domain/addresses/unstandardised_address.py @@ -11,7 +11,7 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class UnsanitisedAddress: +class UnstandardisedAddress: address: str postcode: Postcode org_reference: Optional[str] = None @@ -21,4 +21,4 @@ class UnsanitisedAddress: # A batch of raw, pre-standardisation addresses as supplied by a landlord. -AddressList = NewType("AddressList", list[UnsanitisedAddress]) +AddressList = NewType("AddressList", list[UnstandardisedAddress]) diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index d8d81c65..1a7277d5 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,8 +5,8 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) @@ -14,12 +14,12 @@ class PostcodeSplitterOrchestrator: def __init__( self, task_orchestrator: TaskOrchestrator, - unsanitised_address_repo: UnsanitisedAddressListRepository, + unstandardised_address_repo: UnstandardisedAddressListRepository, queue_client: Address2UprnQueueClient, max_batch_size: int = 500, ) -> None: self._task_orchestrator = task_orchestrator - self._unsanitised_address_repo = unsanitised_address_repo + self._unstandardised_address_repo = unstandardised_address_repo self._queue_client = queue_client self._max_batch_size = max_batch_size @@ -30,7 +30,7 @@ class PostcodeSplitterOrchestrator: parent_subtask_id: UUID, input_s3_uri: str, ) -> list[UUID]: - addresses = self._unsanitised_address_repo.load_batch(input_s3_uri) + addresses = self._unstandardised_address_repo.load_batch(input_s3_uri) path_prefix = ( f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" ) @@ -39,7 +39,7 @@ class PostcodeSplitterOrchestrator: for batch in iter_postcode_grouped_batches( addresses, max_batch_size=self._max_batch_size ): - batch_uri = self._unsanitised_address_repo.save_batch(batch, path_prefix) + batch_uri = self._unstandardised_address_repo.save_batch(batch, path_prefix) child = self._task_orchestrator.create_child_subtask( parent_task_id, inputs={ diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py index 1eb768de..8ad21388 100644 --- a/orchestration/sal_orchestrator.py +++ b/orchestration/sal_orchestrator.py @@ -1,25 +1,25 @@ -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) -from domain.addresses.unsanitised_address import AddressList +from domain.addresses.unstandardised_address import AddressList class SALOrchestrator: - def __init__(self, unsanitised_address_repo: UnsanitisedAddressListRepository) -> None: - self._unsanitised_address_repo = unsanitised_address_repo + def __init__(self, unstandardised_address_repo: UnstandardisedAddressListRepository) -> None: + self._unstandardised_address_repo = unstandardised_address_repo - def get_unsanitised_addresses( + def get_unstandardised_addresses( self, input_s3_uri: str, ) -> AddressList: - return self._unsanitised_address_repo.load_batch(input_s3_uri) + return self._unstandardised_address_repo.load_batch(input_s3_uri) def get_col_to_description_mappings( - self, list_of_unsanitised_address: AddressList + self, list_of_unstandardised_address: AddressList ) -> dict[str, set[str]]: mappings: dict[str, set[str]] = {} - for unsanitised_address in list_of_unsanitised_address: - for key, value in unsanitised_address.additional_info.items(): + for unstandardised_address in list_of_unstandardised_address: + for key, value in unstandardised_address.additional_info.items(): # Lower-case so case-only typos collapse to one variant. mappings.setdefault(key, set()).add(value.lower()) return mappings diff --git a/repositories/unsanitised_address/__init__.py b/repositories/unstandardised_address/__init__.py similarity index 100% rename from repositories/unsanitised_address/__init__.py rename to repositories/unstandardised_address/__init__.py diff --git a/repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py similarity index 83% rename from repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py rename to repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py index 6c382df0..260fce1d 100644 --- a/repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py +++ b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py @@ -4,11 +4,11 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") @@ -17,7 +17,7 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" -class UnsanitisedAddressListCsvS3Repository(UnsanitisedAddressListRepository): +class UnstandardisedAddressListCsvS3Repository(UnstandardisedAddressListRepository): def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket @@ -36,13 +36,13 @@ class UnsanitisedAddressListCsvS3Repository(UnsanitisedAddressListRepository): for col in _ADDRESS_COLUMNS if col in row and row[col].strip() ] - unsanitised_address = ", ".join(parts) + unstandardised_address = ", ".join(parts) postcode = row.get(_POSTCODE_COLUMN, "") raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - UnsanitisedAddress( - address=unsanitised_address, + UnstandardisedAddress( + address=unstandardised_address, postcode=Postcode(postcode), org_reference=internal_reference, additional_info=row, diff --git a/repositories/unsanitised_address/unsanitised_address_list_repository.py b/repositories/unstandardised_address/unstandardised_address_list_repository.py similarity index 69% rename from repositories/unsanitised_address/unsanitised_address_list_repository.py rename to repositories/unstandardised_address/unstandardised_address_list_repository.py index 2f842fcd..4d446304 100644 --- a/repositories/unsanitised_address/unsanitised_address_list_repository.py +++ b/repositories/unstandardised_address/unstandardised_address_list_repository.py @@ -2,10 +2,10 @@ from __future__ import annotations from abc import ABC, abstractmethod -from domain.addresses.unsanitised_address import AddressList +from domain.addresses.unstandardised_address import AddressList -class UnsanitisedAddressListRepository(ABC): +class UnstandardisedAddressListRepository(ABC): @abstractmethod def load_batch(self, s3_uri: str) -> AddressList: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 443e43df..e5b3e186 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,14 +1,14 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode def _addrs(postcode: str, n: int) -> AddressList: return AddressList( [ - UnsanitisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) + UnstandardisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) for i in range(n) ] ) diff --git a/tests/domain/addresses/test_unsanitised_address.py b/tests/domain/addresses/test_unstandardised_address.py similarity index 52% rename from tests/domain/addresses/test_unsanitised_address.py rename to tests/domain/addresses/test_unstandardised_address.py index aa6d0071..dd4eabdb 100644 --- a/tests/domain/addresses/test_unsanitised_address.py +++ b/tests/domain/addresses/test_unstandardised_address.py @@ -2,36 +2,36 @@ import dataclasses import pytest -from domain.addresses.unsanitised_address import UnsanitisedAddress +from domain.addresses.unstandardised_address import UnstandardisedAddress from domain.postcode import Postcode -def test_unsanitised_address_holds_postcode_value_object() -> None: +def test_unstandardised_address_holds_postcode_value_object() -> None: # act - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") -def test_unsanitised_address_preserves_unsanitised_address_verbatim() -> None: - # The free-text unsanitised_address string is intentionally NOT normalised -- +def test_unstandardised_address_preserves_unstandardised_address_verbatim() -> None: + # The free-text unstandardised_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = UnsanitisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) # assert assert addr.address == " 1 The Street " -def test_unsanitised_address_internal_reference_defaults_to_none() -> None: +def test_unstandardised_address_internal_reference_defaults_to_none() -> None: # act - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.org_reference is None -def test_unsanitised_address_internal_reference_accepted() -> None: +def test_unstandardised_address_internal_reference_accepted() -> None: # act - addr = UnsanitisedAddress( + addr = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), org_reference="cust-42", @@ -40,36 +40,36 @@ def test_unsanitised_address_internal_reference_accepted() -> None: assert addr.org_reference == "cust-42" -def test_unsanitised_address_is_frozen() -> None: +def test_unstandardised_address_is_frozen() -> None: # arrange - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] -def test_unsanitised_address_equality_uses_canonical_postcode() -> None: +def test_unstandardised_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = UnsanitisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + a = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b -def test_unsanitised_address_source_row_defaults_to_empty_dict() -> None: +def test_unstandardised_address_source_row_defaults_to_empty_dict() -> None: # act - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.additional_info == {} -def test_unsanitised_address_carries_source_row() -> None: +def test_unstandardised_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = UnsanitisedAddress( + addr = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A 1AA"), additional_info=row, @@ -78,16 +78,16 @@ def test_unsanitised_address_carries_source_row() -> None: assert addr.additional_info == row -def test_unsanitised_address_equality_ignores_source_row() -> None: +def test_unstandardised_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = UnsanitisedAddress( + a = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"x": "1"}, ) - b = UnsanitisedAddress( + b = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"y": "2"}, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 7e2c5167..b3658014 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,16 +1,16 @@ from __future__ import annotations -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode from orchestration.sal_orchestrator import ( SALOrchestrator, ) -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) -class _StubUnsanitisedAddressRepository(UnsanitisedAddressListRepository): +class _StubUnstandardisedAddressRepository(UnstandardisedAddressListRepository): """``get_col_to_description_mappings`` never touches the repo.""" def load_batch(self, s3_uri: str) -> AddressList: @@ -20,8 +20,8 @@ class _StubUnsanitisedAddressRepository(UnsanitisedAddressListRepository): raise NotImplementedError() -def _make_unsanitised_address(landlord_additional_info: dict[str, str]) -> UnsanitisedAddress: - return UnsanitisedAddress( +def _make_unstandardised_address(landlord_additional_info: dict[str, str]) -> UnstandardisedAddress: + return UnstandardisedAddress( address="1 High St", postcode=Postcode("AA1 1AA"), additional_info=landlord_additional_info, @@ -29,16 +29,16 @@ def _make_unsanitised_address(landlord_additional_info: dict[str, str]) -> Unsan def _orchestrator() -> SALOrchestrator: - return SALOrchestrator(unsanitised_address_repo=_StubUnsanitisedAddressRepository()) + return SALOrchestrator(unstandardised_address_repo=_StubUnstandardisedAddressRepository()) def test_collects_every_value_per_shared_key() -> None: # arrange: every address carries the same keys, all values distinct. addresses = AddressList( [ - _make_unsanitised_address({"description": "cosy", "condition": "new"}), - _make_unsanitised_address({"description": "spacious", "condition": "worn"}), - _make_unsanitised_address({"description": "bright", "condition": "fair"}), + _make_unstandardised_address({"description": "cosy", "condition": "new"}), + _make_unstandardised_address({"description": "spacious", "condition": "worn"}), + _make_unstandardised_address({"description": "bright", "condition": "fair"}), ] ) @@ -56,9 +56,9 @@ def test_repeated_values_collapse_to_one_variant() -> None: # arrange: two addresses share the same wall description. addresses = AddressList( [ - _make_unsanitised_address({"description": "cosy"}), - _make_unsanitised_address({"description": "cosy"}), - _make_unsanitised_address({"description": "bright"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "bright"}), ] ) @@ -73,9 +73,9 @@ def test_case_only_variants_collapse_to_one() -> None: # arrange: the same description typed with inconsistent casing. addresses = AddressList( [ - _make_unsanitised_address({"description": "Cosy"}), - _make_unsanitised_address({"description": "cosy"}), - _make_unsanitised_address({"description": "COSY"}), + _make_unstandardised_address({"description": "Cosy"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "COSY"}), ] ) @@ -96,7 +96,7 @@ def test_empty_address_list_yields_empty_mapping() -> None: def test_single_address_yields_single_value_per_key() -> None: # arrange - addresses = AddressList([_make_unsanitised_address({"description": "cosy"})]) + addresses = AddressList([_make_unstandardised_address({"description": "cosy"})]) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 4317156c..d21bcfba 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -18,8 +18,8 @@ from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchest from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) BUCKET = "splitter-bucket" @@ -64,7 +64,7 @@ class Harness: csv_client: CsvS3Client boto_sqs: Any queue_url: str - repo: UnsanitisedAddressListCsvS3Repository + repo: UnstandardisedAddressListCsvS3Repository @pytest.fixture @@ -78,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: queue_url = cast(str, queue["QueueUrl"]) csv_client = CsvS3Client(boto_s3, BUCKET) - repo = UnsanitisedAddressListCsvS3Repository(csv_client, BUCKET) + repo = UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) # DB: ephemeral PostgreSQL TaskOrchestrator @@ -91,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - unsanitised_address_repo=repo, + unstandardised_address_repo=repo, queue_client=queue_client, max_batch_size=3, ) diff --git a/tests/repositories/unsanitised_address/__init__.py b/tests/repositories/unstandardised_address/__init__.py similarity index 100% rename from tests/repositories/unsanitised_address/__init__.py rename to tests/repositories/unstandardised_address/__init__.py diff --git a/tests/repositories/unsanitised_address/conftest.py b/tests/repositories/unstandardised_address/conftest.py similarity index 100% rename from tests/repositories/unsanitised_address/conftest.py rename to tests/repositories/unstandardised_address/conftest.py diff --git a/tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py similarity index 85% rename from tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py rename to tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py index ff26f08a..866d6f2d 100644 --- a/tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py +++ b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py @@ -3,11 +3,11 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) from tests.infrastructure import make_boto_client @@ -15,22 +15,22 @@ BUCKET = "user-address-bucket" @pytest.fixture -def repo() -> Iterator[UnsanitisedAddressListCsvS3Repository]: +def repo() -> Iterator[UnstandardisedAddressListCsvS3Repository]: with mock_aws(): boto_client = make_boto_client("s3") boto_client.create_bucket(Bucket=BUCKET) csv_client = CsvS3Client(boto_client, BUCKET) - yield UnsanitisedAddressListCsvS3Repository(csv_client, BUCKET) + yield UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET) def _upload_csv( - repo: UnsanitisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str + repo: UnstandardisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str ) -> str: return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] def test_load_batch_parses_address_postcode_and_reference( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -56,7 +56,7 @@ def test_load_batch_parses_address_postcode_and_reference( def test_load_batch_uses_only_address_1_when_others_missing( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -81,7 +81,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( def test_load_batch_handles_missing_internal_reference( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -106,10 +106,10 @@ def test_load_batch_handles_missing_internal_reference( def test_load_batch_captures_full_source_row( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, - # not just the ones it parses into UnsanitisedAddress fields. + # not just the ones it parses into UnstandardisedAddress fields. # arrange row = { "Asset Reference": "511", @@ -128,7 +128,7 @@ def test_load_batch_captures_full_source_row( def test_load_batch_raises_when_postcode_column_absent( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] @@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange row = { @@ -169,12 +169,12 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( [ - UnsanitisedAddress( + UnstandardisedAddress( address="1 High Street", postcode=Postcode("SW1A 1AA"), additional_info={ @@ -194,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -227,12 +227,12 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( [ - UnsanitisedAddress( + UnstandardisedAddress( address="1 High Street", postcode=Postcode("SW1A 1AA"), additional_info={