From aea7251107ed1e0136b83e5ba6421b71ab0ee98b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 14:21:50 +0000 Subject: [PATCH 01/29] added files for landlord_overrides --- applications/landlord_overrides/Dockerfile | 34 +++++++++++++++++++ applications/landlord_overrides/handler.py | 9 +++++ .../local_handler/.env.local.example | 5 +++ .../local_handler/docker-compose.yml | 9 +++++ .../local_handler/invoke_local_lambda.py | 16 +++++++++ .../local_handler/run_local.sh | 12 +++++++ .../landlord_overrides/requirements.txt | 4 +++ 7 files changed, 89 insertions(+) create mode 100644 applications/landlord_overrides/Dockerfile create mode 100644 applications/landlord_overrides/handler.py create mode 100644 applications/landlord_overrides/local_handler/.env.local.example create mode 100644 applications/landlord_overrides/local_handler/docker-compose.yml create mode 100755 applications/landlord_overrides/local_handler/invoke_local_lambda.py create mode 100755 applications/landlord_overrides/local_handler/run_local.sh create mode 100644 applications/landlord_overrides/requirements.txt diff --git a/applications/landlord_overrides/Dockerfile b/applications/landlord_overrides/Dockerfile new file mode 100644 index 00000000..ef19f379 --- /dev/null +++ b/applications/landlord_overrides/Dockerfile @@ -0,0 +1,34 @@ +FROM public.ecr.aws/lambda/python:3.11 + +# Postgres host/port/database are baked into the image at build time from +# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets), +# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the +# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT +# baked in -- Terraform injects those as Lambda env vars from Secrets Manager. +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV POSTGRES_HOST=${DEV_DB_HOST} +ENV POSTGRES_PORT=${DEV_DB_PORT} +ENV POSTGRES_DATABASE=${DEV_DB_NAME} + +WORKDIR /var/task + +COPY applications/postcode_splitter/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the layered source the handler imports from. The new splitter pulls +# only DDD-shaped packages — no pandas, no legacy backend/. +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ +COPY orchestration/ orchestration/ +COPY repositories/ repositories/ +COPY utilities/ utilities/ +COPY applications/ applications/ + +# Place the handler at the Lambda task root so the runtime can resolve +# ``main.handler`` without an extra package prefix. +COPY applications/landlord_overrides/handler.py /var/task/main.py + +CMD ["main.handler"] diff --git a/applications/landlord_overrides/handler.py b/applications/landlord_overrides/handler.py new file mode 100644 index 00000000..f998da1d --- /dev/null +++ b/applications/landlord_overrides/handler.py @@ -0,0 +1,9 @@ +from typing import Any + + +def handler( + body: dict[str, Any], + context: Any, +) -> dict[str, list[str]]: + print("hello world") + return {"hello world": ["hello world"]} diff --git a/applications/landlord_overrides/local_handler/.env.local.example b/applications/landlord_overrides/local_handler/.env.local.example new file mode 100644 index 00000000..a78a797f --- /dev/null +++ b/applications/landlord_overrides/local_handler/.env.local.example @@ -0,0 +1,5 @@ +POSTGRES_HOST= +POSTGRES_PORT=5432 +POSTGRES_USERNAME= +POSTGRES_PASSWORD= +POSTGRES_DATABASE= \ No newline at end of file diff --git a/applications/landlord_overrides/local_handler/docker-compose.yml b/applications/landlord_overrides/local_handler/docker-compose.yml new file mode 100644 index 00000000..d217ded6 --- /dev/null +++ b/applications/landlord_overrides/local_handler/docker-compose.yml @@ -0,0 +1,9 @@ +services: + landlord_overrides: + build: + context: ../../../ + dockerfile: applications/landlord_overrides/Dockerfile + ports: + - "9002:8080" + env_file: + - .env.local diff --git a/applications/landlord_overrides/local_handler/invoke_local_lambda.py b/applications/landlord_overrides/local_handler/invoke_local_lambda.py new file mode 100755 index 00000000..4514495f --- /dev/null +++ b/applications/landlord_overrides/local_handler/invoke_local_lambda.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9002" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = {"Records": [{"body": json.dumps({})}]} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/applications/landlord_overrides/local_handler/run_local.sh b/applications/landlord_overrides/local_handler/run_local.sh new file mode 100755 index 00000000..345b60ee --- /dev/null +++ b/applications/landlord_overrides/local_handler/run_local.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")" + +if [ ! -f .env.local ]; then + cp .env.local.example .env.local + echo "Created .env.local from the template — fill it in, then re-run." >&2 + exit 1 +fi + +docker compose build --no-cache +docker compose up --force-recreate diff --git a/applications/landlord_overrides/requirements.txt b/applications/landlord_overrides/requirements.txt new file mode 100644 index 00000000..6a85a255 --- /dev/null +++ b/applications/landlord_overrides/requirements.txt @@ -0,0 +1,4 @@ +boto3 +pydantic +sqlmodel +psycopg2-binary From 68809a68c12cd411c0a9b26df39ca95025001f13 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 14:26:05 +0000 Subject: [PATCH 02/29] renamed to landlord description overrides --- .../Dockerfile | 2 +- .../handler.py | 0 .../local_handler/.env.local.example | 0 .../local_handler/docker-compose.yml | 2 +- .../local_handler/invoke_local_lambda.py | 0 .../local_handler/run_local.sh | 0 .../requirements.txt | 0 7 files changed, 2 insertions(+), 2 deletions(-) rename applications/{landlord_overrides => landlord_description_overrides}/Dockerfile (93%) rename applications/{landlord_overrides => landlord_description_overrides}/handler.py (100%) rename applications/{landlord_overrides => landlord_description_overrides}/local_handler/.env.local.example (100%) rename applications/{landlord_overrides => landlord_description_overrides}/local_handler/docker-compose.yml (64%) rename applications/{landlord_overrides => landlord_description_overrides}/local_handler/invoke_local_lambda.py (100%) rename applications/{landlord_overrides => landlord_description_overrides}/local_handler/run_local.sh (100%) rename applications/{landlord_overrides => landlord_description_overrides}/requirements.txt (100%) diff --git a/applications/landlord_overrides/Dockerfile b/applications/landlord_description_overrides/Dockerfile similarity index 93% rename from applications/landlord_overrides/Dockerfile rename to applications/landlord_description_overrides/Dockerfile index ef19f379..e2456b81 100644 --- a/applications/landlord_overrides/Dockerfile +++ b/applications/landlord_description_overrides/Dockerfile @@ -29,6 +29,6 @@ COPY applications/ applications/ # Place the handler at the Lambda task root so the runtime can resolve # ``main.handler`` without an extra package prefix. -COPY applications/landlord_overrides/handler.py /var/task/main.py +COPY applications/landlord_description_overrides/handler.py /var/task/main.py CMD ["main.handler"] diff --git a/applications/landlord_overrides/handler.py b/applications/landlord_description_overrides/handler.py similarity index 100% rename from applications/landlord_overrides/handler.py rename to applications/landlord_description_overrides/handler.py diff --git a/applications/landlord_overrides/local_handler/.env.local.example b/applications/landlord_description_overrides/local_handler/.env.local.example similarity index 100% rename from applications/landlord_overrides/local_handler/.env.local.example rename to applications/landlord_description_overrides/local_handler/.env.local.example diff --git a/applications/landlord_overrides/local_handler/docker-compose.yml b/applications/landlord_description_overrides/local_handler/docker-compose.yml similarity index 64% rename from applications/landlord_overrides/local_handler/docker-compose.yml rename to applications/landlord_description_overrides/local_handler/docker-compose.yml index d217ded6..6ead2e33 100644 --- a/applications/landlord_overrides/local_handler/docker-compose.yml +++ b/applications/landlord_description_overrides/local_handler/docker-compose.yml @@ -2,7 +2,7 @@ services: landlord_overrides: build: context: ../../../ - dockerfile: applications/landlord_overrides/Dockerfile + dockerfile: applications/landlord_description_overrides/Dockerfile ports: - "9002:8080" env_file: diff --git a/applications/landlord_overrides/local_handler/invoke_local_lambda.py b/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py similarity index 100% rename from applications/landlord_overrides/local_handler/invoke_local_lambda.py rename to applications/landlord_description_overrides/local_handler/invoke_local_lambda.py diff --git a/applications/landlord_overrides/local_handler/run_local.sh b/applications/landlord_description_overrides/local_handler/run_local.sh similarity index 100% rename from applications/landlord_overrides/local_handler/run_local.sh rename to applications/landlord_description_overrides/local_handler/run_local.sh diff --git a/applications/landlord_overrides/requirements.txt b/applications/landlord_description_overrides/requirements.txt similarity index 100% rename from applications/landlord_overrides/requirements.txt rename to applications/landlord_description_overrides/requirements.txt From 4830f82b589da75760aafd1f1c878bd02b956f31 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 16:32:15 +0000 Subject: [PATCH 03/29] test: add failing tests for get_col_to_description_mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drive the contract for LandlordDescriptionOverridesOrchestrator. get_col_to_description_mappings: given a list of UserAddress sharing the same landlord_additional_info keys, return each key mapped to the list of values found across all addresses. Tests are red — the method still raises NotImplementedError. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...lord_description_overrides_orchestrator.py | 19 +++++ ...lord_description_overrides_orchestrator.py | 69 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 orchestration/landlord_description_overrides_orchestrator.py create mode 100644 tests/orchestration/test_landlord_description_overrides_orchestrator.py diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py new file mode 100644 index 00000000..fb3fc61b --- /dev/null +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -0,0 +1,19 @@ +from repositories.user_address.user_address_repository import UserAddressRepository +from domain.addresses.user_address import UserAddress + + +class LandlordDescriptionOverridesOrchestrator: + def __init__(self, user_address_repo: UserAddressRepository) -> None: + self._user_address_repo = user_address_repo + + def get_user_address( + self, + input_s3_uri: str, + ) -> list[UserAddress]: + return self._user_address_repo.load_batch(input_s3_uri) + + def get_col_to_description_mappings( + self, list_of_user_address: list[UserAddress] + ) -> dict[str, list[str]]: + + raise NotImplementedError() diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py new file mode 100644 index 00000000..5660bf78 --- /dev/null +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode +from orchestration.landlord_description_overrides_orchestrator import ( + LandlordDescriptionOverridesOrchestrator, +) +from repositories.user_address.user_address_repository import UserAddressRepository + + +class _StubUserAddressRepository(UserAddressRepository): + """``get_col_to_description_mappings`` never touches the repo.""" + + def load_batch(self, s3_uri: str) -> list[UserAddress]: + raise NotImplementedError() + + def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + raise NotImplementedError() + + +def _make_user_address(landlord_additional_info: dict[str, str]) -> UserAddress: + return UserAddress( + user_address="1 High St", + postcode=Postcode("AA1 1AA"), + landlord_additional_info=landlord_additional_info, + ) + + +def _orchestrator() -> LandlordDescriptionOverridesOrchestrator: + return LandlordDescriptionOverridesOrchestrator( + user_address_repo=_StubUserAddressRepository() + ) + + +def test_collects_every_value_per_shared_key() -> None: + # arrange: every address carries the same keys, all values distinct. + addresses = [ + _make_user_address({"description": "cosy", "condition": "new"}), + _make_user_address({"description": "spacious", "condition": "worn"}), + _make_user_address({"description": "bright", "condition": "fair"}), + ] + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert + assert mappings == { + "description": ["cosy", "spacious", "bright"], + "condition": ["new", "worn", "fair"], + } + + +def test_empty_address_list_yields_empty_mapping() -> None: + # arrange / act + mappings = _orchestrator().get_col_to_description_mappings([]) + + # assert + assert mappings == {} + + +def test_single_address_yields_single_value_per_key() -> None: + # arrange + addresses = [_make_user_address({"description": "cosy"})] + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert + assert mappings == {"description": ["cosy"]} From b14f98788e05b6c4964817be27fc83f35725b4e5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 16:32:50 +0000 Subject: [PATCH 04/29] added landlord orchestration --- .../landlord_description_overrides/handler.py | 38 ++++++++++++++++++- asset_list/app.py | 13 +++---- domain/addresses/user_address.py | 4 +- .../user_address_csv_s3_repository.py | 7 +++- tests/domain/addresses/test_user_address.py | 14 +++---- .../test_user_address_csv_s3_repository.py | 20 +++++++--- 6 files changed, 72 insertions(+), 24 deletions(-) diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index f998da1d..003bd4d3 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -1,9 +1,45 @@ from typing import Any +import boto3 +from orchestration.landlord_description_overrides_orchestrator import ( + LandlordDescriptionOverridesOrchestrator, +) +from infrastructure.csv_s3_client import CsvS3Client +from repositories.user_address.user_address_csv_s3_repository import ( + UserAddressCsvS3Repository, +) def handler( body: dict[str, Any], context: Any, ) -> dict[str, list[str]]: - print("hello world") + + s3_uri = "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv" + bucket = "retrofit-data-dev" + + # boto3.client is overloaded per-service in the installed stubs; cast + # to Any so the strict-mode checker treats it as opaque. + boto3_client: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + boto_s3: Any = boto3_client("s3") + + csv_client = CsvS3Client(boto_s3, bucket) + user_address_repo = UserAddressCsvS3Repository(csv_client, bucket) + + orchestrator = LandlordDescriptionOverridesOrchestrator( + user_address_repo=user_address_repo, + ) + + list_of_user_address = orchestrator.get_user_address(input_s3_uri=s3_uri) + + for each_user_address in list_of_user_address: + print(each_user_address.landlord_additional_info.keys()) + break + + # Read csv of user input + # get the column and unique variations of each description + # { walls: "wall variation 1", "wall varition 2"} + # Call chatgpt(input from landlord, our way of understanding the mapping) Retrun -> lanlordMapped + return {"hello world": ["hello world"]} diff --git a/asset_list/app.py b/asset_list/app.py index 424f4df6..aef410e6 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -79,17 +79,17 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "hyde.xlsx" - sheet_name = "AddressProfilingResults" - postcode_column = "Postcode" - address1_column = "Address" + data_filename = "asset_list (8).xlsx" + sheet_name = "Standardised Asset List" + postcode_column = "postcode" + address1_column = "domna_address_1" address1_method = None - fulladdress_column = "Postcode" + fulladdress_column = "domna_address_1" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Property Type" # Good to include if landlord gave + landlord_property_type = "landlord_property_id" # Good to include if landlord gave landlord_built_form = None # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None @@ -468,4 +468,3 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) - diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index 9a28751b..b6deb2e4 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -15,4 +15,6 @@ class UserAddress: user_address: str postcode: Postcode internal_reference: Optional[str] = None - source_row: dict[str, str] = field(default_factory=_empty_source_row, compare=False) + landlord_additional_info: dict[str, str] = field( + default_factory=_empty_source_row, compare=False + ) diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 058fd5a5..0b54d360 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -43,14 +43,17 @@ class UserAddressCsvS3Repository(UserAddressRepository): user_address=user_address, postcode=Postcode(postcode), internal_reference=internal_reference, - source_row=row, + landlord_additional_info=row, ) ) return addresses def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: rows: list[dict[str, str]] = [ - {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)} + { + **addr.landlord_additional_info, + _POSTCODE_CLEAN_COLUMN: str(addr.postcode), + } for addr in addresses ] diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index 8d092df3..21e5050d 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -17,9 +17,7 @@ def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = UserAddress( - user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") - ) + addr = UserAddress(user_address=" 1 The Street ", postcode=Postcode("SW1A1AA")) # assert assert addr.user_address == " 1 The Street " @@ -64,7 +62,7 @@ def test_user_address_source_row_defaults_to_empty_dict() -> None: # act addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # assert - assert addr.source_row == {} + assert addr.landlord_additional_info == {} def test_user_address_carries_source_row() -> None: @@ -74,10 +72,10 @@ def test_user_address_carries_source_row() -> None: addr = UserAddress( user_address="1 The Street", postcode=Postcode("SW1A 1AA"), - source_row=row, + landlord_additional_info=row, ) # assert - assert addr.source_row == row + assert addr.landlord_additional_info == row def test_user_address_equality_ignores_source_row() -> None: @@ -87,12 +85,12 @@ def test_user_address_equality_ignores_source_row() -> None: a = UserAddress( user_address="1 The Street", postcode=Postcode("SW1A1AA"), - source_row={"x": "1"}, + landlord_additional_info={"x": "1"}, ) b = UserAddress( user_address="1 The Street", postcode=Postcode("SW1A1AA"), - source_row={"y": "2"}, + landlord_additional_info={"y": "2"}, ) # act / assert assert a == b diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index 9ffb250a..0f630923 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -124,7 +124,7 @@ def test_load_batch_captures_full_source_row( addresses = repo.load_batch(uri) # assert - assert addresses[0].source_row == row + assert addresses[0].landlord_additional_info == row def test_load_batch_raises_when_postcode_column_absent( @@ -154,7 +154,9 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( # act saved_uri = repo.save_batch(addresses, "tasks/passthrough") - saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + saved_rows = repo._csv_client.read_rows( + saved_uri + ) # pyright: ignore[reportPrivateUsage] # assert assert len(saved_rows) == 1 @@ -174,7 +176,10 @@ def test_save_batch_returns_uri_under_path_prefix( UserAddress( user_address="1 High Street", postcode=Postcode("SW1A 1AA"), - source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, + landlord_additional_info={ + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + }, ), ] @@ -207,7 +212,9 @@ def test_save_then_reload_round_trip_preserves_columns( # act saved_uri = repo.save_batch(addresses, "tasks/round-trip") - saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + saved_rows = repo._csv_client.read_rows( + saved_uri + ) # pyright: ignore[reportPrivateUsage] # assert # Original columns come back verbatim; postcode_clean is the only addition. @@ -225,7 +232,10 @@ def test_save_batch_uses_unique_filename_per_call( UserAddress( user_address="1 High Street", postcode=Postcode("SW1A 1AA"), - source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, + landlord_additional_info={ + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + }, ), ] From c833a3c91b5a1615d418694f779ab4c721d1a3e5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 16:33:54 +0000 Subject: [PATCH 05/29] feat: implement get_col_to_description_mappings Collect, per shared landlord_additional_info key, the list of values across all UserAddress entries. Preserves first-seen key order and input order of values. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../landlord_description_overrides_orchestrator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index fb3fc61b..0751975a 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -15,5 +15,8 @@ class LandlordDescriptionOverridesOrchestrator: def get_col_to_description_mappings( self, list_of_user_address: list[UserAddress] ) -> dict[str, list[str]]: - - raise NotImplementedError() + mappings: dict[str, list[str]] = {} + for user_address in list_of_user_address: + for key, value in user_address.landlord_additional_info.items(): + mappings.setdefault(key, []).append(value) + return mappings From 8baa4c82aace31092bb9940a8888512589ec7439 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 16:57:14 +0000 Subject: [PATCH 06/29] save correct progress --- .../landlord_description_overrides/handler.py | 11 ++++--- infrastructure/csv_s3_client.py | 31 ++++++++++++++++- ...lord_description_overrides_orchestrator.py | 6 ++-- tests/infrastructure/test_csv_s3_client.py | 33 +++++++++++++++++++ ...lord_description_overrides_orchestrator.py | 21 ++++++++++-- 5 files changed, 91 insertions(+), 11 deletions(-) diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 003bd4d3..65297dac 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,6 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, ) +from domain.addresses.user_address import UserAddress def handler( @@ -31,11 +32,13 @@ def handler( user_address_repo=user_address_repo, ) - list_of_user_address = orchestrator.get_user_address(input_s3_uri=s3_uri) + list_of_user_address: list[UserAddress] = orchestrator.get_user_address( + input_s3_uri=s3_uri + ) - for each_user_address in list_of_user_address: - print(each_user_address.landlord_additional_info.keys()) - break + col_to_desc_map = orchestrator.get_col_to_description_mappings( + list_of_user_address=list_of_user_address + ) # Read csv of user input # get the column and unique variations of each description diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py index 8af8de73..d058ba53 100644 --- a/infrastructure/csv_s3_client.py +++ b/infrastructure/csv_s3_client.py @@ -5,6 +5,30 @@ from infrastructure.s3_client import S3Client from infrastructure.s3_uri import parse_s3_uri +def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]: + """Disambiguate repeated CSV headers by appending an index. + + The first occurrence keeps its name; each later one becomes + ``name_1``, ``name_2`` … so duplicate columns survive as distinct + keys instead of collapsing onto one (last-wins) dict entry. + """ + deduped: list[str] = [] + counts: dict[str, int] = {} + for name in fieldnames: + if name not in counts: + counts[name] = 0 + deduped.append(name) + continue + counts[name] += 1 + candidate = f"{name}_{counts[name]}" + while candidate in counts: + counts[name] += 1 + candidate = f"{name}_{counts[name]}" + counts[candidate] = 0 + deduped.append(candidate) + return deduped + + class CsvS3Client(S3Client): def read_rows(self, s3_uri: str) -> list[dict[str, str]]: bucket, key = parse_s3_uri(s3_uri) @@ -19,7 +43,12 @@ class CsvS3Client(S3Client): # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8. text = raw.decode("cp1252") - reader = csv.DictReader(StringIO(text)) + buffer = StringIO(text) + header = next(csv.reader(buffer), None) + if header is None: + return [] + fieldnames = _dedupe_fieldnames(header) + reader = csv.DictReader(buffer, fieldnames=fieldnames) return [dict(row) for row in reader] def save_rows(self, rows: list[dict[str, str]], key: str) -> str: diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 0751975a..7f3c3396 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -14,9 +14,9 @@ class LandlordDescriptionOverridesOrchestrator: def get_col_to_description_mappings( self, list_of_user_address: list[UserAddress] - ) -> dict[str, list[str]]: - mappings: dict[str, list[str]] = {} + ) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} for user_address in list_of_user_address: for key, value in user_address.landlord_additional_info.items(): - mappings.setdefault(key, []).append(value) + mappings.setdefault(key, set()).add(value) return mappings diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py index 30e27164..e7ec7eab 100644 --- a/tests/infrastructure/test_csv_s3_client.py +++ b/tests/infrastructure/test_csv_s3_client.py @@ -49,3 +49,36 @@ def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None: # act / assert with pytest.raises(ValueError, match="does not match client bucket"): csv_client.read_rows("s3://other-bucket/uploads/addresses.csv") + + +def test_read_rows_indexes_duplicate_column_names(csv_client: CsvS3Client) -> None: + # arrange: the Hyde export has two columns both headed "Walls" — a + # description and a score. Without disambiguation csv.DictReader would + # collapse them onto one key and the description would be lost. + raw = "Address 1,Walls,Roofs,Walls\n1 High St,Cavity: Filled,Pitched 300mm,9.6\n" + uri = csv_client.put_object("uploads/dup.csv", raw.encode("utf-8")) + + # act + rows = csv_client.read_rows(uri) + + # assert: the first occurrence keeps its name, the second gets an index. + assert rows == [ + { + "Address 1": "1 High St", + "Walls": "Cavity: Filled", + "Roofs": "Pitched 300mm", + "Walls_1": "9.6", + } + ] + + +def test_read_rows_indexes_each_repeat_of_a_column(csv_client: CsvS3Client) -> None: + # arrange: three columns sharing one header. + raw = "Walls,Walls,Walls\nfirst,second,third\n" + uri = csv_client.put_object("uploads/triple.csv", raw.encode("utf-8")) + + # act + rows = csv_client.read_rows(uri) + + # assert + assert rows == [{"Walls": "first", "Walls_1": "second", "Walls_2": "third"}] diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 5660bf78..4f241423 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -45,11 +45,26 @@ def test_collects_every_value_per_shared_key() -> None: # assert assert mappings == { - "description": ["cosy", "spacious", "bright"], - "condition": ["new", "worn", "fair"], + "description": {"cosy", "spacious", "bright"}, + "condition": {"new", "worn", "fair"}, } +def test_repeated_values_collapse_to_one_variant() -> None: + # arrange: two addresses share the same wall description. + addresses = [ + _make_user_address({"description": "cosy"}), + _make_user_address({"description": "cosy"}), + _make_user_address({"description": "bright"}), + ] + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: a set keeps one entry per distinct variant. + assert mappings == {"description": {"cosy", "bright"}} + + def test_empty_address_list_yields_empty_mapping() -> None: # arrange / act mappings = _orchestrator().get_col_to_description_mappings([]) @@ -66,4 +81,4 @@ def test_single_address_yields_single_value_per_key() -> None: mappings = _orchestrator().get_col_to_description_mappings(addresses) # assert - assert mappings == {"description": ["cosy"]} + assert mappings == {"description": {"cosy"}} From 94cbf5f5166df1dff1030e6788243197036d13e0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 16:59:57 +0000 Subject: [PATCH 07/29] changed useraddress landlordasset list --- .../landlord_description_overrides/handler.py | 4 +-- domain/addresses/postcode_batching.py | 14 +++++----- domain/addresses/user_address.py | 2 +- ...lord_description_overrides_orchestrator.py | 9 ++++--- .../user_address_csv_s3_repository.py | 10 +++---- .../user_address/user_address_repository.py | 8 +++--- .../addresses/test_postcode_batching.py | 10 +++---- tests/domain/addresses/test_user_address.py | 26 ++++++++++--------- ...lord_description_overrides_orchestrator.py | 25 ++++++++++++++---- .../test_user_address_csv_s3_repository.py | 6 ++--- 10 files changed, 66 insertions(+), 48 deletions(-) diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 65297dac..2655beb9 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,7 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, ) -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList def handler( @@ -32,7 +32,7 @@ def handler( user_address_repo=user_address_repo, ) - list_of_user_address: list[UserAddress] = orchestrator.get_user_address( + list_of_user_address: list[LandlordAssetList] = orchestrator.get_user_address( input_s3_uri=s3_uri ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 44e4d967..d4d04b00 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,21 +2,21 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[UserAddress], + addresses: Iterable[LandlordAssetList], *, max_batch_size: int = 500, -) -> Iterator[list[UserAddress]]: +) -> Iterator[list[LandlordAssetList]]: if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") groups = _group_by_postcode_in_order(addresses) - buffer: list[UserAddress] = [] + buffer: list[LandlordAssetList] = [] for group in groups.values(): group_len = len(group) @@ -43,9 +43,9 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[UserAddress], -) -> dict[Postcode, list[UserAddress]]: - groups: dict[Postcode, list[UserAddress]] = {} + addresses: Iterable[LandlordAssetList], +) -> dict[Postcode, list[LandlordAssetList]]: + groups: dict[Postcode, list[LandlordAssetList]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) return groups diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index b6deb2e4..c93f46e5 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -11,7 +11,7 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class UserAddress: +class LandlordAssetList: user_address: str postcode: Postcode internal_reference: Optional[str] = None diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 7f3c3396..9321994d 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -1,5 +1,5 @@ from repositories.user_address.user_address_repository import UserAddressRepository -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList class LandlordDescriptionOverridesOrchestrator: @@ -9,14 +9,15 @@ class LandlordDescriptionOverridesOrchestrator: def get_user_address( self, input_s3_uri: str, - ) -> list[UserAddress]: + ) -> list[LandlordAssetList]: return self._user_address_repo.load_batch(input_s3_uri) def get_col_to_description_mappings( - self, list_of_user_address: list[UserAddress] + self, list_of_user_address: list[LandlordAssetList] ) -> dict[str, set[str]]: mappings: dict[str, set[str]] = {} for user_address in list_of_user_address: for key, value in user_address.landlord_additional_info.items(): - mappings.setdefault(key, set()).add(value) + # Lower-case so case-only typos collapse to one variant. + mappings.setdefault(key, set()).add(value.lower()) return mappings diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 0b54d360..612a52ec 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -4,7 +4,7 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_repository import UserAddressRepository @@ -20,14 +20,14 @@ class UserAddressCsvS3Repository(UserAddressRepository): self._csv_client = csv_client self._bucket = bucket - def load_batch(self, s3_uri: str) -> list[UserAddress]: + def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: rows = self._csv_client.read_rows(s3_uri) if rows and _POSTCODE_COLUMN not in rows[0]: raise ValueError( f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; " f"columns present: {sorted(rows[0])}" ) - addresses: list[UserAddress] = [] + addresses: list[LandlordAssetList] = [] for row in rows: parts = [ row[col].strip() @@ -39,7 +39,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - UserAddress( + LandlordAssetList( user_address=user_address, postcode=Postcode(postcode), internal_reference=internal_reference, @@ -48,7 +48,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): ) return addresses - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + def save_batch(self, addresses: list[LandlordAssetList], path_prefix: str) -> str: rows: list[dict[str, str]] = [ { **addr.landlord_additional_info, diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index b2c0f866..b89247c5 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -2,12 +2,14 @@ from __future__ import annotations from abc import ABC, abstractmethod -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList class UserAddressRepository(ABC): @abstractmethod - def load_batch(self, s3_uri: str) -> list[UserAddress]: ... + def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: ... @abstractmethod - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ... + def save_batch( + self, addresses: list[LandlordAssetList], path_prefix: str + ) -> str: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 8ffcf1b5..82e5ced7 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,13 +1,13 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode -def _addrs(postcode: str, n: int) -> list[UserAddress]: +def _addrs(postcode: str, n: int) -> list[LandlordAssetList]: return [ - UserAddress( + LandlordAssetList( user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) ) for i in range(n) @@ -74,9 +74,7 @@ def test_oversize_group_flushes_existing_buffer_first() -> None: big = _addrs("BB2 2BB", 7) tail = _addrs("CC3 3CC", 1) # act - batches = list( - iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) - ) + batches = list(iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)) # assert assert len(batches) == 3 assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"] diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index 21e5050d..39c52283 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -2,13 +2,13 @@ import dataclasses import pytest -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode def test_user_address_holds_postcode_value_object() -> None: # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") @@ -17,21 +17,23 @@ def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = UserAddress(user_address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList( + user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") + ) # assert assert addr.user_address == " 1 The Street " def test_user_address_internal_reference_defaults_to_none() -> None: # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.internal_reference is None def test_user_address_internal_reference_accepted() -> None: # act - addr = UserAddress( + addr = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A1AA"), internal_reference="cust-42", @@ -42,7 +44,7 @@ def test_user_address_internal_reference_accepted() -> None: def test_user_address_is_frozen() -> None: # arrange - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] @@ -52,15 +54,15 @@ def test_user_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + a = LandlordAssetList(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b def test_user_address_source_row_defaults_to_empty_dict() -> None: # act - addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.landlord_additional_info == {} @@ -69,7 +71,7 @@ def test_user_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = UserAddress( + addr = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A 1AA"), landlord_additional_info=row, @@ -82,12 +84,12 @@ def test_user_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = UserAddress( + a = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A1AA"), landlord_additional_info={"x": "1"}, ) - b = UserAddress( + b = LandlordAssetList( user_address="1 The Street", postcode=Postcode("SW1A1AA"), landlord_additional_info={"y": "2"}, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 4f241423..c7197071 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,6 +1,6 @@ from __future__ import annotations -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode from orchestration.landlord_description_overrides_orchestrator import ( LandlordDescriptionOverridesOrchestrator, @@ -11,15 +11,15 @@ from repositories.user_address.user_address_repository import UserAddressReposit class _StubUserAddressRepository(UserAddressRepository): """``get_col_to_description_mappings`` never touches the repo.""" - def load_batch(self, s3_uri: str) -> list[UserAddress]: + def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: raise NotImplementedError() - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + def save_batch(self, addresses: list[LandlordAssetList], path_prefix: str) -> str: raise NotImplementedError() -def _make_user_address(landlord_additional_info: dict[str, str]) -> UserAddress: - return UserAddress( +def _make_user_address(landlord_additional_info: dict[str, str]) -> LandlordAssetList: + return LandlordAssetList( user_address="1 High St", postcode=Postcode("AA1 1AA"), landlord_additional_info=landlord_additional_info, @@ -65,6 +65,21 @@ def test_repeated_values_collapse_to_one_variant() -> None: assert mappings == {"description": {"cosy", "bright"}} +def test_case_only_variants_collapse_to_one() -> None: + # arrange: the same description typed with inconsistent casing. + addresses = [ + _make_user_address({"description": "Cosy"}), + _make_user_address({"description": "cosy"}), + _make_user_address({"description": "COSY"}), + ] + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: lower-casing folds the casing typos into one variant. + assert mappings == {"description": {"cosy"}} + + def test_empty_address_list_yields_empty_mapping() -> None: # arrange / act mappings = _orchestrator().get_col_to_description_mappings([]) diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index 0f630923..9d53b35b 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.user_address import UserAddress +from domain.addresses.user_address import LandlordAssetList from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( @@ -173,7 +173,7 @@ def test_save_batch_returns_uri_under_path_prefix( ) -> None: # arrange addresses = [ - UserAddress( + LandlordAssetList( user_address="1 High Street", postcode=Postcode("SW1A 1AA"), landlord_additional_info={ @@ -229,7 +229,7 @@ def test_save_batch_uses_unique_filename_per_call( ) -> None: # arrange addresses = [ - UserAddress( + LandlordAssetList( user_address="1 High Street", postcode=Postcode("SW1A 1AA"), landlord_additional_info={ From acb306f7b9dc7a67fc9b3d371df08abdbf471961 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 07:34:50 +0000 Subject: [PATCH 08/29] asset list from landlord --- .../landlord_description_overrides/handler.py | 4 +- .../{user_address.py => asset_list.py} | 8 +-- domain/addresses/postcode_batching.py | 14 ++--- ...lord_description_overrides_orchestrator.py | 8 +-- .../user_address_csv_s3_repository.py | 18 +++---- .../user_address/user_address_repository.py | 8 ++- .../addresses/test_postcode_batching.py | 8 ++- tests/domain/addresses/test_user_address.py | 52 +++++++++---------- ...lord_description_overrides_orchestrator.py | 14 ++--- .../test_user_address_csv_s3_repository.py | 28 +++++----- 10 files changed, 78 insertions(+), 84 deletions(-) rename domain/addresses/{user_address.py => asset_list.py} (68%) diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 2655beb9..2691d6d2 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,7 +7,7 @@ from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, ) -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList def handler( @@ -32,7 +32,7 @@ def handler( user_address_repo=user_address_repo, ) - list_of_user_address: list[LandlordAssetList] = orchestrator.get_user_address( + list_of_user_address: list[AssetList] = orchestrator.get_user_address( input_s3_uri=s3_uri ) diff --git a/domain/addresses/user_address.py b/domain/addresses/asset_list.py similarity index 68% rename from domain/addresses/user_address.py rename to domain/addresses/asset_list.py index c93f46e5..1332aa2e 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/asset_list.py @@ -11,10 +11,10 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class LandlordAssetList: - user_address: str +class AssetList: + address: str postcode: Postcode - internal_reference: Optional[str] = None - landlord_additional_info: dict[str, str] = field( + org_reference: Optional[str] = None + additional_info: dict[str, str] = field( default_factory=_empty_source_row, compare=False ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index d4d04b00..fe63605e 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,21 +2,21 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[LandlordAssetList], + addresses: Iterable[AssetList], *, max_batch_size: int = 500, -) -> Iterator[list[LandlordAssetList]]: +) -> Iterator[list[AssetList]]: if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") groups = _group_by_postcode_in_order(addresses) - buffer: list[LandlordAssetList] = [] + buffer: list[AssetList] = [] for group in groups.values(): group_len = len(group) @@ -43,9 +43,9 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[LandlordAssetList], -) -> dict[Postcode, list[LandlordAssetList]]: - groups: dict[Postcode, list[LandlordAssetList]] = {} + addresses: Iterable[AssetList], +) -> dict[Postcode, list[AssetList]]: + groups: dict[Postcode, list[AssetList]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) return groups diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 9321994d..18132667 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -1,5 +1,5 @@ from repositories.user_address.user_address_repository import UserAddressRepository -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList class LandlordDescriptionOverridesOrchestrator: @@ -9,15 +9,15 @@ class LandlordDescriptionOverridesOrchestrator: def get_user_address( self, input_s3_uri: str, - ) -> list[LandlordAssetList]: + ) -> list[AssetList]: return self._user_address_repo.load_batch(input_s3_uri) def get_col_to_description_mappings( - self, list_of_user_address: list[LandlordAssetList] + self, list_of_user_address: list[AssetList] ) -> dict[str, set[str]]: mappings: dict[str, set[str]] = {} for user_address in list_of_user_address: - for key, value in user_address.landlord_additional_info.items(): + for key, value in user_address.additional_info.items(): # Lower-case so case-only typos collapse to one variant. mappings.setdefault(key, set()).add(value.lower()) return mappings diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 612a52ec..adbbfe3e 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -4,7 +4,7 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_repository import UserAddressRepository @@ -20,14 +20,14 @@ class UserAddressCsvS3Repository(UserAddressRepository): self._csv_client = csv_client self._bucket = bucket - def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: + def load_batch(self, s3_uri: str) -> list[AssetList]: rows = self._csv_client.read_rows(s3_uri) if rows and _POSTCODE_COLUMN not in rows[0]: raise ValueError( f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; " f"columns present: {sorted(rows[0])}" ) - addresses: list[LandlordAssetList] = [] + addresses: list[AssetList] = [] for row in rows: parts = [ row[col].strip() @@ -39,19 +39,19 @@ class UserAddressCsvS3Repository(UserAddressRepository): raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - LandlordAssetList( - user_address=user_address, + AssetList( + address=user_address, postcode=Postcode(postcode), - internal_reference=internal_reference, - landlord_additional_info=row, + org_reference=internal_reference, + additional_info=row, ) ) return addresses - def save_batch(self, addresses: list[LandlordAssetList], path_prefix: str) -> str: + def save_batch(self, addresses: list[AssetList], path_prefix: str) -> str: rows: list[dict[str, str]] = [ { - **addr.landlord_additional_info, + **addr.additional_info, _POSTCODE_CLEAN_COLUMN: str(addr.postcode), } for addr in addresses diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index b89247c5..eafd0e1d 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -2,14 +2,12 @@ from __future__ import annotations from abc import ABC, abstractmethod -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList class UserAddressRepository(ABC): @abstractmethod - def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: ... + def load_batch(self, s3_uri: str) -> list[AssetList]: ... @abstractmethod - def save_batch( - self, addresses: list[LandlordAssetList], path_prefix: str - ) -> str: ... + def save_batch(self, addresses: list[AssetList], path_prefix: str) -> str: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 82e5ced7..4aaeef10 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,15 +1,13 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList from domain.postcode import Postcode -def _addrs(postcode: str, n: int) -> list[LandlordAssetList]: +def _addrs(postcode: str, n: int) -> list[AssetList]: return [ - LandlordAssetList( - user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) - ) + AssetList(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) for i in range(n) ] diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index 39c52283..be065995 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -2,13 +2,13 @@ import dataclasses import pytest -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList from domain.postcode import Postcode def test_user_address_holds_postcode_value_object() -> None: # act - addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = AssetList(address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") @@ -17,34 +17,32 @@ def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = LandlordAssetList( - user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") - ) + addr = AssetList(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) # assert - assert addr.user_address == " 1 The Street " + assert addr.address == " 1 The Street " def test_user_address_internal_reference_defaults_to_none() -> None: # act - addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert - assert addr.internal_reference is None + assert addr.org_reference is None def test_user_address_internal_reference_accepted() -> None: # act - addr = LandlordAssetList( - user_address="1 The Street", + addr = AssetList( + address="1 The Street", postcode=Postcode("SW1A1AA"), - internal_reference="cust-42", + org_reference="cust-42", ) # assert - assert addr.internal_reference == "cust-42" + assert addr.org_reference == "cust-42" def test_user_address_is_frozen() -> None: # arrange - addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] @@ -54,45 +52,45 @@ def test_user_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = LandlordAssetList(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + a = AssetList(address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b def test_user_address_source_row_defaults_to_empty_dict() -> None: # act - addr = LandlordAssetList(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert - assert addr.landlord_additional_info == {} + assert addr.additional_info == {} def test_user_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = LandlordAssetList( - user_address="1 The Street", + addr = AssetList( + address="1 The Street", postcode=Postcode("SW1A 1AA"), - landlord_additional_info=row, + additional_info=row, ) # assert - assert addr.landlord_additional_info == row + assert addr.additional_info == row def test_user_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = LandlordAssetList( - user_address="1 The Street", + a = AssetList( + address="1 The Street", postcode=Postcode("SW1A1AA"), - landlord_additional_info={"x": "1"}, + additional_info={"x": "1"}, ) - b = LandlordAssetList( - user_address="1 The Street", + b = AssetList( + address="1 The Street", postcode=Postcode("SW1A1AA"), - landlord_additional_info={"y": "2"}, + additional_info={"y": "2"}, ) # act / assert assert a == b diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index c7197071..26cf46b4 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,6 +1,6 @@ from __future__ import annotations -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList from domain.postcode import Postcode from orchestration.landlord_description_overrides_orchestrator import ( LandlordDescriptionOverridesOrchestrator, @@ -11,18 +11,18 @@ from repositories.user_address.user_address_repository import UserAddressReposit class _StubUserAddressRepository(UserAddressRepository): """``get_col_to_description_mappings`` never touches the repo.""" - def load_batch(self, s3_uri: str) -> list[LandlordAssetList]: + def load_batch(self, s3_uri: str) -> list[AssetList]: raise NotImplementedError() - def save_batch(self, addresses: list[LandlordAssetList], path_prefix: str) -> str: + def save_batch(self, addresses: list[AssetList], path_prefix: str) -> str: raise NotImplementedError() -def _make_user_address(landlord_additional_info: dict[str, str]) -> LandlordAssetList: - return LandlordAssetList( - user_address="1 High St", +def _make_user_address(landlord_additional_info: dict[str, str]) -> AssetList: + return AssetList( + address="1 High St", postcode=Postcode("AA1 1AA"), - landlord_additional_info=landlord_additional_info, + additional_info=landlord_additional_info, ) diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index 9d53b35b..dc97f0e3 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.user_address import LandlordAssetList +from domain.addresses.user_address import AssetList from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( @@ -50,9 +50,9 @@ def test_load_batch_parses_address_postcode_and_reference( # assert assert len(addresses) == 1 address = addresses[0] - assert address.user_address == "1 High Street, Flat 2, Townville" + assert address.address == "1 High Street, Flat 2, Townville" assert address.postcode == Postcode("SW1A1AA") - assert address.internal_reference == "REF-001" + assert address.org_reference == "REF-001" def test_load_batch_uses_only_address_1_when_others_missing( @@ -75,9 +75,9 @@ def test_load_batch_uses_only_address_1_when_others_missing( # assert assert len(addresses) == 1 - assert addresses[0].user_address == "10 Cardiff Road" + assert addresses[0].address == "10 Cardiff Road" assert addresses[0].postcode == Postcode("CF101AA") - assert addresses[0].internal_reference == "REF-002" + assert addresses[0].org_reference == "REF-002" def test_load_batch_handles_missing_internal_reference( @@ -100,9 +100,9 @@ def test_load_batch_handles_missing_internal_reference( # assert assert len(addresses) == 1 - assert addresses[0].user_address == "5 Park Lane" + assert addresses[0].address == "5 Park Lane" assert addresses[0].postcode == Postcode("M11AA") - assert addresses[0].internal_reference is None + assert addresses[0].org_reference is None def test_load_batch_captures_full_source_row( @@ -124,7 +124,7 @@ def test_load_batch_captures_full_source_row( addresses = repo.load_batch(uri) # assert - assert addresses[0].landlord_additional_info == row + assert addresses[0].additional_info == row def test_load_batch_raises_when_postcode_column_absent( @@ -173,10 +173,10 @@ def test_save_batch_returns_uri_under_path_prefix( ) -> None: # arrange addresses = [ - LandlordAssetList( - user_address="1 High Street", + AssetList( + address="1 High Street", postcode=Postcode("SW1A 1AA"), - landlord_additional_info={ + additional_info={ "Address 1": "1 High Street", "postcode": "SW1A 1AA", }, @@ -229,10 +229,10 @@ def test_save_batch_uses_unique_filename_per_call( ) -> None: # arrange addresses = [ - LandlordAssetList( - user_address="1 High Street", + AssetList( + address="1 High Street", postcode=Postcode("SW1A 1AA"), - landlord_additional_info={ + additional_info={ "Address 1": "1 High Street", "postcode": "SW1A 1AA", }, From cf14a4e3aaf151c6a472b55483855ffc9ca4aca0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 08:14:46 +0000 Subject: [PATCH 09/29] rename to SAL and AssetList and RawAddresses --- .../landlord_description_overrides/Dockerfile | 34 --------- .../landlord_description_overrides/handler.py | 48 ------------ .../local_handler/.env.local.example | 5 -- .../local_handler/docker-compose.yml | 9 --- .../local_handler/invoke_local_lambda.py | 16 ---- .../local_handler/run_local.sh | 12 --- .../requirements.txt | 4 - applications/postcode_splitter/handler.py | 12 +-- domain/addresses/postcode_batching.py | 20 ++--- .../{asset_list.py => raw_address.py} | 8 +- ...lord_description_overrides_orchestrator.py | 23 ------ .../postcode_splitter_orchestrator.py | 10 +-- .../{user_address => raw_address}/__init__.py | 0 .../raw_address_csv_s3_repository.py} | 18 ++--- .../raw_address/raw_address_repository.py | 13 ++++ .../user_address/user_address_repository.py | 13 ---- .../addresses/test_postcode_batching.py | 14 ++-- ...st_user_address.py => test_raw_address.py} | 44 +++++------ ...lord_description_overrides_orchestrator.py | 62 ++++++++------- .../test_postcode_splitter_orchestrator.py | 38 ++++------ .../{user_address => raw_address}/__init__.py | 0 .../{user_address => raw_address}/conftest.py | 0 .../test_raw_address_csv_s3_repository.py} | 76 ++++++++++--------- 23 files changed, 169 insertions(+), 310 deletions(-) delete mode 100644 applications/landlord_description_overrides/Dockerfile delete mode 100644 applications/landlord_description_overrides/handler.py delete mode 100644 applications/landlord_description_overrides/local_handler/.env.local.example delete mode 100644 applications/landlord_description_overrides/local_handler/docker-compose.yml delete mode 100755 applications/landlord_description_overrides/local_handler/invoke_local_lambda.py delete mode 100755 applications/landlord_description_overrides/local_handler/run_local.sh delete mode 100644 applications/landlord_description_overrides/requirements.txt rename domain/addresses/{asset_list.py => raw_address.py} (67%) delete mode 100644 orchestration/landlord_description_overrides_orchestrator.py rename repositories/{user_address => raw_address}/__init__.py (100%) rename repositories/{user_address/user_address_csv_s3_repository.py => raw_address/raw_address_csv_s3_repository.py} (80%) create mode 100644 repositories/raw_address/raw_address_repository.py delete mode 100644 repositories/user_address/user_address_repository.py rename tests/domain/addresses/{test_user_address.py => test_raw_address.py} (55%) rename tests/repositories/{user_address => raw_address}/__init__.py (100%) rename tests/repositories/{user_address => raw_address}/conftest.py (100%) rename tests/repositories/{user_address/test_user_address_csv_s3_repository.py => raw_address/test_raw_address_csv_s3_repository.py} (80%) diff --git a/applications/landlord_description_overrides/Dockerfile b/applications/landlord_description_overrides/Dockerfile deleted file mode 100644 index e2456b81..00000000 --- a/applications/landlord_description_overrides/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -FROM public.ecr.aws/lambda/python:3.11 - -# Postgres host/port/database are baked into the image at build time from -# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets), -# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the -# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT -# baked in -- Terraform injects those as Lambda env vars from Secrets Manager. -ARG DEV_DB_HOST -ARG DEV_DB_PORT -ARG DEV_DB_NAME - -ENV POSTGRES_HOST=${DEV_DB_HOST} -ENV POSTGRES_PORT=${DEV_DB_PORT} -ENV POSTGRES_DATABASE=${DEV_DB_NAME} - -WORKDIR /var/task - -COPY applications/postcode_splitter/requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the layered source the handler imports from. The new splitter pulls -# only DDD-shaped packages — no pandas, no legacy backend/. -COPY domain/ domain/ -COPY infrastructure/ infrastructure/ -COPY orchestration/ orchestration/ -COPY repositories/ repositories/ -COPY utilities/ utilities/ -COPY applications/ applications/ - -# Place the handler at the Lambda task root so the runtime can resolve -# ``main.handler`` without an extra package prefix. -COPY applications/landlord_description_overrides/handler.py /var/task/main.py - -CMD ["main.handler"] diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py deleted file mode 100644 index 2691d6d2..00000000 --- a/applications/landlord_description_overrides/handler.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Any -import boto3 -from orchestration.landlord_description_overrides_orchestrator import ( - LandlordDescriptionOverridesOrchestrator, -) -from infrastructure.csv_s3_client import CsvS3Client -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, -) -from domain.addresses.user_address import AssetList - - -def handler( - body: dict[str, Any], - context: Any, -) -> dict[str, list[str]]: - - s3_uri = "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv" - bucket = "retrofit-data-dev" - - # boto3.client is overloaded per-service in the installed stubs; cast - # to Any so the strict-mode checker treats it as opaque. - boto3_client: Any = ( - boto3.client - ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] - boto_s3: Any = boto3_client("s3") - - csv_client = CsvS3Client(boto_s3, bucket) - user_address_repo = UserAddressCsvS3Repository(csv_client, bucket) - - orchestrator = LandlordDescriptionOverridesOrchestrator( - user_address_repo=user_address_repo, - ) - - list_of_user_address: list[AssetList] = orchestrator.get_user_address( - input_s3_uri=s3_uri - ) - - col_to_desc_map = orchestrator.get_col_to_description_mappings( - list_of_user_address=list_of_user_address - ) - - # Read csv of user input - # get the column and unique variations of each description - # { walls: "wall variation 1", "wall varition 2"} - # Call chatgpt(input from landlord, our way of understanding the mapping) Retrun -> lanlordMapped - - return {"hello world": ["hello world"]} diff --git a/applications/landlord_description_overrides/local_handler/.env.local.example b/applications/landlord_description_overrides/local_handler/.env.local.example deleted file mode 100644 index a78a797f..00000000 --- a/applications/landlord_description_overrides/local_handler/.env.local.example +++ /dev/null @@ -1,5 +0,0 @@ -POSTGRES_HOST= -POSTGRES_PORT=5432 -POSTGRES_USERNAME= -POSTGRES_PASSWORD= -POSTGRES_DATABASE= \ No newline at end of file diff --git a/applications/landlord_description_overrides/local_handler/docker-compose.yml b/applications/landlord_description_overrides/local_handler/docker-compose.yml deleted file mode 100644 index 6ead2e33..00000000 --- a/applications/landlord_description_overrides/local_handler/docker-compose.yml +++ /dev/null @@ -1,9 +0,0 @@ -services: - landlord_overrides: - build: - context: ../../../ - dockerfile: applications/landlord_description_overrides/Dockerfile - ports: - - "9002:8080" - env_file: - - .env.local diff --git a/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py b/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py deleted file mode 100755 index 4514495f..00000000 --- a/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python3 -import json -import requests - -HOST = "localhost" -PORT = "9002" - -LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" - -payload = {"Records": [{"body": json.dumps({})}]} - -response = requests.post(LAMBDA_URL, json=payload) - -print("Status code:", response.status_code) -print("Response:") -print(response.text) diff --git a/applications/landlord_description_overrides/local_handler/run_local.sh b/applications/landlord_description_overrides/local_handler/run_local.sh deleted file mode 100755 index 345b60ee..00000000 --- a/applications/landlord_description_overrides/local_handler/run_local.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -cd "$(dirname "$0")" - -if [ ! -f .env.local ]; then - cp .env.local.example .env.local - echo "Created .env.local from the template — fill it in, then re-run." >&2 - exit 1 -fi - -docker compose build --no-cache -docker compose up --force-recreate diff --git a/applications/landlord_description_overrides/requirements.txt b/applications/landlord_description_overrides/requirements.txt deleted file mode 100644 index 6a85a255..00000000 --- a/applications/landlord_description_overrides/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -boto3 -pydantic -sqlmodel -psycopg2-binary diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 9fb3ca6a..1f453858 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -12,8 +12,8 @@ from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from infrastructure.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, +from repositories.raw_address.raw_address_csv_s3_repository import ( + RawAddressCsvS3Repository, ) from utilities.aws_lambda.subtask_handler import subtask_handler @@ -29,17 +29,19 @@ def handler( # boto3.client is overloaded per-service in the installed stubs; cast # to Any so the strict-mode checker treats it as opaque. - boto3_client: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + boto3_client: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] boto_s3: Any = boto3_client("s3") boto_sqs: Any = boto3_client("sqs") csv_client = CsvS3Client(boto_s3, bucket) - user_address_repo = UserAddressCsvS3Repository(csv_client, bucket) + raw_address_repo = RawAddressCsvS3Repository(csv_client, bucket) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - user_address_repo=user_address_repo, + raw_address_repo=raw_address_repo, queue_client=queue_client, ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index fe63605e..dd7203b1 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,21 +2,21 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.user_address import AssetList +from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[AssetList], + addresses: Iterable[RawAddress], *, max_batch_size: int = 500, -) -> Iterator[list[AssetList]]: +) -> Iterator[AddressList]: if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") groups = _group_by_postcode_in_order(addresses) - buffer: list[AssetList] = [] + buffer: AddressList = AddressList([]) for group in groups.values(): group_len = len(group) @@ -26,14 +26,14 @@ def iter_postcode_grouped_batches( if group_len >= max_batch_size: if buffer: yield buffer - buffer = [] + buffer = AddressList([]) yield group continue # Adding this group would overflow: flush buffer before appending. if len(buffer) + group_len > max_batch_size: yield buffer - buffer = [] + buffer = AddressList([]) buffer.extend(group) @@ -43,9 +43,9 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[AssetList], -) -> dict[Postcode, list[AssetList]]: - groups: dict[Postcode, list[AssetList]] = {} + addresses: Iterable[RawAddress], +) -> dict[Postcode, AddressList]: + groups: dict[Postcode, AddressList] = {} for address in addresses: - groups.setdefault(address.postcode, []).append(address) + groups.setdefault(address.postcode, AddressList([])).append(address) return groups diff --git a/domain/addresses/asset_list.py b/domain/addresses/raw_address.py similarity index 67% rename from domain/addresses/asset_list.py rename to domain/addresses/raw_address.py index 1332aa2e..f9a2789e 100644 --- a/domain/addresses/asset_list.py +++ b/domain/addresses/raw_address.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Optional +from typing import Optional, NewType from domain.postcode import Postcode @@ -11,10 +11,14 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class AssetList: +class RawAddress: address: str postcode: Postcode org_reference: Optional[str] = None additional_info: dict[str, str] = field( default_factory=_empty_source_row, compare=False ) + + +# A batch of raw, pre-standardisation addresses as supplied by a landlord. +AddressList = NewType("AddressList", list[RawAddress]) diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py deleted file mode 100644 index 18132667..00000000 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ /dev/null @@ -1,23 +0,0 @@ -from repositories.user_address.user_address_repository import UserAddressRepository -from domain.addresses.user_address import AssetList - - -class LandlordDescriptionOverridesOrchestrator: - def __init__(self, user_address_repo: UserAddressRepository) -> None: - self._user_address_repo = user_address_repo - - def get_user_address( - self, - input_s3_uri: str, - ) -> list[AssetList]: - return self._user_address_repo.load_batch(input_s3_uri) - - def get_col_to_description_mappings( - self, list_of_user_address: list[AssetList] - ) -> dict[str, set[str]]: - mappings: dict[str, set[str]] = {} - for user_address in list_of_user_address: - for key, value in user_address.additional_info.items(): - # Lower-case so case-only typos collapse to one variant. - mappings.setdefault(key, set()).add(value.lower()) - return mappings diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index 36f4b515..f7ea520c 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,19 +5,19 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.user_address.user_address_repository import UserAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressRepository class PostcodeSplitterOrchestrator: def __init__( self, task_orchestrator: TaskOrchestrator, - user_address_repo: UserAddressRepository, + raw_address_repo: RawAddressRepository, queue_client: Address2UprnQueueClient, max_batch_size: int = 500, ) -> None: self._task_orchestrator = task_orchestrator - self._user_address_repo = user_address_repo + self._raw_address_repo = raw_address_repo self._queue_client = queue_client self._max_batch_size = max_batch_size @@ -28,7 +28,7 @@ class PostcodeSplitterOrchestrator: parent_subtask_id: UUID, input_s3_uri: str, ) -> list[UUID]: - addresses = self._user_address_repo.load_batch(input_s3_uri) + addresses = self._raw_address_repo.load_batch(input_s3_uri) path_prefix = ( f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" ) @@ -37,7 +37,7 @@ class PostcodeSplitterOrchestrator: for batch in iter_postcode_grouped_batches( addresses, max_batch_size=self._max_batch_size ): - batch_uri = self._user_address_repo.save_batch(batch, path_prefix) + batch_uri = self._raw_address_repo.save_batch(batch, path_prefix) child = self._task_orchestrator.create_child_subtask( parent_task_id, inputs={ diff --git a/repositories/user_address/__init__.py b/repositories/raw_address/__init__.py similarity index 100% rename from repositories/user_address/__init__.py rename to repositories/raw_address/__init__.py diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/raw_address/raw_address_csv_s3_repository.py similarity index 80% rename from repositories/user_address/user_address_csv_s3_repository.py rename to repositories/raw_address/raw_address_csv_s3_repository.py index adbbfe3e..5b47438d 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/raw_address/raw_address_csv_s3_repository.py @@ -4,10 +4,10 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.user_address import AssetList +from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.user_address.user_address_repository import UserAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressRepository _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") _POSTCODE_COLUMN: str = "postcode" @@ -15,32 +15,32 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" -class UserAddressCsvS3Repository(UserAddressRepository): +class RawAddressCsvS3Repository(RawAddressRepository): def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket - def load_batch(self, s3_uri: str) -> list[AssetList]: + def load_batch(self, s3_uri: str) -> AddressList: rows = self._csv_client.read_rows(s3_uri) if rows and _POSTCODE_COLUMN not in rows[0]: raise ValueError( f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; " f"columns present: {sorted(rows[0])}" ) - addresses: list[AssetList] = [] + addresses: AddressList = AddressList([]) for row in rows: parts = [ row[col].strip() for col in _ADDRESS_COLUMNS if col in row and row[col].strip() ] - user_address = ", ".join(parts) + raw_address = ", ".join(parts) postcode = row.get(_POSTCODE_COLUMN, "") raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - AssetList( - address=user_address, + RawAddress( + address=raw_address, postcode=Postcode(postcode), org_reference=internal_reference, additional_info=row, @@ -48,7 +48,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): ) return addresses - def save_batch(self, addresses: list[AssetList], path_prefix: str) -> str: + def save_batch(self, addresses: AddressList, path_prefix: str) -> str: rows: list[dict[str, str]] = [ { **addr.additional_info, diff --git a/repositories/raw_address/raw_address_repository.py b/repositories/raw_address/raw_address_repository.py new file mode 100644 index 00000000..c79d6c4a --- /dev/null +++ b/repositories/raw_address/raw_address_repository.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from domain.addresses.raw_address import AddressList + + +class RawAddressRepository(ABC): + @abstractmethod + def load_batch(self, s3_uri: str) -> AddressList: ... + + @abstractmethod + def save_batch(self, addresses: AddressList, path_prefix: str) -> str: ... diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py deleted file mode 100644 index eafd0e1d..00000000 --- a/repositories/user_address/user_address_repository.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -from domain.addresses.user_address import AssetList - - -class UserAddressRepository(ABC): - @abstractmethod - def load_batch(self, s3_uri: str) -> list[AssetList]: ... - - @abstractmethod - def save_batch(self, addresses: list[AssetList], path_prefix: str) -> str: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 4aaeef10..c7bb2d00 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,15 +1,17 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.user_address import AssetList +from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode -def _addrs(postcode: str, n: int) -> list[AssetList]: - return [ - AssetList(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) - for i in range(n) - ] +def _addrs(postcode: str, n: int) -> AddressList: + return AddressList( + [ + RawAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) + for i in range(n) + ] + ) def test_empty_input_yields_no_batches() -> None: diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_raw_address.py similarity index 55% rename from tests/domain/addresses/test_user_address.py rename to tests/domain/addresses/test_raw_address.py index be065995..0309b45e 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_raw_address.py @@ -2,36 +2,36 @@ import dataclasses import pytest -from domain.addresses.user_address import AssetList +from domain.addresses.raw_address import RawAddress from domain.postcode import Postcode -def test_user_address_holds_postcode_value_object() -> None: +def test_raw_address_holds_postcode_value_object() -> None: # act - addr = AssetList(address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = RawAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") -def test_user_address_preserves_user_address_verbatim() -> None: - # The free-text user_address string is intentionally NOT normalised -- +def test_raw_address_preserves_raw_address_verbatim() -> None: + # The free-text raw_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = AssetList(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + addr = RawAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) # assert assert addr.address == " 1 The Street " -def test_user_address_internal_reference_defaults_to_none() -> None: +def test_raw_address_internal_reference_defaults_to_none() -> None: # act - addr = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.org_reference is None -def test_user_address_internal_reference_accepted() -> None: +def test_raw_address_internal_reference_accepted() -> None: # act - addr = AssetList( + addr = RawAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), org_reference="cust-42", @@ -40,36 +40,36 @@ def test_user_address_internal_reference_accepted() -> None: assert addr.org_reference == "cust-42" -def test_user_address_is_frozen() -> None: +def test_raw_address_is_frozen() -> None: # arrange - addr = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] -def test_user_address_equality_uses_canonical_postcode() -> None: +def test_raw_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = AssetList(address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) + a = RawAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b -def test_user_address_source_row_defaults_to_empty_dict() -> None: +def test_raw_address_source_row_defaults_to_empty_dict() -> None: # act - addr = AssetList(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.additional_info == {} -def test_user_address_carries_source_row() -> None: +def test_raw_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = AssetList( + addr = RawAddress( address="1 The Street", postcode=Postcode("SW1A 1AA"), additional_info=row, @@ -78,16 +78,16 @@ def test_user_address_carries_source_row() -> None: assert addr.additional_info == row -def test_user_address_equality_ignores_source_row() -> None: +def test_raw_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = AssetList( + a = RawAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"x": "1"}, ) - b = AssetList( + b = RawAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"y": "2"}, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 26cf46b4..58790cc6 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,44 +1,44 @@ from __future__ import annotations -from domain.addresses.user_address import AssetList +from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from orchestration.landlord_description_overrides_orchestrator import ( - LandlordDescriptionOverridesOrchestrator, + SALOrchestrator, ) -from repositories.user_address.user_address_repository import UserAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressRepository -class _StubUserAddressRepository(UserAddressRepository): +class _StubRawAddressRepository(RawAddressRepository): """``get_col_to_description_mappings`` never touches the repo.""" - def load_batch(self, s3_uri: str) -> list[AssetList]: + def load_batch(self, s3_uri: str) -> AddressList: raise NotImplementedError() - def save_batch(self, addresses: list[AssetList], path_prefix: str) -> str: + def save_batch(self, addresses: AddressList, path_prefix: str) -> str: raise NotImplementedError() -def _make_user_address(landlord_additional_info: dict[str, str]) -> AssetList: - return AssetList( +def _make_raw_address(landlord_additional_info: dict[str, str]) -> RawAddress: + return RawAddress( address="1 High St", postcode=Postcode("AA1 1AA"), additional_info=landlord_additional_info, ) -def _orchestrator() -> LandlordDescriptionOverridesOrchestrator: - return LandlordDescriptionOverridesOrchestrator( - user_address_repo=_StubUserAddressRepository() - ) +def _orchestrator() -> SALOrchestrator: + return SALOrchestrator(raw_address_repo=_StubRawAddressRepository()) def test_collects_every_value_per_shared_key() -> None: # arrange: every address carries the same keys, all values distinct. - addresses = [ - _make_user_address({"description": "cosy", "condition": "new"}), - _make_user_address({"description": "spacious", "condition": "worn"}), - _make_user_address({"description": "bright", "condition": "fair"}), - ] + addresses = AddressList( + [ + _make_raw_address({"description": "cosy", "condition": "new"}), + _make_raw_address({"description": "spacious", "condition": "worn"}), + _make_raw_address({"description": "bright", "condition": "fair"}), + ] + ) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) @@ -52,11 +52,13 @@ def test_collects_every_value_per_shared_key() -> None: def test_repeated_values_collapse_to_one_variant() -> None: # arrange: two addresses share the same wall description. - addresses = [ - _make_user_address({"description": "cosy"}), - _make_user_address({"description": "cosy"}), - _make_user_address({"description": "bright"}), - ] + addresses = AddressList( + [ + _make_raw_address({"description": "cosy"}), + _make_raw_address({"description": "cosy"}), + _make_raw_address({"description": "bright"}), + ] + ) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) @@ -67,11 +69,13 @@ def test_repeated_values_collapse_to_one_variant() -> None: def test_case_only_variants_collapse_to_one() -> None: # arrange: the same description typed with inconsistent casing. - addresses = [ - _make_user_address({"description": "Cosy"}), - _make_user_address({"description": "cosy"}), - _make_user_address({"description": "COSY"}), - ] + addresses = AddressList( + [ + _make_raw_address({"description": "Cosy"}), + _make_raw_address({"description": "cosy"}), + _make_raw_address({"description": "COSY"}), + ] + ) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) @@ -82,7 +86,7 @@ def test_case_only_variants_collapse_to_one() -> None: def test_empty_address_list_yields_empty_mapping() -> None: # arrange / act - mappings = _orchestrator().get_col_to_description_mappings([]) + mappings = _orchestrator().get_col_to_description_mappings(AddressList([])) # assert assert mappings == {} @@ -90,7 +94,7 @@ def test_empty_address_list_yields_empty_mapping() -> None: def test_single_address_yields_single_value_per_key() -> None: # arrange - addresses = [_make_user_address({"description": "cosy"})] + addresses = AddressList([_make_raw_address({"description": "cosy"})]) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index a718ffbc..36039fca 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -18,8 +18,8 @@ from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchest from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, +from repositories.raw_address.raw_address_csv_s3_repository import ( + RawAddressCsvS3Repository, ) BUCKET = "splitter-bucket" @@ -27,7 +27,9 @@ REGION = "us-east-1" def _make_boto_client(service_name: str) -> Any: - factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + factory: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] return factory(service_name, region_name=REGION) @@ -62,7 +64,7 @@ class Harness: csv_client: CsvS3Client boto_sqs: Any queue_url: str - repo: UserAddressCsvS3Repository + repo: RawAddressCsvS3Repository @pytest.fixture @@ -76,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: queue_url = cast(str, queue["QueueUrl"]) csv_client = CsvS3Client(boto_s3, BUCKET) - repo = UserAddressCsvS3Repository(csv_client, BUCKET) + repo = RawAddressCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) # DB: ephemeral PostgreSQL TaskOrchestrator @@ -89,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - user_address_repo=repo, + raw_address_repo=repo, queue_client=queue_client, max_batch_size=3, ) @@ -169,10 +171,8 @@ def test_split_and_dispatch_creates_three_children_for_fixture( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) @@ -197,10 +197,8 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) @@ -230,10 +228,8 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) @@ -267,10 +263,8 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order( harness: Harness, ) -> None: # arrange - parent_task, parent_subtask = ( - harness.task_orchestrator.create_task_with_subtask( - task_source="manual:postcode-splitter-int" - ) + parent_task, parent_subtask = harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" ) input_uri = _upload_fixture_csv(harness.csv_client) diff --git a/tests/repositories/user_address/__init__.py b/tests/repositories/raw_address/__init__.py similarity index 100% rename from tests/repositories/user_address/__init__.py rename to tests/repositories/raw_address/__init__.py diff --git a/tests/repositories/user_address/conftest.py b/tests/repositories/raw_address/conftest.py similarity index 100% rename from tests/repositories/user_address/conftest.py rename to tests/repositories/raw_address/conftest.py diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py similarity index 80% rename from tests/repositories/user_address/test_user_address_csv_s3_repository.py rename to tests/repositories/raw_address/test_raw_address_csv_s3_repository.py index dc97f0e3..09fc8fc5 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py @@ -3,11 +3,11 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.user_address import AssetList +from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.user_address.user_address_csv_s3_repository import ( - UserAddressCsvS3Repository, +from repositories.raw_address.raw_address_csv_s3_repository import ( + RawAddressCsvS3Repository, ) from tests.infrastructure import make_boto_client @@ -15,22 +15,22 @@ BUCKET = "user-address-bucket" @pytest.fixture -def repo() -> Iterator[UserAddressCsvS3Repository]: +def repo() -> Iterator[RawAddressCsvS3Repository]: with mock_aws(): boto_client = make_boto_client("s3") boto_client.create_bucket(Bucket=BUCKET) csv_client = CsvS3Client(boto_client, BUCKET) - yield UserAddressCsvS3Repository(csv_client, BUCKET) + yield RawAddressCsvS3Repository(csv_client, BUCKET) def _upload_csv( - repo: UserAddressCsvS3Repository, rows: list[dict[str, str]], key: str + repo: RawAddressCsvS3Repository, rows: list[dict[str, str]], key: str ) -> str: return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] def test_load_batch_parses_address_postcode_and_reference( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange rows = [ @@ -56,7 +56,7 @@ def test_load_batch_parses_address_postcode_and_reference( def test_load_batch_uses_only_address_1_when_others_missing( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange rows = [ @@ -81,7 +81,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( def test_load_batch_handles_missing_internal_reference( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange rows = [ @@ -106,10 +106,10 @@ def test_load_batch_handles_missing_internal_reference( def test_load_batch_captures_full_source_row( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, - # not just the ones it parses into UserAddress fields. + # not just the ones it parses into RawAddress fields. # arrange row = { "Asset Reference": "511", @@ -128,7 +128,7 @@ def test_load_batch_captures_full_source_row( def test_load_batch_raises_when_postcode_column_absent( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] @@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange row = { @@ -169,19 +169,21 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange - addresses = [ - AssetList( - address="1 High Street", - postcode=Postcode("SW1A 1AA"), - additional_info={ - "Address 1": "1 High Street", - "postcode": "SW1A 1AA", - }, - ), - ] + addresses = AddressList( + [ + RawAddress( + address="1 High Street", + postcode=Postcode("SW1A 1AA"), + additional_info={ + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + }, + ), + ] + ) # act uri = repo.save_batch(addresses, "tasks/abc/batches") @@ -192,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange rows = [ @@ -225,19 +227,21 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( - repo: UserAddressCsvS3Repository, + repo: RawAddressCsvS3Repository, ) -> None: # arrange - addresses = [ - AssetList( - address="1 High Street", - postcode=Postcode("SW1A 1AA"), - additional_info={ - "Address 1": "1 High Street", - "postcode": "SW1A 1AA", - }, - ), - ] + addresses = AddressList( + [ + RawAddress( + address="1 High Street", + postcode=Postcode("SW1A 1AA"), + additional_info={ + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + }, + ), + ] + ) # act uri_1 = repo.save_batch(addresses, "tasks/uniqueness") From 5b677dedbec75d85faa0dad510be4c15d91b7741 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 08:15:11 +0000 Subject: [PATCH 10/29] SAL --- applications/SAL/Dockerfile | 34 ++++++++++++++ applications/SAL/handler.py | 46 +++++++++++++++++++ .../SAL/local_handler/.env.local.example | 5 ++ .../SAL/local_handler/docker-compose.yml | 9 ++++ .../SAL/local_handler/invoke_local_lambda.py | 16 +++++++ applications/SAL/local_handler/run_local.sh | 12 +++++ applications/SAL/requirements.txt | 4 ++ orchestration/sal_orchestrator.py | 23 ++++++++++ 8 files changed, 149 insertions(+) create mode 100644 applications/SAL/Dockerfile create mode 100644 applications/SAL/handler.py create mode 100644 applications/SAL/local_handler/.env.local.example create mode 100644 applications/SAL/local_handler/docker-compose.yml create mode 100755 applications/SAL/local_handler/invoke_local_lambda.py create mode 100755 applications/SAL/local_handler/run_local.sh create mode 100644 applications/SAL/requirements.txt create mode 100644 orchestration/sal_orchestrator.py diff --git a/applications/SAL/Dockerfile b/applications/SAL/Dockerfile new file mode 100644 index 00000000..e2456b81 --- /dev/null +++ b/applications/SAL/Dockerfile @@ -0,0 +1,34 @@ +FROM public.ecr.aws/lambda/python:3.11 + +# Postgres host/port/database are baked into the image at build time from +# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets), +# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the +# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT +# baked in -- Terraform injects those as Lambda env vars from Secrets Manager. +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV POSTGRES_HOST=${DEV_DB_HOST} +ENV POSTGRES_PORT=${DEV_DB_PORT} +ENV POSTGRES_DATABASE=${DEV_DB_NAME} + +WORKDIR /var/task + +COPY applications/postcode_splitter/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the layered source the handler imports from. The new splitter pulls +# only DDD-shaped packages — no pandas, no legacy backend/. +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ +COPY orchestration/ orchestration/ +COPY repositories/ repositories/ +COPY utilities/ utilities/ +COPY applications/ applications/ + +# Place the handler at the Lambda task root so the runtime can resolve +# ``main.handler`` without an extra package prefix. +COPY applications/landlord_description_overrides/handler.py /var/task/main.py + +CMD ["main.handler"] diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py new file mode 100644 index 00000000..73dffd5a --- /dev/null +++ b/applications/SAL/handler.py @@ -0,0 +1,46 @@ +from typing import Any +import boto3 +from orchestration.landlord_description_overrides_orchestrator import ( + SALOrchestrator, +) +from infrastructure.csv_s3_client import CsvS3Client +from repositories.raw_address.raw_address_csv_s3_repository import ( + RawAddressCsvS3Repository, +) +from domain.addresses.raw_address import AddressList + + +def handler( + body: dict[str, Any], + context: Any, +) -> dict[str, list[str]]: + + s3_uri = "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv" + bucket = "retrofit-data-dev" + + # boto3.client is overloaded per-service in the installed stubs; cast + # to Any so the strict-mode checker treats it as opaque. + boto3_client: Any = boto3.client # noqa + boto_s3: Any = boto3_client("s3") + + csv_client = CsvS3Client(boto_s3, bucket) + raw_address_repo = RawAddressCsvS3Repository(csv_client, bucket) + + orchestrator = SALOrchestrator( + raw_address_repo=raw_address_repo, + ) + + list_of_raw_address: AddressList = orchestrator.get_raw_addresses( + input_s3_uri=s3_uri + ) + + col_to_desc_map = orchestrator.get_col_to_description_mappings( + list_of_raw_address=list_of_raw_address + ) + + # Read csv of user input + # get the column and unique variations of each description + # { walls: "wall variation 1", "wall varition 2"} + # Call chatgpt(input from landlord, our way of understanding the mapping) Retrun -> lanlordMapped + + return {"hello world": ["hello world"]} diff --git a/applications/SAL/local_handler/.env.local.example b/applications/SAL/local_handler/.env.local.example new file mode 100644 index 00000000..a78a797f --- /dev/null +++ b/applications/SAL/local_handler/.env.local.example @@ -0,0 +1,5 @@ +POSTGRES_HOST= +POSTGRES_PORT=5432 +POSTGRES_USERNAME= +POSTGRES_PASSWORD= +POSTGRES_DATABASE= \ No newline at end of file diff --git a/applications/SAL/local_handler/docker-compose.yml b/applications/SAL/local_handler/docker-compose.yml new file mode 100644 index 00000000..6ead2e33 --- /dev/null +++ b/applications/SAL/local_handler/docker-compose.yml @@ -0,0 +1,9 @@ +services: + landlord_overrides: + build: + context: ../../../ + dockerfile: applications/landlord_description_overrides/Dockerfile + ports: + - "9002:8080" + env_file: + - .env.local diff --git a/applications/SAL/local_handler/invoke_local_lambda.py b/applications/SAL/local_handler/invoke_local_lambda.py new file mode 100755 index 00000000..4514495f --- /dev/null +++ b/applications/SAL/local_handler/invoke_local_lambda.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9002" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = {"Records": [{"body": json.dumps({})}]} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/applications/SAL/local_handler/run_local.sh b/applications/SAL/local_handler/run_local.sh new file mode 100755 index 00000000..345b60ee --- /dev/null +++ b/applications/SAL/local_handler/run_local.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")" + +if [ ! -f .env.local ]; then + cp .env.local.example .env.local + echo "Created .env.local from the template — fill it in, then re-run." >&2 + exit 1 +fi + +docker compose build --no-cache +docker compose up --force-recreate diff --git a/applications/SAL/requirements.txt b/applications/SAL/requirements.txt new file mode 100644 index 00000000..6a85a255 --- /dev/null +++ b/applications/SAL/requirements.txt @@ -0,0 +1,4 @@ +boto3 +pydantic +sqlmodel +psycopg2-binary diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py new file mode 100644 index 00000000..e9584aa1 --- /dev/null +++ b/orchestration/sal_orchestrator.py @@ -0,0 +1,23 @@ +from repositories.raw_address.raw_address_repository import RawAddressRepository +from domain.addresses.raw_address import AddressList + + +class SALOrchestrator: + def __init__(self, raw_address_repo: RawAddressRepository) -> None: + self._raw_address_repo = raw_address_repo + + def get_raw_addresses( + self, + input_s3_uri: str, + ) -> AddressList: + return self._raw_address_repo.load_batch(input_s3_uri) + + def get_col_to_description_mappings( + self, list_of_raw_address: AddressList + ) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} + for raw_address in list_of_raw_address: + for key, value in raw_address.additional_info.items(): + # Lower-case so case-only typos collapse to one variant. + mappings.setdefault(key, set()).add(value.lower()) + return mappings From 84098e28ff5937c012a18f72bd7217339f91d33c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 08:17:37 +0000 Subject: [PATCH 11/29] raw address list repo --- applications/SAL/handler.py | 6 ++--- .../postcode_splitter_orchestrator.py | 4 +-- orchestration/sal_orchestrator.py | 4 +-- ... => raw_address_list_csv_s3_repository.py} | 4 +-- ...tory.py => raw_address_list_repository.py} | 2 +- ...lord_description_overrides_orchestrator.py | 6 ++--- .../test_postcode_splitter_orchestrator.py | 6 ++--- .../test_raw_address_csv_s3_repository.py | 26 +++++++++---------- 8 files changed, 29 insertions(+), 29 deletions(-) rename repositories/raw_address/{raw_address_csv_s3_repository.py => raw_address_list_csv_s3_repository.py} (96%) rename repositories/raw_address/{raw_address_repository.py => raw_address_list_repository.py} (89%) diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index 73dffd5a..c975a039 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -1,11 +1,11 @@ from typing import Any import boto3 -from orchestration.landlord_description_overrides_orchestrator import ( +from orchestration.sal_orchestrator import ( SALOrchestrator, ) from infrastructure.csv_s3_client import CsvS3Client from repositories.raw_address.raw_address_csv_s3_repository import ( - RawAddressCsvS3Repository, + RawAddressListCsvS3Repository, ) from domain.addresses.raw_address import AddressList @@ -24,7 +24,7 @@ def handler( boto_s3: Any = boto3_client("s3") csv_client = CsvS3Client(boto_s3, bucket) - raw_address_repo = RawAddressCsvS3Repository(csv_client, bucket) + raw_address_repo = RawAddressListCsvS3Repository(csv_client, bucket) orchestrator = SALOrchestrator( raw_address_repo=raw_address_repo, diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index f7ea520c..d1530e9f 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,14 +5,14 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.raw_address.raw_address_repository import RawAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressListRepository class PostcodeSplitterOrchestrator: def __init__( self, task_orchestrator: TaskOrchestrator, - raw_address_repo: RawAddressRepository, + raw_address_repo: RawAddressListRepository, queue_client: Address2UprnQueueClient, max_batch_size: int = 500, ) -> None: diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py index e9584aa1..1154befc 100644 --- a/orchestration/sal_orchestrator.py +++ b/orchestration/sal_orchestrator.py @@ -1,9 +1,9 @@ -from repositories.raw_address.raw_address_repository import RawAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressListRepository from domain.addresses.raw_address import AddressList class SALOrchestrator: - def __init__(self, raw_address_repo: RawAddressRepository) -> None: + def __init__(self, raw_address_repo: RawAddressListRepository) -> None: self._raw_address_repo = raw_address_repo def get_raw_addresses( diff --git a/repositories/raw_address/raw_address_csv_s3_repository.py b/repositories/raw_address/raw_address_list_csv_s3_repository.py similarity index 96% rename from repositories/raw_address/raw_address_csv_s3_repository.py rename to repositories/raw_address/raw_address_list_csv_s3_repository.py index 5b47438d..b0c2eec7 100644 --- a/repositories/raw_address/raw_address_csv_s3_repository.py +++ b/repositories/raw_address/raw_address_list_csv_s3_repository.py @@ -7,7 +7,7 @@ from typing import Optional from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_repository import RawAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressListRepository _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") _POSTCODE_COLUMN: str = "postcode" @@ -15,7 +15,7 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" -class RawAddressCsvS3Repository(RawAddressRepository): +class RawAddressListCsvS3Repository(RawAddressListRepository): def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket diff --git a/repositories/raw_address/raw_address_repository.py b/repositories/raw_address/raw_address_list_repository.py similarity index 89% rename from repositories/raw_address/raw_address_repository.py rename to repositories/raw_address/raw_address_list_repository.py index c79d6c4a..8abb96be 100644 --- a/repositories/raw_address/raw_address_repository.py +++ b/repositories/raw_address/raw_address_list_repository.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from domain.addresses.raw_address import AddressList -class RawAddressRepository(ABC): +class RawAddressListRepository(ABC): @abstractmethod def load_batch(self, s3_uri: str) -> AddressList: ... diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 58790cc6..bb79df6c 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -2,13 +2,13 @@ from __future__ import annotations from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode -from orchestration.landlord_description_overrides_orchestrator import ( +from orchestration.sal_orchestrator import ( SALOrchestrator, ) -from repositories.raw_address.raw_address_repository import RawAddressRepository +from repositories.raw_address.raw_address_repository import RawAddressListRepository -class _StubRawAddressRepository(RawAddressRepository): +class _StubRawAddressRepository(RawAddressListRepository): """``get_col_to_description_mappings`` never touches the repo.""" def load_batch(self, s3_uri: str) -> AddressList: diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 36039fca..0ce81781 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -19,7 +19,7 @@ from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository from repositories.raw_address.raw_address_csv_s3_repository import ( - RawAddressCsvS3Repository, + RawAddressListCsvS3Repository, ) BUCKET = "splitter-bucket" @@ -64,7 +64,7 @@ class Harness: csv_client: CsvS3Client boto_sqs: Any queue_url: str - repo: RawAddressCsvS3Repository + repo: RawAddressListCsvS3Repository @pytest.fixture @@ -78,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: queue_url = cast(str, queue["QueueUrl"]) csv_client = CsvS3Client(boto_s3, BUCKET) - repo = RawAddressCsvS3Repository(csv_client, BUCKET) + repo = RawAddressListCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) # DB: ephemeral PostgreSQL TaskOrchestrator diff --git a/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py b/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py index 09fc8fc5..99284ec5 100644 --- a/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py +++ b/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py @@ -7,7 +7,7 @@ from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.raw_address.raw_address_csv_s3_repository import ( - RawAddressCsvS3Repository, + RawAddressListCsvS3Repository, ) from tests.infrastructure import make_boto_client @@ -15,22 +15,22 @@ BUCKET = "user-address-bucket" @pytest.fixture -def repo() -> Iterator[RawAddressCsvS3Repository]: +def repo() -> Iterator[RawAddressListCsvS3Repository]: with mock_aws(): boto_client = make_boto_client("s3") boto_client.create_bucket(Bucket=BUCKET) csv_client = CsvS3Client(boto_client, BUCKET) - yield RawAddressCsvS3Repository(csv_client, BUCKET) + yield RawAddressListCsvS3Repository(csv_client, BUCKET) def _upload_csv( - repo: RawAddressCsvS3Repository, rows: list[dict[str, str]], key: str + repo: RawAddressListCsvS3Repository, rows: list[dict[str, str]], key: str ) -> str: return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] def test_load_batch_parses_address_postcode_and_reference( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -56,7 +56,7 @@ def test_load_batch_parses_address_postcode_and_reference( def test_load_batch_uses_only_address_1_when_others_missing( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -81,7 +81,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( def test_load_batch_handles_missing_internal_reference( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -106,7 +106,7 @@ def test_load_batch_handles_missing_internal_reference( def test_load_batch_captures_full_source_row( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, # not just the ones it parses into RawAddress fields. @@ -128,7 +128,7 @@ def test_load_batch_captures_full_source_row( def test_load_batch_raises_when_postcode_column_absent( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] @@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange row = { @@ -169,7 +169,7 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( @@ -194,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -227,7 +227,7 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( - repo: RawAddressCsvS3Repository, + repo: RawAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( From 91bb4b6571402b96e2573e8cd194b71c7c16fd18 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 08:22:13 +0000 Subject: [PATCH 12/29] address list --- applications/SAL/handler.py | 2 +- applications/postcode_splitter/handler.py | 8 ++++---- orchestration/postcode_splitter_orchestrator.py | 4 +++- orchestration/sal_orchestrator.py | 4 +++- .../raw_address/raw_address_list_csv_s3_repository.py | 4 +++- .../test_landlord_description_overrides_orchestrator.py | 4 +++- .../orchestration/test_postcode_splitter_orchestrator.py | 2 +- ...tory.py => test_raw_address_list_csv_s3_repository.py} | 2 +- 8 files changed, 19 insertions(+), 11 deletions(-) rename tests/repositories/raw_address/{test_raw_address_csv_s3_repository.py => test_raw_address_list_csv_s3_repository.py} (98%) diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index c975a039..69f4c04d 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -4,7 +4,7 @@ from orchestration.sal_orchestrator import ( SALOrchestrator, ) from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_csv_s3_repository import ( +from repositories.raw_address.raw_address_list_csv_s3_repository import ( RawAddressListCsvS3Repository, ) from domain.addresses.raw_address import AddressList diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 1f453858..071ff6f9 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -12,8 +12,8 @@ from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from infrastructure.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator -from repositories.raw_address.raw_address_csv_s3_repository import ( - RawAddressCsvS3Repository, +from repositories.raw_address.raw_address_list_csv_s3_repository import ( + RawAddressListCsvS3Repository, ) from utilities.aws_lambda.subtask_handler import subtask_handler @@ -36,12 +36,12 @@ def handler( boto_sqs: Any = boto3_client("sqs") csv_client = CsvS3Client(boto_s3, bucket) - raw_address_repo = RawAddressCsvS3Repository(csv_client, bucket) + user_address_repo = RawAddressListCsvS3Repository(csv_client, bucket) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - raw_address_repo=raw_address_repo, + user_address_repo=user_address_repo, queue_client=queue_client, ) diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index d1530e9f..20145524 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,7 +5,9 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.raw_address.raw_address_repository import RawAddressListRepository +from repositories.raw_address.raw_address_list_repository import ( + RawAddressListRepository, +) class PostcodeSplitterOrchestrator: diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py index 1154befc..f55947e7 100644 --- a/orchestration/sal_orchestrator.py +++ b/orchestration/sal_orchestrator.py @@ -1,4 +1,6 @@ -from repositories.raw_address.raw_address_repository import RawAddressListRepository +from repositories.raw_address.raw_address_list_repository import ( + RawAddressListRepository, +) from domain.addresses.raw_address import AddressList diff --git a/repositories/raw_address/raw_address_list_csv_s3_repository.py b/repositories/raw_address/raw_address_list_csv_s3_repository.py index b0c2eec7..a636b17b 100644 --- a/repositories/raw_address/raw_address_list_csv_s3_repository.py +++ b/repositories/raw_address/raw_address_list_csv_s3_repository.py @@ -7,7 +7,9 @@ from typing import Optional from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_repository import RawAddressListRepository +from repositories.raw_address.raw_address_list_repository import ( + RawAddressListRepository, +) _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") _POSTCODE_COLUMN: str = "postcode" diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index bb79df6c..133d5b39 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -5,7 +5,9 @@ from domain.postcode import Postcode from orchestration.sal_orchestrator import ( SALOrchestrator, ) -from repositories.raw_address.raw_address_repository import RawAddressListRepository +from repositories.raw_address.raw_address_list_repository import ( + RawAddressListRepository, +) class _StubRawAddressRepository(RawAddressListRepository): diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 0ce81781..1540112f 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -18,7 +18,7 @@ from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchest from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository -from repositories.raw_address.raw_address_csv_s3_repository import ( +from repositories.raw_address.raw_address_list_csv_s3_repository import ( RawAddressListCsvS3Repository, ) diff --git a/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py b/tests/repositories/raw_address/test_raw_address_list_csv_s3_repository.py similarity index 98% rename from tests/repositories/raw_address/test_raw_address_csv_s3_repository.py rename to tests/repositories/raw_address/test_raw_address_list_csv_s3_repository.py index 99284ec5..8870b29a 100644 --- a/tests/repositories/raw_address/test_raw_address_csv_s3_repository.py +++ b/tests/repositories/raw_address/test_raw_address_list_csv_s3_repository.py @@ -6,7 +6,7 @@ from moto import mock_aws from domain.addresses.raw_address import AddressList, RawAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_csv_s3_repository import ( +from repositories.raw_address.raw_address_list_csv_s3_repository import ( RawAddressListCsvS3Repository, ) from tests.infrastructure import make_boto_client From 0dee917094057da947dd0ff3ec9b28833d48cd9b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 08:27:59 +0000 Subject: [PATCH 13/29] unsanistiesed address list instead of raw address lit --- applications/SAL/handler.py | 20 ++++----- applications/postcode_splitter/handler.py | 8 ++-- domain/addresses/postcode_batching.py | 6 +-- ...{raw_address.py => unsanitised_address.py} | 4 +- .../postcode_splitter_orchestrator.py | 12 ++--- orchestration/sal_orchestrator.py | 20 ++++----- .../__init__.py | 0 ...nitised_address_list_csv_s3_repository.py} | 14 +++--- .../unsanitised_address_list_repository.py} | 4 +- .../addresses/test_postcode_batching.py | 4 +- ...address.py => test_unsanitised_address.py} | 44 +++++++++---------- ...lord_description_overrides_orchestrator.py | 34 +++++++------- .../test_postcode_splitter_orchestrator.py | 10 ++--- .../__init__.py | 0 .../conftest.py | 0 ...nitised_address_list_csv_s3_repository.py} | 36 +++++++-------- 16 files changed, 107 insertions(+), 109 deletions(-) rename domain/addresses/{raw_address.py => unsanitised_address.py} (84%) rename repositories/{raw_address => unsanitised_address}/__init__.py (100%) rename repositories/{raw_address/raw_address_list_csv_s3_repository.py => unsanitised_address/unsanitised_address_list_csv_s3_repository.py} (84%) rename repositories/{raw_address/raw_address_list_repository.py => unsanitised_address/unsanitised_address_list_repository.py} (70%) rename tests/domain/addresses/{test_raw_address.py => test_unsanitised_address.py} (51%) rename tests/repositories/{raw_address => unsanitised_address}/__init__.py (100%) rename tests/repositories/{raw_address => unsanitised_address}/conftest.py (100%) rename tests/repositories/{raw_address/test_raw_address_list_csv_s3_repository.py => unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py} (86%) diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index 69f4c04d..fbed3b83 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -4,10 +4,10 @@ from orchestration.sal_orchestrator import ( SALOrchestrator, ) from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_list_csv_s3_repository import ( - RawAddressListCsvS3Repository, +from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( + UnsanitisedAddressListCsvS3Repository, ) -from domain.addresses.raw_address import AddressList +from domain.addresses.unsanitised_address import AddressList def handler( @@ -24,18 +24,16 @@ def handler( boto_s3: Any = boto3_client("s3") csv_client = CsvS3Client(boto_s3, bucket) - raw_address_repo = RawAddressListCsvS3Repository(csv_client, bucket) + unsanitised_address_repo = UnsanitisedAddressListCsvS3Repository(csv_client, bucket) - orchestrator = SALOrchestrator( - raw_address_repo=raw_address_repo, + sal = SALOrchestrator( + unsanitised_address_repo=unsanitised_address_repo, ) - list_of_raw_address: AddressList = orchestrator.get_raw_addresses( - input_s3_uri=s3_uri - ) + addressList: AddressList = sal.get_unsanitised_addresses(input_s3_uri=s3_uri) - col_to_desc_map = orchestrator.get_col_to_description_mappings( - list_of_raw_address=list_of_raw_address + col_to_desc_map = sal.get_col_to_description_mappings( + list_of_unsanitised_address=addressList ) # Read csv of user input diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 071ff6f9..6614ecda 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -12,8 +12,8 @@ from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from infrastructure.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator -from repositories.raw_address.raw_address_list_csv_s3_repository import ( - RawAddressListCsvS3Repository, +from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( + UnsanitisedAddressListCsvS3Repository, ) from utilities.aws_lambda.subtask_handler import subtask_handler @@ -36,12 +36,12 @@ def handler( boto_sqs: Any = boto3_client("sqs") csv_client = CsvS3Client(boto_s3, bucket) - user_address_repo = RawAddressListCsvS3Repository(csv_client, bucket) + unsanitised_address_repo = UnsanitisedAddressListCsvS3Repository(csv_client, bucket) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - user_address_repo=user_address_repo, + unsanitised_address_repo=unsanitised_address_repo, queue_client=queue_client, ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index dd7203b1..18135dbd 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,12 +2,12 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.raw_address import AddressList, RawAddress +from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[RawAddress], + addresses: Iterable[UnsanitisedAddress], *, max_batch_size: int = 500, ) -> Iterator[AddressList]: @@ -43,7 +43,7 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[RawAddress], + addresses: Iterable[UnsanitisedAddress], ) -> dict[Postcode, AddressList]: groups: dict[Postcode, AddressList] = {} for address in addresses: diff --git a/domain/addresses/raw_address.py b/domain/addresses/unsanitised_address.py similarity index 84% rename from domain/addresses/raw_address.py rename to domain/addresses/unsanitised_address.py index f9a2789e..a33f0d88 100644 --- a/domain/addresses/raw_address.py +++ b/domain/addresses/unsanitised_address.py @@ -11,7 +11,7 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class RawAddress: +class UnsanitisedAddress: address: str postcode: Postcode org_reference: Optional[str] = None @@ -21,4 +21,4 @@ class RawAddress: # A batch of raw, pre-standardisation addresses as supplied by a landlord. -AddressList = NewType("AddressList", list[RawAddress]) +AddressList = NewType("AddressList", list[UnsanitisedAddress]) diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index 20145524..d8d81c65 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,8 +5,8 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.raw_address.raw_address_list_repository import ( - RawAddressListRepository, +from repositories.unsanitised_address.unsanitised_address_list_repository import ( + UnsanitisedAddressListRepository, ) @@ -14,12 +14,12 @@ class PostcodeSplitterOrchestrator: def __init__( self, task_orchestrator: TaskOrchestrator, - raw_address_repo: RawAddressListRepository, + unsanitised_address_repo: UnsanitisedAddressListRepository, queue_client: Address2UprnQueueClient, max_batch_size: int = 500, ) -> None: self._task_orchestrator = task_orchestrator - self._raw_address_repo = raw_address_repo + self._unsanitised_address_repo = unsanitised_address_repo self._queue_client = queue_client self._max_batch_size = max_batch_size @@ -30,7 +30,7 @@ class PostcodeSplitterOrchestrator: parent_subtask_id: UUID, input_s3_uri: str, ) -> list[UUID]: - addresses = self._raw_address_repo.load_batch(input_s3_uri) + addresses = self._unsanitised_address_repo.load_batch(input_s3_uri) path_prefix = ( f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" ) @@ -39,7 +39,7 @@ class PostcodeSplitterOrchestrator: for batch in iter_postcode_grouped_batches( addresses, max_batch_size=self._max_batch_size ): - batch_uri = self._raw_address_repo.save_batch(batch, path_prefix) + batch_uri = self._unsanitised_address_repo.save_batch(batch, path_prefix) child = self._task_orchestrator.create_child_subtask( parent_task_id, inputs={ diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py index f55947e7..1eb768de 100644 --- a/orchestration/sal_orchestrator.py +++ b/orchestration/sal_orchestrator.py @@ -1,25 +1,25 @@ -from repositories.raw_address.raw_address_list_repository import ( - RawAddressListRepository, +from repositories.unsanitised_address.unsanitised_address_list_repository import ( + UnsanitisedAddressListRepository, ) -from domain.addresses.raw_address import AddressList +from domain.addresses.unsanitised_address import AddressList class SALOrchestrator: - def __init__(self, raw_address_repo: RawAddressListRepository) -> None: - self._raw_address_repo = raw_address_repo + def __init__(self, unsanitised_address_repo: UnsanitisedAddressListRepository) -> None: + self._unsanitised_address_repo = unsanitised_address_repo - def get_raw_addresses( + def get_unsanitised_addresses( self, input_s3_uri: str, ) -> AddressList: - return self._raw_address_repo.load_batch(input_s3_uri) + return self._unsanitised_address_repo.load_batch(input_s3_uri) def get_col_to_description_mappings( - self, list_of_raw_address: AddressList + self, list_of_unsanitised_address: AddressList ) -> dict[str, set[str]]: mappings: dict[str, set[str]] = {} - for raw_address in list_of_raw_address: - for key, value in raw_address.additional_info.items(): + for unsanitised_address in list_of_unsanitised_address: + for key, value in unsanitised_address.additional_info.items(): # Lower-case so case-only typos collapse to one variant. mappings.setdefault(key, set()).add(value.lower()) return mappings diff --git a/repositories/raw_address/__init__.py b/repositories/unsanitised_address/__init__.py similarity index 100% rename from repositories/raw_address/__init__.py rename to repositories/unsanitised_address/__init__.py diff --git a/repositories/raw_address/raw_address_list_csv_s3_repository.py b/repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py similarity index 84% rename from repositories/raw_address/raw_address_list_csv_s3_repository.py rename to repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py index a636b17b..6c382df0 100644 --- a/repositories/raw_address/raw_address_list_csv_s3_repository.py +++ b/repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py @@ -4,11 +4,11 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.raw_address import AddressList, RawAddress +from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_list_repository import ( - RawAddressListRepository, +from repositories.unsanitised_address.unsanitised_address_list_repository import ( + UnsanitisedAddressListRepository, ) _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") @@ -17,7 +17,7 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" -class RawAddressListCsvS3Repository(RawAddressListRepository): +class UnsanitisedAddressListCsvS3Repository(UnsanitisedAddressListRepository): def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket @@ -36,13 +36,13 @@ class RawAddressListCsvS3Repository(RawAddressListRepository): for col in _ADDRESS_COLUMNS if col in row and row[col].strip() ] - raw_address = ", ".join(parts) + unsanitised_address = ", ".join(parts) postcode = row.get(_POSTCODE_COLUMN, "") raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - RawAddress( - address=raw_address, + UnsanitisedAddress( + address=unsanitised_address, postcode=Postcode(postcode), org_reference=internal_reference, additional_info=row, diff --git a/repositories/raw_address/raw_address_list_repository.py b/repositories/unsanitised_address/unsanitised_address_list_repository.py similarity index 70% rename from repositories/raw_address/raw_address_list_repository.py rename to repositories/unsanitised_address/unsanitised_address_list_repository.py index 8abb96be..2f842fcd 100644 --- a/repositories/raw_address/raw_address_list_repository.py +++ b/repositories/unsanitised_address/unsanitised_address_list_repository.py @@ -2,10 +2,10 @@ from __future__ import annotations from abc import ABC, abstractmethod -from domain.addresses.raw_address import AddressList +from domain.addresses.unsanitised_address import AddressList -class RawAddressListRepository(ABC): +class UnsanitisedAddressListRepository(ABC): @abstractmethod def load_batch(self, s3_uri: str) -> AddressList: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index c7bb2d00..443e43df 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,14 +1,14 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.raw_address import AddressList, RawAddress +from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress from domain.postcode import Postcode def _addrs(postcode: str, n: int) -> AddressList: return AddressList( [ - RawAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) + UnsanitisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) for i in range(n) ] ) diff --git a/tests/domain/addresses/test_raw_address.py b/tests/domain/addresses/test_unsanitised_address.py similarity index 51% rename from tests/domain/addresses/test_raw_address.py rename to tests/domain/addresses/test_unsanitised_address.py index 0309b45e..aa6d0071 100644 --- a/tests/domain/addresses/test_raw_address.py +++ b/tests/domain/addresses/test_unsanitised_address.py @@ -2,36 +2,36 @@ import dataclasses import pytest -from domain.addresses.raw_address import RawAddress +from domain.addresses.unsanitised_address import UnsanitisedAddress from domain.postcode import Postcode -def test_raw_address_holds_postcode_value_object() -> None: +def test_unsanitised_address_holds_postcode_value_object() -> None: # act - addr = RawAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") -def test_raw_address_preserves_raw_address_verbatim() -> None: - # The free-text raw_address string is intentionally NOT normalised -- +def test_unsanitised_address_preserves_unsanitised_address_verbatim() -> None: + # The free-text unsanitised_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = RawAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + addr = UnsanitisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) # assert assert addr.address == " 1 The Street " -def test_raw_address_internal_reference_defaults_to_none() -> None: +def test_unsanitised_address_internal_reference_defaults_to_none() -> None: # act - addr = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.org_reference is None -def test_raw_address_internal_reference_accepted() -> None: +def test_unsanitised_address_internal_reference_accepted() -> None: # act - addr = RawAddress( + addr = UnsanitisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), org_reference="cust-42", @@ -40,36 +40,36 @@ def test_raw_address_internal_reference_accepted() -> None: assert addr.org_reference == "cust-42" -def test_raw_address_is_frozen() -> None: +def test_unsanitised_address_is_frozen() -> None: # arrange - addr = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] -def test_raw_address_equality_uses_canonical_postcode() -> None: +def test_unsanitised_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = RawAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + a = UnsanitisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b -def test_raw_address_source_row_defaults_to_empty_dict() -> None: +def test_unsanitised_address_source_row_defaults_to_empty_dict() -> None: # act - addr = RawAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.additional_info == {} -def test_raw_address_carries_source_row() -> None: +def test_unsanitised_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = RawAddress( + addr = UnsanitisedAddress( address="1 The Street", postcode=Postcode("SW1A 1AA"), additional_info=row, @@ -78,16 +78,16 @@ def test_raw_address_carries_source_row() -> None: assert addr.additional_info == row -def test_raw_address_equality_ignores_source_row() -> None: +def test_unsanitised_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = RawAddress( + a = UnsanitisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"x": "1"}, ) - b = RawAddress( + b = UnsanitisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"y": "2"}, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 133d5b39..7e2c5167 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,16 +1,16 @@ from __future__ import annotations -from domain.addresses.raw_address import AddressList, RawAddress +from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress from domain.postcode import Postcode from orchestration.sal_orchestrator import ( SALOrchestrator, ) -from repositories.raw_address.raw_address_list_repository import ( - RawAddressListRepository, +from repositories.unsanitised_address.unsanitised_address_list_repository import ( + UnsanitisedAddressListRepository, ) -class _StubRawAddressRepository(RawAddressListRepository): +class _StubUnsanitisedAddressRepository(UnsanitisedAddressListRepository): """``get_col_to_description_mappings`` never touches the repo.""" def load_batch(self, s3_uri: str) -> AddressList: @@ -20,8 +20,8 @@ class _StubRawAddressRepository(RawAddressListRepository): raise NotImplementedError() -def _make_raw_address(landlord_additional_info: dict[str, str]) -> RawAddress: - return RawAddress( +def _make_unsanitised_address(landlord_additional_info: dict[str, str]) -> UnsanitisedAddress: + return UnsanitisedAddress( address="1 High St", postcode=Postcode("AA1 1AA"), additional_info=landlord_additional_info, @@ -29,16 +29,16 @@ def _make_raw_address(landlord_additional_info: dict[str, str]) -> RawAddress: def _orchestrator() -> SALOrchestrator: - return SALOrchestrator(raw_address_repo=_StubRawAddressRepository()) + return SALOrchestrator(unsanitised_address_repo=_StubUnsanitisedAddressRepository()) def test_collects_every_value_per_shared_key() -> None: # arrange: every address carries the same keys, all values distinct. addresses = AddressList( [ - _make_raw_address({"description": "cosy", "condition": "new"}), - _make_raw_address({"description": "spacious", "condition": "worn"}), - _make_raw_address({"description": "bright", "condition": "fair"}), + _make_unsanitised_address({"description": "cosy", "condition": "new"}), + _make_unsanitised_address({"description": "spacious", "condition": "worn"}), + _make_unsanitised_address({"description": "bright", "condition": "fair"}), ] ) @@ -56,9 +56,9 @@ def test_repeated_values_collapse_to_one_variant() -> None: # arrange: two addresses share the same wall description. addresses = AddressList( [ - _make_raw_address({"description": "cosy"}), - _make_raw_address({"description": "cosy"}), - _make_raw_address({"description": "bright"}), + _make_unsanitised_address({"description": "cosy"}), + _make_unsanitised_address({"description": "cosy"}), + _make_unsanitised_address({"description": "bright"}), ] ) @@ -73,9 +73,9 @@ def test_case_only_variants_collapse_to_one() -> None: # arrange: the same description typed with inconsistent casing. addresses = AddressList( [ - _make_raw_address({"description": "Cosy"}), - _make_raw_address({"description": "cosy"}), - _make_raw_address({"description": "COSY"}), + _make_unsanitised_address({"description": "Cosy"}), + _make_unsanitised_address({"description": "cosy"}), + _make_unsanitised_address({"description": "COSY"}), ] ) @@ -96,7 +96,7 @@ def test_empty_address_list_yields_empty_mapping() -> None: def test_single_address_yields_single_value_per_key() -> None: # arrange - addresses = AddressList([_make_raw_address({"description": "cosy"})]) + addresses = AddressList([_make_unsanitised_address({"description": "cosy"})]) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 1540112f..4317156c 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -18,8 +18,8 @@ from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchest from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository -from repositories.raw_address.raw_address_list_csv_s3_repository import ( - RawAddressListCsvS3Repository, +from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( + UnsanitisedAddressListCsvS3Repository, ) BUCKET = "splitter-bucket" @@ -64,7 +64,7 @@ class Harness: csv_client: CsvS3Client boto_sqs: Any queue_url: str - repo: RawAddressListCsvS3Repository + repo: UnsanitisedAddressListCsvS3Repository @pytest.fixture @@ -78,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: queue_url = cast(str, queue["QueueUrl"]) csv_client = CsvS3Client(boto_s3, BUCKET) - repo = RawAddressListCsvS3Repository(csv_client, BUCKET) + repo = UnsanitisedAddressListCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) # DB: ephemeral PostgreSQL TaskOrchestrator @@ -91,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - raw_address_repo=repo, + unsanitised_address_repo=repo, queue_client=queue_client, max_batch_size=3, ) diff --git a/tests/repositories/raw_address/__init__.py b/tests/repositories/unsanitised_address/__init__.py similarity index 100% rename from tests/repositories/raw_address/__init__.py rename to tests/repositories/unsanitised_address/__init__.py diff --git a/tests/repositories/raw_address/conftest.py b/tests/repositories/unsanitised_address/conftest.py similarity index 100% rename from tests/repositories/raw_address/conftest.py rename to tests/repositories/unsanitised_address/conftest.py diff --git a/tests/repositories/raw_address/test_raw_address_list_csv_s3_repository.py b/tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py similarity index 86% rename from tests/repositories/raw_address/test_raw_address_list_csv_s3_repository.py rename to tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py index 8870b29a..ff26f08a 100644 --- a/tests/repositories/raw_address/test_raw_address_list_csv_s3_repository.py +++ b/tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py @@ -3,11 +3,11 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.raw_address import AddressList, RawAddress +from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.raw_address.raw_address_list_csv_s3_repository import ( - RawAddressListCsvS3Repository, +from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( + UnsanitisedAddressListCsvS3Repository, ) from tests.infrastructure import make_boto_client @@ -15,22 +15,22 @@ BUCKET = "user-address-bucket" @pytest.fixture -def repo() -> Iterator[RawAddressListCsvS3Repository]: +def repo() -> Iterator[UnsanitisedAddressListCsvS3Repository]: with mock_aws(): boto_client = make_boto_client("s3") boto_client.create_bucket(Bucket=BUCKET) csv_client = CsvS3Client(boto_client, BUCKET) - yield RawAddressListCsvS3Repository(csv_client, BUCKET) + yield UnsanitisedAddressListCsvS3Repository(csv_client, BUCKET) def _upload_csv( - repo: RawAddressListCsvS3Repository, rows: list[dict[str, str]], key: str + repo: UnsanitisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str ) -> str: return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] def test_load_batch_parses_address_postcode_and_reference( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -56,7 +56,7 @@ def test_load_batch_parses_address_postcode_and_reference( def test_load_batch_uses_only_address_1_when_others_missing( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -81,7 +81,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( def test_load_batch_handles_missing_internal_reference( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -106,10 +106,10 @@ def test_load_batch_handles_missing_internal_reference( def test_load_batch_captures_full_source_row( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, - # not just the ones it parses into RawAddress fields. + # not just the ones it parses into UnsanitisedAddress fields. # arrange row = { "Asset Reference": "511", @@ -128,7 +128,7 @@ def test_load_batch_captures_full_source_row( def test_load_batch_raises_when_postcode_column_absent( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] @@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange row = { @@ -169,12 +169,12 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( [ - RawAddress( + UnsanitisedAddress( address="1 High Street", postcode=Postcode("SW1A 1AA"), additional_info={ @@ -194,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -227,12 +227,12 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( - repo: RawAddressListCsvS3Repository, + repo: UnsanitisedAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( [ - RawAddress( + UnsanitisedAddress( address="1 High Street", postcode=Postcode("SW1A 1AA"), additional_info={ From 61efcad27b5ac309fcc1dd87dddee610fa9f1a1e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 10:13:32 +0000 Subject: [PATCH 14/29] standardist Address --- UBIQUITOUS_LANGUAGE.md | 22 ++++++---- applications/SAL/handler.py | 25 ++++++++--- applications/postcode_splitter/handler.py | 8 ++-- domain/addresses/postcode_batching.py | 6 +-- domain/addresses/standardised_address_list.py | 21 +++++++++ ...d_address.py => unstandardised_address.py} | 4 +- .../postcode_splitter_orchestrator.py | 12 ++--- orchestration/sal_orchestrator.py | 20 ++++----- .../__init__.py | 0 ...ardised_address_list_csv_s3_repository.py} | 14 +++--- ...unstandardised_address_list_repository.py} | 4 +- .../addresses/test_postcode_batching.py | 4 +- ...ress.py => test_unstandardised_address.py} | 44 +++++++++---------- ...lord_description_overrides_orchestrator.py | 34 +++++++------- .../test_postcode_splitter_orchestrator.py | 10 ++--- .../__init__.py | 0 .../conftest.py | 0 ...ardised_address_list_csv_s3_repository.py} | 36 +++++++-------- 18 files changed, 151 insertions(+), 113 deletions(-) create mode 100644 domain/addresses/standardised_address_list.py rename domain/addresses/{unsanitised_address.py => unstandardised_address.py} (84%) rename repositories/{unsanitised_address => unstandardised_address}/__init__.py (100%) rename repositories/{unsanitised_address/unsanitised_address_list_csv_s3_repository.py => unstandardised_address/unstandardised_address_list_csv_s3_repository.py} (83%) rename repositories/{unsanitised_address/unsanitised_address_list_repository.py => unstandardised_address/unstandardised_address_list_repository.py} (69%) rename tests/domain/addresses/{test_unsanitised_address.py => test_unstandardised_address.py} (52%) rename tests/repositories/{unsanitised_address => unstandardised_address}/__init__.py (100%) rename tests/repositories/{unsanitised_address => unstandardised_address}/conftest.py (100%) rename tests/repositories/{unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py => unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py} (85%) diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index c3074c02..d2fde99a 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -23,16 +23,18 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve |------|------------|------------------| | **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" | | **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | -| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" | +| **Unstandardised Address** | A frozen dataclass (`domain.addresses.unstandardised_address.UnstandardisedAddress`) capturing a single address exactly as a customer supplied it, before any standardisation: a free-text `address` line (intentionally NOT normalised), a canonical `postcode` (a `Postcode` value object, sanitised on construction), an optional `org_reference` (the customer's own identifier for the property), and `additional_info` (the full source row — every column of the customer's upload, preserved verbatim). | "user address", "asset list", "raw address", "landlord address", "Hyde address" | +| **Address List** | A nominal `NewType` over `list[UnstandardisedAddress]` (`domain.addresses.unstandardised_address.AddressList`) — a batch of unstandardised addresses, such as one customer's bulk-onboarding upload or a postcode-grouped sub-batch produced for downstream processing. Being nominal, it is constructed explicitly: `AddressList([...])`. It is the raw *input* to ingestion; the standardised *output* is a **Standardised Asset List**. | "asset list", "Hyde address list", "user addresses" | +| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**; generated by the `SALOrchestrator`. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" | | **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | ## Address Matching | Term | Definition | Aliases to avoid | |------|------------|------------------| -| **Lexiscore** | A similarity score in [0, 1] between a user address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" | +| **Lexiscore** | A similarity score in [0, 1] between an unstandardised address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" | | **Lexirank** | Dense rank of candidates sorted by lexiscore descending; rank 1 = best match. | "rank", "position" | -| **UPRN Candidate** | An EPC search result that is a plausible match for a given user address, before scoring decides the winner. | "match candidate", "result" | +| **UPRN Candidate** | An EPC search result that is a plausible match for a given unstandardised address, before scoring decides the winner. | "match candidate", "result" | | **Score Threshold** | The minimum lexiscore (currently 0.6) below which no match is returned even if a candidate exists. | "minimum score", "cutoff" | | **Ambiguous Match** | A matching outcome where two or more candidates share lexirank 1, making it impossible to select a unique winner. | "tie", "draw", "duplicate" | | **Best Match** | The single UPRN candidate with lexirank 1 that meets or exceeds the score threshold. | "winner", "top result" | @@ -53,14 +55,16 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve - A **Dwelling** may have multiple **EPCs** across time; the one with the most recent **Registration Date** is the current one. - A **UPRN** identifies a **Dwelling** permanently; it does not change when the property changes owner. - An **EPC Search Result** is a summary; it points to a full **EPC** via its **Certificate Number**. -- **Address Matching** uses a **User Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search. +- An **Address List** is an ordered batch of **Unstandardised Addresses**; a customer's bulk-onboarding upload arrives as one. +- Ingestion turns an **Address List** (raw input) into a **Standardised Asset List** (standardised output) — the **SAL Orchestrator** drives this. +- **Address Matching** uses an **Unstandardised Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search. - A **Lexirank** of 1 with no **Ambiguous Match** and a **Lexiscore** ≥ the **Score Threshold** produces a **Best Match**. ## Example dialogue -> **Dev:** "We have a user address and postcode. How do we find the UPRN?" +> **Dev:** "We have an unstandardised address and postcode. How do we find the UPRN?" -> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **User Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**." +> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **Unstandardised Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**." > **Dev:** "What if two results share the same address line 1?" @@ -72,7 +76,9 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve ## Flagged ambiguities -- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense. +- **"address"** appears in several senses: the **Unstandardised Address** dataclass (one customer-supplied address before standardisation), its free-text `address` field, and the normalised address lines on an **EPC Search Result**. Always qualify: "unstandardised address" vs "EPC address" or "address line 1". Within `domain/addresses/`, the dataclass is **Unstandardised Address**; in upstream ingestion contexts (CSV columns, SQS payloads) "address" may still mean the bare free-text string. - **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments. -- **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`. +- **"user_inputed_address"** (and `user_address`) in `backend/address2UPRN/` is legacy naming — a misspelled synonym for what is now the **Unstandardised Address**. That address-matching code has not been renamed; new code should use **Unstandardised Address**. +- **"Hyde address list"** — "Hyde" is the name of one customer, not a domain concept. A domain expert may say "the Hyde address list" because Hyde is the customer in front of them, but the generalised term is **Address List** (and **Unstandardised Address** for a single item). A customer's identity is data — it belongs in `org_reference` or `additional_info`, never in a type or module name. +- **"address list"** vs **"asset list"** — opposite ends of the ingestion pipeline; do not conflate them. An **Address List** is the raw *input* (unstandardised addresses as the customer supplied them); a **Standardised Asset List** is the standardised *output*. The historical `AssetList` dataclass (now **Unstandardised Address**) misnamed the input an "asset list" — that mistake is what the rename corrected. - **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter. diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index fbed3b83..6076a662 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -4,10 +4,10 @@ from orchestration.sal_orchestrator import ( SALOrchestrator, ) from infrastructure.csv_s3_client import CsvS3Client -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) -from domain.addresses.unsanitised_address import AddressList +from domain.addresses.unstandardised_address import AddressList def handler( @@ -24,16 +24,16 @@ def handler( boto_s3: Any = boto3_client("s3") csv_client = CsvS3Client(boto_s3, bucket) - unsanitised_address_repo = UnsanitisedAddressListCsvS3Repository(csv_client, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket) sal = SALOrchestrator( - unsanitised_address_repo=unsanitised_address_repo, + unstandardised_address_repo=unstandardised_address_repo, ) - addressList: AddressList = sal.get_unsanitised_addresses(input_s3_uri=s3_uri) + addressList: AddressList = sal.get_unstandardised_addresses(input_s3_uri=s3_uri) col_to_desc_map = sal.get_col_to_description_mappings( - list_of_unsanitised_address=addressList + list_of_unstandardised_address=addressList ) # Read csv of user input @@ -41,4 +41,15 @@ def handler( # { walls: "wall variation 1", "wall varition 2"} # Call chatgpt(input from landlord, our way of understanding the mapping) Retrun -> lanlordMapped + + ENUM Walls: + cavity_wall_1976: 1 + + # 1) COuld download site notes from pashub and get + # 2) Open Data communites API -> + # 3) new api + + # User story: + # cavity: asbuilt (1976 - 1982): + return {"hello world": ["hello world"]} diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 6614ecda..ac2c4e99 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -12,8 +12,8 @@ from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from infrastructure.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) from utilities.aws_lambda.subtask_handler import subtask_handler @@ -36,12 +36,12 @@ def handler( boto_sqs: Any = boto3_client("sqs") csv_client = CsvS3Client(boto_s3, bucket) - unsanitised_address_repo = UnsanitisedAddressListCsvS3Repository(csv_client, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - unsanitised_address_repo=unsanitised_address_repo, + unstandardised_address_repo=unstandardised_address_repo, queue_client=queue_client, ) diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 18135dbd..ca4cd752 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -2,12 +2,12 @@ from __future__ import annotations from collections.abc import Iterable, Iterator -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode def iter_postcode_grouped_batches( - addresses: Iterable[UnsanitisedAddress], + addresses: Iterable[UnstandardisedAddress], *, max_batch_size: int = 500, ) -> Iterator[AddressList]: @@ -43,7 +43,7 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( - addresses: Iterable[UnsanitisedAddress], + addresses: Iterable[UnstandardisedAddress], ) -> dict[Postcode, AddressList]: groups: dict[Postcode, AddressList] = {} for address in addresses: diff --git a/domain/addresses/standardised_address_list.py b/domain/addresses/standardised_address_list.py new file mode 100644 index 00000000..8e3f4fc7 --- /dev/null +++ b/domain/addresses/standardised_address_list.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import NewType, Optional + +from domain.postcode import Postcode + + +def _empty_source_row() -> dict[str, str]: + return {} + + +@dataclass(frozen=True) +class StandardisedAddress: + address: str + postcode: Postcode + org_reference: Optional[str] = None + + +# Standardised Asset List -- the cleaned output counterpart to AddressList. +SAL = NewType("SAL", list[StandardisedAddress]) diff --git a/domain/addresses/unsanitised_address.py b/domain/addresses/unstandardised_address.py similarity index 84% rename from domain/addresses/unsanitised_address.py rename to domain/addresses/unstandardised_address.py index a33f0d88..8917bdf4 100644 --- a/domain/addresses/unsanitised_address.py +++ b/domain/addresses/unstandardised_address.py @@ -11,7 +11,7 @@ def _empty_source_row() -> dict[str, str]: @dataclass(frozen=True) -class UnsanitisedAddress: +class UnstandardisedAddress: address: str postcode: Postcode org_reference: Optional[str] = None @@ -21,4 +21,4 @@ class UnsanitisedAddress: # A batch of raw, pre-standardisation addresses as supplied by a landlord. -AddressList = NewType("AddressList", list[UnsanitisedAddress]) +AddressList = NewType("AddressList", list[UnstandardisedAddress]) diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index d8d81c65..1a7277d5 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -5,8 +5,8 @@ from uuid import UUID from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from orchestration.task_orchestrator import TaskOrchestrator from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) @@ -14,12 +14,12 @@ class PostcodeSplitterOrchestrator: def __init__( self, task_orchestrator: TaskOrchestrator, - unsanitised_address_repo: UnsanitisedAddressListRepository, + unstandardised_address_repo: UnstandardisedAddressListRepository, queue_client: Address2UprnQueueClient, max_batch_size: int = 500, ) -> None: self._task_orchestrator = task_orchestrator - self._unsanitised_address_repo = unsanitised_address_repo + self._unstandardised_address_repo = unstandardised_address_repo self._queue_client = queue_client self._max_batch_size = max_batch_size @@ -30,7 +30,7 @@ class PostcodeSplitterOrchestrator: parent_subtask_id: UUID, input_s3_uri: str, ) -> list[UUID]: - addresses = self._unsanitised_address_repo.load_batch(input_s3_uri) + addresses = self._unstandardised_address_repo.load_batch(input_s3_uri) path_prefix = ( f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" ) @@ -39,7 +39,7 @@ class PostcodeSplitterOrchestrator: for batch in iter_postcode_grouped_batches( addresses, max_batch_size=self._max_batch_size ): - batch_uri = self._unsanitised_address_repo.save_batch(batch, path_prefix) + batch_uri = self._unstandardised_address_repo.save_batch(batch, path_prefix) child = self._task_orchestrator.create_child_subtask( parent_task_id, inputs={ diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py index 1eb768de..8ad21388 100644 --- a/orchestration/sal_orchestrator.py +++ b/orchestration/sal_orchestrator.py @@ -1,25 +1,25 @@ -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) -from domain.addresses.unsanitised_address import AddressList +from domain.addresses.unstandardised_address import AddressList class SALOrchestrator: - def __init__(self, unsanitised_address_repo: UnsanitisedAddressListRepository) -> None: - self._unsanitised_address_repo = unsanitised_address_repo + def __init__(self, unstandardised_address_repo: UnstandardisedAddressListRepository) -> None: + self._unstandardised_address_repo = unstandardised_address_repo - def get_unsanitised_addresses( + def get_unstandardised_addresses( self, input_s3_uri: str, ) -> AddressList: - return self._unsanitised_address_repo.load_batch(input_s3_uri) + return self._unstandardised_address_repo.load_batch(input_s3_uri) def get_col_to_description_mappings( - self, list_of_unsanitised_address: AddressList + self, list_of_unstandardised_address: AddressList ) -> dict[str, set[str]]: mappings: dict[str, set[str]] = {} - for unsanitised_address in list_of_unsanitised_address: - for key, value in unsanitised_address.additional_info.items(): + for unstandardised_address in list_of_unstandardised_address: + for key, value in unstandardised_address.additional_info.items(): # Lower-case so case-only typos collapse to one variant. mappings.setdefault(key, set()).add(value.lower()) return mappings diff --git a/repositories/unsanitised_address/__init__.py b/repositories/unstandardised_address/__init__.py similarity index 100% rename from repositories/unsanitised_address/__init__.py rename to repositories/unstandardised_address/__init__.py diff --git a/repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py similarity index 83% rename from repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py rename to repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py index 6c382df0..260fce1d 100644 --- a/repositories/unsanitised_address/unsanitised_address_list_csv_s3_repository.py +++ b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py @@ -4,11 +4,11 @@ import uuid from datetime import datetime, timezone from typing import Optional -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") @@ -17,7 +17,7 @@ _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" -class UnsanitisedAddressListCsvS3Repository(UnsanitisedAddressListRepository): +class UnstandardisedAddressListCsvS3Repository(UnstandardisedAddressListRepository): def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket @@ -36,13 +36,13 @@ class UnsanitisedAddressListCsvS3Repository(UnsanitisedAddressListRepository): for col in _ADDRESS_COLUMNS if col in row and row[col].strip() ] - unsanitised_address = ", ".join(parts) + unstandardised_address = ", ".join(parts) postcode = row.get(_POSTCODE_COLUMN, "") raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() internal_reference: Optional[str] = raw_ref or None addresses.append( - UnsanitisedAddress( - address=unsanitised_address, + UnstandardisedAddress( + address=unstandardised_address, postcode=Postcode(postcode), org_reference=internal_reference, additional_info=row, diff --git a/repositories/unsanitised_address/unsanitised_address_list_repository.py b/repositories/unstandardised_address/unstandardised_address_list_repository.py similarity index 69% rename from repositories/unsanitised_address/unsanitised_address_list_repository.py rename to repositories/unstandardised_address/unstandardised_address_list_repository.py index 2f842fcd..4d446304 100644 --- a/repositories/unsanitised_address/unsanitised_address_list_repository.py +++ b/repositories/unstandardised_address/unstandardised_address_list_repository.py @@ -2,10 +2,10 @@ from __future__ import annotations from abc import ABC, abstractmethod -from domain.addresses.unsanitised_address import AddressList +from domain.addresses.unstandardised_address import AddressList -class UnsanitisedAddressListRepository(ABC): +class UnstandardisedAddressListRepository(ABC): @abstractmethod def load_batch(self, s3_uri: str) -> AddressList: ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 443e43df..e5b3e186 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -1,14 +1,14 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode def _addrs(postcode: str, n: int) -> AddressList: return AddressList( [ - UnsanitisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) + UnstandardisedAddress(address=f"{i} {postcode} Street", postcode=Postcode(postcode)) for i in range(n) ] ) diff --git a/tests/domain/addresses/test_unsanitised_address.py b/tests/domain/addresses/test_unstandardised_address.py similarity index 52% rename from tests/domain/addresses/test_unsanitised_address.py rename to tests/domain/addresses/test_unstandardised_address.py index aa6d0071..dd4eabdb 100644 --- a/tests/domain/addresses/test_unsanitised_address.py +++ b/tests/domain/addresses/test_unstandardised_address.py @@ -2,36 +2,36 @@ import dataclasses import pytest -from domain.addresses.unsanitised_address import UnsanitisedAddress +from domain.addresses.unstandardised_address import UnstandardisedAddress from domain.postcode import Postcode -def test_unsanitised_address_holds_postcode_value_object() -> None: +def test_unstandardised_address_holds_postcode_value_object() -> None: # act - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) # assert assert addr.postcode == Postcode("SW1A1AA") -def test_unsanitised_address_preserves_unsanitised_address_verbatim() -> None: - # The free-text unsanitised_address string is intentionally NOT normalised -- +def test_unstandardised_address_preserves_unstandardised_address_verbatim() -> None: + # The free-text unstandardised_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. # act - addr = UnsanitisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address=" 1 The Street ", postcode=Postcode("SW1A1AA")) # assert assert addr.address == " 1 The Street " -def test_unsanitised_address_internal_reference_defaults_to_none() -> None: +def test_unstandardised_address_internal_reference_defaults_to_none() -> None: # act - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.org_reference is None -def test_unsanitised_address_internal_reference_accepted() -> None: +def test_unstandardised_address_internal_reference_accepted() -> None: # act - addr = UnsanitisedAddress( + addr = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), org_reference="cust-42", @@ -40,36 +40,36 @@ def test_unsanitised_address_internal_reference_accepted() -> None: assert addr.org_reference == "cust-42" -def test_unsanitised_address_is_frozen() -> None: +def test_unstandardised_address_is_frozen() -> None: # arrange - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] -def test_unsanitised_address_equality_uses_canonical_postcode() -> None: +def test_unstandardised_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. # arrange - a = UnsanitisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) - b = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + a = UnstandardisedAddress(address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # act / assert assert a == b -def test_unsanitised_address_source_row_defaults_to_empty_dict() -> None: +def test_unstandardised_address_source_row_defaults_to_empty_dict() -> None: # act - addr = UnsanitisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) + addr = UnstandardisedAddress(address="1 The Street", postcode=Postcode("SW1A1AA")) # assert assert addr.additional_info == {} -def test_unsanitised_address_carries_source_row() -> None: +def test_unstandardised_address_carries_source_row() -> None: # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} # act - addr = UnsanitisedAddress( + addr = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A 1AA"), additional_info=row, @@ -78,16 +78,16 @@ def test_unsanitised_address_carries_source_row() -> None: assert addr.additional_info == row -def test_unsanitised_address_equality_ignores_source_row() -> None: +def test_unstandardised_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. # arrange - a = UnsanitisedAddress( + a = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"x": "1"}, ) - b = UnsanitisedAddress( + b = UnstandardisedAddress( address="1 The Street", postcode=Postcode("SW1A1AA"), additional_info={"y": "2"}, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 7e2c5167..b3658014 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,16 +1,16 @@ from __future__ import annotations -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode from orchestration.sal_orchestrator import ( SALOrchestrator, ) -from repositories.unsanitised_address.unsanitised_address_list_repository import ( - UnsanitisedAddressListRepository, +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, ) -class _StubUnsanitisedAddressRepository(UnsanitisedAddressListRepository): +class _StubUnstandardisedAddressRepository(UnstandardisedAddressListRepository): """``get_col_to_description_mappings`` never touches the repo.""" def load_batch(self, s3_uri: str) -> AddressList: @@ -20,8 +20,8 @@ class _StubUnsanitisedAddressRepository(UnsanitisedAddressListRepository): raise NotImplementedError() -def _make_unsanitised_address(landlord_additional_info: dict[str, str]) -> UnsanitisedAddress: - return UnsanitisedAddress( +def _make_unstandardised_address(landlord_additional_info: dict[str, str]) -> UnstandardisedAddress: + return UnstandardisedAddress( address="1 High St", postcode=Postcode("AA1 1AA"), additional_info=landlord_additional_info, @@ -29,16 +29,16 @@ def _make_unsanitised_address(landlord_additional_info: dict[str, str]) -> Unsan def _orchestrator() -> SALOrchestrator: - return SALOrchestrator(unsanitised_address_repo=_StubUnsanitisedAddressRepository()) + return SALOrchestrator(unstandardised_address_repo=_StubUnstandardisedAddressRepository()) def test_collects_every_value_per_shared_key() -> None: # arrange: every address carries the same keys, all values distinct. addresses = AddressList( [ - _make_unsanitised_address({"description": "cosy", "condition": "new"}), - _make_unsanitised_address({"description": "spacious", "condition": "worn"}), - _make_unsanitised_address({"description": "bright", "condition": "fair"}), + _make_unstandardised_address({"description": "cosy", "condition": "new"}), + _make_unstandardised_address({"description": "spacious", "condition": "worn"}), + _make_unstandardised_address({"description": "bright", "condition": "fair"}), ] ) @@ -56,9 +56,9 @@ def test_repeated_values_collapse_to_one_variant() -> None: # arrange: two addresses share the same wall description. addresses = AddressList( [ - _make_unsanitised_address({"description": "cosy"}), - _make_unsanitised_address({"description": "cosy"}), - _make_unsanitised_address({"description": "bright"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "bright"}), ] ) @@ -73,9 +73,9 @@ def test_case_only_variants_collapse_to_one() -> None: # arrange: the same description typed with inconsistent casing. addresses = AddressList( [ - _make_unsanitised_address({"description": "Cosy"}), - _make_unsanitised_address({"description": "cosy"}), - _make_unsanitised_address({"description": "COSY"}), + _make_unstandardised_address({"description": "Cosy"}), + _make_unstandardised_address({"description": "cosy"}), + _make_unstandardised_address({"description": "COSY"}), ] ) @@ -96,7 +96,7 @@ def test_empty_address_list_yields_empty_mapping() -> None: def test_single_address_yields_single_value_per_key() -> None: # arrange - addresses = AddressList([_make_unsanitised_address({"description": "cosy"})]) + addresses = AddressList([_make_unstandardised_address({"description": "cosy"})]) # act mappings = _orchestrator().get_col_to_description_mappings(addresses) diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 4317156c..d21bcfba 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -18,8 +18,8 @@ from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchest from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository from repositories.tasks.task_postgres_repository import TaskPostgresRepository -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) BUCKET = "splitter-bucket" @@ -64,7 +64,7 @@ class Harness: csv_client: CsvS3Client boto_sqs: Any queue_url: str - repo: UnsanitisedAddressListCsvS3Repository + repo: UnstandardisedAddressListCsvS3Repository @pytest.fixture @@ -78,7 +78,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: queue_url = cast(str, queue["QueueUrl"]) csv_client = CsvS3Client(boto_s3, BUCKET) - repo = UnsanitisedAddressListCsvS3Repository(csv_client, BUCKET) + repo = UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) # DB: ephemeral PostgreSQL TaskOrchestrator @@ -91,7 +91,7 @@ def harness(db_engine: Engine) -> Iterator[Harness]: splitter = PostcodeSplitterOrchestrator( task_orchestrator=task_orchestrator, - unsanitised_address_repo=repo, + unstandardised_address_repo=repo, queue_client=queue_client, max_batch_size=3, ) diff --git a/tests/repositories/unsanitised_address/__init__.py b/tests/repositories/unstandardised_address/__init__.py similarity index 100% rename from tests/repositories/unsanitised_address/__init__.py rename to tests/repositories/unstandardised_address/__init__.py diff --git a/tests/repositories/unsanitised_address/conftest.py b/tests/repositories/unstandardised_address/conftest.py similarity index 100% rename from tests/repositories/unsanitised_address/conftest.py rename to tests/repositories/unstandardised_address/conftest.py diff --git a/tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py similarity index 85% rename from tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py rename to tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py index ff26f08a..866d6f2d 100644 --- a/tests/repositories/unsanitised_address/test_unsanitised_address_list_csv_s3_repository.py +++ b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py @@ -3,11 +3,11 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from domain.addresses.unsanitised_address import AddressList, UnsanitisedAddress +from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client -from repositories.unsanitised_address.unsanitised_address_list_csv_s3_repository import ( - UnsanitisedAddressListCsvS3Repository, +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, ) from tests.infrastructure import make_boto_client @@ -15,22 +15,22 @@ BUCKET = "user-address-bucket" @pytest.fixture -def repo() -> Iterator[UnsanitisedAddressListCsvS3Repository]: +def repo() -> Iterator[UnstandardisedAddressListCsvS3Repository]: with mock_aws(): boto_client = make_boto_client("s3") boto_client.create_bucket(Bucket=BUCKET) csv_client = CsvS3Client(boto_client, BUCKET) - yield UnsanitisedAddressListCsvS3Repository(csv_client, BUCKET) + yield UnstandardisedAddressListCsvS3Repository(csv_client, BUCKET) def _upload_csv( - repo: UnsanitisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str + repo: UnstandardisedAddressListCsvS3Repository, rows: list[dict[str, str]], key: str ) -> str: return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] def test_load_batch_parses_address_postcode_and_reference( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -56,7 +56,7 @@ def test_load_batch_parses_address_postcode_and_reference( def test_load_batch_uses_only_address_1_when_others_missing( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -81,7 +81,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( def test_load_batch_handles_missing_internal_reference( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -106,10 +106,10 @@ def test_load_batch_handles_missing_internal_reference( def test_load_batch_captures_full_source_row( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, - # not just the ones it parses into UnsanitisedAddress fields. + # not just the ones it parses into UnstandardisedAddress fields. # arrange row = { "Asset Reference": "511", @@ -128,7 +128,7 @@ def test_load_batch_captures_full_source_row( def test_load_batch_raises_when_postcode_column_absent( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] @@ -140,7 +140,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange row = { @@ -169,12 +169,12 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( [ - UnsanitisedAddress( + UnstandardisedAddress( address="1 High Street", postcode=Postcode("SW1A 1AA"), additional_info={ @@ -194,7 +194,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange rows = [ @@ -227,12 +227,12 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( - repo: UnsanitisedAddressListCsvS3Repository, + repo: UnstandardisedAddressListCsvS3Repository, ) -> None: # arrange addresses = AddressList( [ - UnsanitisedAddress( + UnstandardisedAddress( address="1 High Street", postcode=Postcode("SW1A 1AA"), additional_info={ From 675aa089c937c51aa6c6b59df52aa19814e9a3de Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 14:00:33 +0000 Subject: [PATCH 15/29] updated rdsap option; seperated s3 location in infrastrucutre; added open ai api --- applications/SAL/handler.py | 37 +-- applications/postcode_splitter/handler.py | 2 +- datatypes/epc/domain/epc_property_data.py | 22 +- datatypes/epc/schema/rdsap_schema_17_0.py | 2 +- datatypes/epc/schema/rdsap_schema_17_1.py | 2 +- datatypes/epc/schema/rdsap_schema_18_0.py | 3 +- datatypes/epc/schema/rdsap_schema_19_0.py | 2 +- datatypes/epc/schema/rdsap_schema_20_0_0.py | 3 +- datatypes/epc/schema/rdsap_schema_21_0_0.py | 4 +- datatypes/epc/schema/rdsap_schema_21_0_1.py | 4 +- domain/epc/__init__.py | 4 + domain/epc/epc_record.py | 21 ++ domain/epc/property_type.py | 9 + infrastructure/epc/__init__.py | 13 ++ infrastructure/epc/epc_client.py | 41 ++++ infrastructure/epc/exceptions.py | 17 ++ infrastructure/epc/gov_uk/__init__.py | 6 + infrastructure/epc/gov_uk/_retry.py | 34 +++ .../epc/gov_uk/gov_uk_epc_client.py | 132 +++++++++++ .../epc/gov_uk/gov_uk_property_type.py | 25 +++ .../__init__.py | 5 + ...orical_open_data_communities_epc_client.py | 24 ++ infrastructure/openai/__init__.py | 0 infrastructure/openai/exceptions.py | 2 + infrastructure/openai/openai_client.py | 60 +++++ infrastructure/s3/__init__.py | 0 infrastructure/{ => s3}/csv_s3_client.py | 4 +- infrastructure/{ => s3}/s3_client.py | 0 infrastructure/{ => s3}/s3_uri.py | 0 ...dardised_address_list_csv_s3_repository.py | 2 +- tests/infrastructure/epc/__init__.py | 0 tests/infrastructure/epc/gov_uk/__init__.py | 0 tests/infrastructure/epc/gov_uk/conftest.py | 49 ++++ .../epc/gov_uk/test_gov_uk_epc_client.py | 211 ++++++++++++++++++ tests/infrastructure/test_csv_s3_client.py | 2 +- tests/infrastructure/test_s3_client.py | 2 +- tests/infrastructure/test_s3_uri.py | 2 +- .../test_postcode_splitter_orchestrator.py | 2 +- ...dardised_address_list_csv_s3_repository.py | 2 +- 39 files changed, 709 insertions(+), 41 deletions(-) create mode 100644 domain/epc/__init__.py create mode 100644 domain/epc/epc_record.py create mode 100644 domain/epc/property_type.py create mode 100644 infrastructure/epc/__init__.py create mode 100644 infrastructure/epc/epc_client.py create mode 100644 infrastructure/epc/exceptions.py create mode 100644 infrastructure/epc/gov_uk/__init__.py create mode 100644 infrastructure/epc/gov_uk/_retry.py create mode 100644 infrastructure/epc/gov_uk/gov_uk_epc_client.py create mode 100644 infrastructure/epc/gov_uk/gov_uk_property_type.py create mode 100644 infrastructure/epc/historical_open_data_communities/__init__.py create mode 100644 infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py create mode 100644 infrastructure/openai/__init__.py create mode 100644 infrastructure/openai/exceptions.py create mode 100644 infrastructure/openai/openai_client.py create mode 100644 infrastructure/s3/__init__.py rename infrastructure/{ => s3}/csv_s3_client.py (95%) rename infrastructure/{ => s3}/s3_client.py (100%) rename infrastructure/{ => s3}/s3_uri.py (100%) create mode 100644 tests/infrastructure/epc/__init__.py create mode 100644 tests/infrastructure/epc/gov_uk/__init__.py create mode 100644 tests/infrastructure/epc/gov_uk/conftest.py create mode 100644 tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index 6076a662..f354171c 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -3,12 +3,14 @@ import boto3 from orchestration.sal_orchestrator import ( SALOrchestrator, ) -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( UnstandardisedAddressListCsvS3Repository, ) from domain.addresses.unstandardised_address import AddressList +from infrastructure.epc.gov_uk import GovUkEpcClient + def handler( body: dict[str, Any], @@ -24,7 +26,9 @@ def handler( boto_s3: Any = boto3_client("s3") csv_client = CsvS3Client(boto_s3, bucket) - unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository(csv_client, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository( + csv_client, bucket + ) sal = SALOrchestrator( unstandardised_address_repo=unstandardised_address_repo, @@ -36,20 +40,17 @@ def handler( list_of_unstandardised_address=addressList ) - # Read csv of user input - # get the column and unique variations of each description - # { walls: "wall variation 1", "wall varition 2"} - # Call chatgpt(input from landlord, our way of understanding the mapping) Retrun -> lanlordMapped + """ + ---- + # TODO Property Type: + # 1) Make a small enum with all property types (5 enum) + # 2) Make an interface with ChatGPTAi to get wall field description and map it to enum + # 3) Stroe in landlord overrides + # TODO Wall Type: + # 1) Make a small enum with all property types (5 enum) + # 2) Make an interface with ChatGPTAi to get wall field description and map it to enum + # 3) Stroe in landlord overrides + --- + """ - - ENUM Walls: - cavity_wall_1976: 1 - - # 1) COuld download site notes from pashub and get - # 2) Open Data communites API -> - # 3) new api - - # User story: - # cavity: asbuilt (1976 - 1982): - - return {"hello world": ["hello world"]} + return {"hello": ["200"]} diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index ac2c4e99..e34a6af3 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -9,7 +9,7 @@ from applications.postcode_splitter.postcode_splitter_trigger_body import ( PostcodeSplitterTriggerBody, ) from infrastructure.address2uprn_queue_client import Address2UprnQueueClient -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( diff --git a/datatypes/epc/domain/epc_property_data.py b/datatypes/epc/domain/epc_property_data.py index 8795b389..68a25205 100644 --- a/datatypes/epc/domain/epc_property_data.py +++ b/datatypes/epc/domain/epc_property_data.py @@ -29,7 +29,9 @@ class MainHeatingDetail: boiler_flue_type: Optional[int] = None # TODO: make enum? boiler_ignition_type: Optional[int] = None # TODO: make enum? central_heating_pump_age: Optional[int] = None - central_heating_pump_age_str: Optional[str] = None # str from site notes e.g. "Unknown", "Pre 2013" + central_heating_pump_age_str: Optional[str] = ( + None # str from site notes e.g. "Unknown", "Pre 2013" + ) main_heating_index_number: Optional[int] = None sap_main_heating_code: Optional[int] = None # TODO: make enum? main_heating_number: Optional[int] = None @@ -54,7 +56,7 @@ class ShowerOutlets: @dataclass class SapHeating: - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] has_fixed_air_conditioning: bool cylinder_size: Optional[Union[int, str]] = ( @@ -67,7 +69,9 @@ class SapHeating: cylinder_insulation_type: Optional[Union[int, str]] = None cylinder_thermostat: Optional[str] = None secondary_fuel_type: Optional[int] = None - secondary_heating_type: Optional[Union[int, str]] = None # int from API; str from site notes + secondary_heating_type: Optional[Union[int, str]] = ( + None # int from API; str from site notes + ) cylinder_insulation_thickness_mm: Optional[int] = None @@ -75,7 +79,9 @@ class SapHeating: class SapVentilation: ventilation_type: Optional[str] = None draught_lobby: Optional[bool] = None - pressure_test: Optional[str] = None # str from site notes e.g. "No test"; int in API via mechanical_ventilation + pressure_test: Optional[str] = ( + None # str from site notes e.g. "No test"; int in API via mechanical_ventilation + ) open_flues_count: Optional[int] = None closed_flues_count: Optional[int] = None boiler_flues_count: Optional[int] = None @@ -219,8 +225,12 @@ class SapBuildingPart: None # TODO: make enum/mapping? ) floor_type: Optional[str] = None # str from site notes e.g. "Ground Floor" - floor_construction_type: Optional[str] = None # str from site notes; distinct from floor_construction: int in SapFloorDimension - floor_insulation_type_str: Optional[str] = None # str from site notes e.g. "As Built" + floor_construction_type: Optional[str] = ( + None # str from site notes; distinct from floor_construction: int in SapFloorDimension + ) + floor_insulation_type_str: Optional[str] = ( + None # str from site notes e.g. "As Built" + ) floor_u_value_known: Optional[bool] = None roof_construction: Optional[int] = None diff --git a/datatypes/epc/schema/rdsap_schema_17_0.py b/datatypes/epc/schema/rdsap_schema_17_0.py index 22aaded4..9cbedf97 100644 --- a/datatypes/epc/schema/rdsap_schema_17_0.py +++ b/datatypes/epc/schema/rdsap_schema_17_0.py @@ -37,7 +37,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] cylinder_insulation_type: int diff --git a/datatypes/epc/schema/rdsap_schema_17_1.py b/datatypes/epc/schema/rdsap_schema_17_1.py index a4c007ed..b0af07e6 100644 --- a/datatypes/epc/schema/rdsap_schema_17_1.py +++ b/datatypes/epc/schema/rdsap_schema_17_1.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] cylinder_insulation_type: int diff --git a/datatypes/epc/schema/rdsap_schema_18_0.py b/datatypes/epc/schema/rdsap_schema_18_0.py index a038dc9b..4ce2f887 100644 --- a/datatypes/epc/schema/rdsap_schema_18_0.py +++ b/datatypes/epc/schema/rdsap_schema_18_0.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -86,6 +86,7 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. floor_area is a Measurement object in schema 18.0.""" + floor_area: Measurement insulation: str roof_room_connected: str diff --git a/datatypes/epc/schema/rdsap_schema_19_0.py b/datatypes/epc/schema/rdsap_schema_19_0.py index b94d9bb3..b3c77ec4 100644 --- a/datatypes/epc/schema/rdsap_schema_19_0.py +++ b/datatypes/epc/schema/rdsap_schema_19_0.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str diff --git a/datatypes/epc/schema/rdsap_schema_20_0_0.py b/datatypes/epc/schema/rdsap_schema_20_0_0.py index 8f3986a2..9deb235e 100644 --- a/datatypes/epc/schema/rdsap_schema_20_0_0.py +++ b/datatypes/epc/schema/rdsap_schema_20_0_0.py @@ -49,7 +49,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -103,6 +103,7 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. floor_area is a plain number in schema 20.0.0 (not a Measurement object).""" + floor_area: Union[int, float] insulation: str roof_room_connected: str diff --git a/datatypes/epc/schema/rdsap_schema_21_0_0.py b/datatypes/epc/schema/rdsap_schema_21_0_0.py index eee00cb8..8d19e5f9 100644 --- a/datatypes/epc/schema/rdsap_schema_21_0_0.py +++ b/datatypes/epc/schema/rdsap_schema_21_0_0.py @@ -33,6 +33,7 @@ class ShowerOutlets: @dataclass class InstantaneousWwhrs: """Changed in 21.0.0: references WWHRS product index numbers instead of room counts.""" + wwhrs_index_number1: Optional[int] = None wwhrs_index_number2: Optional[int] = None @@ -61,7 +62,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -154,6 +155,7 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. insulation and roof_room_connected removed in schema 21.0.0.""" + floor_area: Union[int, float] construction_age_band: str diff --git a/datatypes/epc/schema/rdsap_schema_21_0_1.py b/datatypes/epc/schema/rdsap_schema_21_0_1.py index 9b3dbd1d..f6be7cc3 100644 --- a/datatypes/epc/schema/rdsap_schema_21_0_1.py +++ b/datatypes/epc/schema/rdsap_schema_21_0_1.py @@ -50,7 +50,7 @@ class MainHeatingDetail: main_heating_fraction: int main_heating_data_source: int boiler_flue_type: Optional[int] = None - fan_flue_present: Optional[str] = None # TODO: make bool + fan_flue_present: Optional[str] = None # TODO: make bool boiler_ignition_type: Optional[int] = None central_heating_pump_age: Optional[int] = None main_heating_index_number: Optional[int] = None @@ -62,7 +62,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: InstantaneousWwhrs + instantaneous_wwhrs: Optional[InstantaneousWwhrs] main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str diff --git a/domain/epc/__init__.py b/domain/epc/__init__.py new file mode 100644 index 00000000..e49fea42 --- /dev/null +++ b/domain/epc/__init__.py @@ -0,0 +1,4 @@ +from domain.epc.epc_record import EpcRecord +from domain.epc.property_type import PropertyType + +__all__ = ["EpcRecord", "PropertyType"] diff --git a/domain/epc/epc_record.py b/domain/epc/epc_record.py new file mode 100644 index 00000000..7194d1d6 --- /dev/null +++ b/domain/epc/epc_record.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +from domain.epc.property_type import PropertyType + + +@dataclass(frozen=True) +class EpcRecord: + """A streamlined record of EPC property data. + + A focused subset of the full ``EpcPropertyData``: a property's identity + plus its typed property type. Grow this with further fields as the + domain needs them. + """ + + address_line_1: str + postcode: str + uprn: Optional[int] + property_type: PropertyType diff --git a/domain/epc/property_type.py b/domain/epc/property_type.py new file mode 100644 index 00000000..707988aa --- /dev/null +++ b/domain/epc/property_type.py @@ -0,0 +1,9 @@ +from enum import Enum + + +class PropertyType(Enum): + HOUSE = "House" + BUNGALOW = "Bungalow" + FLAT = "Flat" + MAISONETTE = "Maisonette" + PARK_HOME = "Park home" diff --git a/infrastructure/epc/__init__.py b/infrastructure/epc/__init__.py new file mode 100644 index 00000000..f99a7cb3 --- /dev/null +++ b/infrastructure/epc/__init__.py @@ -0,0 +1,13 @@ +from infrastructure.epc.epc_client import EpcClient +from infrastructure.epc.exceptions import ( + EpcApiError, + EpcNotFoundError, + EpcRateLimitError, +) + +__all__ = [ + "EpcApiError", + "EpcClient", + "EpcNotFoundError", + "EpcRateLimitError", +] diff --git a/infrastructure/epc/epc_client.py b/infrastructure/epc/epc_client.py new file mode 100644 index 00000000..d1f8639c --- /dev/null +++ b/infrastructure/epc/epc_client.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Optional + +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.search import EpcSearchResult + + +class EpcClient(ABC): + """Interface for retrieving EPC (Energy Performance Certificate) data. + + Implementations fetch from a data source and return domain objects; + callers depend only on this interface, not on a concrete transport. + """ + + @abstractmethod + def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: + """Return the EPC certificates registered at ``postcode``. + + Returns an empty list when the postcode has no certificates. + """ + ... + + @abstractmethod + def get_by_certificate_number( + self, certificate_number: str + ) -> EpcPropertyData: + """Return the full EPC record for a certificate number. + + Raises EpcNotFoundError when no such certificate exists. + """ + ... + + @abstractmethod + def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: + """Return the most recent EPC record for ``uprn``. + + Returns None when the UPRN has no certificates. + """ + ... diff --git a/infrastructure/epc/exceptions.py b/infrastructure/epc/exceptions.py new file mode 100644 index 00000000..8e2e5165 --- /dev/null +++ b/infrastructure/epc/exceptions.py @@ -0,0 +1,17 @@ +from typing import Optional + + +class EpcApiError(Exception): + """Base for all EPC client errors.""" + + +class EpcNotFoundError(EpcApiError): + """Raised when the API returns 404 for a resource that must exist.""" + + +class EpcRateLimitError(EpcApiError): + """Raised when the API returns 429 and all retries are exhausted.""" + + def __init__(self, message: str, retry_after: Optional[float] = None) -> None: + super().__init__(message) + self.retry_after = retry_after diff --git a/infrastructure/epc/gov_uk/__init__.py b/infrastructure/epc/gov_uk/__init__.py new file mode 100644 index 00000000..d491a1ef --- /dev/null +++ b/infrastructure/epc/gov_uk/__init__.py @@ -0,0 +1,6 @@ +from infrastructure.epc.gov_uk.gov_uk_epc_client import GovUkEpcClient +from infrastructure.epc.gov_uk.gov_uk_property_type import ( + property_type_from_gov_uk_code, +) + +__all__ = ["GovUkEpcClient", "property_type_from_gov_uk_code"] diff --git a/infrastructure/epc/gov_uk/_retry.py b/infrastructure/epc/gov_uk/_retry.py new file mode 100644 index 00000000..db92b131 --- /dev/null +++ b/infrastructure/epc/gov_uk/_retry.py @@ -0,0 +1,34 @@ +import time +from typing import Callable, Optional, TypeVar + +from infrastructure.epc.exceptions import EpcRateLimitError + +T = TypeVar("T") + + +def call_with_retry( + fn: Callable[[], T], + max_retries: int = 5, + backoff_base: float = 1.0, + backoff_multiplier: float = 2.0, + max_backoff: float = 60.0, +) -> T: + """Call ``fn``, retrying on EpcRateLimitError with exponential backoff. + + Honours the API's ``Retry-After`` header when present, otherwise backs off + ``backoff_base * backoff_multiplier ** attempt`` (capped at ``max_backoff``). + """ + last_exc: Optional[EpcRateLimitError] = None + for attempt in range(max_retries + 1): + try: + return fn() + except EpcRateLimitError as exc: + last_exc = exc + if attempt < max_retries: + if exc.retry_after is not None: + delay = exc.retry_after + else: + delay = backoff_base * (backoff_multiplier**attempt) + time.sleep(min(delay, max_backoff)) + assert last_exc is not None + raise last_exc diff --git a/infrastructure/epc/gov_uk/gov_uk_epc_client.py b/infrastructure/epc/gov_uk/gov_uk_epc_client.py new file mode 100644 index 00000000..ac0db09f --- /dev/null +++ b/infrastructure/epc/gov_uk/gov_uk_epc_client.py @@ -0,0 +1,132 @@ +# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml +from __future__ import annotations + +from typing import Any, Optional + +import httpx + +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from datatypes.epc.search import EpcSearchResult +from infrastructure.epc.epc_client import EpcClient +from infrastructure.epc.exceptions import ( + EpcApiError, + EpcNotFoundError, + EpcRateLimitError, +) +from infrastructure.epc.gov_uk._retry import call_with_retry + + +class GovUkEpcClient(EpcClient): + """EpcClient backed by the live gov.uk EPC API. + + Endpoint: https://api.get-energy-performance-data.communities.gov.uk + """ + + BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" + REQUEST_TIMEOUT = 10.0 + + def __init__(self, auth_token: str) -> None: + self._headers = { + "Authorization": f"Bearer {auth_token}", + "Accept": "application/json", + } + + def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: + normalised = self._normalise_postcode(postcode) + return call_with_retry(lambda: self._search(postcode=normalised)) + + def get_by_certificate_number( + self, certificate_number: str + ) -> EpcPropertyData: + raw = call_with_retry(lambda: self._fetch_certificate(certificate_number)) + return EpcPropertyDataMapper.from_api_response(raw) + + def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: + results = call_with_retry(lambda: self._search(uprn=uprn)) + if not results: + return None + latest = max(results, key=lambda r: r.registration_date) + return self.get_by_certificate_number(latest.certificate_number) + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + @staticmethod + def _normalise_postcode(postcode: str) -> str: + """Return the postcode with all spaces removed and uppercased.""" + return postcode.replace(" ", "").upper() + + @staticmethod + def _parse_retry_after(resp: httpx.Response) -> Optional[float]: + header = resp.headers.get("Retry-After") + if header is None: + return None + try: + return float(header) + except (TypeError, ValueError): + return None + + def _fetch_certificate(self, certificate_number: str) -> dict[str, Any]: + resp = httpx.get( + f"{self.BASE_URL}/api/certificate", + params={"certificate_number": certificate_number}, + headers=self._headers, + timeout=self.REQUEST_TIMEOUT, + ) + if resp.status_code == 404: + raise EpcNotFoundError(certificate_number) + if resp.status_code == 429: + raise EpcRateLimitError( + "Rate limited by EPC API", + retry_after=self._parse_retry_after(resp), + ) + if not resp.is_success: + raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") + return resp.json()["data"] + + def _search( + self, + postcode: Optional[str] = None, + uprn: Optional[int] = None, + ) -> list[EpcSearchResult]: + params: dict[str, str | int] = {} + if postcode: + params["postcode"] = postcode + if uprn is not None: + params["uprn"] = uprn + + resp = httpx.get( + f"{self.BASE_URL}/api/domestic/search", + params=params, + headers=self._headers, + timeout=self.REQUEST_TIMEOUT, + ) + if resp.status_code == 404: + return [] + if resp.status_code == 429: + raise EpcRateLimitError( + "Rate limited by EPC API", + retry_after=self._parse_retry_after(resp), + ) + if not resp.is_success: + raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") + + rows = resp.json().get("data", []) + return [self._parse_search_result(row) for row in rows] + + @staticmethod + def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult: + return EpcSearchResult( + certificate_number=row["certificateNumber"], + address_line_1=row["addressLine1"], + address_line_2=row.get("addressLine2"), + address_line_3=row.get("addressLine3"), + address_line_4=row.get("addressLine4"), + postcode=row["postcode"], + post_town=row["postTown"], + uprn=row.get("uprn"), + current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], + registration_date=row["registrationDate"], + ) diff --git a/infrastructure/epc/gov_uk/gov_uk_property_type.py b/infrastructure/epc/gov_uk/gov_uk_property_type.py new file mode 100644 index 00000000..a0f4a7a3 --- /dev/null +++ b/infrastructure/epc/gov_uk/gov_uk_property_type.py @@ -0,0 +1,25 @@ +from domain.epc.property_type import PropertyType + +# GOV.UK EPC API ``property_type`` integer codes mapped to the domain type. +# This translation is GOV.UK-specific and lives in the infrastructure layer so +# the domain ``PropertyType`` stays free of any source encoding. +_PROPERTY_TYPE_BY_GOV_UK_CODE: dict[int, PropertyType] = { + 0: PropertyType.HOUSE, + 1: PropertyType.BUNGALOW, + 2: PropertyType.FLAT, + 3: PropertyType.MAISONETTE, + 4: PropertyType.PARK_HOME, +} + + +def property_type_from_gov_uk_code(code: int) -> PropertyType: + """Translate a GOV.UK EPC ``property_type`` code to the domain PropertyType. + + Raises ValueError for a code GOV.UK has not been mapped here yet. + """ + try: + return _PROPERTY_TYPE_BY_GOV_UK_CODE[code] + except KeyError: + raise ValueError( + f"Unknown GOV.UK EPC property type code: {code}" + ) from None diff --git a/infrastructure/epc/historical_open_data_communities/__init__.py b/infrastructure/epc/historical_open_data_communities/__init__.py new file mode 100644 index 00000000..88a69081 --- /dev/null +++ b/infrastructure/epc/historical_open_data_communities/__init__.py @@ -0,0 +1,5 @@ +from infrastructure.epc.historical_open_data_communities.historical_open_data_communities_epc_client import ( + HistoricalOpenDataCommunitiesEpcClient, +) + +__all__ = ["HistoricalOpenDataCommunitiesEpcClient"] diff --git a/infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py b/infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py new file mode 100644 index 00000000..d8c7f9ac --- /dev/null +++ b/infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from typing import Optional + +from domain.epc.epc_record import EpcRecord + + +class HistoricalOpenDataCommunitiesEpcClient: + """EPC client backed by Open Data Communities' historical EPC data. + + Stub — not yet implemented. Every method raises NotImplementedError for + now. Unlike GovUkEpcClient it returns the domain ``EpcRecord`` directly; + once the ``EpcClient`` port is migrated to return ``EpcRecord``, this + adapter should implement it. + """ + + def search_by_postcode(self, postcode: str) -> list[EpcRecord]: + raise NotImplementedError + + def get_by_certificate_number(self, certificate_number: str) -> EpcRecord: + raise NotImplementedError + + def get_by_uprn(self, uprn: int) -> Optional[EpcRecord]: + raise NotImplementedError diff --git a/infrastructure/openai/__init__.py b/infrastructure/openai/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/openai/exceptions.py b/infrastructure/openai/exceptions.py new file mode 100644 index 00000000..14cf95a2 --- /dev/null +++ b/infrastructure/openai/exceptions.py @@ -0,0 +1,2 @@ +class OpenAiClientError(Exception): + """Base for all OpenAI client errors.""" diff --git a/infrastructure/openai/openai_client.py b/infrastructure/openai/openai_client.py new file mode 100644 index 00000000..34af4290 --- /dev/null +++ b/infrastructure/openai/openai_client.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import os +from typing import Optional + +from openai import OpenAI +from openai.types.chat import ChatCompletionMessageParam + +from infrastructure.openai.exceptions import OpenAiClientError + + +class OpenAiChatClient: + """Thin wrapper over the OpenAI Chat Completions API. + + Sends a single prompt and returns the assistant's reply as plain text. + """ + + DEFAULT_MODEL = "gpt-4o-mini" + + def __init__( + self, + api_key: Optional[str] = None, + model: Optional[str] = None, + ) -> None: + key = api_key or os.environ.get("OPENAI_API_KEY") + if not key: + raise OpenAiClientError( + "No OpenAI API key provided. " + "Pass api_key or set the OPENAI_API_KEY environment variable." + ) + self._client = OpenAI(api_key=key) + self._model = model or self.DEFAULT_MODEL + + def generate( + self, + prompt: str, + system_prompt: Optional[str] = None, + ) -> str: + """Send a prompt to the model and return its reply text. + + Args: + prompt: The user message to send. + system_prompt: Optional instruction that sets the model's behaviour. + + Raises: + OpenAiClientError: If the model returns an empty response. + """ + messages: list[ChatCompletionMessageParam] = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + response = self._client.chat.completions.create( + model=self._model, + messages=messages, + ) + content = response.choices[0].message.content + if content is None: + raise OpenAiClientError("OpenAI returned an empty response.") + return content diff --git a/infrastructure/s3/__init__.py b/infrastructure/s3/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/csv_s3_client.py b/infrastructure/s3/csv_s3_client.py similarity index 95% rename from infrastructure/csv_s3_client.py rename to infrastructure/s3/csv_s3_client.py index d058ba53..67c9a8d4 100644 --- a/infrastructure/csv_s3_client.py +++ b/infrastructure/s3/csv_s3_client.py @@ -1,8 +1,8 @@ import csv from io import StringIO -from infrastructure.s3_client import S3Client -from infrastructure.s3_uri import parse_s3_uri +from infrastructure.s3.s3_client import S3Client +from infrastructure.s3.s3_uri import parse_s3_uri def _dedupe_fieldnames(fieldnames: list[str]) -> list[str]: diff --git a/infrastructure/s3_client.py b/infrastructure/s3/s3_client.py similarity index 100% rename from infrastructure/s3_client.py rename to infrastructure/s3/s3_client.py diff --git a/infrastructure/s3_uri.py b/infrastructure/s3/s3_uri.py similarity index 100% rename from infrastructure/s3_uri.py rename to infrastructure/s3/s3_uri.py diff --git a/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py index 260fce1d..20bae20c 100644 --- a/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py +++ b/repositories/unstandardised_address/unstandardised_address_list_csv_s3_repository.py @@ -6,7 +6,7 @@ from typing import Optional from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from repositories.unstandardised_address.unstandardised_address_list_repository import ( UnstandardisedAddressListRepository, ) diff --git a/tests/infrastructure/epc/__init__.py b/tests/infrastructure/epc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/infrastructure/epc/gov_uk/__init__.py b/tests/infrastructure/epc/gov_uk/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/infrastructure/epc/gov_uk/conftest.py b/tests/infrastructure/epc/gov_uk/conftest.py new file mode 100644 index 00000000..8fbd3094 --- /dev/null +++ b/tests/infrastructure/epc/gov_uk/conftest.py @@ -0,0 +1,49 @@ +import json +import pathlib + +import pytest + +from infrastructure.epc.gov_uk.gov_uk_epc_client import GovUkEpcClient + +SAMPLES_DIR = pathlib.Path("backend/epc_api/json_samples") + + +@pytest.fixture +def rdsap_21_0_0_cert(): + return json.loads((SAMPLES_DIR / "RdSAP-Schema-21.0.0/epc.json").read_text()) + + +@pytest.fixture +def rdsap_21_0_1_cert(): + return json.loads((SAMPLES_DIR / "RdSAP-Schema-21.0.1/epc.json").read_text()) + + +@pytest.fixture +def epc_client(): + return GovUkEpcClient(auth_token="test-token") + + +def make_search_row( + cert_num="CERT-001", + address_line_1="1 Test Street", + postcode="SW1A 1AA", + post_town="London", + uprn=100023336956, + band="D", + registration_date="2024-01-01", + address_line_2=None, + address_line_3=None, + address_line_4=None, +): + return { + "certificateNumber": cert_num, + "addressLine1": address_line_1, + "addressLine2": address_line_2, + "addressLine3": address_line_3, + "addressLine4": address_line_4, + "postcode": postcode, + "postTown": post_town, + "uprn": uprn, + "currentEnergyEfficiencyBand": band, + "registrationDate": registration_date, + } diff --git a/tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py b/tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py new file mode 100644 index 00000000..46164a0e --- /dev/null +++ b/tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py @@ -0,0 +1,211 @@ +from unittest.mock import MagicMock, call, patch + +import pytest + +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.search import EpcSearchResult +from infrastructure.epc.exceptions import EpcNotFoundError +from tests.infrastructure.epc.gov_uk.conftest import make_search_row + +_SLEEP = "infrastructure.epc.gov_uk._retry.time.sleep" + + +def _mock_response(status_code=200, json_data=None, headers=None): + resp = MagicMock() + resp.status_code = status_code + resp.is_success = 200 <= status_code < 300 + resp.json.return_value = json_data or {} + resp.text = str(json_data) + resp.headers = headers or {} + return resp + + +# --------------------------------------------------------------------------- +# Test 1: get_by_certificate_number happy path +# --------------------------------------------------------------------------- + + +def test_get_by_certificate_number_returns_epc_property_data( + epc_client, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + with patch("httpx.get", return_value=_mock_response(200, cert_response)): + result = epc_client.get_by_certificate_number("CERT-001") + + assert isinstance(result, EpcPropertyData) + + +# --------------------------------------------------------------------------- +# Test 2: get_by_certificate_number 404 -> EpcNotFoundError +# --------------------------------------------------------------------------- + + +def test_get_by_certificate_number_404_raises_not_found(epc_client): + with patch("httpx.get", return_value=_mock_response(404)): + with pytest.raises(EpcNotFoundError): + epc_client.get_by_certificate_number("BAD-CERT") + + +# --------------------------------------------------------------------------- +# Test 3: 429 retried, succeeds on 3rd attempt +# --------------------------------------------------------------------------- + + +def test_get_by_certificate_number_retries_on_429_and_succeeds( + epc_client, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429), + _mock_response(429), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch(_SLEEP): + result = epc_client.get_by_certificate_number("CERT-001") + + assert isinstance(result, EpcPropertyData) + + +# --------------------------------------------------------------------------- +# Test 3b: 429 with Retry-After header -> sleeps for that value +# --------------------------------------------------------------------------- + + +def test_429_retry_after_header_drives_sleep_duration( + epc_client, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429, headers={"Retry-After": "7"}), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: + epc_client.get_by_certificate_number("CERT-001") + + mock_sleep.assert_called_once_with(7.0) + + +# --------------------------------------------------------------------------- +# Test 3c: 429 without Retry-After -> falls back to exponential backoff +# --------------------------------------------------------------------------- + + +def test_429_without_retry_after_uses_exponential_backoff( + epc_client, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429), + _mock_response(429), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: + epc_client.get_by_certificate_number("CERT-001") + + assert mock_sleep.call_args_list == [call(1.0), call(2.0)] + + +# --------------------------------------------------------------------------- +# Test 3d: malformed Retry-After header -> falls back to exponential backoff +# --------------------------------------------------------------------------- + + +def test_429_malformed_retry_after_falls_back_to_backoff( + epc_client, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429, headers={"Retry-After": "Wed, 21 Oct 2026 07:28:00 GMT"}), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: + epc_client.get_by_certificate_number("CERT-001") + + mock_sleep.assert_called_once_with(1.0) + + +# --------------------------------------------------------------------------- +# Test 3e: Retry-After capped by max_backoff to avoid hostile/buggy values +# --------------------------------------------------------------------------- + + +def test_429_retry_after_capped_by_max_backoff(epc_client, rdsap_21_0_1_cert): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429, headers={"Retry-After": "9999"}), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: + epc_client.get_by_certificate_number("CERT-001") + + mock_sleep.assert_called_once_with(60.0) + + +# --------------------------------------------------------------------------- +# Test 4: get_by_uprn empty search -> None +# --------------------------------------------------------------------------- + + +def test_get_by_uprn_returns_none_when_no_results(epc_client): + with patch("httpx.get", return_value=_mock_response(200, {"data": []})): + result = epc_client.get_by_uprn(100023336956) + + assert result is None + + +# --------------------------------------------------------------------------- +# Test 5: get_by_uprn multiple results -> fetches latest by registration_date +# --------------------------------------------------------------------------- + + +def test_get_by_uprn_picks_most_recent_certificate(epc_client, rdsap_21_0_1_cert): + search_rows = [ + make_search_row(cert_num="CERT-OLD", registration_date="2022-01-01"), + make_search_row(cert_num="CERT-NEW", registration_date="2024-06-01"), + make_search_row(cert_num="CERT-MID", registration_date="2023-03-15"), + ] + cert_response = {"data": rdsap_21_0_1_cert} + + def fake_get(url, params=None, **kwargs): + if "search" in url: + return _mock_response(200, {"data": search_rows}) + return _mock_response(200, cert_response) + + with patch("httpx.get", side_effect=fake_get) as mock_get: + result = epc_client.get_by_uprn(100023336956) + + assert isinstance(result, EpcPropertyData) + # Second call must be for the most recent cert + cert_call = mock_get.call_args_list[1] + assert cert_call.kwargs["params"]["certificate_number"] == "CERT-NEW" + + +# --------------------------------------------------------------------------- +# Test 6: search_by_postcode returns list[EpcSearchResult] +# --------------------------------------------------------------------------- + + +def test_search_by_postcode_returns_results(epc_client): + rows = [ + make_search_row(cert_num="CERT-A", address_line_1="1 High Street"), + make_search_row(cert_num="CERT-B", address_line_1="2 High Street"), + ] + with patch("httpx.get", return_value=_mock_response(200, {"data": rows})): + results = epc_client.search_by_postcode("SW1A 1AA") + + assert len(results) == 2 + assert all(isinstance(r, EpcSearchResult) for r in results) + assert results[0].certificate_number == "CERT-A" + assert results[1].address_line_1 == "2 High Street" + + +# --------------------------------------------------------------------------- +# Test 7: search_by_postcode 404 -> empty list +# --------------------------------------------------------------------------- + + +def test_search_by_postcode_404_returns_empty_list(epc_client): + with patch("httpx.get", return_value=_mock_response(404)): + results = epc_client.search_by_postcode("ZZ9 9ZZ") + + assert results == [] diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py index e7ec7eab..048a1cbe 100644 --- a/tests/infrastructure/test_csv_s3_client.py +++ b/tests/infrastructure/test_csv_s3_client.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from tests.infrastructure import make_boto_client BUCKET = "csv-bucket" diff --git a/tests/infrastructure/test_s3_client.py b/tests/infrastructure/test_s3_client.py index 67db4f58..bdac6be1 100644 --- a/tests/infrastructure/test_s3_client.py +++ b/tests/infrastructure/test_s3_client.py @@ -3,7 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws -from infrastructure.s3_client import S3Client +from infrastructure.s3.s3_client import S3Client from tests.infrastructure import make_boto_client BUCKET = "test-bucket" diff --git a/tests/infrastructure/test_s3_uri.py b/tests/infrastructure/test_s3_uri.py index 32fd710f..f0865865 100644 --- a/tests/infrastructure/test_s3_uri.py +++ b/tests/infrastructure/test_s3_uri.py @@ -1,6 +1,6 @@ import pytest -from infrastructure.s3_uri import parse_s3_uri +from infrastructure.s3.s3_uri import parse_s3_uri def test_parses_simple_s3_uri() -> None: diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index d21bcfba..9ad56094 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -13,7 +13,7 @@ from sqlalchemy import Engine from sqlmodel import Session from infrastructure.address2uprn_queue_client import Address2UprnQueueClient -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator from orchestration.task_orchestrator import TaskOrchestrator from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository diff --git a/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py index 866d6f2d..f86878c3 100644 --- a/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py +++ b/tests/repositories/unstandardised_address/test_unstandardised_address_list_csv_s3_repository.py @@ -5,7 +5,7 @@ from moto import mock_aws from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode -from infrastructure.csv_s3_client import CsvS3Client +from infrastructure.s3.csv_s3_client import CsvS3Client from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( UnstandardisedAddressListCsvS3Repository, ) From c887153292e2581c217f9374648d5181ae84b260 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 14:07:10 +0000 Subject: [PATCH 16/29] renamed to chatgpt --- infrastructure/{openai => chatgpt}/__init__.py | 0 .../{openai/openai_client.py => chatgpt/chatgpt.py} | 8 ++++---- infrastructure/{openai => chatgpt}/exceptions.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename infrastructure/{openai => chatgpt}/__init__.py (100%) rename infrastructure/{openai/openai_client.py => chatgpt/chatgpt.py} (89%) rename infrastructure/{openai => chatgpt}/exceptions.py (54%) diff --git a/infrastructure/openai/__init__.py b/infrastructure/chatgpt/__init__.py similarity index 100% rename from infrastructure/openai/__init__.py rename to infrastructure/chatgpt/__init__.py diff --git a/infrastructure/openai/openai_client.py b/infrastructure/chatgpt/chatgpt.py similarity index 89% rename from infrastructure/openai/openai_client.py rename to infrastructure/chatgpt/chatgpt.py index 34af4290..ee2a5b39 100644 --- a/infrastructure/openai/openai_client.py +++ b/infrastructure/chatgpt/chatgpt.py @@ -6,10 +6,10 @@ from typing import Optional from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam -from infrastructure.openai.exceptions import OpenAiClientError +from infrastructure.chatgpt.exceptions import ChatGPTClientError -class OpenAiChatClient: +class ChatGPT: """Thin wrapper over the OpenAI Chat Completions API. Sends a single prompt and returns the assistant's reply as plain text. @@ -24,7 +24,7 @@ class OpenAiChatClient: ) -> None: key = api_key or os.environ.get("OPENAI_API_KEY") if not key: - raise OpenAiClientError( + raise ChatGPTClientError( "No OpenAI API key provided. " "Pass api_key or set the OPENAI_API_KEY environment variable." ) @@ -56,5 +56,5 @@ class OpenAiChatClient: ) content = response.choices[0].message.content if content is None: - raise OpenAiClientError("OpenAI returned an empty response.") + raise ChatGPTClientError("ChatGPT returned an empty response.") return content diff --git a/infrastructure/openai/exceptions.py b/infrastructure/chatgpt/exceptions.py similarity index 54% rename from infrastructure/openai/exceptions.py rename to infrastructure/chatgpt/exceptions.py index 14cf95a2..31663f3d 100644 --- a/infrastructure/openai/exceptions.py +++ b/infrastructure/chatgpt/exceptions.py @@ -1,2 +1,2 @@ -class OpenAiClientError(Exception): +class ChatGPTClientError(Exception): """Base for all OpenAI client errors.""" From e23bcd7e138c08205471b49675faf2f5fa433068 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 14:51:28 +0000 Subject: [PATCH 17/29] chatgpt interface scaffold --- UBIQUITOUS_LANGUAGE.md | 6 ++++++ applications/SAL/handler.py | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index d2fde99a..34dc3115 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -49,6 +49,12 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve | **New EPC API** | The replacement government API (`api.get-energy-performance-data.communities.gov.uk`) using Bearer token auth. | "new API", "current API" | | **Bearer Token** | The auth credential required by the new EPC API; stored in the `EPC_AUTH_TOKEN` environment variable. | "API key", "auth token", "secret" | +## Methodology + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **DDD** | Domain-Driven Design — the design approach this glossary supports, modelling software around a shared domain language. | "domain design", "driven design" | + ## Relationships - An **EPC** belongs to exactly one **Dwelling** and has one **Certificate Number**. diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index f354171c..af3aa90f 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -9,8 +9,6 @@ from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repo ) from domain.addresses.unstandardised_address import AddressList -from infrastructure.epc.gov_uk import GovUkEpcClient - def handler( body: dict[str, Any], @@ -36,6 +34,11 @@ def handler( addressList: AddressList = sal.get_unstandardised_addresses(input_s3_uri=s3_uri) + column_mapping = { + # "Wall Description": "Walls", + "Property Type": "Property Type", + } + col_to_desc_map = sal.get_col_to_description_mappings( list_of_unstandardised_address=addressList ) From d0e5aa9e3f7ccb8c63b1799671b7fb54f2af6862 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 14:53:31 +0000 Subject: [PATCH 18/29] =?UTF-8?q?Classify=20a=20landlord=20description=20i?= =?UTF-8?q?nto=20a=20SAL=20property=20type=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- domain/sal/__init__.py | 0 domain/sal/property_type.py | 25 ++++++++++++ domain/sal/property_type_classifier.py | 27 +++++++++++++ .../chatgpt_property_type_classifier.py | 38 +++++++++++++++++++ tests/infrastructure/chatgpt/__init__.py | 0 .../test_chatgpt_property_type_classifier.py | 33 ++++++++++++++++ 6 files changed, 123 insertions(+) create mode 100644 domain/sal/__init__.py create mode 100644 domain/sal/property_type.py create mode 100644 domain/sal/property_type_classifier.py create mode 100644 infrastructure/chatgpt/chatgpt_property_type_classifier.py create mode 100644 tests/infrastructure/chatgpt/__init__.py create mode 100644 tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py diff --git a/domain/sal/__init__.py b/domain/sal/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/sal/property_type.py b/domain/sal/property_type.py new file mode 100644 index 00000000..9659639a --- /dev/null +++ b/domain/sal/property_type.py @@ -0,0 +1,25 @@ +from enum import Enum + + +class PropertyType(Enum): + """A landlord-supplied property type, as resolved by the SAL context. + + Distinct from the EPC context's ``PropertyType``: a landlord CSV value + may be unresolvable, so this enum carries an explicit ``UNKNOWN`` member. + """ + + HOUSE = "House" + BUNGALOW = "Bungalow" + FLAT = "Flat" + MAISONETTE = "Maisonette" + PARK_HOME = "Park home" + UNKNOWN = "Unknown" + + +class PropertyTypeClassificationError(Exception): + """Raised when property-type classification fails wholesale. + + A whole-batch failure (the AI backend is unreachable, or returns a reply + that cannot be parsed) raises this. A single description that merely + cannot be resolved is not an error -- it maps to ``PropertyType.UNKNOWN``. + """ diff --git a/domain/sal/property_type_classifier.py b/domain/sal/property_type_classifier.py new file mode 100644 index 00000000..af941e83 --- /dev/null +++ b/domain/sal/property_type_classifier.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from domain.sal.property_type import PropertyType + + +class PropertyTypeClassifier(ABC): + """Port: resolves free-text descriptions into SAL ``PropertyType`` values. + + Implementations decide *how* (an LLM, a lookup table, a rules engine); + ``SALOrchestrator`` depends only on this interface. + """ + + @abstractmethod + def classify(self, descriptions: set[str]) -> dict[str, PropertyType]: + """Classify each description into a ``PropertyType``. + + Every input description appears as a key in the result. A description + that cannot be resolved maps to ``PropertyType.UNKNOWN``. + + Raises: + PropertyTypeClassificationError: If the classification call fails + wholesale (e.g. the backend is unreachable or returns an + unparseable response). + """ + ... diff --git a/infrastructure/chatgpt/chatgpt_property_type_classifier.py b/infrastructure/chatgpt/chatgpt_property_type_classifier.py new file mode 100644 index 00000000..d4f0c060 --- /dev/null +++ b/infrastructure/chatgpt/chatgpt_property_type_classifier.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import json +from typing import Any + +from domain.sal.property_type import PropertyType +from domain.sal.property_type_classifier import PropertyTypeClassifier +from infrastructure.chatgpt.chatgpt import ChatGPT + + +class ChatGptPropertyTypeClassifier(PropertyTypeClassifier): + """PropertyTypeClassifier backed by the ChatGPT client.""" + + _CATEGORIES = ", ".join( + member.value + for member in PropertyType + if member is not PropertyType.UNKNOWN + ) + _SYSTEM_PROMPT = ( + "Classify each UK property description into exactly one category. " + f"Categories: {_CATEGORIES}. " + "Reply with only a JSON object mapping each original description " + "to its category, and nothing else." + ) + + def __init__(self, chat_gpt: ChatGPT) -> None: + self._chat_gpt = chat_gpt + + def classify(self, descriptions: set[str]) -> dict[str, PropertyType]: + reply = self._chat_gpt.generate( + prompt=json.dumps(sorted(descriptions)), + system_prompt=self._SYSTEM_PROMPT, + ) + raw: dict[str, Any] = json.loads(reply) + return { + description: PropertyType(raw[description]) + for description in descriptions + } diff --git a/tests/infrastructure/chatgpt/__init__.py b/tests/infrastructure/chatgpt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py new file mode 100644 index 00000000..8c697eb2 --- /dev/null +++ b/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import Optional + +from domain.sal.property_type import PropertyType +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_property_type_classifier import ( + ChatGptPropertyTypeClassifier, +) + + +class _FakeChatGPT(ChatGPT): + """Hand-written ChatGPT stand-in: returns a canned reply, records prompts.""" + + def __init__(self, reply: str = "{}") -> None: + self.prompts: list[str] = [] + self._reply = reply + + def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str: + self.prompts.append(prompt) + return self._reply + + +def test_classifies_description_into_property_type() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = ChatGptPropertyTypeClassifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached"}) + + # Assert + assert result == {"semi-detached": PropertyType.HOUSE} From 11a498ba4e76a56a6797b2f99081f2bf84d8fb0f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 14:55:01 +0000 Subject: [PATCH 19/29] =?UTF-8?q?Map=20an=20unrecognised=20classification?= =?UTF-8?q?=20reply=20to=20UNKNOWN=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chatgpt/chatgpt_property_type_classifier.py | 10 +++++++++- .../chatgpt/test_chatgpt_property_type_classifier.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/infrastructure/chatgpt/chatgpt_property_type_classifier.py b/infrastructure/chatgpt/chatgpt_property_type_classifier.py index d4f0c060..75ec1556 100644 --- a/infrastructure/chatgpt/chatgpt_property_type_classifier.py +++ b/infrastructure/chatgpt/chatgpt_property_type_classifier.py @@ -33,6 +33,14 @@ class ChatGptPropertyTypeClassifier(PropertyTypeClassifier): ) raw: dict[str, Any] = json.loads(reply) return { - description: PropertyType(raw[description]) + description: self._to_property_type(raw[description]) for description in descriptions } + + @staticmethod + def _to_property_type(value: Any) -> PropertyType: + """Map a reply value to a PropertyType, defaulting to UNKNOWN.""" + try: + return PropertyType(value) + except ValueError: + return PropertyType.UNKNOWN diff --git a/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py index 8c697eb2..d4801154 100644 --- a/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py +++ b/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py @@ -31,3 +31,15 @@ def test_classifies_description_into_property_type() -> None: # Assert assert result == {"semi-detached": PropertyType.HOUSE} + + +def test_unrecognised_category_maps_to_unknown() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"garden shed": "Shed"}') + classifier = ChatGptPropertyTypeClassifier(chat_gpt) + + # Act + result = classifier.classify({"garden shed"}) + + # Assert + assert result == {"garden shed": PropertyType.UNKNOWN} From a747534f377ecaab6e518b6e8eb186fde4c6bfde Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 15:28:26 +0000 Subject: [PATCH 20/29] refactored to allow multiple column types --- applications/SAL/handler.py | 48 ++++--- domain/sal/column_classifier.py | 39 +++++ domain/sal/property_type.py | 9 -- domain/sal/property_type_classifier.py | 27 ---- domain/sal/wall_type.py | 15 ++ .../chatgpt/chatgpt_column_classifier.py | 85 +++++++++++ .../chatgpt_property_type_classifier.py | 46 ------ orchestration/sal_orchestrator.py | 39 ++++- .../chatgpt/test_chatgpt_column_classifier.py | 135 ++++++++++++++++++ .../test_chatgpt_property_type_classifier.py | 45 ------ ...lord_description_overrides_orchestrator.py | 85 ++++++++++- 11 files changed, 420 insertions(+), 153 deletions(-) create mode 100644 domain/sal/column_classifier.py delete mode 100644 domain/sal/property_type_classifier.py create mode 100644 domain/sal/wall_type.py create mode 100644 infrastructure/chatgpt/chatgpt_column_classifier.py delete mode 100644 infrastructure/chatgpt/chatgpt_property_type_classifier.py create mode 100644 tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py delete mode 100644 tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py index af3aa90f..c1d73827 100644 --- a/applications/SAL/handler.py +++ b/applications/SAL/handler.py @@ -1,4 +1,6 @@ +import logging from typing import Any + import boto3 from orchestration.sal_orchestrator import ( SALOrchestrator, @@ -8,6 +10,15 @@ from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repo UnstandardisedAddressListCsvS3Repository, ) from domain.addresses.unstandardised_address import AddressList +from domain.sal.column_classifier import ColumnClassifier +from domain.sal.property_type import PropertyType +from domain.sal.wall_type import WallType +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_column_classifier import ( + ChatGptColumnClassifier, +) + +logger = logging.getLogger(__name__) def handler( @@ -28,32 +39,31 @@ def handler( csv_client, bucket ) + # One ChatGPT-backed classifier per landlord-CSV column, keyed by column name. + chat_gpt = ChatGPT() + classifiers: dict[str, ColumnClassifier[Any]] = { + "Property Type": ChatGptColumnClassifier( + chat_gpt, PropertyType, PropertyType.UNKNOWN + ), + "Walls": ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN), + } + sal = SALOrchestrator( unstandardised_address_repo=unstandardised_address_repo, + classifiers=classifiers, ) addressList: AddressList = sal.get_unstandardised_addresses(input_s3_uri=s3_uri) - column_mapping = { - # "Wall Description": "Walls", - "Property Type": "Property Type", - } + # Cap the batch to the first 20 while the ChatGPT path is under test. + addressList = AddressList(addressList[:20]) - col_to_desc_map = sal.get_col_to_description_mappings( - list_of_unstandardised_address=addressList - ) + classified = sal.classify_columns(addressList) + for column, mapping in classified.items(): + logger.info( + "Classified %d descriptions for column %r.", len(mapping), column + ) - """ - ---- - # TODO Property Type: - # 1) Make a small enum with all property types (5 enum) - # 2) Make an interface with ChatGPTAi to get wall field description and map it to enum - # 3) Stroe in landlord overrides - # TODO Wall Type: - # 1) Make a small enum with all property types (5 enum) - # 2) Make an interface with ChatGPTAi to get wall field description and map it to enum - # 3) Stroe in landlord overrides - --- - """ + # TODO: persist `classified` to landlord overrides. return {"hello": ["200"]} diff --git a/domain/sal/column_classifier.py b/domain/sal/column_classifier.py new file mode 100644 index 00000000..3324d79f --- /dev/null +++ b/domain/sal/column_classifier.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Generic, TypeVar + +E = TypeVar("E", bound=Enum) + + +class ClassificationError(Exception): + """Raised when classifying a column's descriptions fails wholesale. + + A whole-batch failure (the AI backend is unreachable, or returns a reply + that cannot be parsed) raises this. A single description that merely + cannot be resolved is not an error -- it maps to the enum's UNKNOWN member. + """ + + +class ColumnClassifier(ABC, Generic[E]): + """Port: resolves free-text descriptions into a category enum ``E``. + + One classifier handles one landlord-CSV column. Implementations decide + *how* the mapping is performed (an LLM, a lookup table, a rules engine); + ``SALOrchestrator`` depends only on this interface. + """ + + @abstractmethod + def classify(self, descriptions: set[str]) -> dict[str, E]: + """Classify each description into a category enum member. + + Every input description appears as a key in the result. A description + that cannot be resolved maps to the enum's UNKNOWN member. + + Raises: + ClassificationError: If the classification call fails wholesale + (e.g. the backend is unreachable or returns an unparseable + response). + """ + ... diff --git a/domain/sal/property_type.py b/domain/sal/property_type.py index 9659639a..3980c2f0 100644 --- a/domain/sal/property_type.py +++ b/domain/sal/property_type.py @@ -14,12 +14,3 @@ class PropertyType(Enum): MAISONETTE = "Maisonette" PARK_HOME = "Park home" UNKNOWN = "Unknown" - - -class PropertyTypeClassificationError(Exception): - """Raised when property-type classification fails wholesale. - - A whole-batch failure (the AI backend is unreachable, or returns a reply - that cannot be parsed) raises this. A single description that merely - cannot be resolved is not an error -- it maps to ``PropertyType.UNKNOWN``. - """ diff --git a/domain/sal/property_type_classifier.py b/domain/sal/property_type_classifier.py deleted file mode 100644 index af941e83..00000000 --- a/domain/sal/property_type_classifier.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -from domain.sal.property_type import PropertyType - - -class PropertyTypeClassifier(ABC): - """Port: resolves free-text descriptions into SAL ``PropertyType`` values. - - Implementations decide *how* (an LLM, a lookup table, a rules engine); - ``SALOrchestrator`` depends only on this interface. - """ - - @abstractmethod - def classify(self, descriptions: set[str]) -> dict[str, PropertyType]: - """Classify each description into a ``PropertyType``. - - Every input description appears as a key in the result. A description - that cannot be resolved maps to ``PropertyType.UNKNOWN``. - - Raises: - PropertyTypeClassificationError: If the classification call fails - wholesale (e.g. the backend is unreachable or returns an - unparseable response). - """ - ... diff --git a/domain/sal/wall_type.py b/domain/sal/wall_type.py new file mode 100644 index 00000000..05dc2ba9 --- /dev/null +++ b/domain/sal/wall_type.py @@ -0,0 +1,15 @@ +from enum import Enum + + +class WallType(Enum): + """A landlord-supplied wall construction type, as resolved by the SAL context. + + Mirrors the main RdSAP wall constructions. Like the SAL ``PropertyType``, + it carries an explicit ``UNKNOWN`` member for unresolvable CSV values. + """ + + CAVITY = "Cavity" + SOLID_BRICK = "Solid Brick" + TIMBER_FRAME = "Timber frame" + SANDSTONE = "Sandstone" + UNKNOWN = "Unknown" diff --git a/infrastructure/chatgpt/chatgpt_column_classifier.py b/infrastructure/chatgpt/chatgpt_column_classifier.py new file mode 100644 index 00000000..8f564e6c --- /dev/null +++ b/infrastructure/chatgpt/chatgpt_column_classifier.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import json +from enum import Enum +from typing import Any, TypeVar + +from domain.sal.column_classifier import ClassificationError, ColumnClassifier +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.exceptions import ChatGPTClientError + +E = TypeVar("E", bound=Enum) + + +class ChatGptColumnClassifier(ColumnClassifier[E]): + """ColumnClassifier backed by ChatGPT, parametrised by a category enum. + + The same classification path -- prompt, JSON parsing, UNKNOWN fallback -- + serves any category enum; only ``category_enum`` and its ``unknown`` + member differ between columns. + """ + + def __init__( + self, + chat_gpt: ChatGPT, + category_enum: type[E], + unknown: E, + ) -> None: + self._chat_gpt = chat_gpt + self._category_enum = category_enum + self._unknown = unknown + + def classify(self, descriptions: set[str]) -> dict[str, E]: + if not descriptions: + return {} + try: + reply = self._chat_gpt.generate( + prompt=json.dumps(sorted(descriptions)), + system_prompt=self._system_prompt(), + ) + except ChatGPTClientError as error: + raise ClassificationError( + f"ChatGPT classification failed for " + f"{self._category_enum.__name__}." + ) from error + try: + raw: dict[str, Any] = json.loads(self._strip_code_fence(reply)) + except json.JSONDecodeError as error: + raise ClassificationError( + f"ChatGPT returned a reply that is not valid JSON: {reply!r}" + ) from error + return { + description: self._to_category(raw.get(description)) + for description in descriptions + } + + def _system_prompt(self) -> str: + categories = ", ".join( + member.value + for member in self._category_enum + if member is not self._unknown + ) + return ( + "Classify each free-text description into exactly one category. " + f"Categories: {categories}. " + "Reply with only a JSON object mapping each original description " + "to its category, and nothing else." + ) + + def _to_category(self, value: Any) -> E: + """Map a reply value to a category member, defaulting to UNKNOWN.""" + try: + return self._category_enum(value) + except ValueError: + return self._unknown + + @staticmethod + def _strip_code_fence(reply: str) -> str: + """Remove a surrounding markdown code fence, if ChatGPT added one.""" + text = reply.strip() + if not text.startswith("```"): + return text + lines = text.splitlines()[1:] + if lines and lines[-1].strip() == "```": + lines = lines[:-1] + return "\n".join(lines) diff --git a/infrastructure/chatgpt/chatgpt_property_type_classifier.py b/infrastructure/chatgpt/chatgpt_property_type_classifier.py deleted file mode 100644 index 75ec1556..00000000 --- a/infrastructure/chatgpt/chatgpt_property_type_classifier.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import annotations - -import json -from typing import Any - -from domain.sal.property_type import PropertyType -from domain.sal.property_type_classifier import PropertyTypeClassifier -from infrastructure.chatgpt.chatgpt import ChatGPT - - -class ChatGptPropertyTypeClassifier(PropertyTypeClassifier): - """PropertyTypeClassifier backed by the ChatGPT client.""" - - _CATEGORIES = ", ".join( - member.value - for member in PropertyType - if member is not PropertyType.UNKNOWN - ) - _SYSTEM_PROMPT = ( - "Classify each UK property description into exactly one category. " - f"Categories: {_CATEGORIES}. " - "Reply with only a JSON object mapping each original description " - "to its category, and nothing else." - ) - - def __init__(self, chat_gpt: ChatGPT) -> None: - self._chat_gpt = chat_gpt - - def classify(self, descriptions: set[str]) -> dict[str, PropertyType]: - reply = self._chat_gpt.generate( - prompt=json.dumps(sorted(descriptions)), - system_prompt=self._SYSTEM_PROMPT, - ) - raw: dict[str, Any] = json.loads(reply) - return { - description: self._to_property_type(raw[description]) - for description in descriptions - } - - @staticmethod - def _to_property_type(value: Any) -> PropertyType: - """Map a reply value to a PropertyType, defaulting to UNKNOWN.""" - try: - return PropertyType(value) - except ValueError: - return PropertyType.UNKNOWN diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py index 8ad21388..6b451746 100644 --- a/orchestration/sal_orchestrator.py +++ b/orchestration/sal_orchestrator.py @@ -1,12 +1,22 @@ +from enum import Enum +from typing import Any + +from domain.addresses.unstandardised_address import AddressList +from domain.sal.column_classifier import ColumnClassifier from repositories.unstandardised_address.unstandardised_address_list_repository import ( UnstandardisedAddressListRepository, ) -from domain.addresses.unstandardised_address import AddressList class SALOrchestrator: - def __init__(self, unstandardised_address_repo: UnstandardisedAddressListRepository) -> None: + def __init__( + self, + unstandardised_address_repo: UnstandardisedAddressListRepository, + classifiers: dict[str, ColumnClassifier[Any]], + ) -> None: self._unstandardised_address_repo = unstandardised_address_repo + # Keyed by landlord-CSV column name. + self._classifiers = classifiers def get_unstandardised_addresses( self, @@ -20,6 +30,27 @@ class SALOrchestrator: mappings: dict[str, set[str]] = {} for unstandardised_address in list_of_unstandardised_address: for key, value in unstandardised_address.additional_info.items(): - # Lower-case so case-only typos collapse to one variant. - mappings.setdefault(key, set()).add(value.lower()) + bucket = mappings.setdefault(key, set()) + # A comma-separated value is several descriptions in one cell; + # split it so each is its own entry. Lower-case so case-only + # typos collapse to one variant. + for variant in value.split(","): + variant = variant.strip().lower() + if variant: + bucket.add(variant) return mappings + + def classify_columns( + self, addresses: AddressList + ) -> dict[str, dict[str, Enum]]: + """Classify every registered column's descriptions. + + Returns a mapping of column name to ``{description: category}``. A + registered column absent from the addresses contributes an empty + inner mapping. + """ + col_to_desc = self.get_col_to_description_mappings(addresses) + return { + column: classifier.classify(col_to_desc.get(column, set())) + for column, classifier in self._classifiers.items() + } diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py new file mode 100644 index 00000000..5ec854f1 --- /dev/null +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from typing import Optional + +import pytest + +from domain.sal.column_classifier import ClassificationError +from domain.sal.property_type import PropertyType +from domain.sal.wall_type import WallType +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_column_classifier import ( + ChatGptColumnClassifier, +) +from infrastructure.chatgpt.exceptions import ChatGPTClientError + + +class _FakeChatGPT(ChatGPT): + """Hand-written ChatGPT stand-in: returns a canned reply, records prompts.""" + + def __init__( + self, + reply: str = "{}", + error: Optional[Exception] = None, + ) -> None: + self.prompts: list[str] = [] + self._reply = reply + self._error = error + + def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str: + self.prompts.append(prompt) + if self._error is not None: + raise self._error + return self._reply + + +def _property_type_classifier( + chat_gpt: ChatGPT, +) -> ChatGptColumnClassifier[PropertyType]: + return ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN) + + +def test_classifies_description_into_its_category() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached"}) + + # Assert + assert result == {"semi-detached": PropertyType.HOUSE} + + +def test_classifies_when_reply_is_wrapped_in_a_markdown_fence() -> None: + # Arrange: ChatGPT wraps the JSON in a ```json ... ``` code fence. + chat_gpt = _FakeChatGPT(reply='```json\n{"semi-detached": "House"}\n```') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached"}) + + # Assert + assert result == {"semi-detached": PropertyType.HOUSE} + + +def test_unrecognised_category_maps_to_unknown() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"garden shed": "Shed"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"garden shed"}) + + # Assert + assert result == {"garden shed": PropertyType.UNKNOWN} + + +def test_description_omitted_from_reply_maps_to_unknown() -> None: + # Arrange: the reply classifies one description but not the other. + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify({"semi-detached", "TBC"}) + + # Assert + assert result == { + "semi-detached": PropertyType.HOUSE, + "TBC": PropertyType.UNKNOWN, + } + + +def test_chatgpt_failure_raises_classification_error() -> None: + # Arrange + chat_gpt = _FakeChatGPT(error=ChatGPTClientError("backend unreachable")) + classifier = _property_type_classifier(chat_gpt) + + # Act / Assert + with pytest.raises(ClassificationError): + classifier.classify({"semi-detached"}) + + +def test_non_json_reply_raises_classification_error_with_the_raw_reply() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply="sorry, I can't do that") + classifier = _property_type_classifier(chat_gpt) + + # Act / Assert: the error surfaces the offending reply for diagnosis. + with pytest.raises(ClassificationError, match="sorry, I can't do that"): + classifier.classify({"semi-detached"}) + + +def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None: + # Arrange + chat_gpt = _FakeChatGPT(reply='{"unused": "House"}') + classifier = _property_type_classifier(chat_gpt) + + # Act + result = classifier.classify(set()) + + # Assert + assert result == {} + assert chat_gpt.prompts == [] + + +def test_classifies_with_a_different_category_enum() -> None: + # Arrange: the same adapter classifies a WallType column. + chat_gpt = _FakeChatGPT(reply='{"solid brick wall": "Solid Brick"}') + classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN) + + # Act + result = classifier.classify({"solid brick wall"}) + + # Assert + assert result == {"solid brick wall": WallType.SOLID_BRICK} diff --git a/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py deleted file mode 100644 index d4801154..00000000 --- a/tests/infrastructure/chatgpt/test_chatgpt_property_type_classifier.py +++ /dev/null @@ -1,45 +0,0 @@ -from __future__ import annotations - -from typing import Optional - -from domain.sal.property_type import PropertyType -from infrastructure.chatgpt.chatgpt import ChatGPT -from infrastructure.chatgpt.chatgpt_property_type_classifier import ( - ChatGptPropertyTypeClassifier, -) - - -class _FakeChatGPT(ChatGPT): - """Hand-written ChatGPT stand-in: returns a canned reply, records prompts.""" - - def __init__(self, reply: str = "{}") -> None: - self.prompts: list[str] = [] - self._reply = reply - - def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str: - self.prompts.append(prompt) - return self._reply - - -def test_classifies_description_into_property_type() -> None: - # Arrange - chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') - classifier = ChatGptPropertyTypeClassifier(chat_gpt) - - # Act - result = classifier.classify({"semi-detached"}) - - # Assert - assert result == {"semi-detached": PropertyType.HOUSE} - - -def test_unrecognised_category_maps_to_unknown() -> None: - # Arrange - chat_gpt = _FakeChatGPT(reply='{"garden shed": "Shed"}') - classifier = ChatGptPropertyTypeClassifier(chat_gpt) - - # Act - result = classifier.classify({"garden shed"}) - - # Assert - assert result == {"garden shed": PropertyType.UNKNOWN} diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index b3658014..62f1a329 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -1,7 +1,13 @@ from __future__ import annotations +from enum import Enum +from typing import Any, Optional + from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress from domain.postcode import Postcode +from domain.sal.column_classifier import ColumnClassifier +from domain.sal.property_type import PropertyType +from domain.sal.wall_type import WallType from orchestration.sal_orchestrator import ( SALOrchestrator, ) @@ -20,7 +26,21 @@ class _StubUnstandardisedAddressRepository(UnstandardisedAddressListRepository): raise NotImplementedError() -def _make_unstandardised_address(landlord_additional_info: dict[str, str]) -> UnstandardisedAddress: +class _StubColumnClassifier(ColumnClassifier[Enum]): + """Records the descriptions it received; returns a canned mapping.""" + + def __init__(self, result: dict[str, Enum]) -> None: + self.received: Optional[set[str]] = None + self._result = result + + def classify(self, descriptions: set[str]) -> dict[str, Enum]: + self.received = descriptions + return self._result + + +def _make_unstandardised_address( + landlord_additional_info: dict[str, str], +) -> UnstandardisedAddress: return UnstandardisedAddress( address="1 High St", postcode=Postcode("AA1 1AA"), @@ -28,8 +48,13 @@ def _make_unstandardised_address(landlord_additional_info: dict[str, str]) -> Un ) -def _orchestrator() -> SALOrchestrator: - return SALOrchestrator(unstandardised_address_repo=_StubUnstandardisedAddressRepository()) +def _orchestrator( + classifiers: Optional[dict[str, ColumnClassifier[Any]]] = None, +) -> SALOrchestrator: + return SALOrchestrator( + unstandardised_address_repo=_StubUnstandardisedAddressRepository(), + classifiers=classifiers or {}, + ) def test_collects_every_value_per_shared_key() -> None: @@ -86,6 +111,19 @@ def test_case_only_variants_collapse_to_one() -> None: assert mappings == {"description": {"cosy"}} +def test_comma_separated_value_splits_into_individual_entries() -> None: + # arrange: a single cell packs several descriptions, comma-separated. + addresses = AddressList( + [_make_unstandardised_address({"description": "cosy, bright, COSY"})] + ) + + # act + mappings = _orchestrator().get_col_to_description_mappings(addresses) + + # assert: each comma-separated part is its own trimmed, lower-cased entry. + assert mappings == {"description": {"cosy", "bright"}} + + def test_empty_address_list_yields_empty_mapping() -> None: # arrange / act mappings = _orchestrator().get_col_to_description_mappings(AddressList([])) @@ -103,3 +141,44 @@ def test_single_address_yields_single_value_per_key() -> None: # assert assert mappings == {"description": {"cosy"}} + + +def test_classify_columns_classifies_each_registered_column() -> None: + # arrange: addresses carry two classifiable columns. + addresses = AddressList( + [ + _make_unstandardised_address( + {"Property Type": "semi-detached", "Walls": "solid brick"} + ), + ] + ) + property_types = _StubColumnClassifier( + result={"semi-detached": PropertyType.HOUSE} + ) + wall_types = _StubColumnClassifier(result={"solid brick": WallType.SOLID_BRICK}) + + # act + result = _orchestrator( + {"Property Type": property_types, "Walls": wall_types} + ).classify_columns(addresses) + + # assert: each registered column was classified independently. + assert result == { + "Property Type": {"semi-detached": PropertyType.HOUSE}, + "Walls": {"solid brick": WallType.SOLID_BRICK}, + } + + +def test_classify_columns_yields_empty_mapping_for_an_absent_column() -> None: + # arrange: a classifier is registered for a column the addresses lack. + addresses = AddressList([_make_unstandardised_address({"Walls": "cavity"})]) + property_types = _StubColumnClassifier(result={}) + + # act + result = _orchestrator( + {"Property Type": property_types} + ).classify_columns(addresses) + + # assert: the absent column classified an empty description set. + assert result == {"Property Type": {}} + assert property_types.received == set() From 96aeed4f2ee6550555ae34ddd0d3b6bba3ea6c13 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 22 May 2026 15:36:46 +0000 Subject: [PATCH 21/29] Remove EPC and asset_list changes unrelated to SAL handler This branch's objective is the SAL ingestion handler (applications/SAL/handler.py) and its dependency tree. Drop work that crept in but is unreferenced by it: - EPC feature: domain/epc, infrastructure/epc (gov_uk + historical clients), tests/infrastructure/epc - datatypes/epc edits (instantaneous_wwhrs Optional) reverted to main - asset_list/app.py local data-file/column tweak reverted to main Co-Authored-By: Claude Opus 4.7 (1M context) --- asset_list/app.py | 13 +- datatypes/epc/domain/epc_property_data.py | 22 +- datatypes/epc/schema/rdsap_schema_17_0.py | 2 +- datatypes/epc/schema/rdsap_schema_17_1.py | 2 +- datatypes/epc/schema/rdsap_schema_18_0.py | 3 +- datatypes/epc/schema/rdsap_schema_19_0.py | 2 +- datatypes/epc/schema/rdsap_schema_20_0_0.py | 3 +- datatypes/epc/schema/rdsap_schema_21_0_0.py | 4 +- datatypes/epc/schema/rdsap_schema_21_0_1.py | 4 +- domain/epc/__init__.py | 4 - domain/epc/epc_record.py | 21 -- domain/epc/property_type.py | 9 - infrastructure/epc/__init__.py | 13 -- infrastructure/epc/epc_client.py | 41 ---- infrastructure/epc/exceptions.py | 17 -- infrastructure/epc/gov_uk/__init__.py | 6 - infrastructure/epc/gov_uk/_retry.py | 34 --- .../epc/gov_uk/gov_uk_epc_client.py | 132 ----------- .../epc/gov_uk/gov_uk_property_type.py | 25 --- .../__init__.py | 5 - ...orical_open_data_communities_epc_client.py | 24 -- tests/infrastructure/epc/__init__.py | 0 tests/infrastructure/epc/gov_uk/__init__.py | 0 tests/infrastructure/epc/gov_uk/conftest.py | 49 ---- .../epc/gov_uk/test_gov_uk_epc_client.py | 211 ------------------ 25 files changed, 21 insertions(+), 625 deletions(-) delete mode 100644 domain/epc/__init__.py delete mode 100644 domain/epc/epc_record.py delete mode 100644 domain/epc/property_type.py delete mode 100644 infrastructure/epc/__init__.py delete mode 100644 infrastructure/epc/epc_client.py delete mode 100644 infrastructure/epc/exceptions.py delete mode 100644 infrastructure/epc/gov_uk/__init__.py delete mode 100644 infrastructure/epc/gov_uk/_retry.py delete mode 100644 infrastructure/epc/gov_uk/gov_uk_epc_client.py delete mode 100644 infrastructure/epc/gov_uk/gov_uk_property_type.py delete mode 100644 infrastructure/epc/historical_open_data_communities/__init__.py delete mode 100644 infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py delete mode 100644 tests/infrastructure/epc/__init__.py delete mode 100644 tests/infrastructure/epc/gov_uk/__init__.py delete mode 100644 tests/infrastructure/epc/gov_uk/conftest.py delete mode 100644 tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py diff --git a/asset_list/app.py b/asset_list/app.py index aef410e6..424f4df6 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -79,17 +79,17 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "asset_list (8).xlsx" - sheet_name = "Standardised Asset List" - postcode_column = "postcode" - address1_column = "domna_address_1" + data_filename = "hyde.xlsx" + sheet_name = "AddressProfilingResults" + postcode_column = "Postcode" + address1_column = "Address" address1_method = None - fulladdress_column = "domna_address_1" + fulladdress_column = "Postcode" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "landlord_property_id" # Good to include if landlord gave + landlord_property_type = "Property Type" # Good to include if landlord gave landlord_built_form = None # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None @@ -468,3 +468,4 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) + diff --git a/datatypes/epc/domain/epc_property_data.py b/datatypes/epc/domain/epc_property_data.py index 68a25205..8795b389 100644 --- a/datatypes/epc/domain/epc_property_data.py +++ b/datatypes/epc/domain/epc_property_data.py @@ -29,9 +29,7 @@ class MainHeatingDetail: boiler_flue_type: Optional[int] = None # TODO: make enum? boiler_ignition_type: Optional[int] = None # TODO: make enum? central_heating_pump_age: Optional[int] = None - central_heating_pump_age_str: Optional[str] = ( - None # str from site notes e.g. "Unknown", "Pre 2013" - ) + central_heating_pump_age_str: Optional[str] = None # str from site notes e.g. "Unknown", "Pre 2013" main_heating_index_number: Optional[int] = None sap_main_heating_code: Optional[int] = None # TODO: make enum? main_heating_number: Optional[int] = None @@ -56,7 +54,7 @@ class ShowerOutlets: @dataclass class SapHeating: - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] has_fixed_air_conditioning: bool cylinder_size: Optional[Union[int, str]] = ( @@ -69,9 +67,7 @@ class SapHeating: cylinder_insulation_type: Optional[Union[int, str]] = None cylinder_thermostat: Optional[str] = None secondary_fuel_type: Optional[int] = None - secondary_heating_type: Optional[Union[int, str]] = ( - None # int from API; str from site notes - ) + secondary_heating_type: Optional[Union[int, str]] = None # int from API; str from site notes cylinder_insulation_thickness_mm: Optional[int] = None @@ -79,9 +75,7 @@ class SapHeating: class SapVentilation: ventilation_type: Optional[str] = None draught_lobby: Optional[bool] = None - pressure_test: Optional[str] = ( - None # str from site notes e.g. "No test"; int in API via mechanical_ventilation - ) + pressure_test: Optional[str] = None # str from site notes e.g. "No test"; int in API via mechanical_ventilation open_flues_count: Optional[int] = None closed_flues_count: Optional[int] = None boiler_flues_count: Optional[int] = None @@ -225,12 +219,8 @@ class SapBuildingPart: None # TODO: make enum/mapping? ) floor_type: Optional[str] = None # str from site notes e.g. "Ground Floor" - floor_construction_type: Optional[str] = ( - None # str from site notes; distinct from floor_construction: int in SapFloorDimension - ) - floor_insulation_type_str: Optional[str] = ( - None # str from site notes e.g. "As Built" - ) + floor_construction_type: Optional[str] = None # str from site notes; distinct from floor_construction: int in SapFloorDimension + floor_insulation_type_str: Optional[str] = None # str from site notes e.g. "As Built" floor_u_value_known: Optional[bool] = None roof_construction: Optional[int] = None diff --git a/datatypes/epc/schema/rdsap_schema_17_0.py b/datatypes/epc/schema/rdsap_schema_17_0.py index 9cbedf97..22aaded4 100644 --- a/datatypes/epc/schema/rdsap_schema_17_0.py +++ b/datatypes/epc/schema/rdsap_schema_17_0.py @@ -37,7 +37,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] cylinder_insulation_type: int diff --git a/datatypes/epc/schema/rdsap_schema_17_1.py b/datatypes/epc/schema/rdsap_schema_17_1.py index b0af07e6..a4c007ed 100644 --- a/datatypes/epc/schema/rdsap_schema_17_1.py +++ b/datatypes/epc/schema/rdsap_schema_17_1.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] cylinder_insulation_type: int diff --git a/datatypes/epc/schema/rdsap_schema_18_0.py b/datatypes/epc/schema/rdsap_schema_18_0.py index 4ce2f887..a038dc9b 100644 --- a/datatypes/epc/schema/rdsap_schema_18_0.py +++ b/datatypes/epc/schema/rdsap_schema_18_0.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -86,7 +86,6 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. floor_area is a Measurement object in schema 18.0.""" - floor_area: Measurement insulation: str roof_room_connected: str diff --git a/datatypes/epc/schema/rdsap_schema_19_0.py b/datatypes/epc/schema/rdsap_schema_19_0.py index b3c77ec4..b94d9bb3 100644 --- a/datatypes/epc/schema/rdsap_schema_19_0.py +++ b/datatypes/epc/schema/rdsap_schema_19_0.py @@ -41,7 +41,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str diff --git a/datatypes/epc/schema/rdsap_schema_20_0_0.py b/datatypes/epc/schema/rdsap_schema_20_0_0.py index 9deb235e..8f3986a2 100644 --- a/datatypes/epc/schema/rdsap_schema_20_0_0.py +++ b/datatypes/epc/schema/rdsap_schema_20_0_0.py @@ -49,7 +49,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -103,7 +103,6 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. floor_area is a plain number in schema 20.0.0 (not a Measurement object).""" - floor_area: Union[int, float] insulation: str roof_room_connected: str diff --git a/datatypes/epc/schema/rdsap_schema_21_0_0.py b/datatypes/epc/schema/rdsap_schema_21_0_0.py index 8d19e5f9..eee00cb8 100644 --- a/datatypes/epc/schema/rdsap_schema_21_0_0.py +++ b/datatypes/epc/schema/rdsap_schema_21_0_0.py @@ -33,7 +33,6 @@ class ShowerOutlets: @dataclass class InstantaneousWwhrs: """Changed in 21.0.0: references WWHRS product index numbers instead of room counts.""" - wwhrs_index_number1: Optional[int] = None wwhrs_index_number2: Optional[int] = None @@ -62,7 +61,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str @@ -155,7 +154,6 @@ class SapFloorDimension: @dataclass class SapRoomInRoof: """Room-in-roof details. insulation and roof_room_connected removed in schema 21.0.0.""" - floor_area: Union[int, float] construction_age_band: str diff --git a/datatypes/epc/schema/rdsap_schema_21_0_1.py b/datatypes/epc/schema/rdsap_schema_21_0_1.py index f6be7cc3..9b3dbd1d 100644 --- a/datatypes/epc/schema/rdsap_schema_21_0_1.py +++ b/datatypes/epc/schema/rdsap_schema_21_0_1.py @@ -50,7 +50,7 @@ class MainHeatingDetail: main_heating_fraction: int main_heating_data_source: int boiler_flue_type: Optional[int] = None - fan_flue_present: Optional[str] = None # TODO: make bool + fan_flue_present: Optional[str] = None # TODO: make bool boiler_ignition_type: Optional[int] = None central_heating_pump_age: Optional[int] = None main_heating_index_number: Optional[int] = None @@ -62,7 +62,7 @@ class SapHeating: cylinder_size: int water_heating_code: int water_heating_fuel: int - instantaneous_wwhrs: Optional[InstantaneousWwhrs] + instantaneous_wwhrs: InstantaneousWwhrs main_heating_details: List[MainHeatingDetail] immersion_heating_type: Union[int, str] has_fixed_air_conditioning: str diff --git a/domain/epc/__init__.py b/domain/epc/__init__.py deleted file mode 100644 index e49fea42..00000000 --- a/domain/epc/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from domain.epc.epc_record import EpcRecord -from domain.epc.property_type import PropertyType - -__all__ = ["EpcRecord", "PropertyType"] diff --git a/domain/epc/epc_record.py b/domain/epc/epc_record.py deleted file mode 100644 index 7194d1d6..00000000 --- a/domain/epc/epc_record.py +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import Optional - -from domain.epc.property_type import PropertyType - - -@dataclass(frozen=True) -class EpcRecord: - """A streamlined record of EPC property data. - - A focused subset of the full ``EpcPropertyData``: a property's identity - plus its typed property type. Grow this with further fields as the - domain needs them. - """ - - address_line_1: str - postcode: str - uprn: Optional[int] - property_type: PropertyType diff --git a/domain/epc/property_type.py b/domain/epc/property_type.py deleted file mode 100644 index 707988aa..00000000 --- a/domain/epc/property_type.py +++ /dev/null @@ -1,9 +0,0 @@ -from enum import Enum - - -class PropertyType(Enum): - HOUSE = "House" - BUNGALOW = "Bungalow" - FLAT = "Flat" - MAISONETTE = "Maisonette" - PARK_HOME = "Park home" diff --git a/infrastructure/epc/__init__.py b/infrastructure/epc/__init__.py deleted file mode 100644 index f99a7cb3..00000000 --- a/infrastructure/epc/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from infrastructure.epc.epc_client import EpcClient -from infrastructure.epc.exceptions import ( - EpcApiError, - EpcNotFoundError, - EpcRateLimitError, -) - -__all__ = [ - "EpcApiError", - "EpcClient", - "EpcNotFoundError", - "EpcRateLimitError", -] diff --git a/infrastructure/epc/epc_client.py b/infrastructure/epc/epc_client.py deleted file mode 100644 index d1f8639c..00000000 --- a/infrastructure/epc/epc_client.py +++ /dev/null @@ -1,41 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import Optional - -from datatypes.epc.domain.epc_property_data import EpcPropertyData -from datatypes.epc.search import EpcSearchResult - - -class EpcClient(ABC): - """Interface for retrieving EPC (Energy Performance Certificate) data. - - Implementations fetch from a data source and return domain objects; - callers depend only on this interface, not on a concrete transport. - """ - - @abstractmethod - def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: - """Return the EPC certificates registered at ``postcode``. - - Returns an empty list when the postcode has no certificates. - """ - ... - - @abstractmethod - def get_by_certificate_number( - self, certificate_number: str - ) -> EpcPropertyData: - """Return the full EPC record for a certificate number. - - Raises EpcNotFoundError when no such certificate exists. - """ - ... - - @abstractmethod - def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: - """Return the most recent EPC record for ``uprn``. - - Returns None when the UPRN has no certificates. - """ - ... diff --git a/infrastructure/epc/exceptions.py b/infrastructure/epc/exceptions.py deleted file mode 100644 index 8e2e5165..00000000 --- a/infrastructure/epc/exceptions.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Optional - - -class EpcApiError(Exception): - """Base for all EPC client errors.""" - - -class EpcNotFoundError(EpcApiError): - """Raised when the API returns 404 for a resource that must exist.""" - - -class EpcRateLimitError(EpcApiError): - """Raised when the API returns 429 and all retries are exhausted.""" - - def __init__(self, message: str, retry_after: Optional[float] = None) -> None: - super().__init__(message) - self.retry_after = retry_after diff --git a/infrastructure/epc/gov_uk/__init__.py b/infrastructure/epc/gov_uk/__init__.py deleted file mode 100644 index d491a1ef..00000000 --- a/infrastructure/epc/gov_uk/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from infrastructure.epc.gov_uk.gov_uk_epc_client import GovUkEpcClient -from infrastructure.epc.gov_uk.gov_uk_property_type import ( - property_type_from_gov_uk_code, -) - -__all__ = ["GovUkEpcClient", "property_type_from_gov_uk_code"] diff --git a/infrastructure/epc/gov_uk/_retry.py b/infrastructure/epc/gov_uk/_retry.py deleted file mode 100644 index db92b131..00000000 --- a/infrastructure/epc/gov_uk/_retry.py +++ /dev/null @@ -1,34 +0,0 @@ -import time -from typing import Callable, Optional, TypeVar - -from infrastructure.epc.exceptions import EpcRateLimitError - -T = TypeVar("T") - - -def call_with_retry( - fn: Callable[[], T], - max_retries: int = 5, - backoff_base: float = 1.0, - backoff_multiplier: float = 2.0, - max_backoff: float = 60.0, -) -> T: - """Call ``fn``, retrying on EpcRateLimitError with exponential backoff. - - Honours the API's ``Retry-After`` header when present, otherwise backs off - ``backoff_base * backoff_multiplier ** attempt`` (capped at ``max_backoff``). - """ - last_exc: Optional[EpcRateLimitError] = None - for attempt in range(max_retries + 1): - try: - return fn() - except EpcRateLimitError as exc: - last_exc = exc - if attempt < max_retries: - if exc.retry_after is not None: - delay = exc.retry_after - else: - delay = backoff_base * (backoff_multiplier**attempt) - time.sleep(min(delay, max_backoff)) - assert last_exc is not None - raise last_exc diff --git a/infrastructure/epc/gov_uk/gov_uk_epc_client.py b/infrastructure/epc/gov_uk/gov_uk_epc_client.py deleted file mode 100644 index ac0db09f..00000000 --- a/infrastructure/epc/gov_uk/gov_uk_epc_client.py +++ /dev/null @@ -1,132 +0,0 @@ -# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml -from __future__ import annotations - -from typing import Any, Optional - -import httpx - -from datatypes.epc.domain.epc_property_data import EpcPropertyData -from datatypes.epc.domain.mapper import EpcPropertyDataMapper -from datatypes.epc.search import EpcSearchResult -from infrastructure.epc.epc_client import EpcClient -from infrastructure.epc.exceptions import ( - EpcApiError, - EpcNotFoundError, - EpcRateLimitError, -) -from infrastructure.epc.gov_uk._retry import call_with_retry - - -class GovUkEpcClient(EpcClient): - """EpcClient backed by the live gov.uk EPC API. - - Endpoint: https://api.get-energy-performance-data.communities.gov.uk - """ - - BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" - REQUEST_TIMEOUT = 10.0 - - def __init__(self, auth_token: str) -> None: - self._headers = { - "Authorization": f"Bearer {auth_token}", - "Accept": "application/json", - } - - def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: - normalised = self._normalise_postcode(postcode) - return call_with_retry(lambda: self._search(postcode=normalised)) - - def get_by_certificate_number( - self, certificate_number: str - ) -> EpcPropertyData: - raw = call_with_retry(lambda: self._fetch_certificate(certificate_number)) - return EpcPropertyDataMapper.from_api_response(raw) - - def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: - results = call_with_retry(lambda: self._search(uprn=uprn)) - if not results: - return None - latest = max(results, key=lambda r: r.registration_date) - return self.get_by_certificate_number(latest.certificate_number) - - # ------------------------------------------------------------------ - # Private helpers - # ------------------------------------------------------------------ - - @staticmethod - def _normalise_postcode(postcode: str) -> str: - """Return the postcode with all spaces removed and uppercased.""" - return postcode.replace(" ", "").upper() - - @staticmethod - def _parse_retry_after(resp: httpx.Response) -> Optional[float]: - header = resp.headers.get("Retry-After") - if header is None: - return None - try: - return float(header) - except (TypeError, ValueError): - return None - - def _fetch_certificate(self, certificate_number: str) -> dict[str, Any]: - resp = httpx.get( - f"{self.BASE_URL}/api/certificate", - params={"certificate_number": certificate_number}, - headers=self._headers, - timeout=self.REQUEST_TIMEOUT, - ) - if resp.status_code == 404: - raise EpcNotFoundError(certificate_number) - if resp.status_code == 429: - raise EpcRateLimitError( - "Rate limited by EPC API", - retry_after=self._parse_retry_after(resp), - ) - if not resp.is_success: - raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") - return resp.json()["data"] - - def _search( - self, - postcode: Optional[str] = None, - uprn: Optional[int] = None, - ) -> list[EpcSearchResult]: - params: dict[str, str | int] = {} - if postcode: - params["postcode"] = postcode - if uprn is not None: - params["uprn"] = uprn - - resp = httpx.get( - f"{self.BASE_URL}/api/domestic/search", - params=params, - headers=self._headers, - timeout=self.REQUEST_TIMEOUT, - ) - if resp.status_code == 404: - return [] - if resp.status_code == 429: - raise EpcRateLimitError( - "Rate limited by EPC API", - retry_after=self._parse_retry_after(resp), - ) - if not resp.is_success: - raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") - - rows = resp.json().get("data", []) - return [self._parse_search_result(row) for row in rows] - - @staticmethod - def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult: - return EpcSearchResult( - certificate_number=row["certificateNumber"], - address_line_1=row["addressLine1"], - address_line_2=row.get("addressLine2"), - address_line_3=row.get("addressLine3"), - address_line_4=row.get("addressLine4"), - postcode=row["postcode"], - post_town=row["postTown"], - uprn=row.get("uprn"), - current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], - registration_date=row["registrationDate"], - ) diff --git a/infrastructure/epc/gov_uk/gov_uk_property_type.py b/infrastructure/epc/gov_uk/gov_uk_property_type.py deleted file mode 100644 index a0f4a7a3..00000000 --- a/infrastructure/epc/gov_uk/gov_uk_property_type.py +++ /dev/null @@ -1,25 +0,0 @@ -from domain.epc.property_type import PropertyType - -# GOV.UK EPC API ``property_type`` integer codes mapped to the domain type. -# This translation is GOV.UK-specific and lives in the infrastructure layer so -# the domain ``PropertyType`` stays free of any source encoding. -_PROPERTY_TYPE_BY_GOV_UK_CODE: dict[int, PropertyType] = { - 0: PropertyType.HOUSE, - 1: PropertyType.BUNGALOW, - 2: PropertyType.FLAT, - 3: PropertyType.MAISONETTE, - 4: PropertyType.PARK_HOME, -} - - -def property_type_from_gov_uk_code(code: int) -> PropertyType: - """Translate a GOV.UK EPC ``property_type`` code to the domain PropertyType. - - Raises ValueError for a code GOV.UK has not been mapped here yet. - """ - try: - return _PROPERTY_TYPE_BY_GOV_UK_CODE[code] - except KeyError: - raise ValueError( - f"Unknown GOV.UK EPC property type code: {code}" - ) from None diff --git a/infrastructure/epc/historical_open_data_communities/__init__.py b/infrastructure/epc/historical_open_data_communities/__init__.py deleted file mode 100644 index 88a69081..00000000 --- a/infrastructure/epc/historical_open_data_communities/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from infrastructure.epc.historical_open_data_communities.historical_open_data_communities_epc_client import ( - HistoricalOpenDataCommunitiesEpcClient, -) - -__all__ = ["HistoricalOpenDataCommunitiesEpcClient"] diff --git a/infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py b/infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py deleted file mode 100644 index d8c7f9ac..00000000 --- a/infrastructure/epc/historical_open_data_communities/historical_open_data_communities_epc_client.py +++ /dev/null @@ -1,24 +0,0 @@ -from __future__ import annotations - -from typing import Optional - -from domain.epc.epc_record import EpcRecord - - -class HistoricalOpenDataCommunitiesEpcClient: - """EPC client backed by Open Data Communities' historical EPC data. - - Stub — not yet implemented. Every method raises NotImplementedError for - now. Unlike GovUkEpcClient it returns the domain ``EpcRecord`` directly; - once the ``EpcClient`` port is migrated to return ``EpcRecord``, this - adapter should implement it. - """ - - def search_by_postcode(self, postcode: str) -> list[EpcRecord]: - raise NotImplementedError - - def get_by_certificate_number(self, certificate_number: str) -> EpcRecord: - raise NotImplementedError - - def get_by_uprn(self, uprn: int) -> Optional[EpcRecord]: - raise NotImplementedError diff --git a/tests/infrastructure/epc/__init__.py b/tests/infrastructure/epc/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/infrastructure/epc/gov_uk/__init__.py b/tests/infrastructure/epc/gov_uk/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/infrastructure/epc/gov_uk/conftest.py b/tests/infrastructure/epc/gov_uk/conftest.py deleted file mode 100644 index 8fbd3094..00000000 --- a/tests/infrastructure/epc/gov_uk/conftest.py +++ /dev/null @@ -1,49 +0,0 @@ -import json -import pathlib - -import pytest - -from infrastructure.epc.gov_uk.gov_uk_epc_client import GovUkEpcClient - -SAMPLES_DIR = pathlib.Path("backend/epc_api/json_samples") - - -@pytest.fixture -def rdsap_21_0_0_cert(): - return json.loads((SAMPLES_DIR / "RdSAP-Schema-21.0.0/epc.json").read_text()) - - -@pytest.fixture -def rdsap_21_0_1_cert(): - return json.loads((SAMPLES_DIR / "RdSAP-Schema-21.0.1/epc.json").read_text()) - - -@pytest.fixture -def epc_client(): - return GovUkEpcClient(auth_token="test-token") - - -def make_search_row( - cert_num="CERT-001", - address_line_1="1 Test Street", - postcode="SW1A 1AA", - post_town="London", - uprn=100023336956, - band="D", - registration_date="2024-01-01", - address_line_2=None, - address_line_3=None, - address_line_4=None, -): - return { - "certificateNumber": cert_num, - "addressLine1": address_line_1, - "addressLine2": address_line_2, - "addressLine3": address_line_3, - "addressLine4": address_line_4, - "postcode": postcode, - "postTown": post_town, - "uprn": uprn, - "currentEnergyEfficiencyBand": band, - "registrationDate": registration_date, - } diff --git a/tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py b/tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py deleted file mode 100644 index 46164a0e..00000000 --- a/tests/infrastructure/epc/gov_uk/test_gov_uk_epc_client.py +++ /dev/null @@ -1,211 +0,0 @@ -from unittest.mock import MagicMock, call, patch - -import pytest - -from datatypes.epc.domain.epc_property_data import EpcPropertyData -from datatypes.epc.search import EpcSearchResult -from infrastructure.epc.exceptions import EpcNotFoundError -from tests.infrastructure.epc.gov_uk.conftest import make_search_row - -_SLEEP = "infrastructure.epc.gov_uk._retry.time.sleep" - - -def _mock_response(status_code=200, json_data=None, headers=None): - resp = MagicMock() - resp.status_code = status_code - resp.is_success = 200 <= status_code < 300 - resp.json.return_value = json_data or {} - resp.text = str(json_data) - resp.headers = headers or {} - return resp - - -# --------------------------------------------------------------------------- -# Test 1: get_by_certificate_number happy path -# --------------------------------------------------------------------------- - - -def test_get_by_certificate_number_returns_epc_property_data( - epc_client, rdsap_21_0_1_cert -): - cert_response = {"data": rdsap_21_0_1_cert} - with patch("httpx.get", return_value=_mock_response(200, cert_response)): - result = epc_client.get_by_certificate_number("CERT-001") - - assert isinstance(result, EpcPropertyData) - - -# --------------------------------------------------------------------------- -# Test 2: get_by_certificate_number 404 -> EpcNotFoundError -# --------------------------------------------------------------------------- - - -def test_get_by_certificate_number_404_raises_not_found(epc_client): - with patch("httpx.get", return_value=_mock_response(404)): - with pytest.raises(EpcNotFoundError): - epc_client.get_by_certificate_number("BAD-CERT") - - -# --------------------------------------------------------------------------- -# Test 3: 429 retried, succeeds on 3rd attempt -# --------------------------------------------------------------------------- - - -def test_get_by_certificate_number_retries_on_429_and_succeeds( - epc_client, rdsap_21_0_1_cert -): - cert_response = {"data": rdsap_21_0_1_cert} - responses = [ - _mock_response(429), - _mock_response(429), - _mock_response(200, cert_response), - ] - with patch("httpx.get", side_effect=responses), patch(_SLEEP): - result = epc_client.get_by_certificate_number("CERT-001") - - assert isinstance(result, EpcPropertyData) - - -# --------------------------------------------------------------------------- -# Test 3b: 429 with Retry-After header -> sleeps for that value -# --------------------------------------------------------------------------- - - -def test_429_retry_after_header_drives_sleep_duration( - epc_client, rdsap_21_0_1_cert -): - cert_response = {"data": rdsap_21_0_1_cert} - responses = [ - _mock_response(429, headers={"Retry-After": "7"}), - _mock_response(200, cert_response), - ] - with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: - epc_client.get_by_certificate_number("CERT-001") - - mock_sleep.assert_called_once_with(7.0) - - -# --------------------------------------------------------------------------- -# Test 3c: 429 without Retry-After -> falls back to exponential backoff -# --------------------------------------------------------------------------- - - -def test_429_without_retry_after_uses_exponential_backoff( - epc_client, rdsap_21_0_1_cert -): - cert_response = {"data": rdsap_21_0_1_cert} - responses = [ - _mock_response(429), - _mock_response(429), - _mock_response(200, cert_response), - ] - with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: - epc_client.get_by_certificate_number("CERT-001") - - assert mock_sleep.call_args_list == [call(1.0), call(2.0)] - - -# --------------------------------------------------------------------------- -# Test 3d: malformed Retry-After header -> falls back to exponential backoff -# --------------------------------------------------------------------------- - - -def test_429_malformed_retry_after_falls_back_to_backoff( - epc_client, rdsap_21_0_1_cert -): - cert_response = {"data": rdsap_21_0_1_cert} - responses = [ - _mock_response(429, headers={"Retry-After": "Wed, 21 Oct 2026 07:28:00 GMT"}), - _mock_response(200, cert_response), - ] - with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: - epc_client.get_by_certificate_number("CERT-001") - - mock_sleep.assert_called_once_with(1.0) - - -# --------------------------------------------------------------------------- -# Test 3e: Retry-After capped by max_backoff to avoid hostile/buggy values -# --------------------------------------------------------------------------- - - -def test_429_retry_after_capped_by_max_backoff(epc_client, rdsap_21_0_1_cert): - cert_response = {"data": rdsap_21_0_1_cert} - responses = [ - _mock_response(429, headers={"Retry-After": "9999"}), - _mock_response(200, cert_response), - ] - with patch("httpx.get", side_effect=responses), patch(_SLEEP) as mock_sleep: - epc_client.get_by_certificate_number("CERT-001") - - mock_sleep.assert_called_once_with(60.0) - - -# --------------------------------------------------------------------------- -# Test 4: get_by_uprn empty search -> None -# --------------------------------------------------------------------------- - - -def test_get_by_uprn_returns_none_when_no_results(epc_client): - with patch("httpx.get", return_value=_mock_response(200, {"data": []})): - result = epc_client.get_by_uprn(100023336956) - - assert result is None - - -# --------------------------------------------------------------------------- -# Test 5: get_by_uprn multiple results -> fetches latest by registration_date -# --------------------------------------------------------------------------- - - -def test_get_by_uprn_picks_most_recent_certificate(epc_client, rdsap_21_0_1_cert): - search_rows = [ - make_search_row(cert_num="CERT-OLD", registration_date="2022-01-01"), - make_search_row(cert_num="CERT-NEW", registration_date="2024-06-01"), - make_search_row(cert_num="CERT-MID", registration_date="2023-03-15"), - ] - cert_response = {"data": rdsap_21_0_1_cert} - - def fake_get(url, params=None, **kwargs): - if "search" in url: - return _mock_response(200, {"data": search_rows}) - return _mock_response(200, cert_response) - - with patch("httpx.get", side_effect=fake_get) as mock_get: - result = epc_client.get_by_uprn(100023336956) - - assert isinstance(result, EpcPropertyData) - # Second call must be for the most recent cert - cert_call = mock_get.call_args_list[1] - assert cert_call.kwargs["params"]["certificate_number"] == "CERT-NEW" - - -# --------------------------------------------------------------------------- -# Test 6: search_by_postcode returns list[EpcSearchResult] -# --------------------------------------------------------------------------- - - -def test_search_by_postcode_returns_results(epc_client): - rows = [ - make_search_row(cert_num="CERT-A", address_line_1="1 High Street"), - make_search_row(cert_num="CERT-B", address_line_1="2 High Street"), - ] - with patch("httpx.get", return_value=_mock_response(200, {"data": rows})): - results = epc_client.search_by_postcode("SW1A 1AA") - - assert len(results) == 2 - assert all(isinstance(r, EpcSearchResult) for r in results) - assert results[0].certificate_number == "CERT-A" - assert results[1].address_line_1 == "2 High Street" - - -# --------------------------------------------------------------------------- -# Test 7: search_by_postcode 404 -> empty list -# --------------------------------------------------------------------------- - - -def test_search_by_postcode_404_returns_empty_list(epc_client): - with patch("httpx.get", return_value=_mock_response(404)): - results = epc_client.search_by_postcode("ZZ9 9ZZ") - - assert results == [] From 8422041215ae713923ae81a19b8af40d7632337e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 26 May 2026 15:27:45 +0000 Subject: [PATCH 22/29] landlord overrid orchestration --- UBIQUITOUS_LANGUAGE.md | 2 +- applications/SAL/handler.py | 69 ------- .../Dockerfile | 0 .../landlord_description_overrides/handler.py | 128 +++++++++++++ ...lord_description_overrides_trigger_body.py | 15 ++ .../local_handler/.env.local.example | 0 .../local_handler/docker-compose.yml | 0 .../local_handler/invoke_local_lambda.py | 0 .../local_handler/run_local.sh | 0 .../requirements.txt | 0 ...thon-writes-landlord-overrides-directly.md | 77 ++++++++ .../__init__.py | 0 .../built_form_type.py | 20 ++ .../column_classifier.py | 2 +- .../property_type.py | 2 +- .../roof_type.py | 70 +++++++ .../wall_type.py | 70 +++++++ domain/sal/wall_type.py | 15 -- .../chatgpt/chatgpt_column_classifier.py | 5 +- infrastructure/postgres/engine.py | 24 +++ ..._form_type_override_postgres_repository.py | 82 ++++++++ ...landlord_built_form_type_override_table.py | 69 +++++++ .../postgres/landlord_override_enums.py | 35 ++++ ...perty_type_override_postgres_repository.py | 82 ++++++++ .../landlord_property_type_override_table.py | 67 +++++++ ..._wall_type_override_postgres_repository.py | 80 ++++++++ .../landlord_wall_type_override_table.py | 69 +++++++ orchestration/classifiable_column.py | 37 ++++ ...lord_description_overrides_orchestrator.py | 83 +++++++++ orchestration/sal_orchestrator.py | 56 ------ playground.py | 57 ++++++ repositories/landlord_overrides/__init__.py | 0 .../landlord_override_repository.py | 38 ++++ .../chatgpt/test_chatgpt_column_classifier.py | 6 +- ...lord_description_overrides_orchestrator.py | 175 ++++++++++++++++-- .../landlord_overrides/__init__.py | 0 .../landlord_overrides/postgres/__init__.py | 0 ...perty_type_override_postgres_repository.py | 147 +++++++++++++++ ..._wall_type_override_postgres_repository.py | 158 ++++++++++++++++ 39 files changed, 1576 insertions(+), 164 deletions(-) delete mode 100644 applications/SAL/handler.py rename applications/{SAL => landlord_description_overrides}/Dockerfile (100%) create mode 100644 applications/landlord_description_overrides/handler.py create mode 100644 applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py rename applications/{SAL => landlord_description_overrides}/local_handler/.env.local.example (100%) rename applications/{SAL => landlord_description_overrides}/local_handler/docker-compose.yml (100%) rename applications/{SAL => landlord_description_overrides}/local_handler/invoke_local_lambda.py (100%) rename applications/{SAL => landlord_description_overrides}/local_handler/run_local.sh (100%) rename applications/{SAL => landlord_description_overrides}/requirements.txt (100%) create mode 100644 docs/adr/0003-python-writes-landlord-overrides-directly.md rename domain/{sal => landlord_description_overrides}/__init__.py (100%) create mode 100644 domain/landlord_description_overrides/built_form_type.py rename domain/{sal => landlord_description_overrides}/column_classifier.py (94%) rename domain/{sal => landlord_description_overrides}/property_type.py (78%) create mode 100644 domain/landlord_description_overrides/roof_type.py create mode 100644 domain/landlord_description_overrides/wall_type.py delete mode 100644 domain/sal/wall_type.py create mode 100644 infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py create mode 100644 infrastructure/postgres/landlord_built_form_type_override_table.py create mode 100644 infrastructure/postgres/landlord_override_enums.py create mode 100644 infrastructure/postgres/landlord_property_type_override_postgres_repository.py create mode 100644 infrastructure/postgres/landlord_property_type_override_table.py create mode 100644 infrastructure/postgres/landlord_wall_type_override_postgres_repository.py create mode 100644 infrastructure/postgres/landlord_wall_type_override_table.py create mode 100644 orchestration/classifiable_column.py create mode 100644 orchestration/landlord_description_overrides_orchestrator.py delete mode 100644 orchestration/sal_orchestrator.py create mode 100644 playground.py create mode 100644 repositories/landlord_overrides/__init__.py create mode 100644 repositories/landlord_overrides/landlord_override_repository.py create mode 100644 tests/repositories/landlord_overrides/__init__.py create mode 100644 tests/repositories/landlord_overrides/postgres/__init__.py create mode 100644 tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py create mode 100644 tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index 34dc3115..6426e1c1 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -25,7 +25,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve | **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | | **Unstandardised Address** | A frozen dataclass (`domain.addresses.unstandardised_address.UnstandardisedAddress`) capturing a single address exactly as a customer supplied it, before any standardisation: a free-text `address` line (intentionally NOT normalised), a canonical `postcode` (a `Postcode` value object, sanitised on construction), an optional `org_reference` (the customer's own identifier for the property), and `additional_info` (the full source row — every column of the customer's upload, preserved verbatim). | "user address", "asset list", "raw address", "landlord address", "Hyde address" | | **Address List** | A nominal `NewType` over `list[UnstandardisedAddress]` (`domain.addresses.unstandardised_address.AddressList`) — a batch of unstandardised addresses, such as one customer's bulk-onboarding upload or a postcode-grouped sub-batch produced for downstream processing. Being nominal, it is constructed explicitly: `AddressList([...])`. It is the raw *input* to ingestion; the standardised *output* is a **Standardised Asset List**. | "asset list", "Hyde address list", "user addresses" | -| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**; generated by the `SALOrchestrator`. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" | +| **Standardised Asset List (SAL)** | A customer's property portfolio after ingestion has cleaned and standardised it — each property carrying a canonical field set (UPRN, standardised address, postcode, property type, built form, …). It is the standardised *output* of the pipeline whose raw *input* is an **Address List** of **Unstandardised Addresses**. (Legacy implementation: `asset_list.AssetList` via `load_standardised_asset_list`.) | "address list" (that is the raw input), "asset register", "portfolio list" | | **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | ## Address Matching diff --git a/applications/SAL/handler.py b/applications/SAL/handler.py deleted file mode 100644 index c1d73827..00000000 --- a/applications/SAL/handler.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging -from typing import Any - -import boto3 -from orchestration.sal_orchestrator import ( - SALOrchestrator, -) -from infrastructure.s3.csv_s3_client import CsvS3Client -from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( - UnstandardisedAddressListCsvS3Repository, -) -from domain.addresses.unstandardised_address import AddressList -from domain.sal.column_classifier import ColumnClassifier -from domain.sal.property_type import PropertyType -from domain.sal.wall_type import WallType -from infrastructure.chatgpt.chatgpt import ChatGPT -from infrastructure.chatgpt.chatgpt_column_classifier import ( - ChatGptColumnClassifier, -) - -logger = logging.getLogger(__name__) - - -def handler( - body: dict[str, Any], - context: Any, -) -> dict[str, list[str]]: - - s3_uri = "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv" - bucket = "retrofit-data-dev" - - # boto3.client is overloaded per-service in the installed stubs; cast - # to Any so the strict-mode checker treats it as opaque. - boto3_client: Any = boto3.client # noqa - boto_s3: Any = boto3_client("s3") - - csv_client = CsvS3Client(boto_s3, bucket) - unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository( - csv_client, bucket - ) - - # One ChatGPT-backed classifier per landlord-CSV column, keyed by column name. - chat_gpt = ChatGPT() - classifiers: dict[str, ColumnClassifier[Any]] = { - "Property Type": ChatGptColumnClassifier( - chat_gpt, PropertyType, PropertyType.UNKNOWN - ), - "Walls": ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN), - } - - sal = SALOrchestrator( - unstandardised_address_repo=unstandardised_address_repo, - classifiers=classifiers, - ) - - addressList: AddressList = sal.get_unstandardised_addresses(input_s3_uri=s3_uri) - - # Cap the batch to the first 20 while the ChatGPT path is under test. - addressList = AddressList(addressList[:20]) - - classified = sal.classify_columns(addressList) - for column, mapping in classified.items(): - logger.info( - "Classified %d descriptions for column %r.", len(mapping), column - ) - - # TODO: persist `classified` to landlord overrides. - - return {"hello": ["200"]} diff --git a/applications/SAL/Dockerfile b/applications/landlord_description_overrides/Dockerfile similarity index 100% rename from applications/SAL/Dockerfile rename to applications/landlord_description_overrides/Dockerfile diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py new file mode 100644 index 00000000..ff16925e --- /dev/null +++ b/applications/landlord_description_overrides/handler.py @@ -0,0 +1,128 @@ +import logging +import os +from typing import Any +from uuid import UUID + +import boto3 + +from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import ( + LandlordDescriptionOverridesTriggerBody, +) +from domain.addresses.unstandardised_address import AddressList +from domain.landlord_description_overrides.built_form_type import BuiltFormType +from domain.landlord_description_overrides.property_type import PropertyType +from domain.landlord_description_overrides.wall_type import WallType +from infrastructure.chatgpt.chatgpt import ChatGPT +from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier +from infrastructure.postgres.config import PostgresConfig +from infrastructure.postgres.engine import make_engine, transactional_session +from infrastructure.postgres.landlord_built_form_type_override_postgres_repository import ( + LandlordBuiltFormTypeOverridePostgresRepository, +) +from infrastructure.postgres.landlord_property_type_override_postgres_repository import ( + LandlordPropertyTypeOverridePostgresRepository, +) +from infrastructure.postgres.landlord_wall_type_override_postgres_repository import ( + LandlordWallTypeOverridePostgresRepository, +) +from infrastructure.s3.csv_s3_client import CsvS3Client +from orchestration.classifiable_column import ClassifiableColumn +from orchestration.landlord_description_overrides_orchestrator import ( + LandlordDescriptionOverridesOrchestrator, +) +from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( + UnstandardisedAddressListCsvS3Repository, +) + +logger = logging.getLogger(__name__) + + +def handler( + body: dict[str, Any], + context: Any, +) -> dict[str, list[str]]: + # TODO: replace with ``LandlordDescriptionOverridesTriggerBody.model_validate(body)`` + # once this lambda is wired into the parent task pipeline via the SQS + # subtask envelope. Until then the trigger fields are hard-coded so the + # local invoker can exercise the full path. See ADR-0003 §Out of scope. + trigger = LandlordDescriptionOverridesTriggerBody( + task_id=UUID("00000000-0000-0000-0000-000000000001"), + sub_task_id=UUID("00000000-0000-0000-0000-000000000002"), + s3_uri="s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv", + portfolio_id=730, + ) + + bucket = "retrofit-data-dev" + + # boto3.client is overloaded per-service in the installed stubs; cast + # to Any so the strict-mode checker treats it as opaque. + boto3_client: Any = ( + boto3.client + ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + boto_s3: Any = boto3_client("s3") + + csv_client = CsvS3Client(boto_s3, bucket) + unstandardised_address_repo = UnstandardisedAddressListCsvS3Repository( + csv_client, bucket + ) + + # One transactional session per handler invocation: the context manager + # commits on clean exit and rolls back on exception, so the handler never + # invokes ``.commit()`` itself -- transaction semantics live in the + # infrastructure layer. + engine = make_engine(PostgresConfig.from_env(os.environ)) + with transactional_session(engine) as session: + chat_gpt = ChatGPT() + # The "Property Type" CSV column is read by two classifiers: the + # landlord's free-text (e.g. "semi-detached house") encodes both the + # dwelling kind (PropertyType) and how it joins to neighbours + # (BuiltFormType). Each classification lands in its own table. + columns: list[ClassifiableColumn[Any]] = [ + ClassifiableColumn( + name="property_type", + source_column="Property Type", + classifier=ChatGptColumnClassifier( + chat_gpt, PropertyType, PropertyType.UNKNOWN + ), + repo=LandlordPropertyTypeOverridePostgresRepository(session), + ), + ClassifiableColumn( + name="built_form_type", + source_column="Property Type", + classifier=ChatGptColumnClassifier( + chat_gpt, BuiltFormType, BuiltFormType.UNKNOWN + ), + repo=LandlordBuiltFormTypeOverridePostgresRepository(session), + ), + ClassifiableColumn( + name="wall_type", + source_column="Walls", + classifier=ChatGptColumnClassifier( + chat_gpt, WallType, WallType.UNKNOWN + ), + repo=LandlordWallTypeOverridePostgresRepository(session), + ), + ] + + orchestrator = LandlordDescriptionOverridesOrchestrator( + unstandardised_address_repo=unstandardised_address_repo, + columns=columns, + ) + + addressList: AddressList = orchestrator.get_unstandardised_addresses( + input_s3_uri=trigger.s3_uri + ) + + # Cap the batch to the first 20 while the ChatGPT path is under test. + # Remove before wiring into the production subtask pipeline. + addressList = AddressList(addressList[:20]) + + classified = orchestrator.classify_and_persist( + addressList, portfolio_id=trigger.portfolio_id + ) + for column, mapping in classified.items(): + logger.info( + "Classified %d descriptions for column %r.", len(mapping), column + ) + + return {"hello": ["200"]} diff --git a/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py b/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py new file mode 100644 index 00000000..9f78215e --- /dev/null +++ b/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py @@ -0,0 +1,15 @@ +from uuid import UUID + +from pydantic import BaseModel, ConfigDict + + +class LandlordDescriptionOverridesTriggerBody(BaseModel): + model_config = ConfigDict(extra="allow") + + task_id: UUID + sub_task_id: UUID + s3_uri: str + # ``portfolio_id`` is ``bigint`` in the ``landlord_*_overrides`` schema -- + # Python ``int`` is unbounded so the Pydantic side stays simple; the + # SQLModel row class pins the storage to ``BigInteger``. + portfolio_id: int diff --git a/applications/SAL/local_handler/.env.local.example b/applications/landlord_description_overrides/local_handler/.env.local.example similarity index 100% rename from applications/SAL/local_handler/.env.local.example rename to applications/landlord_description_overrides/local_handler/.env.local.example diff --git a/applications/SAL/local_handler/docker-compose.yml b/applications/landlord_description_overrides/local_handler/docker-compose.yml similarity index 100% rename from applications/SAL/local_handler/docker-compose.yml rename to applications/landlord_description_overrides/local_handler/docker-compose.yml diff --git a/applications/SAL/local_handler/invoke_local_lambda.py b/applications/landlord_description_overrides/local_handler/invoke_local_lambda.py similarity index 100% rename from applications/SAL/local_handler/invoke_local_lambda.py rename to applications/landlord_description_overrides/local_handler/invoke_local_lambda.py diff --git a/applications/SAL/local_handler/run_local.sh b/applications/landlord_description_overrides/local_handler/run_local.sh similarity index 100% rename from applications/SAL/local_handler/run_local.sh rename to applications/landlord_description_overrides/local_handler/run_local.sh diff --git a/applications/SAL/requirements.txt b/applications/landlord_description_overrides/requirements.txt similarity index 100% rename from applications/SAL/requirements.txt rename to applications/landlord_description_overrides/requirements.txt diff --git a/docs/adr/0003-python-writes-landlord-overrides-directly.md b/docs/adr/0003-python-writes-landlord-overrides-directly.md new file mode 100644 index 00000000..ea0fda9b --- /dev/null +++ b/docs/adr/0003-python-writes-landlord-overrides-directly.md @@ -0,0 +1,77 @@ +# ADR-0003: Python writes landlord overrides directly to Postgres + +**Status:** Accepted +**Date:** 2026-05-26 +**Supersedes (in part):** [assessment-model/docs/adr/0002-landlord-override-vocabulary.md](https://github.com/.../assessment-model/blob/main/docs/adr/0002-landlord-override-vocabulary.md) — specifically the clause beginning *"Writes happen from Next.js …"*. + +## Context + +ADR-0002 (in the `assessment-model` TS repo) defined the `landlord_property_type_overrides` and `landlord_wall_type_overrides` tables and noted that the Model service would POST classification results to a Next.js route handler, with Next.js performing the upsert. Drizzle remained the schema source of truth. + +That extra hop has not been built and is now judged unnecessary for the present scope: + +- The classification result is internal — a Lambda computes it, the same Lambda persists it. No third party needs to participate in the write. +- Drizzle remains the schema's source of truth either way: the Python adapter mirrors the schema in a SQLModel row, but the migrations stay with Drizzle. Adding a Next.js route would not change which side owns schema definition. +- The Python lambda already lives next to a Postgres connection in the existing pipeline (`subtask`/`tasks` tables are written from Python today). Adding two more tables to that adapter surface is a small, well-understood change. Routing the same writes through Next.js would mean: lambda → JSON-over-HTTP → Next.js route → Drizzle → Postgres, instead of lambda → SQLAlchemy → Postgres. Three extra moving parts to ship, deploy, monitor, and authenticate for no behavioural gain. + +## Decision + +The Model service (specifically `applications/landlord_description_overrides/handler.py`) writes directly to `landlord_property_type_overrides` and `landlord_wall_type_overrides` via a SQLAlchemy-backed `LandlordOverrideRepository[E]` adapter. No Next.js route handler is required. + +Transaction boundaries live in `infrastructure/postgres/engine.transactional_session` — a context manager that commits on clean exit and rolls back on exception. The application layer (`handler.py`) never calls `.commit()` or `.rollback()` itself; it only opens the context. Orchestration and repository code likewise never commits — keeping transaction semantics confined to one infrastructure helper. + +The conflict policy lives in SQL and is identical for every adapter implementation: + +```sql +INSERT INTO landlord_property_type_overrides (portfolio_id, description, value, source) +VALUES … +ON CONFLICT (portfolio_id, description) +DO UPDATE SET value = EXCLUDED.value, + source = EXCLUDED.source, + updated_at = now() +WHERE landlord_property_type_overrides.source = 'classifier'; +``` + +The `WHERE existing.source = 'classifier'` guard is load-bearing: it lets the classifier refresh its own past output while leaving `source = 'user'` rows untouched. This is the contract ADR-0002's `source` column was added for. + +`UNKNOWN` values are persisted, not skipped — consistent with ADR-0002 §5. A future user override can upgrade them. + +## Consequences + +**Positive.** + +- One fewer service to deploy, monitor, and authenticate. +- The classifier and persistence live in the same process — failures surface against a single `sub_task` row, not split across two systems. +- The Postgres adapter mirrors the existing `subtask`/`tasks` repositories, so reviewers have a precedent to compare against. + +**Negative.** + +- The Python repo now holds two schemas — the schema-source-of-truth Drizzle definition lives in the TS repo, and the Python `SQLModel` row class shadows it. They must stay in lockstep. Mitigations: the TS schema header comment (`landlord_overrides.ts:12`) already names the Python source-of-truth file; a future ADR may add a CI check that diffs the two. +- The boundary that ADR-0002 anticipated for pgEnum validation (a Next.js route validating incoming values before insert) is gone. Pydantic + the Python `Enum` type catch invalid values on the producing side, and Postgres's pgEnum will reject anything that slips through. + +## File layout + +This ADR also fixes a placement convention for Postgres adapters going forward. The codebase currently has the ChatGPT classifier split cleanly along DDD lines — port in `domain/`, adapter in `infrastructure/chatgpt/` — but the `tasks` Postgres adapter does not follow the same shape: its concrete class lives in `repositories/tasks/`, not `infrastructure/postgres/`. + +The convention going forward is: + +- **Port (protocol / abstract base):** `repositories//_repository.py` +- **Postgres adapter (concrete):** `infrastructure/postgres/_postgres_repository.py` +- **SQLModel row class:** `infrastructure/postgres/_table.py` + +The new `LandlordOverrideRepository` family follows this convention. + +**Existing outliers to relocate in a follow-up:** + +- `repositories/tasks/task_postgres_repository.py` → `infrastructure/postgres/task_postgres_repository.py` +- `repositories/tasks/subtask_postgres_repository.py` → `infrastructure/postgres/subtask_postgres_repository.py` + +Both moves are mechanical (import-path updates only). They are intentionally out of scope for the present PR. + +## Out of scope (deferred to follow-up work) + +- Relocating `task_postgres_repository.py` and `subtask_postgres_repository.py` into `infrastructure/postgres/` per the convention above. +- Extracting a shared upsert helper / base class once a third `landlord_*_overrides` column lands — until then the two adapters' 95%-identical bodies are kept side-by-side for direct comparison. +- Switching `applications/landlord_description_overrides/handler.py` to acquire its `Session` via a `@subtask_handler()`-style decorator instead of building its own engine. +- A cross-repo PR amending ADR-0002 to point at this ADR. +- A CI check (or codegen) that diffs the Drizzle pgEnum literals against the Python `Enum.value` strings. diff --git a/domain/sal/__init__.py b/domain/landlord_description_overrides/__init__.py similarity index 100% rename from domain/sal/__init__.py rename to domain/landlord_description_overrides/__init__.py diff --git a/domain/landlord_description_overrides/built_form_type.py b/domain/landlord_description_overrides/built_form_type.py new file mode 100644 index 00000000..327ceebe --- /dev/null +++ b/domain/landlord_description_overrides/built_form_type.py @@ -0,0 +1,20 @@ +from enum import Enum + + +class BuiltFormType(Enum): + """A landlord-supplied built form, as resolved by the landlord-description-overrides context. + + Mirrors the EPC built-form values. ``NOT_RECORDED`` is the legitimate + EPC value for properties whose built form the surveyor did not capture; + ``UNKNOWN`` is the classifier fallback for landlord values that cannot be + resolved at all. + """ + + DETACHED = "Detached" + SEMI_DETACHED = "Semi-Detached" + MID_TERRACE = "Mid-Terrace" + END_TERRACE = "End-Terrace" + ENCLOSED_MID_TERRACE = "Enclosed Mid-Terrace" + ENCLOSED_END_TERRACE = "Enclosed End-Terrace" + NOT_RECORDED = "Not Recorded" + UNKNOWN = "Unknown" diff --git a/domain/sal/column_classifier.py b/domain/landlord_description_overrides/column_classifier.py similarity index 94% rename from domain/sal/column_classifier.py rename to domain/landlord_description_overrides/column_classifier.py index 3324d79f..adc88c6a 100644 --- a/domain/sal/column_classifier.py +++ b/domain/landlord_description_overrides/column_classifier.py @@ -21,7 +21,7 @@ class ColumnClassifier(ABC, Generic[E]): One classifier handles one landlord-CSV column. Implementations decide *how* the mapping is performed (an LLM, a lookup table, a rules engine); - ``SALOrchestrator`` depends only on this interface. + ``LandlordDescriptionOverridesOrchestrator`` depends only on this interface. """ @abstractmethod diff --git a/domain/sal/property_type.py b/domain/landlord_description_overrides/property_type.py similarity index 78% rename from domain/sal/property_type.py rename to domain/landlord_description_overrides/property_type.py index 3980c2f0..453c28c1 100644 --- a/domain/sal/property_type.py +++ b/domain/landlord_description_overrides/property_type.py @@ -2,7 +2,7 @@ from enum import Enum class PropertyType(Enum): - """A landlord-supplied property type, as resolved by the SAL context. + """A landlord-supplied property type, as resolved by the landlord-description-overrides context. Distinct from the EPC context's ``PropertyType``: a landlord CSV value may be unresolvable, so this enum carries an explicit ``UNKNOWN`` member. diff --git a/domain/landlord_description_overrides/roof_type.py b/domain/landlord_description_overrides/roof_type.py new file mode 100644 index 00000000..56ef9e8e --- /dev/null +++ b/domain/landlord_description_overrides/roof_type.py @@ -0,0 +1,70 @@ +from enum import Enum + + +class RoofType(Enum): + """A landlord-supplied roof description, as resolved by the landlord-description-overrides context. + + Each member is one full EPC roof-description string, combining shape + (flat, pitched, roof room(s), thatched) with insulation state and, for + pitched roofs, the loft-insulation depth in millimetres. Adjacency + markers like ``(another dwelling above)`` represent a unit whose top + boundary is another dwelling rather than a roof of its own; they are + kept as members because they appear in the same EPC column. + ``UNKNOWN`` covers values the classifier cannot resolve -- most + commonly raw ``Average thermal transmittance`` U-value strings that + carry no shape/insulation information. + """ + + FLAT_INSULATED = "Flat, insulated" + FLAT_INSULATED_ASSUMED = "Flat, insulated (assumed)" + FLAT_LIMITED_INSULATION = "Flat, limited insulation" + FLAT_LIMITED_INSULATION_ASSUMED = "Flat, limited insulation (assumed)" + FLAT_NO_INSULATION = "Flat, no insulation" + FLAT_NO_INSULATION_ASSUMED = "Flat, no insulation (assumed)" + + PITCHED_INSULATED = "Pitched, insulated" + PITCHED_INSULATED_ASSUMED = "Pitched, insulated (assumed)" + PITCHED_INSULATED_AT_RAFTERS = "Pitched, insulated at rafters" + PITCHED_LIMITED_INSULATION = "Pitched, limited insulation" + PITCHED_LIMITED_INSULATION_ASSUMED = "Pitched, limited insulation (assumed)" + PITCHED_NO_INSULATION = "Pitched, no insulation" + PITCHED_NO_INSULATION_ASSUMED = "Pitched, no insulation (assumed)" + PITCHED_UNKNOWN_LOFT_INSULATION = "Pitched, Unknown loft insulation" + PITCHED_LOFT_0MM = "Pitched, 0 mm loft insulation" + PITCHED_LOFT_12MM = "Pitched, 12 mm loft insulation" + PITCHED_LOFT_25MM = "Pitched, 25 mm loft insulation" + PITCHED_LOFT_50MM = "Pitched, 50 mm loft insulation" + PITCHED_LOFT_75MM = "Pitched, 75 mm loft insulation" + PITCHED_LOFT_100MM = "Pitched, 100 mm loft insulation" + PITCHED_LOFT_125MM = "Pitched, 125 mm loft insulation" + PITCHED_LOFT_150MM = "Pitched, 150 mm loft insulation" + PITCHED_LOFT_175MM = "Pitched, 175 mm loft insulation" + PITCHED_LOFT_200MM = "Pitched, 200 mm loft insulation" + PITCHED_LOFT_225MM = "Pitched, 225 mm loft insulation" + PITCHED_LOFT_250MM = "Pitched, 250 mm loft insulation" + PITCHED_LOFT_270MM = "Pitched, 270 mm loft insulation" + PITCHED_LOFT_300MM = "Pitched, 300 mm loft insulation" + PITCHED_LOFT_350MM = "Pitched, 350 mm loft insulation" + PITCHED_LOFT_400MM = "Pitched, 400 mm loft insulation" + PITCHED_LOFT_400_PLUS_MM = "Pitched, 400+ mm loft insulation" + + ROOF_ROOM_INSULATED = "Roof room(s), insulated" + ROOF_ROOM_INSULATED_ASSUMED = "Roof room(s), insulated (assumed)" + ROOF_ROOM_LIMITED_INSULATION = "Roof room(s), limited insulation" + ROOF_ROOM_LIMITED_INSULATION_ASSUMED = "Roof room(s), limited insulation (assumed)" + ROOF_ROOM_NO_INSULATION = "Roof room(s), no insulation" + ROOF_ROOM_NO_INSULATION_ASSUMED = "Roof room(s), no insulation (assumed)" + ROOF_ROOM_CEILING_INSULATED = "Roof room(s), ceiling insulated" + ROOF_ROOM_THATCHED = "Roof room(s), thatched" + ROOF_ROOM_THATCHED_WITH_ADDITIONAL_INSULATION = "Roof room(s), thatched with additional insulation" + + THATCHED = "Thatched" + THATCHED_WITH_ADDITIONAL_INSULATION = "Thatched, with additional insulation" + + ADJACENT_ANOTHER_DWELLING_ABOVE = "(another dwelling above)" + ADJACENT_SAME_DWELLING_ABOVE = "(same dwelling above)" + ADJACENT_OTHER_PREMISES_ABOVE = "(other premises above)" + ADJACENT_ANOTHER_PREMISES_ABOVE = "(another premises above)" + ANOTHER_PREMISES_ABOVE = "Another Premises Above" + + UNKNOWN = "Unknown" diff --git a/domain/landlord_description_overrides/wall_type.py b/domain/landlord_description_overrides/wall_type.py new file mode 100644 index 00000000..42b90da6 --- /dev/null +++ b/domain/landlord_description_overrides/wall_type.py @@ -0,0 +1,70 @@ +from enum import Enum + + +class WallType(Enum): + """A landlord-supplied wall description, as resolved by the landlord-description-overrides context. + + Each member is one full EPC wall-description string, combining material + (cavity, solid brick, sandstone, …) with construction/insulation state + (as built, filled cavity, with internal insulation, …). ``UNKNOWN`` covers + values the classifier cannot resolve — most commonly raw + ``Average thermal transmittance`` U-value strings that carry no material + information. + """ + + CAVITY_FILLED = "Cavity wall, filled cavity" + CAVITY_AS_BUILT_INSULATED_ASSUMED = "Cavity wall, as built, insulated (assumed)" + CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = "Cavity wall, as built, no insulation (assumed)" + CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Cavity wall, as built, partial insulation (assumed)" + CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation" + CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation" + CAVITY_FILLED_AND_INTERNAL_INSULATION = "Cavity wall, filled cavity and internal insulation" + CAVITY_FILLED_AND_EXTERNAL_INSULATION = "Cavity wall, filled cavity and external insulation" + + SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = "Solid brick, as built, no insulation (assumed)" + SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = "Solid brick, as built, insulated (assumed)" + SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Solid brick, as built, partial insulation (assumed)" + SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation" + SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation" + + TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = "Timber frame, as built, no insulation (assumed)" + TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = "Timber frame, as built, insulated (assumed)" + TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Timber frame, as built, partial insulation (assumed)" + TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation" + + SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = "Sandstone, as built, no insulation (assumed)" + SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)" + SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Sandstone, as built, partial insulation (assumed)" + SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation" + SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation" + + GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = "Granite or whin, as built, no insulation (assumed)" + GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = "Granite or whin, as built, insulated (assumed)" + GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Granite or whin, as built, partial insulation (assumed)" + GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = "Granite or whin, with internal insulation" + GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = "Granite or whin, with external insulation" + + SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = "System built, as built, no insulation (assumed)" + SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = "System built, as built, insulated (assumed)" + SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "System built, as built, partial insulation (assumed)" + SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation" + SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation" + + PARK_HOME_AS_BUILT = "Park home wall, as built" + PARK_HOME_WITH_INTERNAL_INSULATION = "Park home wall, with internal insulation" + PARK_HOME_WITH_EXTERNAL_INSULATION = "Park home wall, with external insulation" + + COB_AS_BUILT = "Cob, as built" + COB_WITH_INTERNAL_INSULATION = "Cob, with internal insulation" + COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation" + + CURTAIN_WALL = "Curtain wall" + CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = "Curtain Wall, as built, no insulation (assumed)" + CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = "Curtain Wall, as built, insulated (assumed)" + CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity" + CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation" + + BASEMENT_WALL = "Basement wall" + BASEMENT_WALL_AS_BUILT = "Basement wall, as built" + + UNKNOWN = "Unknown" diff --git a/domain/sal/wall_type.py b/domain/sal/wall_type.py deleted file mode 100644 index 05dc2ba9..00000000 --- a/domain/sal/wall_type.py +++ /dev/null @@ -1,15 +0,0 @@ -from enum import Enum - - -class WallType(Enum): - """A landlord-supplied wall construction type, as resolved by the SAL context. - - Mirrors the main RdSAP wall constructions. Like the SAL ``PropertyType``, - it carries an explicit ``UNKNOWN`` member for unresolvable CSV values. - """ - - CAVITY = "Cavity" - SOLID_BRICK = "Solid Brick" - TIMBER_FRAME = "Timber frame" - SANDSTONE = "Sandstone" - UNKNOWN = "Unknown" diff --git a/infrastructure/chatgpt/chatgpt_column_classifier.py b/infrastructure/chatgpt/chatgpt_column_classifier.py index 8f564e6c..b23e7c2e 100644 --- a/infrastructure/chatgpt/chatgpt_column_classifier.py +++ b/infrastructure/chatgpt/chatgpt_column_classifier.py @@ -4,7 +4,10 @@ import json from enum import Enum from typing import Any, TypeVar -from domain.sal.column_classifier import ClassificationError, ColumnClassifier +from domain.landlord_description_overrides.column_classifier import ( + ClassificationError, + ColumnClassifier, +) from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.exceptions import ChatGPTClientError diff --git a/infrastructure/postgres/engine.py b/infrastructure/postgres/engine.py index 0de9efcb..ea2b35ad 100644 --- a/infrastructure/postgres/engine.py +++ b/infrastructure/postgres/engine.py @@ -1,3 +1,6 @@ +from collections.abc import Iterator +from contextlib import contextmanager + from sqlalchemy.engine import Engine from sqlmodel import Session, create_engine @@ -16,3 +19,24 @@ def make_engine(config: PostgresConfig) -> Engine: def make_session(engine: Engine) -> Session: return Session(engine) + + +@contextmanager # pyright: ignore[reportDeprecated] +def transactional_session(engine: Engine) -> Iterator[Session]: + """Yield a session whose lifecycle owns the transaction. + + On clean exit the session commits; on any exception it rolls back and + re-raises. Either way the session is closed. Callers in the application + layer can do their work inside the ``with`` block without ever invoking + ``.commit()`` / ``.rollback()`` themselves -- transaction semantics stay + in the infrastructure layer. + """ + session = Session(engine) + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() diff --git a/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py b/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py new file mode 100644 index 00000000..0f7d4959 --- /dev/null +++ b/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py @@ -0,0 +1,82 @@ +"""Postgres adapter for ``LandlordOverrideRepository[BuiltFormType]``. + +Writes to ``landlord_built_form_type_overrides`` (Drizzle-managed; mirrored by +``LandlordBuiltFormTypeOverrideRow``). The conflict policy lives in the SQL -- +see ADR-0003 §Decision. Shape mirrors +``LandlordPropertyTypeOverridePostgresRepository``; the duplication is +deliberate while there are only three columns -- if a fourth lands and the +duplication becomes painful, extract a shared upsert helper then. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import cast + +from sqlalchemy import Table +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import Session + +from domain.landlord_description_overrides.built_form_type import BuiltFormType +from infrastructure.postgres.landlord_built_form_type_override_table import ( + LandlordBuiltFormTypeOverrideRow, +) +from infrastructure.postgres.landlord_override_enums import OverrideSource +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + + +class LandlordBuiltFormTypeOverridePostgresRepository( + LandlordOverrideRepository[BuiltFormType] +): + def __init__(self, session: Session) -> None: + self._session = session + + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, BuiltFormType], + ) -> None: + if not descriptions_to_values: + return + + now = datetime.now(timezone.utc) + rows = [ + { + "portfolio_id": portfolio_id, + "description": description, + "value": value.value, + "source": OverrideSource.CLASSIFIER, + "created_at": now, + "updated_at": now, + } + for description, value in descriptions_to_values.items() + ] + + # SQLModel's class-level ``__table__`` is injected at runtime on + # ``table=True`` classes but isn't exposed by the stubs; pin it to + # ``Table`` via ``getattr`` so the dialect insert helper below + # carries through with strict types. + table: Table = cast( + Table, getattr(LandlordBuiltFormTypeOverrideRow, "__table__") + ) + stmt = pg_insert(table).values(rows) + + # The classifier may refresh its own past output, but must never + # overwrite a user correction -- the ``WHERE existing.source = + # 'classifier'`` guard enforces that. See ADR-0003 §Decision. + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={ + "value": stmt.excluded.value, + "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at, + }, + where=table.c.source == OverrideSource.CLASSIFIER, + ) + + # SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the + # overload signatures is marked deprecated in stubs, which fires + # here even though our INSERT path is the supported one. + self._session.execute(stmt) # pyright: ignore[reportDeprecated] diff --git a/infrastructure/postgres/landlord_built_form_type_override_table.py b/infrastructure/postgres/landlord_built_form_type_override_table.py new file mode 100644 index 00000000..a1f89c35 --- /dev/null +++ b/infrastructure/postgres/landlord_built_form_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_built_form_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``built_form_type`` pgEnum on ``value``, +and the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.landlord_description_overrides.built_form_type import BuiltFormType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordBuiltFormTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_built_form_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_built_form_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: BuiltFormType = Field( + sa_column=Column( + SAEnum( + BuiltFormType, + name="built_form_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_override_enums.py b/infrastructure/postgres/landlord_override_enums.py new file mode 100644 index 00000000..ba2cee94 --- /dev/null +++ b/infrastructure/postgres/landlord_override_enums.py @@ -0,0 +1,35 @@ +"""Shared pgEnum definitions used by every ``landlord_*_overrides`` row class. + +The ``override_source`` pgEnum is referenced by both +``landlord_property_type_overrides`` and ``landlord_wall_type_overrides`` +(per the Drizzle schema -- see ``landlord_overrides.ts``). Defining it once +here and reusing the same SQLAlchemy ``Enum`` instance across both row +classes keeps SQLModel's metadata coherent: ``create_all`` emits exactly one +``CREATE TYPE override_source`` statement, not two parallel ones colliding +on the same pgEnum name. +""" + +from __future__ import annotations + +from sqlalchemy import Enum as SAEnum + + +class OverrideSource: + """Mirror of the ``override_source`` pgEnum. + + Drizzle defines this as ``('classifier', 'user')`` in + ``landlord_overrides.ts``. Modelled here as string constants so callers + don't sprinkle magic strings; the column is constrained by Postgres, + and the only Python-side producer (the classifier path) writes the + literal ``OverrideSource.CLASSIFIER``. + """ + + CLASSIFIER = "classifier" + USER = "user" + + +override_source_sa_enum = SAEnum( + OverrideSource.CLASSIFIER, + OverrideSource.USER, + name="override_source", +) diff --git a/infrastructure/postgres/landlord_property_type_override_postgres_repository.py b/infrastructure/postgres/landlord_property_type_override_postgres_repository.py new file mode 100644 index 00000000..18592c5f --- /dev/null +++ b/infrastructure/postgres/landlord_property_type_override_postgres_repository.py @@ -0,0 +1,82 @@ +"""Postgres adapter for ``LandlordOverrideRepository[PropertyType]``. + +Writes to ``landlord_property_type_overrides`` (Drizzle-managed; mirrored by +``LandlordPropertyTypeOverrideRow``). The conflict policy lives in the SQL -- +see ADR-0003 §Decision. + +Per the convention this ADR fixes, Postgres adapters live in +``infrastructure/postgres/``. The existing ``task_postgres_repository.py`` / +``subtask_postgres_repository.py`` are outliers still under ``repositories/``; +relocating them is tracked as a follow-up in ADR-0003 §"File layout". +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import cast + +from sqlalchemy import Table +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import Session + +from domain.landlord_description_overrides.property_type import PropertyType +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_property_type_override_table import ( + LandlordPropertyTypeOverrideRow, +) +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + + +class LandlordPropertyTypeOverridePostgresRepository( + LandlordOverrideRepository[PropertyType] +): + def __init__(self, session: Session) -> None: + self._session = session + + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, PropertyType], + ) -> None: + if not descriptions_to_values: + return + + now = datetime.now(timezone.utc) + rows = [ + { + "portfolio_id": portfolio_id, + "description": description, + "value": value.value, + "source": OverrideSource.CLASSIFIER, + "created_at": now, + "updated_at": now, + } + for description, value in descriptions_to_values.items() + ] + + # SQLModel's class-level ``__table__`` is injected at runtime on + # ``table=True`` classes but isn't exposed by the stubs; pin it to + # ``Table`` via ``getattr`` so the dialect insert helper below + # carries through with strict types. + table: Table = cast(Table, getattr(LandlordPropertyTypeOverrideRow, "__table__")) + stmt = pg_insert(table).values(rows) + + # The classifier may refresh its own past output, but must never + # overwrite a user correction -- the ``WHERE existing.source = + # 'classifier'`` guard enforces that. See ADR-0003 §Decision. + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={ + "value": stmt.excluded.value, + "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at, + }, + where=table.c.source == OverrideSource.CLASSIFIER, + ) + + # SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the + # overload signatures is marked deprecated in stubs, which fires + # here even though our INSERT path is the supported one. + self._session.execute(stmt) # pyright: ignore[reportDeprecated] diff --git a/infrastructure/postgres/landlord_property_type_override_table.py b/infrastructure/postgres/landlord_property_type_override_table.py new file mode 100644 index 00000000..b76d508e --- /dev/null +++ b/infrastructure/postgres/landlord_property_type_override_table.py @@ -0,0 +1,67 @@ +"""SQLModel mirror of the ``landlord_property_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.landlord_description_overrides.property_type import PropertyType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordPropertyTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_property_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_property_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: PropertyType = Field( + sa_column=Column( + SAEnum( + PropertyType, + name="property_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py b/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py new file mode 100644 index 00000000..21b73e98 --- /dev/null +++ b/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py @@ -0,0 +1,80 @@ +"""Postgres adapter for ``LandlordOverrideRepository[WallType]``. + +Writes to ``landlord_wall_type_overrides`` (Drizzle-managed; mirrored by +``LandlordWallTypeOverrideRow``). The conflict policy lives in the SQL -- +see ADR-0003 §Decision. Shape mirrors +``LandlordPropertyTypeOverridePostgresRepository``; the duplication is +deliberate while there are only two columns -- if a third lands and the +duplication becomes painful, extract a shared upsert helper then. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import cast + +from sqlalchemy import Table +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import Session + +from domain.landlord_description_overrides.wall_type import WallType +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_wall_type_override_table import ( + LandlordWallTypeOverrideRow, +) +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + + +class LandlordWallTypeOverridePostgresRepository( + LandlordOverrideRepository[WallType] +): + def __init__(self, session: Session) -> None: + self._session = session + + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, WallType], + ) -> None: + if not descriptions_to_values: + return + + now = datetime.now(timezone.utc) + rows = [ + { + "portfolio_id": portfolio_id, + "description": description, + "value": value.value, + "source": OverrideSource.CLASSIFIER, + "created_at": now, + "updated_at": now, + } + for description, value in descriptions_to_values.items() + ] + + # SQLModel's class-level ``__table__`` is injected at runtime on + # ``table=True`` classes but isn't exposed by the stubs; pin it to + # ``Table`` via ``getattr`` so the dialect insert helper below + # carries through with strict types. + table: Table = cast(Table, getattr(LandlordWallTypeOverrideRow, "__table__")) + stmt = pg_insert(table).values(rows) + + # The classifier may refresh its own past output, but must never + # overwrite a user correction -- the ``WHERE existing.source = + # 'classifier'`` guard enforces that. See ADR-0003 §Decision. + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={ + "value": stmt.excluded.value, + "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at, + }, + where=table.c.source == OverrideSource.CLASSIFIER, + ) + + # SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the + # overload signatures is marked deprecated in stubs, which fires + # here even though our INSERT path is the supported one. + self._session.execute(stmt) # pyright: ignore[reportDeprecated] diff --git a/infrastructure/postgres/landlord_wall_type_override_table.py b/infrastructure/postgres/landlord_wall_type_override_table.py new file mode 100644 index 00000000..79bea46a --- /dev/null +++ b/infrastructure/postgres/landlord_wall_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_wall_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``wall_type`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.landlord_description_overrides.wall_type import WallType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordWallTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_wall_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_wall_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: WallType = Field( + sa_column=Column( + SAEnum( + WallType, + name="wall_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/orchestration/classifiable_column.py b/orchestration/classifiable_column.py new file mode 100644 index 00000000..fb1dab6e --- /dev/null +++ b/orchestration/classifiable_column.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Generic, TypeVar + +from domain.landlord_description_overrides.column_classifier import ColumnClassifier +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + +E = TypeVar("E", bound=Enum) + + +@dataclass(frozen=True) +class ClassifiableColumn(Generic[E]): + """Pairs a column's classifier with the repository that persists its results. + + The orchestrator registers one ``ClassifiableColumn`` per + (source column, target enum) pair. Bundling the classifier and the + repository together makes the "this enum lands in this table" invariant + structural -- the handler can no longer wire ``PropertyType`` + classifications to a ``WallType`` repo by keying two dicts with the same + string. + + ``source_column`` is the landlord-CSV header to read from; ``name`` is the + unique key the orchestrator uses to report this classification's results + (and the key the handler logs). Two ``ClassifiableColumn``s may share a + ``source_column`` -- e.g. the ``"Property Type"`` CSV column feeds both + ``PropertyType`` and ``BuiltFormType`` classifiers off the same free-text + description -- but each must have a unique ``name``. + """ + + name: str + source_column: str + classifier: ColumnClassifier[E] + repo: LandlordOverrideRepository[E] diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py new file mode 100644 index 00000000..389d1afb --- /dev/null +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -0,0 +1,83 @@ +from enum import Enum +from typing import Any + +from domain.addresses.unstandardised_address import AddressList +from orchestration.classifiable_column import ClassifiableColumn +from repositories.unstandardised_address.unstandardised_address_list_repository import ( + UnstandardisedAddressListRepository, +) + + +class LandlordDescriptionOverridesOrchestrator: + def __init__( + self, + unstandardised_address_repo: UnstandardisedAddressListRepository, + columns: list[ClassifiableColumn[Any]], + ) -> None: + self._unstandardised_address_repo = unstandardised_address_repo + # Each entry is one (source CSV column, target enum) classification. + # Two entries may share ``source_column`` -- e.g. ``"Property Type"`` + # feeds both PropertyType and BuiltFormType classifiers -- so the + # registry is a list rather than a dict keyed by header. + self._columns = columns + + def get_unstandardised_addresses( + self, + input_s3_uri: str, + ) -> AddressList: + return self._unstandardised_address_repo.load_batch(input_s3_uri) + + def get_col_to_description_mappings( + self, list_of_unstandardised_address: AddressList + ) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} + for unstandardised_address in list_of_unstandardised_address: + for key, value in unstandardised_address.additional_info.items(): + bucket = mappings.setdefault(key, set()) + # A comma-separated value is several descriptions in one cell; + # split it so each is its own entry. Lower-case so case-only + # typos collapse to one variant. + for variant in value.split(","): + variant = variant.strip().lower() + if variant: + bucket.add(variant) + return mappings + + def classify_columns( + self, addresses: AddressList + ) -> dict[str, dict[str, Enum]]: + """Classify every registered column's descriptions. + + Returns a mapping of ``ClassifiableColumn.name`` to + ``{description: category}``. A registered column whose ``source_column`` + is absent from the addresses contributes an empty inner mapping. + """ + col_to_desc = self.get_col_to_description_mappings(addresses) + return { + column.name: column.classifier.classify( + col_to_desc.get(column.source_column, set()) + ) + for column in self._columns + } + + def classify_and_persist( + self, addresses: AddressList, portfolio_id: int + ) -> dict[str, dict[str, Enum]]: + """Classify every registered column and persist the results. + + Each non-empty mapping is written via the column's repository under + ``source = 'classifier'``. Empty mappings (a registered column whose + ``source_column`` is absent from this batch) skip the DB round-trip. + The orchestrator does not commit -- the caller owns the transaction + boundary. + + Returns the same shape as ``classify_columns`` so callers can log + per-column counts. + """ + classified = self.classify_columns(addresses) + for column in self._columns: + mapping = classified[column.name] + if not mapping: + continue + column.repo.upsert_all(portfolio_id, mapping) + return classified diff --git a/orchestration/sal_orchestrator.py b/orchestration/sal_orchestrator.py deleted file mode 100644 index 6b451746..00000000 --- a/orchestration/sal_orchestrator.py +++ /dev/null @@ -1,56 +0,0 @@ -from enum import Enum -from typing import Any - -from domain.addresses.unstandardised_address import AddressList -from domain.sal.column_classifier import ColumnClassifier -from repositories.unstandardised_address.unstandardised_address_list_repository import ( - UnstandardisedAddressListRepository, -) - - -class SALOrchestrator: - def __init__( - self, - unstandardised_address_repo: UnstandardisedAddressListRepository, - classifiers: dict[str, ColumnClassifier[Any]], - ) -> None: - self._unstandardised_address_repo = unstandardised_address_repo - # Keyed by landlord-CSV column name. - self._classifiers = classifiers - - def get_unstandardised_addresses( - self, - input_s3_uri: str, - ) -> AddressList: - return self._unstandardised_address_repo.load_batch(input_s3_uri) - - def get_col_to_description_mappings( - self, list_of_unstandardised_address: AddressList - ) -> dict[str, set[str]]: - mappings: dict[str, set[str]] = {} - for unstandardised_address in list_of_unstandardised_address: - for key, value in unstandardised_address.additional_info.items(): - bucket = mappings.setdefault(key, set()) - # A comma-separated value is several descriptions in one cell; - # split it so each is its own entry. Lower-case so case-only - # typos collapse to one variant. - for variant in value.split(","): - variant = variant.strip().lower() - if variant: - bucket.add(variant) - return mappings - - def classify_columns( - self, addresses: AddressList - ) -> dict[str, dict[str, Enum]]: - """Classify every registered column's descriptions. - - Returns a mapping of column name to ``{description: category}``. A - registered column absent from the addresses contributes an empty - inner mapping. - """ - col_to_desc = self.get_col_to_description_mappings(addresses) - return { - column: classifier.classify(col_to_desc.get(column, set())) - for column, classifier in self._classifiers.items() - } diff --git a/playground.py b/playground.py new file mode 100644 index 00000000..d116dcf9 --- /dev/null +++ b/playground.py @@ -0,0 +1,57 @@ +"""Read a file and return unique values from a chosen column.""" + +from pathlib import Path +import argparse +import sys + +import pandas as pd + + +def read_file(path: str | Path) -> pd.DataFrame: + path = Path(path) + suffix = path.suffix.lower() + if suffix == ".csv": + return pd.read_csv(path) + if suffix == ".tsv": + return pd.read_csv(path, sep="\t") + if suffix in {".xlsx", ".xls"}: + return pd.read_excel(path) + if suffix == ".parquet": + return pd.read_parquet(path) + if suffix == ".json": + return pd.read_json(path) + raise ValueError(f"Unsupported file type: {suffix}") + + +def get_unique(path: str | Path, column: str, dropna: bool = True) -> list: + df = read_file(Path(path)) + if column not in df.columns: + raise KeyError(f"Column {column!r} not found. Available: {list(df.columns)}") + series = df[column].dropna() if dropna else df[column] + return series.unique().tolist() + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--path", default="/workspaces/model/certificates-2026.csv") + parser.add_argument("--column", nargs="walls_description") + parser.add_argument("--keep-na", action="store_true") + args, _ = parser.parse_known_args() + + df = read_file(args.path) + + if not args.column: + print("Available columns:") + for c in df.columns: + print(f" - {c}") + return 0 + + column = "roof_description" + series = df[column] if args.keep_na else df[column].dropna() + for value in series.unique(): + print(value) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/repositories/landlord_overrides/__init__.py b/repositories/landlord_overrides/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/repositories/landlord_overrides/landlord_override_repository.py b/repositories/landlord_overrides/landlord_override_repository.py new file mode 100644 index 00000000..47e873fe --- /dev/null +++ b/repositories/landlord_overrides/landlord_override_repository.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from enum import Enum +from typing import Generic, TypeVar + +E = TypeVar("E", bound=Enum) + + +class LandlordOverrideRepository(ABC, Generic[E]): + """Port: persists landlord (description -> category) overrides for a portfolio. + + One repository implementation targets one ``landlord__overrides`` + table. The category enum ``E`` (e.g. ``PropertyType``, ``WallType``) determines + which table the adapter writes to; the orchestrator depends only on this + interface and never names a concrete table. + + Concrete adapters live in ``infrastructure/`` (see ADR-0003): for example + ``infrastructure/postgres/landlord_property_type_override_postgres_repository.py``. + """ + + @abstractmethod + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, E], + ) -> None: + """Upsert each ``(portfolio_id, description) -> value`` row with ``source='classifier'``. + + On conflict with an existing row whose ``source = 'classifier'``, the row + is updated (value, source, updated_at). On conflict with a row whose + ``source = 'user'``, the existing row is preserved -- the classifier + never overwrites a user correction. See ADR-0003 §Decision. + + An empty ``descriptions_to_values`` mapping is a no-op; callers may + skip this call entirely when they have nothing to write. + """ + ... diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py index 5ec854f1..8a07ecec 100644 --- a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -4,9 +4,9 @@ from typing import Optional import pytest -from domain.sal.column_classifier import ClassificationError -from domain.sal.property_type import PropertyType -from domain.sal.wall_type import WallType +from domain.landlord_description_overrides.column_classifier import ClassificationError +from domain.landlord_description_overrides.property_type import PropertyType +from domain.landlord_description_overrides.wall_type import WallType from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.chatgpt_column_classifier import ( ChatGptColumnClassifier, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index 62f1a329..eee4a310 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -4,12 +4,17 @@ from enum import Enum from typing import Any, Optional from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress +from domain.landlord_description_overrides.built_form_type import BuiltFormType +from domain.landlord_description_overrides.column_classifier import ColumnClassifier +from domain.landlord_description_overrides.property_type import PropertyType +from domain.landlord_description_overrides.wall_type import WallType from domain.postcode import Postcode -from domain.sal.column_classifier import ColumnClassifier -from domain.sal.property_type import PropertyType -from domain.sal.wall_type import WallType -from orchestration.sal_orchestrator import ( - SALOrchestrator, +from orchestration.classifiable_column import ClassifiableColumn +from orchestration.landlord_description_overrides_orchestrator import ( + LandlordDescriptionOverridesOrchestrator, +) +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, ) from repositories.unstandardised_address.unstandardised_address_list_repository import ( UnstandardisedAddressListRepository, @@ -38,6 +43,18 @@ class _StubColumnClassifier(ColumnClassifier[Enum]): return self._result +class _StubLandlordOverrideRepository(LandlordOverrideRepository[Enum]): + """Records every ``upsert_all`` call so tests can assert routing.""" + + def __init__(self) -> None: + self.calls: list[tuple[int, dict[str, Enum]]] = [] + + def upsert_all( + self, portfolio_id: int, descriptions_to_values: dict[str, Enum] + ) -> None: + self.calls.append((portfolio_id, dict(descriptions_to_values))) + + def _make_unstandardised_address( landlord_additional_info: dict[str, str], ) -> UnstandardisedAddress: @@ -49,11 +66,25 @@ def _make_unstandardised_address( def _orchestrator( - classifiers: Optional[dict[str, ColumnClassifier[Any]]] = None, -) -> SALOrchestrator: - return SALOrchestrator( + columns: Optional[list[ClassifiableColumn[Any]]] = None, +) -> LandlordDescriptionOverridesOrchestrator: + return LandlordDescriptionOverridesOrchestrator( unstandardised_address_repo=_StubUnstandardisedAddressRepository(), - classifiers=classifiers or {}, + columns=columns or [], + ) + + +def _column( + name: str, + source_column: str, + classifier: ColumnClassifier[Any], + repo: Optional[LandlordOverrideRepository[Any]] = None, +) -> ClassifiableColumn[Any]: + return ClassifiableColumn( + name=name, + source_column=source_column, + classifier=classifier, + repo=repo or _StubLandlordOverrideRepository(), ) @@ -155,30 +186,140 @@ def test_classify_columns_classifies_each_registered_column() -> None: property_types = _StubColumnClassifier( result={"semi-detached": PropertyType.HOUSE} ) - wall_types = _StubColumnClassifier(result={"solid brick": WallType.SOLID_BRICK}) + wall_types = _StubColumnClassifier(result={"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) # act result = _orchestrator( - {"Property Type": property_types, "Walls": wall_types} + [ + _column("property_type", "Property Type", property_types), + _column("wall_type", "Walls", wall_types), + ] ).classify_columns(addresses) - # assert: each registered column was classified independently. + # assert: each registered column was classified independently, keyed by name. assert result == { - "Property Type": {"semi-detached": PropertyType.HOUSE}, - "Walls": {"solid brick": WallType.SOLID_BRICK}, + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, } def test_classify_columns_yields_empty_mapping_for_an_absent_column() -> None: - # arrange: a classifier is registered for a column the addresses lack. + # arrange: a classifier is registered for a source column the addresses lack. addresses = AddressList([_make_unstandardised_address({"Walls": "cavity"})]) property_types = _StubColumnClassifier(result={}) # act result = _orchestrator( - {"Property Type": property_types} + [_column("property_type", "Property Type", property_types)] ).classify_columns(addresses) # assert: the absent column classified an empty description set. - assert result == {"Property Type": {}} + assert result == {"property_type": {}} assert property_types.received == set() + + +def test_classify_columns_runs_two_classifiers_against_a_shared_source_column() -> None: + # arrange: the "Property Type" landlord column feeds two classifiers -- + # PropertyType (what kind of dwelling) and BuiltFormType (how it joins + # to neighbours). Both must run against the same description set; each + # result is keyed by its column's ``name``. + addresses = AddressList( + [_make_unstandardised_address({"Property Type": "semi-detached house"})] + ) + property_types = _StubColumnClassifier( + result={"semi-detached house": PropertyType.HOUSE} + ) + built_form_types = _StubColumnClassifier( + result={"semi-detached house": BuiltFormType.SEMI_DETACHED} + ) + + # act + result = _orchestrator( + [ + _column("property_type", "Property Type", property_types), + _column("built_form_type", "Property Type", built_form_types), + ] + ).classify_columns(addresses) + + # assert: both classifiers saw the same description set, and the two + # results live under their own ``name`` keys without colliding. + assert property_types.received == {"semi-detached house"} + assert built_form_types.received == {"semi-detached house"} + assert result == { + "property_type": {"semi-detached house": PropertyType.HOUSE}, + "built_form_type": {"semi-detached house": BuiltFormType.SEMI_DETACHED}, + } + + +def test_classify_and_persist_writes_each_columns_mapping_to_its_own_repo() -> None: + # arrange: two columns with distinct repos -- the orchestrator must + # route each column's classifications to its own repo, not mix them. + addresses = AddressList( + [ + _make_unstandardised_address( + {"Property Type": "semi-detached", "Walls": "solid brick"} + ), + ] + ) + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column( + "property_type", + "Property Type", + _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}), + property_type_repo, + ), + _column( + "wall_type", + "Walls", + _StubColumnClassifier({"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}), + wall_type_repo, + ), + ] + + # act + result = _orchestrator(columns).classify_and_persist(addresses, portfolio_id=42) + + # assert: each repo received exactly its own column's mapping, under the + # given portfolio_id, and the return value mirrors classify_columns. + assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [ + (42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + ] + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + + +def test_classify_and_persist_skips_upsert_for_a_column_absent_from_the_batch() -> None: + # arrange: ``Walls`` is registered but the address has no ``Walls`` column. + # The orchestrator should still classify (yielding an empty mapping) but + # must NOT call ``upsert_all`` -- an empty bulk insert is a noisy no-op. + addresses = AddressList( + [_make_unstandardised_address({"Property Type": "semi-detached"})] + ) + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column( + "property_type", + "Property Type", + _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}), + property_type_repo, + ), + _column( + "wall_type", + "Walls", + _StubColumnClassifier({}), + wall_type_repo, + ), + ] + + # act + _orchestrator(columns).classify_and_persist(addresses, portfolio_id=7) + + # assert: Property Type wrote; Walls did not. + assert property_type_repo.calls == [(7, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [] diff --git a/tests/repositories/landlord_overrides/__init__.py b/tests/repositories/landlord_overrides/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/landlord_overrides/postgres/__init__.py b/tests/repositories/landlord_overrides/postgres/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py new file mode 100644 index 00000000..9154b664 --- /dev/null +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py @@ -0,0 +1,147 @@ +"""Integration tests for the source-aware upsert policy. + +The conflict policy lives entirely in SQL (``INSERT ... ON CONFLICT +... DO UPDATE ... WHERE existing.source = 'classifier'``). The only way to +verify it correctly distinguishes ``EXCLUDED.source`` from the qualified +``landlord_property_type_overrides.source`` is against a real Postgres -- +the ``db_engine`` fixture in ``tests/conftest.py`` spins one up per test. +""" + +from __future__ import annotations + +from collections.abc import Iterator + +import pytest +from sqlalchemy import Engine +from sqlmodel import Session, select + +from domain.landlord_description_overrides.property_type import PropertyType +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_property_type_override_postgres_repository import ( + LandlordPropertyTypeOverridePostgresRepository, +) +from infrastructure.postgres.landlord_property_type_override_table import ( + LandlordPropertyTypeOverrideRow, +) + + +@pytest.fixture +def session(db_engine: Engine) -> Iterator[Session]: + with Session(db_engine) as s: + yield s + + +def _select_row( + session: Session, portfolio_id: int, description: str +) -> LandlordPropertyTypeOverrideRow: + rows = session.exec( + select(LandlordPropertyTypeOverrideRow).where( + LandlordPropertyTypeOverrideRow.portfolio_id == portfolio_id, + LandlordPropertyTypeOverrideRow.description == description, + ) + ).all() + assert len(rows) == 1, f"expected exactly one row, got {len(rows)}" + return rows[0] + + +def test_inserts_a_fresh_row_with_source_classifier(session: Session) -> None: + # arrange + repo = LandlordPropertyTypeOverridePostgresRepository(session) + + # act + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": PropertyType.HOUSE}) + session.commit() + + # assert + row = _select_row(session, portfolio_id=1, description="cosy") + assert row.value is PropertyType.HOUSE + assert row.source == OverrideSource.CLASSIFIER + + +def test_reupsert_overwrites_a_classifier_row(session: Session) -> None: + # arrange: a stale classifier row exists. + repo = LandlordPropertyTypeOverridePostgresRepository(session) + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": PropertyType.FLAT}) + session.commit() + + # act: re-classify with a different category. + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": PropertyType.HOUSE}) + session.commit() + + # assert: the new classification wins. + row = _select_row(session, portfolio_id=1, description="cosy") + assert row.value is PropertyType.HOUSE + assert row.source == OverrideSource.CLASSIFIER + + +def test_reupsert_does_not_overwrite_a_user_row(session: Session) -> None: + # arrange: a user has corrected the row to ``BUNGALOW``. The classifier + # path never produces ``source = 'user'``; we install the row directly + # to mimic the override frontend. + user_row = LandlordPropertyTypeOverrideRow( + portfolio_id=1, + description="cosy", + value=PropertyType.BUNGALOW, + source=OverrideSource.USER, + ) + session.add(user_row) + session.commit() + + # act: the classifier re-runs and tries to classify the same description + # as a ``HOUSE``. Under the source-aware conflict policy, this must be + # silently skipped -- user edits beat classifier reruns. + repo = LandlordPropertyTypeOverridePostgresRepository(session) + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": PropertyType.HOUSE}) + session.commit() + + # assert: the user row is unchanged. + row = _select_row(session, portfolio_id=1, description="cosy") + assert row.value is PropertyType.BUNGALOW + assert row.source == OverrideSource.USER + + +def test_upsert_keeps_other_portfolios_descriptions_independent( + session: Session, +) -> None: + # arrange: the unique key is ``(portfolio_id, description)``, so the same + # description for two different portfolios must coexist as two rows. + repo = LandlordPropertyTypeOverridePostgresRepository(session) + + # act + repo.upsert_all(portfolio_id=1, descriptions_to_values={"cosy": PropertyType.HOUSE}) + repo.upsert_all(portfolio_id=2, descriptions_to_values={"cosy": PropertyType.FLAT}) + session.commit() + + # assert: both rows survive with their own values. + assert _select_row(session, 1, "cosy").value is PropertyType.HOUSE + assert _select_row(session, 2, "cosy").value is PropertyType.FLAT + + +def test_upsert_persists_unknown_so_a_user_can_resolve_it_later( + session: Session, +) -> None: + # arrange / act: a description the classifier couldn't resolve still + # lands -- per ADR-0002 §5 / ADR-0003 §Decision, so a future user + # override can upgrade it to a real value. + repo = LandlordPropertyTypeOverridePostgresRepository(session) + repo.upsert_all( + portfolio_id=1, + descriptions_to_values={"unparseable nonsense": PropertyType.UNKNOWN}, + ) + session.commit() + + # assert: the row exists with value=UNKNOWN, source=classifier. + row = _select_row(session, portfolio_id=1, description="unparseable nonsense") + assert row.value is PropertyType.UNKNOWN + assert row.source == OverrideSource.CLASSIFIER + + +def test_upsert_all_with_empty_mapping_is_a_no_op(session: Session) -> None: + # arrange / act + repo = LandlordPropertyTypeOverridePostgresRepository(session) + repo.upsert_all(portfolio_id=1, descriptions_to_values={}) + session.commit() + + # assert: nothing was inserted. + rows = session.exec(select(LandlordPropertyTypeOverrideRow)).all() + assert rows == [] diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py new file mode 100644 index 00000000..2aae83dd --- /dev/null +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py @@ -0,0 +1,158 @@ +"""Integration tests for the source-aware upsert policy on the WallType table. + +Mirror of ``test_landlord_property_type_override_postgres_repository.py`` -- +the SQL is structurally identical, but the conflict policy lives in two +separate concrete adapters and so warrants two parallel test suites until +(if) the adapters are factored through a shared upsert helper. +""" + +from __future__ import annotations + +from collections.abc import Iterator + +import pytest +from sqlalchemy import Engine +from sqlmodel import Session, select + +from domain.landlord_description_overrides.wall_type import WallType +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_wall_type_override_postgres_repository import ( + LandlordWallTypeOverridePostgresRepository, +) +from infrastructure.postgres.landlord_wall_type_override_table import ( + LandlordWallTypeOverrideRow, +) + + +@pytest.fixture +def session(db_engine: Engine) -> Iterator[Session]: + with Session(db_engine) as s: + yield s + + +def _select_row( + session: Session, portfolio_id: int, description: str +) -> LandlordWallTypeOverrideRow: + rows = session.exec( + select(LandlordWallTypeOverrideRow).where( + LandlordWallTypeOverrideRow.portfolio_id == portfolio_id, + LandlordWallTypeOverrideRow.description == description, + ) + ).all() + assert len(rows) == 1, f"expected exactly one row, got {len(rows)}" + return rows[0] + + +def test_inserts_a_fresh_row_with_source_classifier(session: Session) -> None: + # arrange + repo = LandlordWallTypeOverridePostgresRepository(session) + + # act + repo.upsert_all( + portfolio_id=1, descriptions_to_values={"cavity insulated": WallType.CAVITY} + ) + session.commit() + + # assert + row = _select_row(session, portfolio_id=1, description="cavity insulated") + assert row.value is WallType.CAVITY + assert row.source == OverrideSource.CLASSIFIER + + +def test_reupsert_overwrites_a_classifier_row(session: Session) -> None: + # arrange: a stale classifier row exists. + repo = LandlordWallTypeOverridePostgresRepository(session) + repo.upsert_all( + portfolio_id=1, descriptions_to_values={"old red brick": WallType.CAVITY} + ) + session.commit() + + # act: re-classify with a different category. + repo.upsert_all( + portfolio_id=1, descriptions_to_values={"old red brick": WallType.SOLID_BRICK} + ) + session.commit() + + # assert: the new classification wins. + row = _select_row(session, portfolio_id=1, description="old red brick") + assert row.value is WallType.SOLID_BRICK + assert row.source == OverrideSource.CLASSIFIER + + +def test_reupsert_does_not_overwrite_a_user_row(session: Session) -> None: + # arrange: a user has corrected the row to ``SANDSTONE``. The classifier + # path never produces ``source = 'user'``; we install the row directly + # to mimic the override frontend. + user_row = LandlordWallTypeOverrideRow( + portfolio_id=1, + description="old red brick", + value=WallType.SANDSTONE, + source=OverrideSource.USER, + ) + session.add(user_row) + session.commit() + + # act: the classifier re-runs and tries to classify the same description + # as ``SOLID_BRICK``. Under the source-aware conflict policy, this must + # be silently skipped -- user edits beat classifier reruns. + repo = LandlordWallTypeOverridePostgresRepository(session) + repo.upsert_all( + portfolio_id=1, descriptions_to_values={"old red brick": WallType.SOLID_BRICK} + ) + session.commit() + + # assert: the user row is unchanged. + row = _select_row(session, portfolio_id=1, description="old red brick") + assert row.value is WallType.SANDSTONE + assert row.source == OverrideSource.USER + + +def test_upsert_keeps_other_portfolios_descriptions_independent( + session: Session, +) -> None: + # arrange / act: the unique key is ``(portfolio_id, description)``, so the + # same description for two different portfolios must coexist as two rows. + repo = LandlordWallTypeOverridePostgresRepository(session) + repo.upsert_all( + portfolio_id=1, descriptions_to_values={"old red brick": WallType.CAVITY} + ) + repo.upsert_all( + portfolio_id=2, descriptions_to_values={"old red brick": WallType.SOLID_BRICK} + ) + session.commit() + + # assert: both rows survive with their own values. + assert _select_row(session, 1, "old red brick").value is WallType.CAVITY + assert _select_row(session, 2, "old red brick").value is WallType.SOLID_BRICK + + +def test_upsert_persists_unknown_so_a_user_can_resolve_it_later( + session: Session, +) -> None: + # arrange / act: a description the classifier couldn't resolve still + # lands -- per ADR-0002 §5 / ADR-0003 §Decision, so a future user + # override can upgrade it to a real value. + repo = LandlordWallTypeOverridePostgresRepository(session) + repo.upsert_all( + portfolio_id=1, + descriptions_to_values={"unparseable wall description": WallType.UNKNOWN}, + ) + session.commit() + + # assert: the row exists with value=UNKNOWN, source=classifier. + row = _select_row( + session, portfolio_id=1, description="unparseable wall description" + ) + assert row.value is WallType.UNKNOWN + assert row.source == OverrideSource.CLASSIFIER + + +def test_upsert_all_with_empty_mapping_is_a_no_op(session: Session) -> None: + # arrange / act + repo = LandlordWallTypeOverridePostgresRepository(session) + repo.upsert_all(portfolio_id=1, descriptions_to_values={}) + session.commit() + + # assert: nothing was inserted. + rows = session.exec(select(LandlordWallTypeOverrideRow)).all() + assert rows == [] From 36f4c32904a40f76e7c07a153cc96c41c925ebe6 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 26 May 2026 16:18:26 +0000 Subject: [PATCH 23/29] added roofs --- .../landlord_description_overrides/handler.py | 20 +++- .../wall_type.py | 93 ++++++++++++++----- .../wall_type_construction_dates.py | 72 ++++++++++++++ .../chatgpt/chatgpt_column_classifier.py | 19 +++- ..._roof_type_override_postgres_repository.py | 80 ++++++++++++++++ .../landlord_roof_type_override_table.py | 69 ++++++++++++++ playground.py | 2 +- .../chatgpt/test_chatgpt_column_classifier.py | 54 ++++++++++- 8 files changed, 378 insertions(+), 31 deletions(-) create mode 100644 domain/landlord_description_overrides/wall_type_construction_dates.py create mode 100644 infrastructure/postgres/landlord_roof_type_override_postgres_repository.py create mode 100644 infrastructure/postgres/landlord_roof_type_override_table.py diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index ff16925e..7b7b60af 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -11,7 +11,11 @@ from applications.landlord_description_overrides.landlord_description_overrides_ from domain.addresses.unstandardised_address import AddressList from domain.landlord_description_overrides.built_form_type import BuiltFormType from domain.landlord_description_overrides.property_type import PropertyType +from domain.landlord_description_overrides.roof_type import RoofType from domain.landlord_description_overrides.wall_type import WallType +from domain.landlord_description_overrides.wall_type_construction_dates import ( + wall_type_construction_date_prompt_hint, +) from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier from infrastructure.postgres.config import PostgresConfig @@ -22,6 +26,9 @@ from infrastructure.postgres.landlord_built_form_type_override_postgres_reposito from infrastructure.postgres.landlord_property_type_override_postgres_repository import ( LandlordPropertyTypeOverridePostgresRepository, ) +from infrastructure.postgres.landlord_roof_type_override_postgres_repository import ( + LandlordRoofTypeOverridePostgresRepository, +) from infrastructure.postgres.landlord_wall_type_override_postgres_repository import ( LandlordWallTypeOverridePostgresRepository, ) @@ -98,10 +105,21 @@ def handler( name="wall_type", source_column="Walls", classifier=ChatGptColumnClassifier( - chat_gpt, WallType, WallType.UNKNOWN + chat_gpt, + WallType, + WallType.UNKNOWN, + extra_instructions=wall_type_construction_date_prompt_hint(), ), repo=LandlordWallTypeOverridePostgresRepository(session), ), + ClassifiableColumn( + name="roof_type", + source_column="Roofs", + classifier=ChatGptColumnClassifier( + chat_gpt, RoofType, RoofType.UNKNOWN + ), + repo=LandlordRoofTypeOverridePostgresRepository(session), + ), ] orchestrator = LandlordDescriptionOverridesOrchestrator( diff --git a/domain/landlord_description_overrides/wall_type.py b/domain/landlord_description_overrides/wall_type.py index 42b90da6..1466f82d 100644 --- a/domain/landlord_description_overrides/wall_type.py +++ b/domain/landlord_description_overrides/wall_type.py @@ -13,40 +13,83 @@ class WallType(Enum): """ CAVITY_FILLED = "Cavity wall, filled cavity" - CAVITY_AS_BUILT_INSULATED_ASSUMED = "Cavity wall, as built, insulated (assumed)" - CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = "Cavity wall, as built, no insulation (assumed)" - CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Cavity wall, as built, partial insulation (assumed)" + CAVITY_AS_BUILT_INSULATED_ASSUMED = ( + "Cavity wall, as built, insulated (assumed)" # 1983 - 1990 + ) + CAVITY_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Cavity wall, as built, no insulation (assumed)" # Pre-1975 + ) + + CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Cavity wall, as built, partial insulation (assumed)" # 1976 - 1982 + ) CAVITY_WITH_INTERNAL_INSULATION = "Cavity wall, with internal insulation" CAVITY_WITH_EXTERNAL_INSULATION = "Cavity wall, with external insulation" - CAVITY_FILLED_AND_INTERNAL_INSULATION = "Cavity wall, filled cavity and internal insulation" - CAVITY_FILLED_AND_EXTERNAL_INSULATION = "Cavity wall, filled cavity and external insulation" + CAVITY_FILLED_AND_INTERNAL_INSULATION = ( + "Cavity wall, filled cavity and internal insulation" + ) + CAVITY_FILLED_AND_EXTERNAL_INSULATION = ( + "Cavity wall, filled cavity and external insulation" + ) - SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = "Solid brick, as built, no insulation (assumed)" - SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = "Solid brick, as built, insulated (assumed)" - SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Solid brick, as built, partial insulation (assumed)" + SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Solid brick, as built, no insulation (assumed)" + ) + SOLID_BRICK_AS_BUILT_INSULATED_ASSUMED = ( + "Solid brick, as built, insulated (assumed)" + ) + SOLID_BRICK_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Solid brick, as built, partial insulation (assumed)" + ) SOLID_BRICK_WITH_INTERNAL_INSULATION = "Solid brick, with internal insulation" SOLID_BRICK_WITH_EXTERNAL_INSULATION = "Solid brick, with external insulation" - TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = "Timber frame, as built, no insulation (assumed)" - TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = "Timber frame, as built, insulated (assumed)" - TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Timber frame, as built, partial insulation (assumed)" + TIMBER_FRAME_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Timber frame, as built, no insulation (assumed)" + ) + TIMBER_FRAME_AS_BUILT_INSULATED_ASSUMED = ( + "Timber frame, as built, insulated (assumed)" + ) + TIMBER_FRAME_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Timber frame, as built, partial insulation (assumed)" + ) TIMBER_FRAME_WITH_ADDITIONAL_INSULATION = "Timber frame, with additional insulation" - SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = "Sandstone, as built, no insulation (assumed)" + SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Sandstone, as built, no insulation (assumed)" + ) SANDSTONE_AS_BUILT_INSULATED_ASSUMED = "Sandstone, as built, insulated (assumed)" - SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Sandstone, as built, partial insulation (assumed)" + SANDSTONE_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Sandstone, as built, partial insulation (assumed)" + ) SANDSTONE_WITH_INTERNAL_INSULATION = "Sandstone, with internal insulation" SANDSTONE_WITH_EXTERNAL_INSULATION = "Sandstone, with external insulation" - GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = "Granite or whin, as built, no insulation (assumed)" - GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = "Granite or whin, as built, insulated (assumed)" - GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "Granite or whin, as built, partial insulation (assumed)" - GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = "Granite or whin, with internal insulation" - GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = "Granite or whin, with external insulation" + GRANITE_OR_WHIN_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Granite or whin, as built, no insulation (assumed)" + ) + GRANITE_OR_WHIN_AS_BUILT_INSULATED_ASSUMED = ( + "Granite or whin, as built, insulated (assumed)" + ) + GRANITE_OR_WHIN_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "Granite or whin, as built, partial insulation (assumed)" + ) + GRANITE_OR_WHIN_WITH_INTERNAL_INSULATION = ( + "Granite or whin, with internal insulation" + ) + GRANITE_OR_WHIN_WITH_EXTERNAL_INSULATION = ( + "Granite or whin, with external insulation" + ) - SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = "System built, as built, no insulation (assumed)" - SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = "System built, as built, insulated (assumed)" - SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = "System built, as built, partial insulation (assumed)" + SYSTEM_BUILT_AS_BUILT_NO_INSULATION_ASSUMED = ( + "System built, as built, no insulation (assumed)" + ) + SYSTEM_BUILT_AS_BUILT_INSULATED_ASSUMED = ( + "System built, as built, insulated (assumed)" + ) + SYSTEM_BUILT_AS_BUILT_PARTIAL_INSULATION_ASSUMED = ( + "System built, as built, partial insulation (assumed)" + ) SYSTEM_BUILT_WITH_INTERNAL_INSULATION = "System built, with internal insulation" SYSTEM_BUILT_WITH_EXTERNAL_INSULATION = "System built, with external insulation" @@ -59,8 +102,12 @@ class WallType(Enum): COB_WITH_EXTERNAL_INSULATION = "Cob, with external insulation" CURTAIN_WALL = "Curtain wall" - CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = "Curtain Wall, as built, no insulation (assumed)" - CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = "Curtain Wall, as built, insulated (assumed)" + CURTAIN_WALL_AS_BUILT_NO_INSULATION_ASSUMED = ( + "Curtain Wall, as built, no insulation (assumed)" + ) + CURTAIN_WALL_AS_BUILT_INSULATED_ASSUMED = ( + "Curtain Wall, as built, insulated (assumed)" + ) CURTAIN_WALL_FILLED = "Curtain Wall, filled cavity" CURTAIN_WALL_WITH_INTERNAL_INSULATION = "Curtain Wall, with internal insulation" diff --git a/domain/landlord_description_overrides/wall_type_construction_dates.py b/domain/landlord_description_overrides/wall_type_construction_dates.py new file mode 100644 index 00000000..4cd869b3 --- /dev/null +++ b/domain/landlord_description_overrides/wall_type_construction_dates.py @@ -0,0 +1,72 @@ +"""Construction-date metadata for the "assumed" ``WallType`` variants. + +The ``(assumed)`` variants of ``WallType`` are what RdSAP picks when a +surveyor has no direct observation and falls back to the typical wall make-up +for a property's build era. The era boundaries reflect UK Building +Regulations milestones for cavity-wall insulation: + +* up to 1975 -- no cavity insulation requirement +* 1976-1982 -- partial-fill cavity (early insulation requirement) +* 1983-1990 -- full-fill cavity (insulation required) + +Captured here as a structured lookup so: + +* the LLM prompt builder can render the ranges as a hint, helping the + classifier resolve era-implying landlord descriptions to the right + ``(assumed)`` variant; +* future date-aware paths (a deterministic year-to-variant shortcut, a + date-keyed repo) can read from the same source instead of duplicating + the knowledge. + +Only the variants where we have a defensible era boundary appear here; the +remaining ``(assumed)`` members are left out rather than guessed. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Mapping, Optional + +from domain.landlord_description_overrides.wall_type import WallType + + +@dataclass(frozen=True) +class YearRange: + """An inclusive year range. ``None`` on either end means "no bound".""" + + start: Optional[int] = None + end: Optional[int] = None + + def __str__(self) -> str: + if self.start is None and self.end is not None: + return f"pre-{self.end + 1}" + if self.start is not None and self.end is None: + return f"{self.start}+" + return f"{self.start}-{self.end}" + + +WALL_TYPE_CONSTRUCTION_YEARS: Mapping[WallType, YearRange] = { + WallType.CAVITY_AS_BUILT_NO_INSULATION_ASSUMED: YearRange(end=1975), + WallType.CAVITY_AS_BUILT_PARTIAL_INSULATION_ASSUMED: YearRange( + start=1976, end=1982 + ), + WallType.CAVITY_AS_BUILT_INSULATED_ASSUMED: YearRange(start=1983, end=1990), +} + + +def wall_type_construction_date_prompt_hint() -> str: + """Render the date metadata as a prompt fragment for the LLM classifier. + + The fragment lists each (variant, year range) pair so the model can + prefer the era-matching ``(assumed)`` variant when a landlord + description carries era information (e.g. "1970s semi", "built before + the war"). + """ + lines = [ + f"- {wall_type.value!r}: typically built {year_range}" + for wall_type, year_range in WALL_TYPE_CONSTRUCTION_YEARS.items() + ] + return ( + "When the description carries construction-era information, prefer " + "the category whose typical build year matches:\n" + "\n".join(lines) + ) diff --git a/infrastructure/chatgpt/chatgpt_column_classifier.py b/infrastructure/chatgpt/chatgpt_column_classifier.py index b23e7c2e..2ce66299 100644 --- a/infrastructure/chatgpt/chatgpt_column_classifier.py +++ b/infrastructure/chatgpt/chatgpt_column_classifier.py @@ -2,7 +2,7 @@ from __future__ import annotations import json from enum import Enum -from typing import Any, TypeVar +from typing import Any, Optional, TypeVar from domain.landlord_description_overrides.column_classifier import ( ClassificationError, @@ -27,10 +27,16 @@ class ChatGptColumnClassifier(ColumnClassifier[E]): chat_gpt: ChatGPT, category_enum: type[E], unknown: E, + extra_instructions: Optional[str] = None, ) -> None: self._chat_gpt = chat_gpt self._category_enum = category_enum self._unknown = unknown + # Free-form column-specific guidance appended to the system prompt + # ahead of the JSON-output instruction. Lets each column ship its + # own hints (e.g. wall-type construction-era ranges) without the + # generic classifier knowing what they are. + self._extra_instructions = extra_instructions def classify(self, descriptions: set[str]) -> dict[str, E]: if not descriptions: @@ -62,12 +68,17 @@ class ChatGptColumnClassifier(ColumnClassifier[E]): for member in self._category_enum if member is not self._unknown ) - return ( - "Classify each free-text description into exactly one category. " - f"Categories: {categories}. " + parts = [ + "Classify each free-text description into exactly one category. ", + f"Categories: {categories}. ", + ] + if self._extra_instructions: + parts.append(self._extra_instructions + " ") + parts.append( "Reply with only a JSON object mapping each original description " "to its category, and nothing else." ) + return "".join(parts) def _to_category(self, value: Any) -> E: """Map a reply value to a category member, defaulting to UNKNOWN.""" diff --git a/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py b/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py new file mode 100644 index 00000000..b5b570bc --- /dev/null +++ b/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py @@ -0,0 +1,80 @@ +"""Postgres adapter for ``LandlordOverrideRepository[RoofType]``. + +Writes to ``landlord_roof_type_overrides`` (Drizzle-managed; mirrored by +``LandlordRoofTypeOverrideRow``). The conflict policy lives in the SQL -- +see ADR-0003 §Decision. Shape mirrors +``LandlordPropertyTypeOverridePostgresRepository``; the duplication is +deliberate while there are only a handful of override columns -- if the +duplication becomes painful, extract a shared upsert helper then. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import cast + +from sqlalchemy import Table +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlmodel import Session + +from domain.landlord_description_overrides.roof_type import RoofType +from infrastructure.postgres.landlord_override_enums import OverrideSource +from infrastructure.postgres.landlord_roof_type_override_table import ( + LandlordRoofTypeOverrideRow, +) +from repositories.landlord_overrides.landlord_override_repository import ( + LandlordOverrideRepository, +) + + +class LandlordRoofTypeOverridePostgresRepository( + LandlordOverrideRepository[RoofType] +): + def __init__(self, session: Session) -> None: + self._session = session + + def upsert_all( + self, + portfolio_id: int, + descriptions_to_values: dict[str, RoofType], + ) -> None: + if not descriptions_to_values: + return + + now = datetime.now(timezone.utc) + rows = [ + { + "portfolio_id": portfolio_id, + "description": description, + "value": value.value, + "source": OverrideSource.CLASSIFIER, + "created_at": now, + "updated_at": now, + } + for description, value in descriptions_to_values.items() + ] + + # SQLModel's class-level ``__table__`` is injected at runtime on + # ``table=True`` classes but isn't exposed by the stubs; pin it to + # ``Table`` via ``getattr`` so the dialect insert helper below + # carries through with strict types. + table: Table = cast(Table, getattr(LandlordRoofTypeOverrideRow, "__table__")) + stmt = pg_insert(table).values(rows) + + # The classifier may refresh its own past output, but must never + # overwrite a user correction -- the ``WHERE existing.source = + # 'classifier'`` guard enforces that. See ADR-0003 §Decision. + stmt = stmt.on_conflict_do_update( + index_elements=["portfolio_id", "description"], + set_={ + "value": stmt.excluded.value, + "source": stmt.excluded.source, + "updated_at": stmt.excluded.updated_at, + }, + where=table.c.source == OverrideSource.CLASSIFIER, + ) + + # SQLModel re-exports SQLAlchemy's ``Session.execute``; one of the + # overload signatures is marked deprecated in stubs, which fires + # here even though our INSERT path is the supported one. + self._session.execute(stmt) # pyright: ignore[reportDeprecated] diff --git a/infrastructure/postgres/landlord_roof_type_override_table.py b/infrastructure/postgres/landlord_roof_type_override_table.py new file mode 100644 index 00000000..f0cea945 --- /dev/null +++ b/infrastructure/postgres/landlord_roof_type_override_table.py @@ -0,0 +1,69 @@ +"""SQLModel mirror of the ``landlord_roof_type_overrides`` Drizzle table. + +The schema source of truth lives in the ``assessment-model`` TS repo +(`src/app/db/schema/landlord_overrides.ts`). The migrations are owned there; +this row class only mirrors the columns so the Python lambda can read/write. +See ADR-0003. Shape mirrors ``LandlordPropertyTypeOverrideRow`` -- the only +differences are the table name, the ``roof_type`` pgEnum on ``value``, and +the unique-constraint name. +""" + +from datetime import datetime, timezone +from typing import ClassVar +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, UniqueConstraint +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.landlord_description_overrides.roof_type import RoofType +from infrastructure.postgres.landlord_override_enums import override_source_sa_enum + + +class LandlordRoofTypeOverrideRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "landlord_roof_type_overrides" # pyright: ignore[reportIncompatibleVariableOverride] + __table_args__: ClassVar[tuple[UniqueConstraint, ...]] = ( # pyright: ignore[reportIncompatibleVariableOverride] + UniqueConstraint( + "portfolio_id", + "description", + name="landlord_roof_type_overrides_portfolio_description_unique", + ), + ) + + id: UUID = Field(default_factory=uuid4, primary_key=True) + + # bigint to match the Drizzle ``portfolio_id`` FK; SQLModel's default int + # mapping is 32-bit Integer and would overflow once portfolio IDs exceed + # 2^31. The FK to ``portfolio.id`` is enforced by the Drizzle migration, + # not declared here -- the ``portfolio`` table is not modelled in Python. + portfolio_id: int = Field( + sa_column=Column(BigInteger, nullable=False, index=True), + ) + + description: str = Field(nullable=False) + + value: RoofType = Field( + sa_column=Column( + SAEnum( + RoofType, + name="roof_type", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=False, + ), + ) + + # Shared SAEnum -- see ``landlord_override_enums`` for why this single + # instance is reused by every ``landlord_*_overrides`` row class. + source: str = Field( + sa_column=Column(override_source_sa_enum, nullable=False), + ) + + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False, + ) diff --git a/playground.py b/playground.py index d116dcf9..5e9001e1 100644 --- a/playground.py +++ b/playground.py @@ -46,7 +46,7 @@ def main() -> int: print(f" - {c}") return 0 - column = "roof_description" + column = "wall " series = df[column] if args.keep_na else df[column].dropna() for value in series.unique(): print(value) diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py index 8a07ecec..4cdf4dfe 100644 --- a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -23,11 +23,13 @@ class _FakeChatGPT(ChatGPT): error: Optional[Exception] = None, ) -> None: self.prompts: list[str] = [] + self.system_prompts: list[Optional[str]] = [] self._reply = reply self._error = error def generate(self, prompt: str, system_prompt: Optional[str] = None) -> str: self.prompts.append(prompt) + self.system_prompts.append(system_prompt) if self._error is not None: raise self._error return self._reply @@ -125,11 +127,59 @@ def test_empty_description_set_returns_empty_without_calling_chatgpt() -> None: def test_classifies_with_a_different_category_enum() -> None: # Arrange: the same adapter classifies a WallType column. - chat_gpt = _FakeChatGPT(reply='{"solid brick wall": "Solid Brick"}') + chat_gpt = _FakeChatGPT( + reply='{"solid brick wall": "Solid brick, as built, no insulation (assumed)"}' + ) classifier = ChatGptColumnClassifier(chat_gpt, WallType, WallType.UNKNOWN) # Act result = classifier.classify({"solid brick wall"}) # Assert - assert result == {"solid brick wall": WallType.SOLID_BRICK} + assert result == { + "solid brick wall": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED + } + + +def test_extra_instructions_are_appended_to_the_system_prompt() -> None: + # Arrange: column-specific guidance (e.g. wall-type build-era hints) + # should reach the model verbatim, in the system prompt ahead of the + # JSON-output instruction. + chat_gpt = _FakeChatGPT(reply='{"1970s semi": "House"}') + classifier = ChatGptColumnClassifier( + chat_gpt, + PropertyType, + PropertyType.UNKNOWN, + extra_instructions="If the description carries a build decade, prefer X.", + ) + + # Act + classifier.classify({"1970s semi"}) + + # Assert: the hint sits in the system prompt, before the JSON instruction. + system_prompt = chat_gpt.system_prompts[0] + assert system_prompt is not None + assert "If the description carries a build decade, prefer X." in system_prompt + hint_index = system_prompt.index("If the description carries a build decade") + json_index = system_prompt.index("Reply with only a JSON object") + assert hint_index < json_index + + +def test_omitting_extra_instructions_leaves_the_system_prompt_unchanged() -> None: + # Arrange: a classifier without per-column guidance must still produce + # the original system prompt -- no trailing whitespace, no orphan hint. + chat_gpt = _FakeChatGPT(reply='{"semi-detached": "House"}') + classifier = ChatGptColumnClassifier(chat_gpt, PropertyType, PropertyType.UNKNOWN) + + # Act + classifier.classify({"semi-detached"}) + + # Assert + system_prompt = chat_gpt.system_prompts[0] + assert system_prompt is not None + assert system_prompt == ( + "Classify each free-text description into exactly one category. " + "Categories: House, Bungalow, Flat, Maisonette, Park home. " + "Reply with only a JSON object mapping each original description " + "to its category, and nothing else." + ) From 99614820b98ed35c9a3c4e2e4d7c3d1e1b6216b3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 29 May 2026 10:41:46 +0000 Subject: [PATCH 24/29] made landlord overrides sqs --- .../landlord_description_overrides/handler.py | 161 +++++++++--------- ...lord_description_overrides_trigger_body.py | 4 + backend/app/bulk_uploads/router.py | 21 +++ backend/app/bulk_uploads/schema.py | 9 + backend/app/config.py | 1 + ...lord_description_overrides_orchestrator.py | 36 ++++ 6 files changed, 152 insertions(+), 80 deletions(-) diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 7b7b60af..801d1f12 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -1,14 +1,12 @@ import logging import os from typing import Any -from uuid import UUID import boto3 from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import ( LandlordDescriptionOverridesTriggerBody, ) -from domain.addresses.unstandardised_address import AddressList from domain.landlord_description_overrides.built_form_type import BuiltFormType from domain.landlord_description_overrides.property_type import PropertyType from domain.landlord_description_overrides.roof_type import RoofType @@ -33,36 +31,90 @@ from infrastructure.postgres.landlord_wall_type_override_postgres_repository imp LandlordWallTypeOverridePostgresRepository, ) from infrastructure.s3.csv_s3_client import CsvS3Client +from infrastructure.s3.s3_uri import parse_s3_uri from orchestration.classifiable_column import ClassifiableColumn from orchestration.landlord_description_overrides_orchestrator import ( LandlordDescriptionOverridesOrchestrator, ) +from orchestration.task_orchestrator import TaskOrchestrator from repositories.unstandardised_address.unstandardised_address_list_csv_s3_repository import ( UnstandardisedAddressListCsvS3Repository, ) +from utilities.aws_lambda.subtask_handler import subtask_handler logger = logging.getLogger(__name__) +def _build_columns( + column_mapping: dict[str, str], chat_gpt: ChatGPT, session: Any +) -> list[ClassifiableColumn[Any]]: + """One ClassifiableColumn per mapped category. + + ``column_mapping`` is ``{category -> source CSV header}``. One header may + feed several categories -- e.g. ``"Property Type"`` -> property_type and + built_form_type -- which falls out naturally because each is a separate + entry. Unknown categories are skipped. + """ + factories = { + "property_type": lambda src: ClassifiableColumn( + name="property_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, PropertyType, PropertyType.UNKNOWN + ), + repo=LandlordPropertyTypeOverridePostgresRepository(session), + ), + "built_form_type": lambda src: ClassifiableColumn( + name="built_form_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, BuiltFormType, BuiltFormType.UNKNOWN + ), + repo=LandlordBuiltFormTypeOverridePostgresRepository(session), + ), + "wall_type": lambda src: ClassifiableColumn( + name="wall_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, + WallType, + WallType.UNKNOWN, + extra_instructions=wall_type_construction_date_prompt_hint(), + ), + repo=LandlordWallTypeOverridePostgresRepository(session), + ), + "roof_type": lambda src: ClassifiableColumn( + name="roof_type", + source_column=src, + classifier=ChatGptColumnClassifier( + chat_gpt, RoofType, RoofType.UNKNOWN + ), + repo=LandlordRoofTypeOverridePostgresRepository(session), + ), + } + + columns: list[ClassifiableColumn[Any]] = [] + for category, source_column in column_mapping.items(): + factory = factories.get(category) + if factory is None: + logger.warning("Unknown classifier category %r; skipping.", category) + continue + columns.append(factory(source_column)) + return columns + + +@subtask_handler() def handler( - body: dict[str, Any], - context: Any, -) -> dict[str, list[str]]: - # TODO: replace with ``LandlordDescriptionOverridesTriggerBody.model_validate(body)`` - # once this lambda is wired into the parent task pipeline via the SQS - # subtask envelope. Until then the trigger fields are hard-coded so the - # local invoker can exercise the full path. See ADR-0003 §Out of scope. - trigger = LandlordDescriptionOverridesTriggerBody( - task_id=UUID("00000000-0000-0000-0000-000000000001"), - sub_task_id=UUID("00000000-0000-0000-0000-000000000002"), - s3_uri="s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv", - portfolio_id=730, - ) + body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator +) -> dict[str, int]: + trigger = LandlordDescriptionOverridesTriggerBody.model_validate(body) - bucket = "retrofit-data-dev" + # The classifier reads the ORIGINAL upload (raw landlord headers), so the S3 + # bucket comes from the trigger URI rather than a fixed env var. + bucket, _key = parse_s3_uri(trigger.s3_uri) - # boto3.client is overloaded per-service in the installed stubs; cast - # to Any so the strict-mode checker treats it as opaque. + # boto3.client is overloaded per-service in the installed stubs; cast to Any + # so the strict-mode checker treats it as opaque. boto3_client: Any = ( boto3.client ) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] @@ -73,74 +125,23 @@ def handler( csv_client, bucket ) - # One transactional session per handler invocation: the context manager - # commits on clean exit and rolls back on exception, so the handler never - # invokes ``.commit()`` itself -- transaction semantics live in the - # infrastructure layer. + # Raw rows, not load_batch: the original upload carries the description + # columns but not the canonical address/postcode columns load_batch requires. + rows = csv_client.read_rows(trigger.s3_uri) + engine = make_engine(PostgresConfig.from_env(os.environ)) with transactional_session(engine) as session: chat_gpt = ChatGPT() - # The "Property Type" CSV column is read by two classifiers: the - # landlord's free-text (e.g. "semi-detached house") encodes both the - # dwelling kind (PropertyType) and how it joins to neighbours - # (BuiltFormType). Each classification lands in its own table. - columns: list[ClassifiableColumn[Any]] = [ - ClassifiableColumn( - name="property_type", - source_column="Property Type", - classifier=ChatGptColumnClassifier( - chat_gpt, PropertyType, PropertyType.UNKNOWN - ), - repo=LandlordPropertyTypeOverridePostgresRepository(session), - ), - ClassifiableColumn( - name="built_form_type", - source_column="Property Type", - classifier=ChatGptColumnClassifier( - chat_gpt, BuiltFormType, BuiltFormType.UNKNOWN - ), - repo=LandlordBuiltFormTypeOverridePostgresRepository(session), - ), - ClassifiableColumn( - name="wall_type", - source_column="Walls", - classifier=ChatGptColumnClassifier( - chat_gpt, - WallType, - WallType.UNKNOWN, - extra_instructions=wall_type_construction_date_prompt_hint(), - ), - repo=LandlordWallTypeOverridePostgresRepository(session), - ), - ClassifiableColumn( - name="roof_type", - source_column="Roofs", - classifier=ChatGptColumnClassifier( - chat_gpt, RoofType, RoofType.UNKNOWN - ), - repo=LandlordRoofTypeOverridePostgresRepository(session), - ), - ] - + columns = _build_columns(trigger.column_mapping, chat_gpt, session) orchestrator = LandlordDescriptionOverridesOrchestrator( unstandardised_address_repo=unstandardised_address_repo, columns=columns, ) - - addressList: AddressList = orchestrator.get_unstandardised_addresses( - input_s3_uri=trigger.s3_uri + classified = orchestrator.classify_and_persist_from_rows( + rows, portfolio_id=trigger.portfolio_id ) - # Cap the batch to the first 20 while the ChatGPT path is under test. - # Remove before wiring into the production subtask pipeline. - addressList = AddressList(addressList[:20]) - - classified = orchestrator.classify_and_persist( - addressList, portfolio_id=trigger.portfolio_id - ) - for column, mapping in classified.items(): - logger.info( - "Classified %d descriptions for column %r.", len(mapping), column - ) - - return {"hello": ["200"]} + counts = {name: len(mapping) for name, mapping in classified.items()} + for name, n in counts.items(): + logger.info("Classified %d descriptions for column %r.", n, name) + return counts diff --git a/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py b/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py index 9f78215e..0ca80ec3 100644 --- a/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py +++ b/applications/landlord_description_overrides/landlord_description_overrides_trigger_body.py @@ -13,3 +13,7 @@ class LandlordDescriptionOverridesTriggerBody(BaseModel): # Python ``int`` is unbounded so the Pydantic side stays simple; the # SQLModel row class pins the storage to ``BigInteger``. portfolio_id: int + # category -> source CSV header (the classifier subset of the upload + # mapping). Defaulted so a malformed/empty message classifies nothing + # rather than failing validation. + column_mapping: dict[str, str] = {} diff --git a/backend/app/bulk_uploads/router.py b/backend/app/bulk_uploads/router.py index 9928b456..c050b18c 100644 --- a/backend/app/bulk_uploads/router.py +++ b/backend/app/bulk_uploads/router.py @@ -13,6 +13,7 @@ from backend.app.bulk_uploads.schema import ( CombinedResultsResponse, CombinerTriggerRequest, FlagsSummary, + LandlordOverridesTriggerRequest, PostcodeSplitterTriggerRequest, ) from backend.app.bulk_uploads.scoring import score_bucket @@ -92,6 +93,26 @@ async def trigger_combiner(req: CombinerTriggerRequest): } +@router.post("/trigger-landlord-overrides", status_code=202) +async def trigger_landlord_overrides(req: LandlordOverridesTriggerRequest): + settings = get_settings() + + try: + sqs = boto3.client("sqs", settings.AWS_DEFAULT_REGION) + response = sqs.send_message( + QueueUrl=settings.LANDLORD_OVERRIDES_SQS_URL, + MessageBody=req.model_dump_json(), + ) + except Exception as e: + raise HTTPException(status_code=500, detail=f"SQS error: {e}") + + return { + "task_id": req.task_id, + "sub_task_id": req.sub_task_id, + "sqs_message_id": response.get("MessageId"), + } + + @router.get("/{task_id}/combined-results", response_model=CombinedResultsResponse) async def get_combined_results( task_id: UUID, diff --git a/backend/app/bulk_uploads/schema.py b/backend/app/bulk_uploads/schema.py index ca3b39ea..af797cac 100644 --- a/backend/app/bulk_uploads/schema.py +++ b/backend/app/bulk_uploads/schema.py @@ -14,6 +14,15 @@ class CombinerTriggerRequest(BaseModel): sub_task_id: str +class LandlordOverridesTriggerRequest(BaseModel): + task_id: str + sub_task_id: str + s3_uri: str + portfolio_id: int + # category -> source CSV header (the classifier subset of the upload mapping) + column_mapping: dict[str, str] + + class FlagsSummary(BaseModel): duplicates: int missing: int diff --git a/backend/app/config.py b/backend/app/config.py index fcfb6d5b..f969518d 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -42,6 +42,7 @@ class Settings(BaseSettings): MAGICPLAN_SQS_URL: str = "changeme" POSTCODE_SPLITTER_SQS_URL: str = "changeme" COMBINER_SQS_URL: str = "changeme" + LANDLORD_OVERRIDES_SQS_URL: str = "changeme" # Third parties EPC_AUTH_TOKEN: str = "changeme" diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 389d1afb..6203b8d5 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -81,3 +81,39 @@ class LandlordDescriptionOverridesOrchestrator: continue column.repo.upsert_all(portfolio_id, mapping) return classified + + def classify_and_persist_from_rows( + self, rows: list[dict[str, str]], portfolio_id: int + ) -> dict[str, dict[str, Enum]]: + """Classify + persist straight from raw CSV rows. + + Unlike ``classify_and_persist``, this does not build an ``AddressList``, + so it has no canonical address/postcode requirement -- the classifier + only needs the raw description cells. Used when reading the original + landlord upload (raw headers) rather than the address-matching CSV. + """ + col_to_desc = self._descriptions_from_rows(rows) + classified = { + column.name: column.classifier.classify( + col_to_desc.get(column.source_column, set()) + ) + for column in self._columns + } + for column in self._columns: + mapping = classified[column.name] + if not mapping: + continue + column.repo.upsert_all(portfolio_id, mapping) + return classified + + @staticmethod + def _descriptions_from_rows(rows: list[dict[str, str]]) -> dict[str, set[str]]: + mappings: dict[str, set[str]] = {} + for row in rows: + for key, value in row.items(): + bucket = mappings.setdefault(key, set()) + for variant in (value or "").split(","): + variant = variant.strip().lower() + if variant: + bucket.add(variant) + return mappings From 47dfe34ec062bfd884a451bc9b22e92f62c5c9d7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 29 May 2026 12:12:54 +0000 Subject: [PATCH 25/29] added landlord description overrides --- .github/workflows/_deploy_lambda.yml | 5 ++ .github/workflows/deploy_terraform.yml | 41 +++++++++++++++ .github/workflows/lambda_smoke_tests.yml | 10 ++++ .../landlord_description_overrides/Dockerfile | 2 +- .../requirements.txt | 1 + .../landlordDescriptionOverrides/main.tf | 50 +++++++++++++++++++ .../landlordDescriptionOverrides/outputs.tf | 9 ++++ .../landlordDescriptionOverrides/provider.tf | 16 ++++++ .../landlordDescriptionOverrides/variables.tf | 33 ++++++++++++ deployment/terraform/shared/main.tf | 41 ++++++++++++--- 10 files changed, 201 insertions(+), 7 deletions(-) create mode 100644 deployment/terraform/lambda/landlordDescriptionOverrides/main.tf create mode 100644 deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf create mode 100644 deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf create mode 100644 deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 0d702155..70f9eabe 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -92,6 +92,9 @@ on: TF_VAR_magicplan_api_key: required: false + + TF_VAR_openai_api_key: + required: false jobs: deploy: runs-on: ubuntu-latest @@ -163,6 +166,7 @@ jobs: TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} + TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }} run: | ECR_REPO_URL_VAR="" if [[ -n "${{ inputs.ecr_repo }}" ]]; then @@ -213,6 +217,7 @@ jobs: TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} + TF_VAR_openai_api_key: ${{ secrets.TF_VAR_openai_api_key }} run: | EXTRA_VARS="" if [[ -n "${{ inputs.ecr_repo }}" ]]; then diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 7f2eb890..fc999bc0 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -203,6 +203,47 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ + # Build Landlord Description Overrides image and Push + # ============================================================ + landlordDescriptionOverrides_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: applications/landlord_description_overrides/Dockerfile + build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + # ============================================================ + # Deploy Landlord Description Overrides Lambda + # ============================================================ + landlordDescriptionOverrides_lambda: + needs: [landlordDescriptionOverrides_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: landlordDescriptionOverrides + lambda_path: deployment/terraform/lambda/landlordDescriptionOverrides + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: landlord_description_overrides-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.landlordDescriptionOverrides_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_openai_api_key: ${{ secrets.DEV_OPENAI_API_KEY }} + # ============================================================ # Build Bulk Address2UPRN Combiner image and Push # ============================================================ diff --git a/.github/workflows/lambda_smoke_tests.yml b/.github/workflows/lambda_smoke_tests.yml index b562f91e..44288821 100644 --- a/.github/workflows/lambda_smoke_tests.yml +++ b/.github/workflows/lambda_smoke_tests.yml @@ -43,6 +43,16 @@ jobs: build_context: . service_name: postcode-splitter-ddd + # ============================================================ + # Landlord Description Overrides + # ============================================================ + landlord_description_overrides_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: applications/landlord_description_overrides/Dockerfile + build_context: . + service_name: landlord-description-overrides + # ============================================================ # Bulk Address2UPRN Combiner # ============================================================ diff --git a/applications/landlord_description_overrides/Dockerfile b/applications/landlord_description_overrides/Dockerfile index e2456b81..c2d4faf7 100644 --- a/applications/landlord_description_overrides/Dockerfile +++ b/applications/landlord_description_overrides/Dockerfile @@ -15,7 +15,7 @@ ENV POSTGRES_DATABASE=${DEV_DB_NAME} WORKDIR /var/task -COPY applications/postcode_splitter/requirements.txt . +COPY applications/landlord_description_overrides/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Copy the layered source the handler imports from. The new splitter pulls diff --git a/applications/landlord_description_overrides/requirements.txt b/applications/landlord_description_overrides/requirements.txt index 6a85a255..b2917847 100644 --- a/applications/landlord_description_overrides/requirements.txt +++ b/applications/landlord_description_overrides/requirements.txt @@ -2,3 +2,4 @@ boto3 pydantic sqlmodel psycopg2-binary +openai diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/main.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/main.tf new file mode 100644 index 00000000..5a69de22 --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/main.tf @@ -0,0 +1,50 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "landlord-description-overrides" + stage = var.stage + + image_uri = local.image_uri + + # The classifier calls OpenAI once per distinct description per column, so it + # is latency-bound. 300s leaves headroom under the queue's 1000s visibility + # timeout. batch_size = 1 keeps one upload per invocation, so a single bad + # record cannot redrive its siblings. maximum_concurrency caps fan-out to + # respect OpenAI rate limits. + timeout = 300 + batch_size = 1 + maximum_concurrency = 5 + + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username + POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password + OPENAI_API_KEY = var.openai_api_key + }, + ) +} + +# Attach S3 read policy so the handler can read the original upload CSV. +resource "aws_iam_role_policy_attachment" "landlord_overrides_s3_read" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.landlord_overrides_s3_read_arn +} diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf new file mode 100644 index 00000000..7c6534db --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/outputs.tf @@ -0,0 +1,9 @@ +output "landlord_description_overrides_queue_url" { + value = module.lambda.queue_url + description = "URL of the Landlord Description Overrides SQS queue (wire into the FastAPI LANDLORD_OVERRIDES_SQS_URL)" +} + +output "landlord_description_overrides_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the Landlord Description Overrides SQS queue" +} diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf new file mode 100644 index 00000000..ed2fa60e --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "landlord-description-overrides-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} diff --git a/deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf b/deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf new file mode 100644 index 00000000..63437a5a --- /dev/null +++ b/deployment/terraform/lambda/landlordDescriptionOverrides/variables.tf @@ -0,0 +1,33 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. landlordDescriptionOverrides)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "openai_api_key" { + type = string + description = "OpenAI API key used by the ChatGPT column classifier" + sensitive = true +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/deployment/terraform/shared/main.tf b/deployment/terraform/shared/main.tf index 0a9e87f6..7d179203 100644 --- a/deployment/terraform/shared/main.tf +++ b/deployment/terraform/shared/main.tf @@ -268,11 +268,11 @@ output "retrofit_heat_baseline_predictions_bucket_name" { // We make this bucket presignable, because we want to generate download links for the frontend module "retrofit_energy_assessments" { - source = "../modules/s3_presignable_bucket" - bucketname = "retrofit-energy-assessments-${var.stage}" - allowed_origins = var.allowed_origins - environment = var.stage - enable_versioning = true + source = "../modules/s3_presignable_bucket" + bucketname = "retrofit-energy-assessments-${var.stage}" + allowed_origins = var.allowed_origins + environment = var.stage + enable_versioning = true } output "retrofit_energy_assessments_bucket_name" { @@ -494,6 +494,35 @@ output "postcode_splitter_s3_read_arn" { value = module.postcode_splitter_s3_read.policy_arn } +################################################ +# Landlord Description Overrides – Lambda +################################################ +module "landlord_description_overrides_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "landlord-description-overrides-terraform-state" +} + +module "landlord_description_overrides_registry" { + source = "../modules/container_registry" + name = "landlord_description_overrides" + stage = var.stage +} + +# S3 policy for the landlord classifier to read the original upload CSV. +module "landlord_overrides_s3_read" { + source = "../modules/s3_iam_policy" + + policy_name = "LandlordOverridesReadS3" + policy_description = "Allow landlord description overrides Lambda to read from retrofit-data bucket" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject", "s3:ListBucket"] + resource_paths = ["/*"] +} + +output "landlord_overrides_s3_read_arn" { + value = module.landlord_overrides_s3_read.policy_arn +} + ################################################ # Bulk Address2UPRN Combiner – Lambda ECR ################################################ @@ -729,7 +758,7 @@ module "hubspot_etl_bucket" { module "hubspot_etl_registry" { source = "../modules/container_registry" name = "hubspot-etl" - stage = var.stage + stage = var.stage } From 3e30b4af4037c54c8fd2956503d8b9595eb6c74d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 29 May 2026 16:17:06 +0000 Subject: [PATCH 26/29] tests wrong environemnt --- .../landlord_description_overrides/handler.py | 20 ++- .../requirements.txt | 2 +- infrastructure/postgres/engine.py | 18 +++ ...lord_description_overrides_orchestrator.py | 69 ++++++--- test.requirements.txt | 3 +- ...lord_description_overrides_orchestrator.py | 144 ++++++++++++++++++ 6 files changed, 226 insertions(+), 30 deletions(-) diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 801d1f12..901a8297 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -17,7 +17,7 @@ from domain.landlord_description_overrides.wall_type_construction_dates import ( from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.chatgpt_column_classifier import ChatGptColumnClassifier from infrastructure.postgres.config import PostgresConfig -from infrastructure.postgres.engine import make_engine, transactional_session +from infrastructure.postgres.engine import commit_scope, make_engine, make_session from infrastructure.postgres.landlord_built_form_type_override_postgres_repository import ( LandlordBuiltFormTypeOverridePostgresRepository, ) @@ -130,16 +130,26 @@ def handler( rows = csv_client.read_rows(trigger.s3_uri) engine = make_engine(PostgresConfig.from_env(os.environ)) - with transactional_session(engine) as session: + # The session is built up front (SQLModel sessions are lazy, so no + # connection is checked out yet) and owned by this handler. Classification + # runs first and calls ChatGPT, which is slow; we deliberately keep no + # transaction open across it. Only the persistence below -- inside + # ``commit_scope`` -- holds a connection. + session = make_session(engine) + try: chat_gpt = ChatGPT() columns = _build_columns(trigger.column_mapping, chat_gpt, session) orchestrator = LandlordDescriptionOverridesOrchestrator( unstandardised_address_repo=unstandardised_address_repo, columns=columns, ) - classified = orchestrator.classify_and_persist_from_rows( - rows, portfolio_id=trigger.portfolio_id - ) + + classified = orchestrator.classify_from_rows(rows) + + with commit_scope(session): + orchestrator.persist(classified, portfolio_id=trigger.portfolio_id) + finally: + session.close() counts = {name: len(mapping) for name, mapping in classified.items()} for name, n in counts.items(): diff --git a/applications/landlord_description_overrides/requirements.txt b/applications/landlord_description_overrides/requirements.txt index b2917847..590c4667 100644 --- a/applications/landlord_description_overrides/requirements.txt +++ b/applications/landlord_description_overrides/requirements.txt @@ -2,4 +2,4 @@ boto3 pydantic sqlmodel psycopg2-binary -openai +openai==1.93.0 diff --git a/infrastructure/postgres/engine.py b/infrastructure/postgres/engine.py index ea2b35ad..2558532e 100644 --- a/infrastructure/postgres/engine.py +++ b/infrastructure/postgres/engine.py @@ -40,3 +40,21 @@ def transactional_session(engine: Engine) -> Iterator[Session]: raise finally: session.close() + + +@contextmanager # pyright: ignore[reportDeprecated] +def commit_scope(session: Session) -> Iterator[Session]: + """Commit a caller-owned session on clean exit; roll back on error. + + Like ``transactional_session`` but for a session the caller already holds + and will close itself. Use it to keep slow, non-DB work *outside* the + transaction: build the session, run the slow work, then enter + ``commit_scope`` only for the persistence -- so a connection is checked out + (SQLModel sessions are lazy) for the shortest possible window. + """ + try: + yield session + session.commit() + except Exception: + session.rollback() + raise diff --git a/orchestration/landlord_description_overrides_orchestrator.py b/orchestration/landlord_description_overrides_orchestrator.py index 6203b8d5..e43992cf 100644 --- a/orchestration/landlord_description_overrides_orchestrator.py +++ b/orchestration/landlord_description_overrides_orchestrator.py @@ -60,50 +60,73 @@ class LandlordDescriptionOverridesOrchestrator: for column in self._columns } + def persist( + self, classified: dict[str, dict[str, Enum]], portfolio_id: int + ) -> None: + """Persist already-classified results via each column's repository. + + ``classified`` is keyed by ``ClassifiableColumn.name`` -- the shape + ``classify_columns`` and ``classify_from_rows`` return. Each non-empty + mapping is written through the column's own repo under + ``source = 'classifier'``; an empty mapping (a registered column absent + from this batch) skips the DB round-trip. + + The orchestrator does not commit -- the caller owns the transaction + boundary, and is expected to open it only around this call so the + slow classification never holds a connection. + """ + for column in self._columns: + mapping = classified.get(column.name) + if not mapping: + continue + column.repo.upsert_all(portfolio_id, mapping) + def classify_and_persist( self, addresses: AddressList, portfolio_id: int ) -> dict[str, dict[str, Enum]]: """Classify every registered column and persist the results. - Each non-empty mapping is written via the column's repository under - ``source = 'classifier'``. Empty mappings (a registered column whose - ``source_column`` is absent from this batch) skip the DB round-trip. - The orchestrator does not commit -- the caller owns the transaction - boundary. - Returns the same shape as ``classify_columns`` so callers can log per-column counts. """ classified = self.classify_columns(addresses) - for column in self._columns: - mapping = classified[column.name] - if not mapping: - continue - column.repo.upsert_all(portfolio_id, mapping) + self.persist(classified, portfolio_id) return classified - def classify_and_persist_from_rows( - self, rows: list[dict[str, str]], portfolio_id: int + def classify_from_rows( + self, rows: list[dict[str, str]] ) -> dict[str, dict[str, Enum]]: - """Classify + persist straight from raw CSV rows. + """Classify raw CSV rows without touching the database. - Unlike ``classify_and_persist``, this does not build an ``AddressList``, - so it has no canonical address/postcode requirement -- the classifier - only needs the raw description cells. Used when reading the original + The classification half of ``classify_and_persist_from_rows``, split + out so a caller can run the slow ChatGPT work *before* opening a + transaction and then write the finished results with ``persist`` inside + one short-lived connection. + + Unlike the ``AddressList`` path this builds no ``AddressList``, so it + has no canonical address/postcode requirement -- the classifier only + needs the raw description cells. Used when reading the original landlord upload (raw headers) rather than the address-matching CSV. """ col_to_desc = self._descriptions_from_rows(rows) - classified = { + return { column.name: column.classifier.classify( col_to_desc.get(column.source_column, set()) ) for column in self._columns } - for column in self._columns: - mapping = classified[column.name] - if not mapping: - continue - column.repo.upsert_all(portfolio_id, mapping) + + def classify_and_persist_from_rows( + self, rows: list[dict[str, str]], portfolio_id: int + ) -> dict[str, dict[str, Enum]]: + """Classify + persist straight from raw CSV rows in one call. + + A convenience composition of ``classify_from_rows`` + ``persist``. + Prefer calling the two separately when classification is slow, so the + transaction opens only around ``persist`` (see the Lambda handler). + """ + classified = self.classify_from_rows(rows) + self.persist(classified, portfolio_id) return classified @staticmethod diff --git a/test.requirements.txt b/test.requirements.txt index 26125034..c5b71977 100644 --- a/test.requirements.txt +++ b/test.requirements.txt @@ -10,4 +10,5 @@ fuzzywuzzy pymupdf playwright==1.58.0 msal -moto[s3,sqs] \ No newline at end of file +moto[s3,sqs] +openai==1.93.0 \ No newline at end of file diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index eee4a310..d05b5911 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -323,3 +323,147 @@ def test_classify_and_persist_skips_upsert_for_a_column_absent_from_the_batch() # assert: Property Type wrote; Walls did not. assert property_type_repo.calls == [(7, {"semi-detached": PropertyType.HOUSE})] assert wall_type_repo.calls == [] + + +def test_classify_from_rows_classifies_each_column_without_persisting() -> None: + # arrange: raw CSV rows (not an AddressList) carry two classifiable columns. + rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}] + property_types = _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}) + wall_types = _StubColumnClassifier( + {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} + ) + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + + # act + result = _orchestrator( + [ + _column("property_type", "Property Type", property_types, property_type_repo), + _column("wall_type", "Walls", wall_types, wall_type_repo), + ] + ).classify_from_rows(rows) + + # assert: each classifier ran against its column's descriptions, keyed by + # name -- and NOT a single repo was touched (classification is DB-free, so + # the slow ChatGPT work can run before any transaction opens). + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + assert property_type_repo.calls == [] + assert wall_type_repo.calls == [] + + +def test_classify_from_rows_splits_and_normalises_descriptions() -> None: + # arrange: one cell packs several descriptions with inconsistent casing, + # spread across rows. The rows path must fold them exactly like the + # AddressList path: comma-split, trimmed, lower-cased, de-duped. + rows = [ + {"Walls": "Solid Brick, cavity"}, + {"Walls": "SOLID BRICK"}, + ] + wall_types = _StubColumnClassifier({}) + + # act + _orchestrator( + [_column("wall_type", "Walls", wall_types)] + ).classify_from_rows(rows) + + # assert: the classifier saw one normalised entry per distinct variant. + assert wall_types.received == {"solid brick", "cavity"} + + +def test_classify_from_rows_yields_empty_mapping_for_an_absent_column() -> None: + # arrange: a column is registered for a header the rows lack. + rows = [{"Walls": "cavity"}] + property_types = _StubColumnClassifier({}) + + # act + result = _orchestrator( + [_column("property_type", "Property Type", property_types)] + ).classify_from_rows(rows) + + # assert: the absent column classified an empty description set. + assert result == {"property_type": {}} + assert property_types.received == set() + + +def test_persist_routes_each_columns_mapping_to_its_own_repo() -> None: + # arrange: a finished ``classified`` mapping (as classify_* would return) + # and two columns with distinct repos. + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo), + _column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo), + ] + classified: dict[str, dict[str, Enum]] = { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + + # act + _orchestrator(columns).persist(classified, portfolio_id=42) + + # assert: each repo received exactly its own column's mapping. + assert property_type_repo.calls == [(42, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [ + (42, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + ] + + +def test_persist_skips_empty_and_missing_mappings() -> None: + # arrange: ``property_type`` has an empty mapping; ``wall_type`` is absent + # from ``classified`` entirely. Neither should hit the DB -- and the + # missing key must not raise (``persist`` reads with ``.get``). + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column("property_type", "Property Type", _StubColumnClassifier({}), property_type_repo), + _column("wall_type", "Walls", _StubColumnClassifier({}), wall_type_repo), + ] + classified: dict[str, dict[str, Enum]] = {"property_type": {}} + + # act + _orchestrator(columns).persist(classified, portfolio_id=7) + + # assert: no upserts at all. + assert property_type_repo.calls == [] + assert wall_type_repo.calls == [] + + +def test_classify_and_persist_from_rows_composes_classify_then_persist() -> None: + # arrange: the one-shot rows path must classify AND route to repos, so the + # convenience composition stays equivalent to calling the two in sequence. + rows = [{"Property Type": "semi-detached", "Walls": "solid brick"}] + property_type_repo = _StubLandlordOverrideRepository() + wall_type_repo = _StubLandlordOverrideRepository() + columns: list[ClassifiableColumn[Any]] = [ + _column( + "property_type", + "Property Type", + _StubColumnClassifier({"semi-detached": PropertyType.HOUSE}), + property_type_repo, + ), + _column( + "wall_type", + "Walls", + _StubColumnClassifier( + {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} + ), + wall_type_repo, + ), + ] + + # act + result = _orchestrator(columns).classify_and_persist_from_rows(rows, portfolio_id=99) + + # assert: same return shape as classify_from_rows, and each repo wrote once. + assert result == { + "property_type": {"semi-detached": PropertyType.HOUSE}, + "wall_type": {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}, + } + assert property_type_repo.calls == [(99, {"semi-detached": PropertyType.HOUSE})] + assert wall_type_repo.calls == [ + (99, {"solid brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED}) + ] From 7f2f2b95a0b0e304f2003ea13063884ebe55fd40 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 1 Jun 2026 09:34:35 +0000 Subject: [PATCH 27/29] update tests to reflect wall types --- ..._wall_type_override_postgres_repository.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py index 2aae83dd..4cee6f5a 100644 --- a/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py @@ -49,13 +49,13 @@ def test_inserts_a_fresh_row_with_source_classifier(session: Session) -> None: # act repo.upsert_all( - portfolio_id=1, descriptions_to_values={"cavity insulated": WallType.CAVITY} + portfolio_id=1, descriptions_to_values={"cavity insulated": WallType.CAVITY_FILLED} ) session.commit() # assert row = _select_row(session, portfolio_id=1, description="cavity insulated") - assert row.value is WallType.CAVITY + assert row.value is WallType.CAVITY_FILLED assert row.source == OverrideSource.CLASSIFIER @@ -63,19 +63,19 @@ def test_reupsert_overwrites_a_classifier_row(session: Session) -> None: # arrange: a stale classifier row exists. repo = LandlordWallTypeOverridePostgresRepository(session) repo.upsert_all( - portfolio_id=1, descriptions_to_values={"old red brick": WallType.CAVITY} + portfolio_id=1, descriptions_to_values={"old red brick": WallType.CAVITY_FILLED} ) session.commit() # act: re-classify with a different category. repo.upsert_all( - portfolio_id=1, descriptions_to_values={"old red brick": WallType.SOLID_BRICK} + portfolio_id=1, descriptions_to_values={"old red brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} ) session.commit() # assert: the new classification wins. row = _select_row(session, portfolio_id=1, description="old red brick") - assert row.value is WallType.SOLID_BRICK + assert row.value is WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED assert row.source == OverrideSource.CLASSIFIER @@ -86,7 +86,7 @@ def test_reupsert_does_not_overwrite_a_user_row(session: Session) -> None: user_row = LandlordWallTypeOverrideRow( portfolio_id=1, description="old red brick", - value=WallType.SANDSTONE, + value=WallType.SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED, source=OverrideSource.USER, ) session.add(user_row) @@ -97,13 +97,13 @@ def test_reupsert_does_not_overwrite_a_user_row(session: Session) -> None: # be silently skipped -- user edits beat classifier reruns. repo = LandlordWallTypeOverridePostgresRepository(session) repo.upsert_all( - portfolio_id=1, descriptions_to_values={"old red brick": WallType.SOLID_BRICK} + portfolio_id=1, descriptions_to_values={"old red brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} ) session.commit() # assert: the user row is unchanged. row = _select_row(session, portfolio_id=1, description="old red brick") - assert row.value is WallType.SANDSTONE + assert row.value is WallType.SANDSTONE_AS_BUILT_NO_INSULATION_ASSUMED assert row.source == OverrideSource.USER @@ -114,16 +114,16 @@ def test_upsert_keeps_other_portfolios_descriptions_independent( # same description for two different portfolios must coexist as two rows. repo = LandlordWallTypeOverridePostgresRepository(session) repo.upsert_all( - portfolio_id=1, descriptions_to_values={"old red brick": WallType.CAVITY} + portfolio_id=1, descriptions_to_values={"old red brick": WallType.CAVITY_FILLED} ) repo.upsert_all( - portfolio_id=2, descriptions_to_values={"old red brick": WallType.SOLID_BRICK} + portfolio_id=2, descriptions_to_values={"old red brick": WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED} ) session.commit() # assert: both rows survive with their own values. - assert _select_row(session, 1, "old red brick").value is WallType.CAVITY - assert _select_row(session, 2, "old red brick").value is WallType.SOLID_BRICK + assert _select_row(session, 1, "old red brick").value is WallType.CAVITY_FILLED + assert _select_row(session, 2, "old red brick").value is WallType.SOLID_BRICK_AS_BUILT_NO_INSULATION_ASSUMED def test_upsert_persists_unknown_so_a_user_can_resolve_it_later( From 9c1b6c76a9a9d91d688aec9f49dd06449570fb31 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 1 Jun 2026 12:08:29 +0000 Subject: [PATCH 28/29] delete playground --- playground.py | 57 --------------------------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 playground.py diff --git a/playground.py b/playground.py deleted file mode 100644 index 5e9001e1..00000000 --- a/playground.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Read a file and return unique values from a chosen column.""" - -from pathlib import Path -import argparse -import sys - -import pandas as pd - - -def read_file(path: str | Path) -> pd.DataFrame: - path = Path(path) - suffix = path.suffix.lower() - if suffix == ".csv": - return pd.read_csv(path) - if suffix == ".tsv": - return pd.read_csv(path, sep="\t") - if suffix in {".xlsx", ".xls"}: - return pd.read_excel(path) - if suffix == ".parquet": - return pd.read_parquet(path) - if suffix == ".json": - return pd.read_json(path) - raise ValueError(f"Unsupported file type: {suffix}") - - -def get_unique(path: str | Path, column: str, dropna: bool = True) -> list: - df = read_file(Path(path)) - if column not in df.columns: - raise KeyError(f"Column {column!r} not found. Available: {list(df.columns)}") - series = df[column].dropna() if dropna else df[column] - return series.unique().tolist() - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--path", default="/workspaces/model/certificates-2026.csv") - parser.add_argument("--column", nargs="walls_description") - parser.add_argument("--keep-na", action="store_true") - args, _ = parser.parse_known_args() - - df = read_file(args.path) - - if not args.column: - print("Available columns:") - for c in df.columns: - print(f" - {c}") - return 0 - - column = "wall " - series = df[column] if args.keep_na else df[column].dropna() - for value in series.unique(): - print(value) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) From c9a9620527cce18062805b0d80812c5a3e76fbf4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 1 Jun 2026 14:00:31 +0000 Subject: [PATCH 29/29] pr review, move domain and orhcestration --- .devcontainer/backend/requirements.txt | 1 + .../landlord_description_overrides/handler.py | 10 ++-- backend/tests/test_search_epc.py | 48 +------------------ .../__init__.py | 0 .../built_form_type.py | 0 .../property_type.py | 0 .../roof_type.py | 0 .../wall_type.py | 0 .../wall_type_construction_dates.py | 2 +- .../chatgpt/chatgpt_column_classifier.py | 2 +- .../column_classifier.py | 0 ..._form_type_override_postgres_repository.py | 2 +- ...landlord_built_form_type_override_table.py | 2 +- ...perty_type_override_postgres_repository.py | 2 +- .../landlord_property_type_override_table.py | 2 +- ..._roof_type_override_postgres_repository.py | 2 +- .../landlord_roof_type_override_table.py | 2 +- ..._wall_type_override_postgres_repository.py | 2 +- .../landlord_wall_type_override_table.py | 2 +- orchestration/classifiable_column.py | 2 +- .../chatgpt/test_chatgpt_column_classifier.py | 6 +-- ...lord_description_overrides_orchestrator.py | 8 ++-- ...perty_type_override_postgres_repository.py | 2 +- ..._wall_type_override_postgres_repository.py | 2 +- 24 files changed, 28 insertions(+), 71 deletions(-) rename domain/{landlord_description_overrides => epc}/__init__.py (100%) rename domain/{landlord_description_overrides => epc}/built_form_type.py (100%) rename domain/{landlord_description_overrides => epc}/property_type.py (100%) rename domain/{landlord_description_overrides => epc}/roof_type.py (100%) rename domain/{landlord_description_overrides => epc}/wall_type.py (100%) rename domain/{landlord_description_overrides => epc}/wall_type_construction_dates.py (97%) rename {domain/landlord_description_overrides => infrastructure}/column_classifier.py (100%) diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index 7a879773..2db3710a 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -27,3 +27,4 @@ pytest-postgresql # Formatting black==26.1.0 boto3-stubs +openai diff --git a/applications/landlord_description_overrides/handler.py b/applications/landlord_description_overrides/handler.py index 901a8297..e2afb4bd 100644 --- a/applications/landlord_description_overrides/handler.py +++ b/applications/landlord_description_overrides/handler.py @@ -7,11 +7,11 @@ import boto3 from applications.landlord_description_overrides.landlord_description_overrides_trigger_body import ( LandlordDescriptionOverridesTriggerBody, ) -from domain.landlord_description_overrides.built_form_type import BuiltFormType -from domain.landlord_description_overrides.property_type import PropertyType -from domain.landlord_description_overrides.roof_type import RoofType -from domain.landlord_description_overrides.wall_type import WallType -from domain.landlord_description_overrides.wall_type_construction_dates import ( +from domain.epc.built_form_type import BuiltFormType +from domain.epc.property_type import PropertyType +from domain.epc.roof_type import RoofType +from domain.epc.wall_type import WallType +from domain.epc.wall_type_construction_dates import ( wall_type_construction_date_prompt_hint, ) from infrastructure.chatgpt.chatgpt import ChatGPT diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py index a0fef7e9..aaf5d680 100644 --- a/backend/tests/test_search_epc.py +++ b/backend/tests/test_search_epc.py @@ -14,55 +14,11 @@ class TestSearchEpcIntegration: def epc_auth_token(self): return os.getenv("EPC_AUTH_TOKEN") - @pytest.mark.parametrize( - "address, postcode, uprn, skip_os, lmk_key, n_old_epcs", - [ - # Test case 1: Valid address and postcode, skipping OS - # In this case, the property is an individual flat but the uprn associated to the - # EPC is for the building as a whole, possibly because there was a conversion of sorts - ("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True, - "260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1), - - # Test case 2: Another valid address and postcode - # In this case, the newest EPC, does not have a uprn associated to it. If we did a search by - # uprn, we would get an old EPC - ("Flat 8, Hainton House", "DN32 9AQ", "", True, - "bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 2), - # Test case 3: When we make a request to the API for this property, we get back results for - # flats 1, 2 and 3. We have some logic to handle the response so that we get back flat 1 - ("Flat 1, 1 Tottenham Street, London", "W1T 2AE", 5167411, True, - "3e6414d7f15f4cf7a69dc20c469bcf043d31a49239b183f1bd0c0e1aafa23c93", 0), - - ], - ) - def test_find_property(self, epc_auth_token, address, postcode, uprn, skip_os, lmk_key, n_old_epcs): - """ - Integration test for `find_property`, making actual API calls. - """ - # Provide your actual API keys or tokens here - os_api_key = "" - - # Initialize the SearchEpc instance - epc_searcher = SearchEpc( - address1=address, - postcode=postcode, - uprn=uprn, - auth_token=epc_auth_token, - os_api_key=os_api_key, - ) - - # Execute the method - epc_searcher.find_property(skip_os=skip_os) - - # We check that we have the correct epc - assert epc_searcher.newest_epc["lmk-key"] == lmk_key - assert len(epc_searcher.older_epcs) == n_old_epcs - def test_search_housenumber(self): - eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + eg1 = "Flat A11, Mortimer House, Grendon Road, Exeter" res1 = SearchEpc.get_house_number(eg1, None) assert res1 == "A11" - eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + eg2 = "Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL" res2 = SearchEpc.get_house_number(eg2, None) assert res2 == "A9" diff --git a/domain/landlord_description_overrides/__init__.py b/domain/epc/__init__.py similarity index 100% rename from domain/landlord_description_overrides/__init__.py rename to domain/epc/__init__.py diff --git a/domain/landlord_description_overrides/built_form_type.py b/domain/epc/built_form_type.py similarity index 100% rename from domain/landlord_description_overrides/built_form_type.py rename to domain/epc/built_form_type.py diff --git a/domain/landlord_description_overrides/property_type.py b/domain/epc/property_type.py similarity index 100% rename from domain/landlord_description_overrides/property_type.py rename to domain/epc/property_type.py diff --git a/domain/landlord_description_overrides/roof_type.py b/domain/epc/roof_type.py similarity index 100% rename from domain/landlord_description_overrides/roof_type.py rename to domain/epc/roof_type.py diff --git a/domain/landlord_description_overrides/wall_type.py b/domain/epc/wall_type.py similarity index 100% rename from domain/landlord_description_overrides/wall_type.py rename to domain/epc/wall_type.py diff --git a/domain/landlord_description_overrides/wall_type_construction_dates.py b/domain/epc/wall_type_construction_dates.py similarity index 97% rename from domain/landlord_description_overrides/wall_type_construction_dates.py rename to domain/epc/wall_type_construction_dates.py index 4cd869b3..0eccc44c 100644 --- a/domain/landlord_description_overrides/wall_type_construction_dates.py +++ b/domain/epc/wall_type_construction_dates.py @@ -27,7 +27,7 @@ from __future__ import annotations from dataclasses import dataclass from typing import Mapping, Optional -from domain.landlord_description_overrides.wall_type import WallType +from domain.epc.wall_type import WallType @dataclass(frozen=True) diff --git a/infrastructure/chatgpt/chatgpt_column_classifier.py b/infrastructure/chatgpt/chatgpt_column_classifier.py index 2ce66299..15389184 100644 --- a/infrastructure/chatgpt/chatgpt_column_classifier.py +++ b/infrastructure/chatgpt/chatgpt_column_classifier.py @@ -4,7 +4,7 @@ import json from enum import Enum from typing import Any, Optional, TypeVar -from domain.landlord_description_overrides.column_classifier import ( +from infrastructure.column_classifier import ( ClassificationError, ColumnClassifier, ) diff --git a/domain/landlord_description_overrides/column_classifier.py b/infrastructure/column_classifier.py similarity index 100% rename from domain/landlord_description_overrides/column_classifier.py rename to infrastructure/column_classifier.py diff --git a/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py b/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py index 0f7d4959..aec4ea4d 100644 --- a/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py +++ b/infrastructure/postgres/landlord_built_form_type_override_postgres_repository.py @@ -17,7 +17,7 @@ from sqlalchemy import Table from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlmodel import Session -from domain.landlord_description_overrides.built_form_type import BuiltFormType +from domain.epc.built_form_type import BuiltFormType from infrastructure.postgres.landlord_built_form_type_override_table import ( LandlordBuiltFormTypeOverrideRow, ) diff --git a/infrastructure/postgres/landlord_built_form_type_override_table.py b/infrastructure/postgres/landlord_built_form_type_override_table.py index a1f89c35..ec93ba27 100644 --- a/infrastructure/postgres/landlord_built_form_type_override_table.py +++ b/infrastructure/postgres/landlord_built_form_type_override_table.py @@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.landlord_description_overrides.built_form_type import BuiltFormType +from domain.epc.built_form_type import BuiltFormType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_property_type_override_postgres_repository.py b/infrastructure/postgres/landlord_property_type_override_postgres_repository.py index 18592c5f..3cd7dbb2 100644 --- a/infrastructure/postgres/landlord_property_type_override_postgres_repository.py +++ b/infrastructure/postgres/landlord_property_type_override_postgres_repository.py @@ -19,7 +19,7 @@ from sqlalchemy import Table from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlmodel import Session -from domain.landlord_description_overrides.property_type import PropertyType +from domain.epc.property_type import PropertyType from infrastructure.postgres.landlord_override_enums import OverrideSource from infrastructure.postgres.landlord_property_type_override_table import ( LandlordPropertyTypeOverrideRow, diff --git a/infrastructure/postgres/landlord_property_type_override_table.py b/infrastructure/postgres/landlord_property_type_override_table.py index b76d508e..ae9377cd 100644 --- a/infrastructure/postgres/landlord_property_type_override_table.py +++ b/infrastructure/postgres/landlord_property_type_override_table.py @@ -14,7 +14,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.landlord_description_overrides.property_type import PropertyType +from domain.epc.property_type import PropertyType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py b/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py index b5b570bc..c3f263a9 100644 --- a/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py +++ b/infrastructure/postgres/landlord_roof_type_override_postgres_repository.py @@ -17,7 +17,7 @@ from sqlalchemy import Table from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlmodel import Session -from domain.landlord_description_overrides.roof_type import RoofType +from domain.epc.roof_type import RoofType from infrastructure.postgres.landlord_override_enums import OverrideSource from infrastructure.postgres.landlord_roof_type_override_table import ( LandlordRoofTypeOverrideRow, diff --git a/infrastructure/postgres/landlord_roof_type_override_table.py b/infrastructure/postgres/landlord_roof_type_override_table.py index f0cea945..58bd61ff 100644 --- a/infrastructure/postgres/landlord_roof_type_override_table.py +++ b/infrastructure/postgres/landlord_roof_type_override_table.py @@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.landlord_description_overrides.roof_type import RoofType +from domain.epc.roof_type import RoofType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py b/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py index 21b73e98..711e5c30 100644 --- a/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py +++ b/infrastructure/postgres/landlord_wall_type_override_postgres_repository.py @@ -17,7 +17,7 @@ from sqlalchemy import Table from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlmodel import Session -from domain.landlord_description_overrides.wall_type import WallType +from domain.epc.wall_type import WallType from infrastructure.postgres.landlord_override_enums import OverrideSource from infrastructure.postgres.landlord_wall_type_override_table import ( LandlordWallTypeOverrideRow, diff --git a/infrastructure/postgres/landlord_wall_type_override_table.py b/infrastructure/postgres/landlord_wall_type_override_table.py index 79bea46a..b5097164 100644 --- a/infrastructure/postgres/landlord_wall_type_override_table.py +++ b/infrastructure/postgres/landlord_wall_type_override_table.py @@ -16,7 +16,7 @@ from sqlalchemy import BigInteger, Column, UniqueConstraint from sqlalchemy import Enum as SAEnum from sqlmodel import Field, SQLModel -from domain.landlord_description_overrides.wall_type import WallType +from domain.epc.wall_type import WallType from infrastructure.postgres.landlord_override_enums import override_source_sa_enum diff --git a/orchestration/classifiable_column.py b/orchestration/classifiable_column.py index fb1dab6e..9b6fda10 100644 --- a/orchestration/classifiable_column.py +++ b/orchestration/classifiable_column.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from enum import Enum from typing import Generic, TypeVar -from domain.landlord_description_overrides.column_classifier import ColumnClassifier +from infrastructure.column_classifier import ColumnClassifier from repositories.landlord_overrides.landlord_override_repository import ( LandlordOverrideRepository, ) diff --git a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py index 4cdf4dfe..0462f3ce 100644 --- a/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py +++ b/tests/infrastructure/chatgpt/test_chatgpt_column_classifier.py @@ -4,9 +4,9 @@ from typing import Optional import pytest -from domain.landlord_description_overrides.column_classifier import ClassificationError -from domain.landlord_description_overrides.property_type import PropertyType -from domain.landlord_description_overrides.wall_type import WallType +from infrastructure.column_classifier import ClassificationError +from domain.epc.property_type import PropertyType +from domain.epc.wall_type import WallType from infrastructure.chatgpt.chatgpt import ChatGPT from infrastructure.chatgpt.chatgpt_column_classifier import ( ChatGptColumnClassifier, diff --git a/tests/orchestration/test_landlord_description_overrides_orchestrator.py b/tests/orchestration/test_landlord_description_overrides_orchestrator.py index d05b5911..18188941 100644 --- a/tests/orchestration/test_landlord_description_overrides_orchestrator.py +++ b/tests/orchestration/test_landlord_description_overrides_orchestrator.py @@ -4,10 +4,10 @@ from enum import Enum from typing import Any, Optional from domain.addresses.unstandardised_address import AddressList, UnstandardisedAddress -from domain.landlord_description_overrides.built_form_type import BuiltFormType -from domain.landlord_description_overrides.column_classifier import ColumnClassifier -from domain.landlord_description_overrides.property_type import PropertyType -from domain.landlord_description_overrides.wall_type import WallType +from domain.epc.built_form_type import BuiltFormType +from infrastructure.column_classifier import ColumnClassifier +from domain.epc.property_type import PropertyType +from domain.epc.wall_type import WallType from domain.postcode import Postcode from orchestration.classifiable_column import ClassifiableColumn from orchestration.landlord_description_overrides_orchestrator import ( diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py index 9154b664..c2b81293 100644 --- a/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_property_type_override_postgres_repository.py @@ -15,7 +15,7 @@ import pytest from sqlalchemy import Engine from sqlmodel import Session, select -from domain.landlord_description_overrides.property_type import PropertyType +from domain.epc.property_type import PropertyType from infrastructure.postgres.landlord_override_enums import OverrideSource from infrastructure.postgres.landlord_property_type_override_postgres_repository import ( LandlordPropertyTypeOverridePostgresRepository, diff --git a/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py b/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py index 4cee6f5a..9504a520 100644 --- a/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py +++ b/tests/repositories/landlord_overrides/postgres/test_landlord_wall_type_override_postgres_repository.py @@ -14,7 +14,7 @@ import pytest from sqlalchemy import Engine from sqlmodel import Session, select -from domain.landlord_description_overrides.wall_type import WallType +from domain.epc.wall_type import WallType from infrastructure.postgres.landlord_override_enums import OverrideSource from infrastructure.postgres.landlord_wall_type_override_postgres_repository import ( LandlordWallTypeOverridePostgresRepository,