From 265be9849b1eb8b7e5393a830c87624aa87e8f07 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 10:50:28 +0000 Subject: [PATCH 01/91] =?UTF-8?q?Store=20uploaded=5Ffile=5Fid=20on=20magic?= =?UTF-8?q?=5Fplan=5Fplan=20row=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/db/functions/magic_plan_functions.py | 8 +++-- .../tests/test_magic_plan_functions.py | 34 +++++++++++++++---- backend/app/db/models/magic_plan.py | 1 + backend/magic_plan/magic_plan_service.py | 3 +- 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/backend/app/db/functions/magic_plan_functions.py b/backend/app/db/functions/magic_plan_functions.py index 9400f36f..143e4172 100644 --- a/backend/app/db/functions/magic_plan_functions.py +++ b/backend/app/db/functions/magic_plan_functions.py @@ -14,15 +14,15 @@ from backend.app.db.models.magic_plan import ( ) -def save_plan(session: Session, plan: Plan) -> None: - plan_id: int = _upsert_plan(session, plan) +def save_plan(session: Session, plan: Plan, uploaded_file_id: int) -> None: + plan_id: int = _upsert_plan(session, plan, uploaded_file_id) _delete_children(session, plan_id) floor_ids: list[int] = _insert_floors(session, plan.floors, plan_id) room_ids: list[int] = _insert_rooms(session, plan.floors, floor_ids) _insert_windows_and_doors(session, plan.floors, room_ids) -def _upsert_plan(session: Session, plan: Plan) -> int: +def _upsert_plan(session: Session, plan: Plan, uploaded_file_id: int) -> int: stmt = ( pg_insert(MagicPlanPlanModel) .values( @@ -30,6 +30,7 @@ def _upsert_plan(session: Session, plan: Plan) -> int: name=plan.name, address=plan.address, postcode=plan.postcode, + uploaded_file_id=uploaded_file_id, ) .on_conflict_do_update( index_elements=["magic_plan_uid"], @@ -37,6 +38,7 @@ def _upsert_plan(session: Session, plan: Plan) -> int: "name": plan.name, "address": plan.address, "postcode": plan.postcode, + "uploaded_file_id": uploaded_file_id, }, ) .returning(col(MagicPlanPlanModel.id)) diff --git a/backend/app/db/functions/tests/test_magic_plan_functions.py b/backend/app/db/functions/tests/test_magic_plan_functions.py index e58d0528..0b93685c 100644 --- a/backend/app/db/functions/tests/test_magic_plan_functions.py +++ b/backend/app/db/functions/tests/test_magic_plan_functions.py @@ -36,7 +36,7 @@ def _count(session: Session, model: type[SQLModel]) -> int: def test_plan_row_present_after_save(db_session: Session, domain_plan: Plan) -> None: # Act - save_plan(db_session, domain_plan) + save_plan(db_session, domain_plan, 1) # Assert assert _count(db_session, MagicPlanPlanModel) == 1 @@ -45,7 +45,7 @@ def test_floor_count_matches_domain(db_session: Session, domain_plan: Plan) -> N # Arrange expected = len(domain_plan.floors) # Act - save_plan(db_session, domain_plan) + save_plan(db_session, domain_plan, 1) # Assert assert _count(db_session, MagicPlanFloorModel) == expected @@ -54,7 +54,7 @@ def test_room_count_matches_domain(db_session: Session, domain_plan: Plan) -> No # Arrange expected = sum(len(f.rooms) for f in domain_plan.floors) # Act - save_plan(db_session, domain_plan) + save_plan(db_session, domain_plan, 1) # Assert assert _count(db_session, MagicPlanRoomModel) == expected @@ -63,7 +63,7 @@ def test_window_count_matches_domain(db_session: Session, domain_plan: Plan) -> # Arrange expected = sum(len(r.windows) for f in domain_plan.floors for r in f.rooms) # Act - save_plan(db_session, domain_plan) + save_plan(db_session, domain_plan, 1) # Assert assert _count(db_session, MagicPlanWindowModel) == expected @@ -72,15 +72,15 @@ def test_door_count_matches_domain(db_session: Session, domain_plan: Plan) -> No # Arrange expected = sum(len(r.doors) for f in domain_plan.floors for r in f.rooms) # Act - save_plan(db_session, domain_plan) + save_plan(db_session, domain_plan, 1) # Assert assert _count(db_session, MagicPlanDoorModel) == expected def test_save_plan_idempotent(db_session: Session, domain_plan: Plan) -> None: # Act — call twice within the same session - save_plan(db_session, domain_plan) - save_plan(db_session, domain_plan) + save_plan(db_session, domain_plan, 1) + save_plan(db_session, domain_plan, 1) # Assert — same row counts as a single call assert _count(db_session, MagicPlanPlanModel) == 1 assert _count(db_session, MagicPlanFloorModel) == len(domain_plan.floors) @@ -93,3 +93,23 @@ def test_save_plan_idempotent(db_session: Session, domain_plan: Plan) -> None: assert _count(db_session, MagicPlanDoorModel) == sum( len(r.doors) for f in domain_plan.floors for r in f.rooms ) + + +def test_uploaded_file_id_stored_after_save(db_session: Session, domain_plan: Plan) -> None: + # Act + save_plan(db_session, domain_plan, 1) + # Assert + row = db_session.execute(select(MagicPlanPlanModel)).scalar_one() + assert row.uploaded_file_id == 1 + + +def test_save_plan_updates_uploaded_file_id_on_reingest( + db_session: Session, domain_plan: Plan +) -> None: + # Arrange + save_plan(db_session, domain_plan, 1) + # Act + save_plan(db_session, domain_plan, 2) + # Assert + row = db_session.execute(select(MagicPlanPlanModel)).scalar_one() + assert row.uploaded_file_id == 2 diff --git a/backend/app/db/models/magic_plan.py b/backend/app/db/models/magic_plan.py index 38e9de18..77ca52fd 100644 --- a/backend/app/db/models/magic_plan.py +++ b/backend/app/db/models/magic_plan.py @@ -11,6 +11,7 @@ class MagicPlanPlanModel(SQLModel, table=True): name: Optional[str] = None address: Optional[str] = None postcode: Optional[str] = None + uploaded_file_id: Optional[int] = Field(default=None) class MagicPlanFloorModel(SQLModel, table=True): diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 22e19ddf..2be3379d 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -55,8 +55,9 @@ class MagicPlanService: ) with db_session() as session: - save_plan(session, plan) session.add(uploaded_file) + session.flush() + save_plan(session, plan, uploaded_file.id) return plan From 509fbf2abfa3849a44782f5a9cf2f8d033157823 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 11:02:46 +0000 Subject: [PATCH 02/91] =?UTF-8?q?Store=20uploaded=5Ffile=5Fid=20on=20magic?= =?UTF-8?q?=5Fplan=5Fplan=20row=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 4 +-- .../tests/test_magic_plan_service.py | 35 +++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 2be3379d..8a75c716 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,7 +1,7 @@ import gzip import json from datetime import datetime, timezone -from typing import Optional +from typing import Optional, cast from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary from datatypes.magicplan.domain.mapper import map_plan @@ -57,7 +57,7 @@ class MagicPlanService: with db_session() as session: session.add(uploaded_file) session.flush() - save_plan(session, plan, uploaded_file.id) + save_plan(session, plan, cast(int, uploaded_file.id)) return plan diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 158cf4d6..a2302ab4 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -271,3 +271,38 @@ def test_run_creates_uploaded_file_record( assert uploaded_file.s3_upload_timestamp is not None assert uploaded_file.uprn == 100023336956 assert uploaded_file.hubspot_deal_id == "deal-789" + + +def test_run_passes_flushed_uploaded_file_id_to_save_plan( + mock_client: MagicMock, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value = [plan_summary] + service = _make_service(mock_client) + mock_session = MagicMock() + added_objects: list = [] + + mock_session.add.side_effect = added_objects.append + + def simulate_flush() -> None: + for obj in added_objects: + if isinstance(obj, UploadedFile): + obj.id = 42 + + mock_session.flush.side_effect = simulate_flush + + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan") as mock_save, patch( + "backend.magic_plan.magic_plan_service.db_session" + ) as mock_db, patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ): + mock_db.return_value.__enter__.return_value = mock_session + # Act + service.run(_make_request()) + + # Assert + assert mock_save.call_args[0][2] == 42 From e3159665653557584edbe681a371c42b4a044a2f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 12:29:25 +0000 Subject: [PATCH 03/91] add coordination and design document types to enums --- backend/app/db/models/uploaded_file.py | 3 +++ backend/pashub_fetcher/core_files.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index c629f574..f3cfee79 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -18,6 +18,9 @@ class FileTypeEnum(enum.Enum): ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note" ECMK_SURVEY_XML = "ecmk_survey_xml" MAGIC_PLAN_JSON = "magic_plan_json" + IMPROVEMENT_OPTION_EVALUATION = "improvement_option_evaluation" + MEDIUM_TERM_IMPROVEMENT_PLAN = "medium_term_improvement_plan" + RETROFIT_DESIGN_DOC = "retrofit_design_doc" class FileSourceEnum(enum.Enum): diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 4da10661..aa426475 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -14,6 +14,9 @@ class CoreFiles(Enum): PAR_PHOTOPACK = "PAR Photo Pack" PAS2023_PROPERTY = "PAS 2023 Property Assessment Report" PAS2023_OCCUPANCY = "PAS 2023 Occupancy Assessment Report" + IMPROVEMENT_OPTION_EVALUATION = "Improvement Option Evaluation" + MEDIUM_TERM_IMPROVEMENT_PLAN = "Medium Term Improvement Plan" + RETROFIT_DESIGN_DOC = "Retrofit Design Doc" CORE_TO_FILETYPE_MAP = { @@ -26,6 +29,9 @@ CORE_TO_FILETYPE_MAP = { CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value, CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value, CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value, + CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value, + CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value, + CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value, } From e3646162de686884b17da231a26eeeaa3c4cdc41 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 13:09:40 +0000 Subject: [PATCH 04/91] =?UTF-8?q?new=20files=20types=20inferred=20from=20f?= =?UTF-8?q?ile=20names=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pashub_fetcher/tests/test_core_files.py | 64 +++++++++++++++++++ pytest.ini | 23 ++++++- 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 backend/pashub_fetcher/tests/test_core_files.py diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py new file mode 100644 index 00000000..fca29b7e --- /dev/null +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -0,0 +1,64 @@ +import pytest + +from backend.pashub_fetcher.core_files import infer_file_type + + +# --- GREEN: pre-existing file types (startswith match) --- + + +def test_infer_photopack(): + assert infer_file_type("Photopack_123456_V1.pdf") == "photo_pack" + + +def test_infer_sitenote(): + assert infer_file_type("SiteNote_123456_V1.pdf") == "site_note" + + +def test_infer_rdsap_sitenote(): + assert infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" + + +def test_infer_pas2023_ventilation(): + assert infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation" + + +def test_infer_pas2023_condition(): + assert infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" + + +def test_infer_pas_significance(): + assert infer_file_type("PAS Significance_123456.pdf") == "pas_significance" + + +def test_infer_par_photopack(): + assert infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" + + +def test_infer_pas2023_property(): + assert infer_file_type("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property" + + +def test_infer_pas2023_occupancy(): + assert infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy" + + +def test_infer_unknown_returns_none(): + assert infer_file_type("unknown_document_123.pdf") is None + + +# --- RED: new file types (suffix match not yet implemented) --- + + +def test_infer_improvement_option_evaluation(): + # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf" + assert infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") == "improvement_option_evaluation" + + +def test_infer_medium_term_improvement_plan(): + # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf" + assert infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan" + + +@pytest.mark.skip(reason="Retrofit Design Doc filename pattern not yet known") +def test_infer_retrofit_design_doc(): + assert infer_file_type("2512-OSM-H56M900-XX-DR-N-A_Radford Road 408.pdf") == "retrofit_design_doc" diff --git a/pytest.ini b/pytest.ini index e2a4a25d..99cc8e1b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,27 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests +testpaths = + recommendations/tests + backend/tests + backend/address2UPRN/tests + backend/app/db/functions/tests + backend/categorisation/tests + backend/condition/tests + backend/documents_parser/tests + backend/ecmk_fetcher/tests + backend/export/tests + backend/magic_plan/tests + backend/onboarders/tests + backend/pashub_fetcher/tests + datatypes/epc/domain/tests + datatypes/epc/schema/tests + datatypes/epc/surveys/tests + datatypes/magicplan/api/tests + datatypes/magicplan/domain/tests + etl/epc/tests + etl/epc_clean/tests + etl/hubspot/tests + etl/spatial/tests markers = integration: mark a test as an integration test From b3a68a264a08af77fc047f97f9adb7453b77f037 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 13:32:54 +0000 Subject: [PATCH 05/91] =?UTF-8?q?new=20files=20types=20inferred=20from=20f?= =?UTF-8?q?ile=20names=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 10 ++++++++++ backend/pashub_fetcher/tests/test_core_files.py | 6 ++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index aa426475..b5ce1073 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -39,4 +39,14 @@ def infer_file_type(filename: str) -> Optional[str]: for core_file, file_type in CORE_TO_FILETYPE_MAP.items(): if filename.startswith(core_file.value): return file_type + + if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename: + return CORE_TO_FILETYPE_MAP[CoreFiles.IMPROVEMENT_OPTION_EVALUATION] + + if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename: + return CORE_TO_FILETYPE_MAP[CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN] + + if "-OSM-" in filename and "DR-N-A" in filename: + return CORE_TO_FILETYPE_MAP[CoreFiles.RETROFIT_DESIGN_DOC] + return None diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index fca29b7e..f8e8b431 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,5 +1,3 @@ -import pytest - from backend.pashub_fetcher.core_files import infer_file_type @@ -59,6 +57,6 @@ def test_infer_medium_term_improvement_plan(): assert infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan" -@pytest.mark.skip(reason="Retrofit Design Doc filename pattern not yet known") def test_infer_retrofit_design_doc(): - assert infer_file_type("2512-OSM-H56M900-XX-DR-N-A_Radford Road 408.pdf") == "retrofit_design_doc" + assert infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc" + assert infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc" From 39c5fd57693e6ceb5af2ce0bac7d1e53e7aca7e1 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 13:41:41 +0000 Subject: [PATCH 06/91] =?UTF-8?q?new=20files=20types=20inferred=20from=20f?= =?UTF-8?q?ile=20names=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 71 +++++++++++++++++----------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index b5ce1073..3e69bf9a 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Optional +from typing import Callable, Optional from backend.app.db.models.uploaded_file import FileTypeEnum @@ -19,34 +19,51 @@ class CoreFiles(Enum): RETROFIT_DESIGN_DOC = "Retrofit Design Doc" -CORE_TO_FILETYPE_MAP = { - CoreFiles.PHOTOPACK: FileTypeEnum.PHOTO_PACK.value, - CoreFiles.SITENOTE: FileTypeEnum.SITE_NOTE.value, - CoreFiles.RDSAP_SITENOTE: FileTypeEnum.RD_SAP_SITE_NOTE.value, - CoreFiles.PAS2023_VENTILATION: FileTypeEnum.PAS_2023_VENTILATION.value, - CoreFiles.PAS2023_CONDITION: FileTypeEnum.PAS_2023_CONDITION.value, - CoreFiles.PAS_SIGNIFICANCE: FileTypeEnum.PAS_SIGNIFICANCE.value, - CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value, - CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value, - CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value, - CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value, - CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value, - CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value, -} +_MATCHERS: list[tuple[Callable[[str], bool], str]] = [ + (lambda f: f.startswith(CoreFiles.PHOTOPACK.value), FileTypeEnum.PHOTO_PACK.value), + (lambda f: f.startswith(CoreFiles.SITENOTE.value), FileTypeEnum.SITE_NOTE.value), + ( + lambda f: f.startswith(CoreFiles.RDSAP_SITENOTE.value), + FileTypeEnum.RD_SAP_SITE_NOTE.value, + ), + ( + lambda f: f.startswith(CoreFiles.PAS2023_VENTILATION.value), + FileTypeEnum.PAS_2023_VENTILATION.value, + ), + ( + lambda f: f.startswith(CoreFiles.PAS2023_CONDITION.value), + FileTypeEnum.PAS_2023_CONDITION.value, + ), + ( + lambda f: f.startswith(CoreFiles.PAS_SIGNIFICANCE.value), + FileTypeEnum.PAS_SIGNIFICANCE.value, + ), + ( + lambda f: f.startswith(CoreFiles.PAR_PHOTOPACK.value), + FileTypeEnum.PAR_PHOTO_PACK.value, + ), + ( + lambda f: f.startswith(CoreFiles.PAS2023_PROPERTY.value), + FileTypeEnum.PAS_2023_PROPERTY.value, + ), + ( + lambda f: f.startswith(CoreFiles.PAS2023_OCCUPANCY.value), + FileTypeEnum.PAS_2023_OCCUPANCY.value, + ), + ( + lambda f: CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in f, + FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value, + ), + ( + lambda f: CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in f, + FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value, + ), + (lambda f: "-OSM-" in f and "DR-N-A" in f, FileTypeEnum.RETROFIT_DESIGN_DOC.value), +] def infer_file_type(filename: str) -> Optional[str]: - for core_file, file_type in CORE_TO_FILETYPE_MAP.items(): - if filename.startswith(core_file.value): + for matcher, file_type in _MATCHERS: + if matcher(filename): return file_type - - if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename: - return CORE_TO_FILETYPE_MAP[CoreFiles.IMPROVEMENT_OPTION_EVALUATION] - - if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename: - return CORE_TO_FILETYPE_MAP[CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN] - - if "-OSM-" in filename and "DR-N-A" in filename: - return CORE_TO_FILETYPE_MAP[CoreFiles.RETROFIT_DESIGN_DOC] - return None From 7635c800e6b88d65ae3ef7ddbbf7d199aaa7e64b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 13 May 2026 16:04:53 +0000 Subject: [PATCH 07/91] added 0.0.7 --- .devcontainer/backend/devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index 24949770..0a78dadf 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -5,7 +5,7 @@ "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", "initializeCommand": "docker network create shared-dev 2>/dev/null || true; test -d \"$HOME/.config/gh\" || test -n \"$GITHUB_TOKEN\" || { echo >&2 'error: no GitHub auth found. Run `gh auth login && gh auth setup-git` on the host, or export GITHUB_TOKEN, then retry.'; exit 1; }", - "postCreateCommand": "gh repo clone Hestia-Homes/agentic-toolkit /tmp/agentic-toolkit -- --branch 0.0.5 --depth 1 && bash /tmp/agentic-toolkit/setup.sh", + "postCreateCommand": "gh repo clone Hestia-Homes/agentic-toolkit /tmp/agentic-toolkit -- --branch 0.0.7 --depth 1 && bash /tmp/agentic-toolkit/setup.sh", "postStartCommand": "bash .devcontainer/backend/post-install.sh", "mounts": [ "source=${localEnv:HOME},target=/workspaces/home,type=bind", From df0f089d4f65d1107d69195820706205380d7e66 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:05:20 +0000 Subject: [PATCH 08/91] =?UTF-8?q?Retrofit=20design=20doc=20selected=20by?= =?UTF-8?q?=20evidence=5Fcategory=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_client.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 backend/pashub_fetcher/tests/test_pashub_client.py diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py new file mode 100644 index 00000000..4f5aef98 --- /dev/null +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -0,0 +1,44 @@ +from typing import Optional + +from backend.pashub_fetcher.core_files import CoreFiles +from backend.pashub_fetcher.evidence_file_data import EvidenceFileData +from backend.pashub_fetcher.pashub_client import PashubClient + + +def make_client() -> PashubClient: + return PashubClient(token="test-token") + + +def make_file( + file_name: str = "unknown.pdf", + evidence_category: Optional[str] = None, + created_utc: str = "2024-01-01T00:00:00", +) -> EvidenceFileData: + return EvidenceFileData( + file_id="id-1", + file_name=file_name, + created_utc=created_utc, + file_size=1024, + file_extension="pdf", + evidence_category=evidence_category, + ) + + +# --------------------------------------------------------------------------- +# _get_core_file_type +# --------------------------------------------------------------------------- + + +def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() -> None: + # Arrange + client = make_client() + file = make_file( + file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", + evidence_category="retrofit design", + ) + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result == CoreFiles.RETROFIT_DESIGN_DOC From f2bbb44207cc9971e8a04436dd8591d16846c2ef Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:10:56 +0000 Subject: [PATCH 09/91] =?UTF-8?q?Retrofit=20design=20doc=20selected=20by?= =?UTF-8?q?=20evidence=5Fcategory=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 20b8590d..11195960 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -87,6 +87,9 @@ class PashubClient: return None def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]: + if file.evidence_category == "retrofit design": + return CoreFiles.RETROFIT_DESIGN_DOC + for core_file in CoreFiles: if file.file_name.startswith(core_file.value): return core_file From 157a36f0cd5801799d2df54cd836b12894b56284 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:14:07 +0000 Subject: [PATCH 10/91] =?UTF-8?q?Evidence=20category=20matching=20is=20cas?= =?UTF-8?q?e-insensitive=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pashub_fetcher/tests/test_pashub_client.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 4f5aef98..ccf32fa6 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -42,3 +42,18 @@ def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() # Assert assert result == CoreFiles.RETROFIT_DESIGN_DOC + + +def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: + # Arrange + client = make_client() + file = make_file( + file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", + evidence_category="Retrofit Design", + ) + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result == CoreFiles.RETROFIT_DESIGN_DOC From 6922ff3e06be9dd1f12b4914aeaa960e25ee08d9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:16:14 +0000 Subject: [PATCH 11/91] =?UTF-8?q?Evidence=20category=20matching=20is=20cas?= =?UTF-8?q?e-insensitive=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 11195960..d7200a1f 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -87,7 +87,7 @@ class PashubClient: return None def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]: - if file.evidence_category == "retrofit design": + if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design": return CoreFiles.RETROFIT_DESIGN_DOC for core_file in CoreFiles: From 5c652d94852476d469436064dbc940ae7c62f46a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:24:14 +0000 Subject: [PATCH 12/91] =?UTF-8?q?Retrofit=20Design=20Doc=20startswith=20ch?= =?UTF-8?q?eck=20removed=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_pashub_client.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index ccf32fa6..8654a137 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -44,6 +44,18 @@ def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() assert result == CoreFiles.RETROFIT_DESIGN_DOC +def test_get_core_file_type_returns_improvement_option_evaluation_via_substring() -> None: + # Arrange + client = make_client() + file = make_file(file_name="6000802 - NG4 4HD - Improvement Option Evaluation.pdf") + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From a1f6ffd6b39f9b1b077cf98cf2346d2414c1c0c0 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:24:34 +0000 Subject: [PATCH 13/91] =?UTF-8?q?Improvement=20Option=20Evaluation=20selec?= =?UTF-8?q?ted=20via=20substring=20match=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index d7200a1f..ba0f0221 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -90,6 +90,9 @@ class PashubClient: if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design": return CoreFiles.RETROFIT_DESIGN_DOC + if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in file.file_name: + return CoreFiles.IMPROVEMENT_OPTION_EVALUATION + for core_file in CoreFiles: if file.file_name.startswith(core_file.value): return core_file From d99d8a33479470156c671dd440e0e5e61269380f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:25:02 +0000 Subject: [PATCH 14/91] =?UTF-8?q?Medium=20Term=20Improvement=20Plan=20sele?= =?UTF-8?q?cted=20via=20substring=20match=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_pashub_client.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 8654a137..9b99cf5c 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -56,6 +56,18 @@ def test_get_core_file_type_returns_improvement_option_evaluation_via_substring( assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION +def test_get_core_file_type_returns_medium_term_improvement_plan_via_substring() -> None: + # Arrange + client = make_client() + file = make_file(file_name="60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From 084c8218a6c5acbc532d8d41ced6cd2eb364e402 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:25:57 +0000 Subject: [PATCH 15/91] =?UTF-8?q?Medium=20Term=20Improvement=20Plan=20sele?= =?UTF-8?q?cted=20via=20substring=20match=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index ba0f0221..556884fe 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -93,6 +93,9 @@ class PashubClient: if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in file.file_name: return CoreFiles.IMPROVEMENT_OPTION_EVALUATION + if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in file.file_name: + return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN + for core_file in CoreFiles: if file.file_name.startswith(core_file.value): return core_file From a8e876d83d1e5b0bf7f204a8401e76b7fafe3170 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:26:34 +0000 Subject: [PATCH 16/91] =?UTF-8?q?Prefix=20and=20unknown=20file=20matching?= =?UTF-8?q?=20behaviour=20documented=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_client.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 9b99cf5c..036e50bc 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -68,6 +68,30 @@ def test_get_core_file_type_returns_medium_term_improvement_plan_via_substring() assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN +def test_get_core_file_type_returns_photopack_via_prefix() -> None: + # Arrange + client = make_client() + file = make_file(file_name="Photopack_123456_V1.pdf") + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result == CoreFiles.PHOTOPACK + + +def test_get_core_file_type_returns_none_for_unknown_file() -> None: + # Arrange + client = make_client() + file = make_file(file_name="unknown_document_123.pdf") + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result is None + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From 506dc92aa3ccc9ef4b3b7f6ab0351e0c76dd7ec8 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:27:42 +0000 Subject: [PATCH 17/91] =?UTF-8?q?=5Fselect=5Flatest=5Fcore=5Ffiles=20retur?= =?UTF-8?q?ns=20single=20retrofit=20design=20doc=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_client.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 036e50bc..334f2de0 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -1,3 +1,4 @@ +# pyright: reportPrivateUsage=false from typing import Optional from backend.pashub_fetcher.core_files import CoreFiles @@ -92,6 +93,29 @@ def test_get_core_file_type_returns_none_for_unknown_file() -> None: assert result is None +# --------------------------------------------------------------------------- +# _select_latest_core_files +# --------------------------------------------------------------------------- + + +def test_select_latest_core_files_returns_single_retrofit_design_doc() -> None: + # Arrange + client = make_client() + files = [ + make_file( + file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", + evidence_category="retrofit design", + created_utc="2024-06-01T00:00:00", + ) + ] + + # Act + result = client._select_latest_core_files(files) + + # Assert + assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From b685008e5ee1816588dedc096866a63764fc9c2a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:28:19 +0000 Subject: [PATCH 18/91] =?UTF-8?q?OSM=20candidate=20wins=20over=20non-OSM?= =?UTF-8?q?=20retrofit=20design=20doc=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_client.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 334f2de0..646ff3bc 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -116,6 +116,29 @@ def test_select_latest_core_files_returns_single_retrofit_design_doc() -> None: assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" +def test_select_latest_core_files_osm_candidate_wins_over_non_osm() -> None: + # Arrange - the non-OSM file is newer but should lose to the OSM file + client = make_client() + files = [ + make_file( + file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", + evidence_category="retrofit design", + created_utc="2024-01-01T00:00:00", + ), + make_file( + file_name="Retrofit Design Doc non-osm variant.pdf", + evidence_category="retrofit design", + created_utc="2024-06-01T00:00:00", + ), + ] + + # Act + result = client._select_latest_core_files(files) + + # Assert + assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From aff79d4151da0b8b0958a34b8090abac7a27260b Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:28:50 +0000 Subject: [PATCH 19/91] =?UTF-8?q?OSM=20candidate=20wins=20over=20non-OSM?= =?UTF-8?q?=20retrofit=20design=20doc=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 556884fe..4435c278 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -116,6 +116,9 @@ class PashubClient: latest_files: Dict[CoreFiles, EvidenceFileData] = {} for core_type, group in grouped.items(): + if core_type == CoreFiles.RETROFIT_DESIGN_DOC and len(group) > 1: + osm_candidates = [f for f in group if "-OSM-" in f.file_name] + group = osm_candidates if osm_candidates else group latest = max(group, key=lambda f: datetime.fromisoformat(f.created_utc)) latest_files[core_type] = latest From 3fe85a635ca94aca2af08ee71b7ee59e9495b106 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:29:24 +0000 Subject: [PATCH 20/91] =?UTF-8?q?Latest=20wins=20when=20both=20retrofit=20?= =?UTF-8?q?design=20doc=20candidates=20have=20OSM=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_client.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 646ff3bc..7f0663db 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -139,6 +139,29 @@ def test_select_latest_core_files_osm_candidate_wins_over_non_osm() -> None: assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" +def test_select_latest_core_files_picks_latest_when_both_candidates_have_osm() -> None: + # Arrange + client = make_client() + files = [ + make_file( + file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", + evidence_category="retrofit design", + created_utc="2024-01-01T00:00:00", + ), + make_file( + file_name="2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf", + evidence_category="retrofit design", + created_utc="2024-06-01T00:00:00", + ), + ] + + # Act + result = client._select_latest_core_files(files) + + # Assert + assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf" + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From 9a04d89cae07671fbe182334df59e079a22f5e78 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:29:54 +0000 Subject: [PATCH 21/91] =?UTF-8?q?Latest=20wins=20as=20fallback=20when=20no?= =?UTF-8?q?=20OSM=20retrofit=20design=20doc=20candidates=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_client.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 7f0663db..9ee8948a 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -162,6 +162,29 @@ def test_select_latest_core_files_picks_latest_when_both_candidates_have_osm() - assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf" +def test_select_latest_core_files_falls_back_to_latest_when_no_osm_candidates() -> None: + # Arrange + client = make_client() + files = [ + make_file( + file_name="retrofit_design_v1.pdf", + evidence_category="retrofit design", + created_utc="2024-01-01T00:00:00", + ), + make_file( + file_name="retrofit_design_v2.pdf", + evidence_category="retrofit design", + created_utc="2024-06-01T00:00:00", + ), + ] + + # Act + result = client._select_latest_core_files(files) + + # Assert + assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "retrofit_design_v2.pdf" + + def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: # Arrange client = make_client() From 16af543560f559c005f649a47b05c60cce2b2c94 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 13 May 2026 16:32:44 +0000 Subject: [PATCH 22/91] =?UTF-8?q?Consolidate=20three-tier=20matching=20and?= =?UTF-8?q?=20tidy=20test=20ordering=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 13 ++++---- .../tests/test_pashub_client.py | 30 +++++++++---------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 4435c278..25bf7b72 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -90,13 +90,16 @@ class PashubClient: if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design": return CoreFiles.RETROFIT_DESIGN_DOC - if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in file.file_name: - return CoreFiles.IMPROVEMENT_OPTION_EVALUATION - - if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in file.file_name: - return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN + for core_file in ( + CoreFiles.IMPROVEMENT_OPTION_EVALUATION, + CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN, + ): + if core_file.value in file.file_name: + return core_file for core_file in CoreFiles: + if core_file is CoreFiles.RETROFIT_DESIGN_DOC: + continue if file.file_name.startswith(core_file.value): return core_file return None diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 9ee8948a..7fd10381 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -45,6 +45,21 @@ def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() assert result == CoreFiles.RETROFIT_DESIGN_DOC +def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: + # Arrange + client = make_client() + file = make_file( + file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", + evidence_category="Retrofit Design", + ) + + # Act + result = client._get_core_file_type(file) + + # Assert + assert result == CoreFiles.RETROFIT_DESIGN_DOC + + def test_get_core_file_type_returns_improvement_option_evaluation_via_substring() -> None: # Arrange client = make_client() @@ -183,18 +198,3 @@ def test_select_latest_core_files_falls_back_to_latest_when_no_osm_candidates() # Assert assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "retrofit_design_v2.pdf" - - -def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: - # Arrange - client = make_client() - file = make_file( - file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", - evidence_category="Retrofit Design", - ) - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result == CoreFiles.RETROFIT_DESIGN_DOC From 664c9b91fa9e280766dbadda11a065b6c044d0a9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 07:38:43 +0000 Subject: [PATCH 23/91] delete incorrect comment in test --- .../pashub_fetcher/tests/test_core_files.py | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index f8e8b431..8715f6ca 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,6 +1,5 @@ from backend.pashub_fetcher.core_files import infer_file_type - # --- GREEN: pre-existing file types (startswith match) --- @@ -13,15 +12,22 @@ def test_infer_sitenote(): def test_infer_rdsap_sitenote(): - assert infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" + assert ( + infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" + ) def test_infer_pas2023_ventilation(): - assert infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation" + assert ( + infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf") + == "pas_2023_ventilation" + ) def test_infer_pas2023_condition(): - assert infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" + assert ( + infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" + ) def test_infer_pas_significance(): @@ -29,34 +35,51 @@ def test_infer_pas_significance(): def test_infer_par_photopack(): - assert infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" + assert ( + infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" + ) def test_infer_pas2023_property(): - assert infer_file_type("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property" + assert ( + infer_file_type("PAS 2023 Property Assessment Report_123456.pdf") + == "pas_2023_property" + ) def test_infer_pas2023_occupancy(): - assert infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy" + assert ( + infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf") + == "pas_2023_occupancy" + ) def test_infer_unknown_returns_none(): assert infer_file_type("unknown_document_123.pdf") is None -# --- RED: new file types (suffix match not yet implemented) --- - - def test_infer_improvement_option_evaluation(): # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf" - assert infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") == "improvement_option_evaluation" + assert ( + infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") + == "improvement_option_evaluation" + ) def test_infer_medium_term_improvement_plan(): # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf" - assert infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan" + assert ( + infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") + == "medium_term_improvement_plan" + ) def test_infer_retrofit_design_doc(): - assert infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc" - assert infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc" + assert ( + infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") + == "retrofit_design_doc" + ) + assert ( + infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") + == "retrofit_design_doc" + ) From 75093fc8333b1cb2ff80cca61e4588e73a448f6a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 07:38:58 +0000 Subject: [PATCH 24/91] delete incorrect comment in test --- backend/pashub_fetcher/tests/test_core_files.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 8715f6ca..8bd31f15 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,7 +1,5 @@ from backend.pashub_fetcher.core_files import infer_file_type -# --- GREEN: pre-existing file types (startswith match) --- - def test_infer_photopack(): assert infer_file_type("Photopack_123456_V1.pdf") == "photo_pack" From 1a789ec609c4b6ca6afe3ea83e9a753687f8a0a4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:37:32 +0000 Subject: [PATCH 25/91] =?UTF-8?q?new=20core=5Ffile=5Ffor=20function=20iden?= =?UTF-8?q?tifies=20CoreFiles=20type=20from=20filename=20and=20evidence=20?= =?UTF-8?q?category=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 6 ++++++ .../pashub_fetcher/tests/test_core_files.py | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 3e69bf9a..050dde27 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -62,6 +62,12 @@ _MATCHERS: list[tuple[Callable[[str], bool], str]] = [ ] +def core_file_for( + filename: str, evidence_category: Optional[str] = None +) -> Optional[CoreFiles]: + raise NotImplementedError + + def infer_file_type(filename: str) -> Optional[str]: for matcher, file_type in _MATCHERS: if matcher(filename): diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 8bd31f15..5ac6b4f7 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,4 +1,4 @@ -from backend.pashub_fetcher.core_files import infer_file_type +from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, infer_file_type def test_infer_photopack(): @@ -81,3 +81,19 @@ def test_infer_retrofit_design_doc(): infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc" ) + + +# --------------------------------------------------------------------------- +# core_file_for +# --------------------------------------------------------------------------- + + +def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None: + # Arrange + filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" + + # Act + result = core_file_for(filename, evidence_category="retrofit design") + + # Assert + assert result == CoreFiles.RETROFIT_DESIGN_DOC From 9adb467a02e42d1d0a82285f1acafa4c344deb1d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:38:36 +0000 Subject: [PATCH 26/91] =?UTF-8?q?new=20core=5Ffile=5Ffor=20function=20iden?= =?UTF-8?q?tifies=20CoreFiles=20type=20from=20filename=20and=20evidence=20?= =?UTF-8?q?category=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 050dde27..07297653 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -65,6 +65,8 @@ _MATCHERS: list[tuple[Callable[[str], bool], str]] = [ def core_file_for( filename: str, evidence_category: Optional[str] = None ) -> Optional[CoreFiles]: + if evidence_category is not None and evidence_category.lower() == "retrofit design": + return CoreFiles.RETROFIT_DESIGN_DOC raise NotImplementedError From e312dd26146115b467437ee60f93de4cb76125ee Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:39:11 +0000 Subject: [PATCH 27/91] =?UTF-8?q?core=5Ffile=5Ffor=20evidence=5Fcategory?= =?UTF-8?q?=20match=20is=20case-insensitive=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 5ac6b4f7..f968a976 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -88,6 +88,17 @@ def test_infer_retrofit_design_doc(): # --------------------------------------------------------------------------- +def test_core_file_for_evidence_category_match_is_case_insensitive() -> None: + # Arrange + filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" + + # Act + result = core_file_for(filename, evidence_category="Retrofit Design") + + # Assert + assert result == CoreFiles.RETROFIT_DESIGN_DOC + + def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None: # Arrange filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" From 9bbd5f1ff9fc0810383c73a2d7bc8863c4f2c258 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:39:58 +0000 Subject: [PATCH 28/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20IOE=20fi?= =?UTF-8?q?les=20via=20filename=20substring=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index f968a976..c6970def 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -108,3 +108,14 @@ def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None: # Assert assert result == CoreFiles.RETROFIT_DESIGN_DOC + + +def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() -> None: + # Arrange + filename = "6000802 - NG4 4HD - Improvement Option Evaluation.pdf" + + # Act + result = core_file_for(filename) + + # Assert + assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION From 46355be3f1e24d10662583afa3b8b55f3a1d8cc6 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:40:21 +0000 Subject: [PATCH 29/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20IOE=20fi?= =?UTF-8?q?les=20via=20filename=20substring=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 07297653..72ef15f8 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -67,6 +67,8 @@ def core_file_for( ) -> Optional[CoreFiles]: if evidence_category is not None and evidence_category.lower() == "retrofit design": return CoreFiles.RETROFIT_DESIGN_DOC + if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename: + return CoreFiles.IMPROVEMENT_OPTION_EVALUATION raise NotImplementedError From 176239475a977943bf81e6bddb9d042bbbb5d014 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:40:49 +0000 Subject: [PATCH 30/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20MTIP=20f?= =?UTF-8?q?iles=20via=20filename=20substring=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index c6970def..85e7607e 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -119,3 +119,14 @@ def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() -> # Assert assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION + + +def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() -> None: + # Arrange + filename = "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf" + + # Act + result = core_file_for(filename) + + # Assert + assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN From 4d3d6dba05477bef466f64dde09f4d88956efad0 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:41:26 +0000 Subject: [PATCH 31/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20MTIP=20f?= =?UTF-8?q?iles=20via=20filename=20substring=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 72ef15f8..4b1023d2 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -69,6 +69,8 @@ def core_file_for( return CoreFiles.RETROFIT_DESIGN_DOC if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename: return CoreFiles.IMPROVEMENT_OPTION_EVALUATION + if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename: + return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN raise NotImplementedError From e940e75a43f1a3aebe8a78dc7bd06d4c648997fb Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:41:52 +0000 Subject: [PATCH 32/91] =?UTF-8?q?core=5Ffile=5Ffor=20falls=20back=20to=20O?= =?UTF-8?q?SM=20filename=20pattern=20for=20Retrofit=20Design=20Doc=20?= =?UTF-8?q?=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 85e7607e..7b991c23 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -130,3 +130,14 @@ def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() -> # Assert assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN + + +def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> None: + # Arrange + filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" + + # Act + result = core_file_for(filename) + + # Assert + assert result == CoreFiles.RETROFIT_DESIGN_DOC From 3ef8a591223ea50ade12b36545dda1f92542abee Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:43:04 +0000 Subject: [PATCH 33/91] =?UTF-8?q?core=5Ffile=5Ffor=20falls=20back=20to=20O?= =?UTF-8?q?SM=20filename=20pattern=20for=20Retrofit=20Design=20Doc=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 4b1023d2..75981cb1 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -71,6 +71,8 @@ def core_file_for( return CoreFiles.IMPROVEMENT_OPTION_EVALUATION if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename: return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN + if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename: + return CoreFiles.RETROFIT_DESIGN_DOC raise NotImplementedError From a2dc945bf38005826a6bc713e3d93ca30b5a79e0 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:43:41 +0000 Subject: [PATCH 34/91] =?UTF-8?q?core=5Ffile=5Ffor=20matches=20remaining?= =?UTF-8?q?=20core=20file=20types=20via=20filename=20prefix=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 7b991c23..f87d8679 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -141,3 +141,14 @@ def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_ # Assert assert result == CoreFiles.RETROFIT_DESIGN_DOC + + +def test_core_file_for_prefix_returns_photopack() -> None: + # Arrange + filename = "Photopack_123456_V1.pdf" + + # Act + result = core_file_for(filename) + + # Assert + assert result == CoreFiles.PHOTOPACK From 605f2e3d1e1f5bdc3ceaa953fd7150f938ade72f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:45:18 +0000 Subject: [PATCH 35/91] =?UTF-8?q?core=5Ffile=5Ffor=20matches=20remaining?= =?UTF-8?q?=20core=20file=20types=20via=20filename=20prefix=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 75981cb1..87a4044a 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -73,7 +73,17 @@ def core_file_for( return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename: return CoreFiles.RETROFIT_DESIGN_DOC - raise NotImplementedError + _prefix_skip = { + CoreFiles.RETROFIT_DESIGN_DOC, + CoreFiles.IMPROVEMENT_OPTION_EVALUATION, + CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN, + } + for core_file in CoreFiles: + if core_file in _prefix_skip: + continue + if filename.startswith(core_file.value): + return core_file + return None def infer_file_type(filename: str) -> Optional[str]: From d4cc00b5e31d7b6653ccb0a7f1307b2638dc2a12 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:46:10 +0000 Subject: [PATCH 36/91] =?UTF-8?q?core=5Ffile=5Ffor=20returns=20None=20for?= =?UTF-8?q?=20unrecognised=20filenames=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index f87d8679..2b20803c 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -152,3 +152,14 @@ def test_core_file_for_prefix_returns_photopack() -> None: # Assert assert result == CoreFiles.PHOTOPACK + + +def test_core_file_for_unknown_filename_returns_none() -> None: + # Arrange + filename = "unknown_document_123.pdf" + + # Act + result = core_file_for(filename) + + # Assert + assert result is None From 541d5965b7619090b9d1a564761e424cba37d86e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:46:48 +0000 Subject: [PATCH 37/91] =?UTF-8?q?core=5Ffile=5Ffor=20OSM=20fallback=20is?= =?UTF-8?q?=20suppressed=20when=20evidence=5Fcategory=20is=20present=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 2b20803c..e97df476 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -163,3 +163,14 @@ def test_core_file_for_unknown_filename_returns_none() -> None: # Assert assert result is None + + +def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> None: + # Arrange — OSM+DR-N-A filename but evidence_category is something other than retrofit design + filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" + + # Act + result = core_file_for(filename, evidence_category="some other category") + + # Assert + assert result is None From 5e31c0f3dadd4d4da36dc023612388ef66f5b4c9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:51:28 +0000 Subject: [PATCH 38/91] =?UTF-8?q?file=5Ftype=5Ffor=20delegates=20to=20core?= =?UTF-8?q?=5Ffile=5Ffor;=20=5FMATCHERS=20removed=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 67 ++++++------------- backend/pashub_fetcher/pashub_service.py | 4 +- .../pashub_fetcher/tests/test_core_files.py | 30 ++++----- 3 files changed, 37 insertions(+), 64 deletions(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 87a4044a..01ae189f 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Callable, Optional +from typing import Optional from backend.app.db.models.uploaded_file import FileTypeEnum @@ -19,47 +19,20 @@ class CoreFiles(Enum): RETROFIT_DESIGN_DOC = "Retrofit Design Doc" -_MATCHERS: list[tuple[Callable[[str], bool], str]] = [ - (lambda f: f.startswith(CoreFiles.PHOTOPACK.value), FileTypeEnum.PHOTO_PACK.value), - (lambda f: f.startswith(CoreFiles.SITENOTE.value), FileTypeEnum.SITE_NOTE.value), - ( - lambda f: f.startswith(CoreFiles.RDSAP_SITENOTE.value), - FileTypeEnum.RD_SAP_SITE_NOTE.value, - ), - ( - lambda f: f.startswith(CoreFiles.PAS2023_VENTILATION.value), - FileTypeEnum.PAS_2023_VENTILATION.value, - ), - ( - lambda f: f.startswith(CoreFiles.PAS2023_CONDITION.value), - FileTypeEnum.PAS_2023_CONDITION.value, - ), - ( - lambda f: f.startswith(CoreFiles.PAS_SIGNIFICANCE.value), - FileTypeEnum.PAS_SIGNIFICANCE.value, - ), - ( - lambda f: f.startswith(CoreFiles.PAR_PHOTOPACK.value), - FileTypeEnum.PAR_PHOTO_PACK.value, - ), - ( - lambda f: f.startswith(CoreFiles.PAS2023_PROPERTY.value), - FileTypeEnum.PAS_2023_PROPERTY.value, - ), - ( - lambda f: f.startswith(CoreFiles.PAS2023_OCCUPANCY.value), - FileTypeEnum.PAS_2023_OCCUPANCY.value, - ), - ( - lambda f: CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in f, - FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value, - ), - ( - lambda f: CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in f, - FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value, - ), - (lambda f: "-OSM-" in f and "DR-N-A" in f, FileTypeEnum.RETROFIT_DESIGN_DOC.value), -] +_CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = { + CoreFiles.PHOTOPACK: FileTypeEnum.PHOTO_PACK.value, + CoreFiles.SITENOTE: FileTypeEnum.SITE_NOTE.value, + CoreFiles.RDSAP_SITENOTE: FileTypeEnum.RD_SAP_SITE_NOTE.value, + CoreFiles.PAS2023_VENTILATION: FileTypeEnum.PAS_2023_VENTILATION.value, + CoreFiles.PAS2023_CONDITION: FileTypeEnum.PAS_2023_CONDITION.value, + CoreFiles.PAS_SIGNIFICANCE: FileTypeEnum.PAS_SIGNIFICANCE.value, + CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value, + CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value, + CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value, + CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value, + CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value, + CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value, +} def core_file_for( @@ -86,8 +59,8 @@ def core_file_for( return None -def infer_file_type(filename: str) -> Optional[str]: - for matcher, file_type in _MATCHERS: - if matcher(filename): - return file_type - return None +def file_type_for(filename: str) -> Optional[str]: + core_file = core_file_for(filename) + if core_file is None: + return None + return _CORE_FILE_TO_FILE_TYPE[core_file] diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 316902f4..ec623f7a 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import ( ) from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_site_notes_pdf -from backend.pashub_fetcher.core_files import infer_file_type +from backend.pashub_fetcher.core_files import file_type_for from backend.pashub_fetcher.pashub_client import PashubClient from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, @@ -109,7 +109,7 @@ class PashubService: uprn=int(uprn) if uprn else None, hubspot_deal_id=hubspot_deal_id, file_source=FileSourceEnum.PAS_HUB.value, - file_type=infer_file_type(filename), + file_type=file_type_for(filename), ) file_paths.append(file_path) uploaded_files.append(uploaded_file) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index e97df476..09fcdcb2 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,65 +1,65 @@ -from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, infer_file_type +from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, file_type_for def test_infer_photopack(): - assert infer_file_type("Photopack_123456_V1.pdf") == "photo_pack" + assert file_type_for("Photopack_123456_V1.pdf") == "photo_pack" def test_infer_sitenote(): - assert infer_file_type("SiteNote_123456_V1.pdf") == "site_note" + assert file_type_for("SiteNote_123456_V1.pdf") == "site_note" def test_infer_rdsap_sitenote(): assert ( - infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" + file_type_for("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" ) def test_infer_pas2023_ventilation(): assert ( - infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf") + file_type_for("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation" ) def test_infer_pas2023_condition(): assert ( - infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" + file_type_for("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" ) def test_infer_pas_significance(): - assert infer_file_type("PAS Significance_123456.pdf") == "pas_significance" + assert file_type_for("PAS Significance_123456.pdf") == "pas_significance" def test_infer_par_photopack(): assert ( - infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" + file_type_for("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" ) def test_infer_pas2023_property(): assert ( - infer_file_type("PAS 2023 Property Assessment Report_123456.pdf") + file_type_for("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property" ) def test_infer_pas2023_occupancy(): assert ( - infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf") + file_type_for("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy" ) def test_infer_unknown_returns_none(): - assert infer_file_type("unknown_document_123.pdf") is None + assert file_type_for("unknown_document_123.pdf") is None def test_infer_improvement_option_evaluation(): # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf" assert ( - infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") + file_type_for("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") == "improvement_option_evaluation" ) @@ -67,18 +67,18 @@ def test_infer_improvement_option_evaluation(): def test_infer_medium_term_improvement_plan(): # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf" assert ( - infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") + file_type_for("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan" ) def test_infer_retrofit_design_doc(): assert ( - infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") + file_type_for("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc" ) assert ( - infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") + file_type_for("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc" ) From fb9bdbc585940e4afe714c152cfc52b48559336d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 08:53:56 +0000 Subject: [PATCH 39/91] =?UTF-8?q?=5Fselect=5Flatest=5Fcore=5Ffiles=20deleg?= =?UTF-8?q?ates=20to=20core=5Ffile=5Ffor;=20=5Fget=5Fcore=5Ffile=5Ftype=20?= =?UTF-8?q?removed=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 22 +---- .../tests/test_pashub_client.py | 83 ------------------- 2 files changed, 2 insertions(+), 103 deletions(-) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 25bf7b72..f851c410 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -5,7 +5,7 @@ from datetime import datetime import requests -from backend.pashub_fetcher.core_files import CoreFiles +from backend.pashub_fetcher.core_files import CoreFiles, core_file_for from backend.pashub_fetcher.evidence_file_data import EvidenceFileData from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata from utils.logger import setup_logger @@ -86,24 +86,6 @@ class PashubClient: except Exception: return None - def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]: - if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design": - return CoreFiles.RETROFIT_DESIGN_DOC - - for core_file in ( - CoreFiles.IMPROVEMENT_OPTION_EVALUATION, - CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN, - ): - if core_file.value in file.file_name: - return core_file - - for core_file in CoreFiles: - if core_file is CoreFiles.RETROFIT_DESIGN_DOC: - continue - if file.file_name.startswith(core_file.value): - return core_file - return None - def _select_latest_core_files( self, files: List[EvidenceFileData], @@ -111,7 +93,7 @@ class PashubClient: grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list) for file in files: - core_type = self._get_core_file_type(file) + core_type = core_file_for(file.file_name, file.evidence_category) if not core_type: continue grouped[core_type].append(file) diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index 7fd10381..34260c73 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -25,89 +25,6 @@ def make_file( ) -# --------------------------------------------------------------------------- -# _get_core_file_type -# --------------------------------------------------------------------------- - - -def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() -> None: - # Arrange - client = make_client() - file = make_file( - file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", - evidence_category="retrofit design", - ) - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result == CoreFiles.RETROFIT_DESIGN_DOC - - -def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None: - # Arrange - client = make_client() - file = make_file( - file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf", - evidence_category="Retrofit Design", - ) - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result == CoreFiles.RETROFIT_DESIGN_DOC - - -def test_get_core_file_type_returns_improvement_option_evaluation_via_substring() -> None: - # Arrange - client = make_client() - file = make_file(file_name="6000802 - NG4 4HD - Improvement Option Evaluation.pdf") - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION - - -def test_get_core_file_type_returns_medium_term_improvement_plan_via_substring() -> None: - # Arrange - client = make_client() - file = make_file(file_name="60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN - - -def test_get_core_file_type_returns_photopack_via_prefix() -> None: - # Arrange - client = make_client() - file = make_file(file_name="Photopack_123456_V1.pdf") - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result == CoreFiles.PHOTOPACK - - -def test_get_core_file_type_returns_none_for_unknown_file() -> None: - # Arrange - client = make_client() - file = make_file(file_name="unknown_document_123.pdf") - - # Act - result = client._get_core_file_type(file) - - # Assert - assert result is None - - # --------------------------------------------------------------------------- # _select_latest_core_files # --------------------------------------------------------------------------- From e8b7cfdcec0c62389759ba4d7ce8642994df062e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 09:01:56 +0000 Subject: [PATCH 40/91] =?UTF-8?q?remove=20redundant=20unknown-file=20test;?= =?UTF-8?q?=20rename=20test=5Finfer=5F*=20to=20test=5Ffile=5Ftype=5Ffor=5F?= =?UTF-8?q?*=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/core_files.py | 9 ++++++ .../pashub_fetcher/tests/test_core_files.py | 28 ++++++++----------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 01ae189f..e668ba7f 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -38,24 +38,33 @@ _CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = { def core_file_for( filename: str, evidence_category: Optional[str] = None ) -> Optional[CoreFiles]: + # Identify retrofit design doc using evidence category as the name is possibly unreliable. + # We might change to always use evidence category, but needs more investigation if evidence_category is not None and evidence_category.lower() == "retrofit design": return CoreFiles.RETROFIT_DESIGN_DOC + if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename: return CoreFiles.IMPROVEMENT_OPTION_EVALUATION + if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename: return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN + if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename: return CoreFiles.RETROFIT_DESIGN_DOC + _prefix_skip = { CoreFiles.RETROFIT_DESIGN_DOC, CoreFiles.IMPROVEMENT_OPTION_EVALUATION, CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN, } + for core_file in CoreFiles: if core_file in _prefix_skip: continue + if filename.startswith(core_file.value): return core_file + return None diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index 09fcdcb2..ee91298e 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,62 +1,58 @@ from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, file_type_for -def test_infer_photopack(): +def test_file_type_for_photopack(): assert file_type_for("Photopack_123456_V1.pdf") == "photo_pack" -def test_infer_sitenote(): +def test_file_type_for_sitenote(): assert file_type_for("SiteNote_123456_V1.pdf") == "site_note" -def test_infer_rdsap_sitenote(): +def test_file_type_for_rdsap_sitenote(): assert ( file_type_for("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" ) -def test_infer_pas2023_ventilation(): +def test_file_type_for_pas2023_ventilation(): assert ( file_type_for("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation" ) -def test_infer_pas2023_condition(): +def test_file_type_for_pas2023_condition(): assert ( file_type_for("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" ) -def test_infer_pas_significance(): +def test_file_type_for_pas_significance(): assert file_type_for("PAS Significance_123456.pdf") == "pas_significance" -def test_infer_par_photopack(): +def test_file_type_for_par_photopack(): assert ( file_type_for("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" ) -def test_infer_pas2023_property(): +def test_file_type_for_pas2023_property(): assert ( file_type_for("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property" ) -def test_infer_pas2023_occupancy(): +def test_file_type_for_pas2023_occupancy(): assert ( file_type_for("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy" ) -def test_infer_unknown_returns_none(): - assert file_type_for("unknown_document_123.pdf") is None - - -def test_infer_improvement_option_evaluation(): +def test_file_type_for_improvement_option_evaluation(): # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf" assert ( file_type_for("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") @@ -64,7 +60,7 @@ def test_infer_improvement_option_evaluation(): ) -def test_infer_medium_term_improvement_plan(): +def test_file_type_for_medium_term_improvement_plan(): # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf" assert ( file_type_for("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") @@ -72,7 +68,7 @@ def test_infer_medium_term_improvement_plan(): ) -def test_infer_retrofit_design_doc(): +def test_file_type_for_retrofit_design_doc(): assert ( file_type_for("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc" From faf698eb7162af4a2f08da1379d5ce3f1be41444 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 10:57:37 +0000 Subject: [PATCH 41/91] rename functions and include typehints --- backend/pashub_fetcher/core_files.py | 6 +- backend/pashub_fetcher/pashub_client.py | 7 ++- backend/pashub_fetcher/pashub_service.py | 4 +- .../pashub_fetcher/tests/test_core_files.py | 61 +++++++++++-------- 4 files changed, 46 insertions(+), 32 deletions(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index e668ba7f..30aa2ba8 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -35,7 +35,7 @@ _CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = { } -def core_file_for( +def get_core_file_type( filename: str, evidence_category: Optional[str] = None ) -> Optional[CoreFiles]: # Identify retrofit design doc using evidence category as the name is possibly unreliable. @@ -68,8 +68,8 @@ def core_file_for( return None -def file_type_for(filename: str) -> Optional[str]: - core_file = core_file_for(filename) +def get_file_type_string(filename: str) -> Optional[str]: + core_file = get_core_file_type(filename) if core_file is None: return None return _CORE_FILE_TO_FILE_TYPE[core_file] diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index f851c410..7896664d 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -5,12 +5,11 @@ from datetime import datetime import requests -from backend.pashub_fetcher.core_files import CoreFiles, core_file_for +from backend.pashub_fetcher.core_files import CoreFiles, get_core_file_type from backend.pashub_fetcher.evidence_file_data import EvidenceFileData from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata from utils.logger import setup_logger - logger = setup_logger() @@ -93,7 +92,9 @@ class PashubClient: grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list) for file in files: - core_type = core_file_for(file.file_name, file.evidence_category) + core_type: Optional[CoreFiles] = get_core_file_type( + file.file_name, file.evidence_category + ) if not core_type: continue grouped[core_type].append(file) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index ec623f7a..b3302fd9 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import ( ) from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_site_notes_pdf -from backend.pashub_fetcher.core_files import file_type_for +from backend.pashub_fetcher.core_files import get_file_type_string from backend.pashub_fetcher.pashub_client import PashubClient from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, @@ -109,7 +109,7 @@ class PashubService: uprn=int(uprn) if uprn else None, hubspot_deal_id=hubspot_deal_id, file_source=FileSourceEnum.PAS_HUB.value, - file_type=file_type_for(filename), + file_type=get_file_type_string(filename), ) file_paths.append(file_path) uploaded_files.append(uploaded_file) diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py index ee91298e..3c1d11b8 100644 --- a/backend/pashub_fetcher/tests/test_core_files.py +++ b/backend/pashub_fetcher/tests/test_core_files.py @@ -1,53 +1,60 @@ -from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, file_type_for +from backend.pashub_fetcher.core_files import ( + CoreFiles, + get_core_file_type, + get_file_type_string, +) def test_file_type_for_photopack(): - assert file_type_for("Photopack_123456_V1.pdf") == "photo_pack" + assert get_file_type_string("Photopack_123456_V1.pdf") == "photo_pack" def test_file_type_for_sitenote(): - assert file_type_for("SiteNote_123456_V1.pdf") == "site_note" + assert get_file_type_string("SiteNote_123456_V1.pdf") == "site_note" def test_file_type_for_rdsap_sitenote(): assert ( - file_type_for("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note" + get_file_type_string("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") + == "rd_sap_site_note" ) def test_file_type_for_pas2023_ventilation(): assert ( - file_type_for("PAS 2023 Ventilation Assessment Report_123456.pdf") + get_file_type_string("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation" ) def test_file_type_for_pas2023_condition(): assert ( - file_type_for("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition" + get_file_type_string("PAS 2023 Condition Report_123456.pdf") + == "pas_2023_condition" ) def test_file_type_for_pas_significance(): - assert file_type_for("PAS Significance_123456.pdf") == "pas_significance" + assert get_file_type_string("PAS Significance_123456.pdf") == "pas_significance" def test_file_type_for_par_photopack(): assert ( - file_type_for("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack" + get_file_type_string("PAR Photo Pack_95101890_V2_Assessment.pdf") + == "par_photo_pack" ) def test_file_type_for_pas2023_property(): assert ( - file_type_for("PAS 2023 Property Assessment Report_123456.pdf") + get_file_type_string("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property" ) def test_file_type_for_pas2023_occupancy(): assert ( - file_type_for("PAS 2023 Occupancy Assessment Report_123456.pdf") + get_file_type_string("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy" ) @@ -55,7 +62,7 @@ def test_file_type_for_pas2023_occupancy(): def test_file_type_for_improvement_option_evaluation(): # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf" assert ( - file_type_for("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") + get_file_type_string("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") == "improvement_option_evaluation" ) @@ -63,18 +70,20 @@ def test_file_type_for_improvement_option_evaluation(): def test_file_type_for_medium_term_improvement_plan(): # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf" assert ( - file_type_for("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") + get_file_type_string( + "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf" + ) == "medium_term_improvement_plan" ) def test_file_type_for_retrofit_design_doc(): assert ( - file_type_for("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") + get_file_type_string("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc" ) assert ( - file_type_for("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") + get_file_type_string("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc" ) @@ -89,7 +98,7 @@ def test_core_file_for_evidence_category_match_is_case_insensitive() -> None: filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" # Act - result = core_file_for(filename, evidence_category="Retrofit Design") + result = get_core_file_type(filename, evidence_category="Retrofit Design") # Assert assert result == CoreFiles.RETROFIT_DESIGN_DOC @@ -100,7 +109,7 @@ def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None: filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" # Act - result = core_file_for(filename, evidence_category="retrofit design") + result = get_core_file_type(filename, evidence_category="retrofit design") # Assert assert result == CoreFiles.RETROFIT_DESIGN_DOC @@ -111,7 +120,7 @@ def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() -> filename = "6000802 - NG4 4HD - Improvement Option Evaluation.pdf" # Act - result = core_file_for(filename) + result = get_core_file_type(filename) # Assert assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION @@ -122,18 +131,20 @@ def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() -> filename = "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf" # Act - result = core_file_for(filename) + result = get_core_file_type(filename) # Assert assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN -def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> None: +def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> ( + None +): # Arrange filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" # Act - result = core_file_for(filename) + result = get_core_file_type(filename) # Assert assert result == CoreFiles.RETROFIT_DESIGN_DOC @@ -144,7 +155,7 @@ def test_core_file_for_prefix_returns_photopack() -> None: filename = "Photopack_123456_V1.pdf" # Act - result = core_file_for(filename) + result = get_core_file_type(filename) # Assert assert result == CoreFiles.PHOTOPACK @@ -155,18 +166,20 @@ def test_core_file_for_unknown_filename_returns_none() -> None: filename = "unknown_document_123.pdf" # Act - result = core_file_for(filename) + result = get_core_file_type(filename) # Assert assert result is None -def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> None: +def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> ( + None +): # Arrange — OSM+DR-N-A filename but evidence_category is something other than retrofit design filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf" # Act - result = core_file_for(filename, evidence_category="some other category") + result = get_core_file_type(filename, evidence_category="some other category") # Assert assert result is None From 955db1c3eb8167bfbd1aa277624e2966eb16f6f8 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 10:58:38 +0000 Subject: [PATCH 42/91] additional typehint --- backend/pashub_fetcher/core_files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py index 30aa2ba8..e63511eb 100644 --- a/backend/pashub_fetcher/core_files.py +++ b/backend/pashub_fetcher/core_files.py @@ -69,7 +69,9 @@ def get_core_file_type( def get_file_type_string(filename: str) -> Optional[str]: - core_file = get_core_file_type(filename) + core_file: Optional[CoreFiles] = get_core_file_type(filename) + if core_file is None: return None + return _CORE_FILE_TO_FILE_TYPE[core_file] From 03ae73f39adf9515d2d9010ab4a7df6d333652c3 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 13:37:08 +0000 Subject: [PATCH 43/91] trigger via sqs from local file --- .../trigger_pashub_sqs_from_file.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 backend/pashub_fetcher/trigger_pashub_sqs_from_file.py diff --git a/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py new file mode 100644 index 00000000..24a29781 --- /dev/null +++ b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py @@ -0,0 +1,103 @@ +import json +import logging +import os +from typing import Any, Optional, cast + +import boto3 +from openpyxl import load_workbook + +from backend.app.config import get_settings +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger: logging.Logger = logging.getLogger(__name__) + +DRY_RUN: bool = True + +EXCEL_PATH: str = os.path.join( + os.path.dirname(__file__), + "united-infrastructure-exports-all-deals-2026-05-14.xlsx", +) + + +def _build_requests(excel_path: str) -> list[PashubToAraTriggerRequest]: + wb = load_workbook(excel_path, data_only=True) + ws = wb.worksheets[0] + + headers: dict[str, int] = {} + for col in range(1, ws.max_column + 1): + header_val = ws.cell(row=1, column=col).value + if header_val is not None: + headers[str(header_val).strip()] = col + + pashub_col: int = headers["PasHub link"] + record_id_col: int = headers["Record ID"] + deal_name_col: int = headers["Deal Name"] + deal_stage_col: int = headers["Deal Stage"] + + requests: list[PashubToAraTriggerRequest] = [] + + for row in range(2, ws.max_row + 1): + pashub_link_raw = ws.cell(row=row, column=pashub_col).value + if not pashub_link_raw: + continue + + pashub_link: str = str(pashub_link_raw).strip() + + record_id_raw = ws.cell(row=row, column=record_id_col).value + deal_name_raw = ws.cell(row=row, column=deal_name_col).value + deal_stage_raw = ws.cell(row=row, column=deal_stage_col).value + + hubspot_deal_id: Optional[str] = ( + str(record_id_raw) if record_id_raw is not None else None + ) + address: Optional[str] = ( + str(deal_name_raw).strip() if deal_name_raw is not None else None + ) + deal_stage: Optional[str] = ( + str(deal_stage_raw).strip() if deal_stage_raw is not None else None + ) + + requests.append( + PashubToAraTriggerRequest( + pashub_link=pashub_link, + hubspot_deal_id=hubspot_deal_id, + address=address, + deal_stage=deal_stage, + ) + ) + + return requests + + +def main() -> None: + trigger_requests: list[PashubToAraTriggerRequest] = _build_requests(EXCEL_PATH) + + sqs: Any = cast(Any, boto3.client("sqs")) # type: ignore[reportUnknownMemberType] + queue_url: str = get_settings().PASHUB_TO_ARA_SQS_URL + + count: int = 0 + for request in trigger_requests: + action: str = "DRY RUN" if DRY_RUN else "SENDING" + logger.info( + f"[{action}] deal_id={request.hubspot_deal_id} pashub_link={request.pashub_link}" + ) + + if not DRY_RUN: + response: dict[str, Any] = sqs.send_message( + QueueUrl=queue_url, + MessageBody=json.dumps(request.model_dump()), + ) + message_id: str = response["MessageId"] + logger.info(f" MessageId: {message_id}") + + count += 1 + + label: str = "would send" if DRY_RUN else "sent" + print(f"{count} messages {label}") + + +if __name__ == "__main__": + main() From 0b358e6de66a04efefe19f83319f8854fdac52ae Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 13:37:14 +0000 Subject: [PATCH 44/91] =?UTF-8?q?pashub=5Fjob=5Fid=20extracts=20ID=20from?= =?UTF-8?q?=20/evidence/view=20links=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_pashub_to_ara_trigger_request.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py diff --git a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py new file mode 100644 index 00000000..b538fa7e --- /dev/null +++ b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py @@ -0,0 +1,20 @@ +import pytest + +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) + + +def make_request(pashub_link: str) -> PashubToAraTriggerRequest: + return PashubToAraTriggerRequest(pashub_link=pashub_link) + + +def test_pashub_job_id_extracts_id_from_evidence_view_link() -> None: + # Arrange + request = make_request("https://pashub.net/jobs/job-id-123/evidence/view") + + # Act + result = request.pashub_job_id + + # Assert + assert result == "job-id-123" From 567778991961189679c45f1ca0312c5ff089702e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 13:40:28 +0000 Subject: [PATCH 45/91] =?UTF-8?q?pashub=5Fjob=5Fid=20extracts=20ID=20from?= =?UTF-8?q?=20/evidence/view=20links=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pashub_to_ara_trigger_request.py | 8 ++++++-- .../test_pashub_to_ara_trigger_request.py | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py index 518a8dc3..2e077c2e 100644 --- a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py +++ b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py @@ -1,10 +1,11 @@ +import re from typing import Optional from pydantic import BaseModel class PashubToAraTriggerRequest(BaseModel): pashub_link: ( - str # e.g. https://pashub.net/jobs/12345-abcd-1234-abcd-12345abcde/details + str # e.g. https://pashub.net/jobs/{id}/details, /jobs/{id}/evidence/view, /jobs/{id} ) address: Optional[str] = None @@ -17,4 +18,7 @@ class PashubToAraTriggerRequest(BaseModel): @property def pashub_job_id(self) -> str: - return self.pashub_link.split("/")[-2] + match = re.search(r"/jobs/([^/]+)", self.pashub_link) + if not match: + raise ValueError(f"No job ID found in PasHub link: {self.pashub_link}") + return match.group(1) diff --git a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py index b538fa7e..6eec1e14 100644 --- a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py +++ b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py @@ -9,6 +9,26 @@ def make_request(pashub_link: str) -> PashubToAraTriggerRequest: return PashubToAraTriggerRequest(pashub_link=pashub_link) +def test_pashub_job_id_raises_for_invalid_link() -> None: + # Arrange + request = make_request("https://pashub.net/rcs-dashboard") + + # Act / Assert + with pytest.raises(ValueError): + request.pashub_job_id + + +def test_pashub_job_id_extracts_id_from_bare_job_link() -> None: + # Arrange + request = make_request("https://pashub.net/jobs/job-id-123") + + # Act + result = request.pashub_job_id + + # Assert + assert result == "job-id-123" + + def test_pashub_job_id_extracts_id_from_evidence_view_link() -> None: # Arrange request = make_request("https://pashub.net/jobs/job-id-123/evidence/view") From ecd2676c5e9bc909f642855345c13e02ba52d4fc Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 14 May 2026 13:42:38 +0000 Subject: [PATCH 46/91] =?UTF-8?q?pashub=5Fjob=5Fid=20extracts=20job=20ID?= =?UTF-8?q?=20from=20all=20valid=20PasHub=20link=20shapes=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_to_ara_trigger_request.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py index 6eec1e14..56187350 100644 --- a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py +++ b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py @@ -9,6 +9,17 @@ def make_request(pashub_link: str) -> PashubToAraTriggerRequest: return PashubToAraTriggerRequest(pashub_link=pashub_link) +def test_pashub_job_id_extracts_id_from_details_link() -> None: + # Arrange + request = make_request("https://pashub.net/jobs/job-id-123/details") + + # Act + result = request.pashub_job_id + + # Assert + assert result == "job-id-123" + + def test_pashub_job_id_raises_for_invalid_link() -> None: # Arrange request = make_request("https://pashub.net/rcs-dashboard") From 572fcc1406d93ed7b0a8c32e1ab53b99183fd6e2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 14 May 2026 16:38:22 +0000 Subject: [PATCH 47/91] smoke tests --- .github/workflows/_smoke_test_lambda.yml | 63 +++++++++++++ .github/workflows/lambda_smoke_tests.yml | 107 +++++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 .github/workflows/_smoke_test_lambda.yml create mode 100644 .github/workflows/lambda_smoke_tests.yml diff --git a/.github/workflows/_smoke_test_lambda.yml b/.github/workflows/_smoke_test_lambda.yml new file mode 100644 index 00000000..63ec0af4 --- /dev/null +++ b/.github/workflows/_smoke_test_lambda.yml @@ -0,0 +1,63 @@ +name: Lambda smoke test + +on: + workflow_call: + inputs: + dockerfile_path: + required: true + type: string + build_context: + required: false + default: "." + type: string + service_name: + required: true + type: string + +jobs: + smoke-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build Lambda image + run: | + docker build \ + --platform linux/amd64 \ + -f ${{ inputs.dockerfile_path }} \ + -t ${{ inputs.service_name }}-smoke-test:latest \ + ${{ inputs.build_context }} + + - name: Start Lambda container + run: | + docker run -d --name ${{ inputs.service_name }}-smoke-test \ + -p 9000:8080 \ + ${{ inputs.service_name }}-smoke-test:latest + + - name: Invoke Lambda and check for import errors + run: | + sleep 2 + response=$(curl -s -X POST \ + http://localhost:9000/2015-03-31/functions/function/invocations \ + -H "Content-Type: application/json" \ + -d '{"Records":[{"body":"{}"}]}') + + echo "Response: $response" + + if [ -z "$response" ]; then + echo "No response from Lambda RIE" + exit 1 + fi + + if echo "$response" | grep -qE 'ImportModuleError|ModuleNotFoundError|ImportError'; then + echo "Import error detected in handler" + exit 1 + fi + + - name: Dump container logs + if: always() + run: docker logs ${{ inputs.service_name }}-smoke-test + + - name: Tear down container + if: always() + run: docker rm -f ${{ inputs.service_name }}-smoke-test diff --git a/.github/workflows/lambda_smoke_tests.yml b/.github/workflows/lambda_smoke_tests.yml new file mode 100644 index 00000000..5ff5420a --- /dev/null +++ b/.github/workflows/lambda_smoke_tests.yml @@ -0,0 +1,107 @@ +name: Lambda Smoke Tests + +on: + pull_request: + branches: + - main + +jobs: + # ============================================================ + # Ara Engine + # ============================================================ + ara_engine_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/docker/engine.Dockerfile + build_context: . + service_name: ara-engine + + # ============================================================ + # Address 2 UPRN + # ============================================================ + address2uprn_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/address2UPRN/handler/Dockerfile + build_context: . + service_name: address2uprn + + # ============================================================ + # Postcode Splitter + # ============================================================ + postcode_splitter_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/postcode_splitter/handler/Dockerfile + build_context: . + service_name: postcode-splitter + + # ============================================================ + # Bulk Address2UPRN Combiner + # ============================================================ + bulk_address2uprn_combiner_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/bulk_address2uprn_combiner/handler/Dockerfile + build_context: . + service_name: bulk-address2uprn-combiner + + # ============================================================ + # Condition ETL + # ============================================================ + condition_etl_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/condition/handler/Dockerfile + build_context: . + service_name: condition-etl + + # ============================================================ + # Categorisation + # ============================================================ + categorisation_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/categorisation/handler/Dockerfile + build_context: . + service_name: categorisation + + # ============================================================ + # Ordnance Survey + # ============================================================ + ordnance_survey_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/ordnanceSurvey/handler/Dockerfile + build_context: . + service_name: ordnance-survey + + # ============================================================ + # Pas Hub Fetcher + # ============================================================ + pashub_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/pashub_fetcher/handler/Dockerfile + build_context: . + service_name: pashub + + # ============================================================ + # MagicPlan + # ============================================================ + magic_plan_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: backend/magic_plan/handler/Dockerfile + build_context: . + service_name: magic-plan + + # ============================================================ + # HubSpot Scraper + # ============================================================ + hubspot_scraper_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: etl/hubspot/scripts/scraper/handler/Dockerfile + build_context: . + service_name: hubspot-scraper From 16e60001800fca1db834d90adf843fdd15b419ce Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 14 May 2026 16:44:18 +0000 Subject: [PATCH 48/91] smoke tests --- .github/workflows/_smoke_test_lambda.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_smoke_test_lambda.yml b/.github/workflows/_smoke_test_lambda.yml index 63ec0af4..9b564f73 100644 --- a/.github/workflows/_smoke_test_lambda.yml +++ b/.github/workflows/_smoke_test_lambda.yml @@ -36,8 +36,8 @@ jobs: - name: Invoke Lambda and check for import errors run: | - sleep 2 - response=$(curl -s -X POST \ + response=$(curl -s --retry-connrefused --retry 15 --retry-delay 1 \ + -X POST \ http://localhost:9000/2015-03-31/functions/function/invocations \ -H "Content-Type: application/json" \ -d '{"Records":[{"body":"{}"}]}') From 0c3a31ed81a094d0907b321ab2d7ff3ad061e523 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 14 May 2026 16:49:45 +0000 Subject: [PATCH 49/91] smoke tests --- .github/workflows/_smoke_test_lambda.yml | 28 +++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_smoke_test_lambda.yml b/.github/workflows/_smoke_test_lambda.yml index 9b564f73..3fcf0de4 100644 --- a/.github/workflows/_smoke_test_lambda.yml +++ b/.github/workflows/_smoke_test_lambda.yml @@ -20,6 +20,13 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Download AWS Lambda RIE + run: | + mkdir -p ~/.aws-lambda-rie + curl -fsSL -o ~/.aws-lambda-rie/aws-lambda-rie \ + https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie + chmod +x ~/.aws-lambda-rie/aws-lambda-rie + - name: Build Lambda image run: | docker build \ @@ -30,9 +37,24 @@ jobs: - name: Start Lambda container run: | - docker run -d --name ${{ inputs.service_name }}-smoke-test \ - -p 9000:8080 \ - ${{ inputs.service_name }}-smoke-test:latest + IMG=${{ inputs.service_name }}-smoke-test:latest + ENTRY=$(docker inspect --format='{{range .Config.Entrypoint}}{{.}} {{end}}' "$IMG") + CMD_ARGS=$(docker inspect --format='{{range .Config.Cmd}}{{.}} {{end}}' "$IMG") + + if echo "$ENTRY" | grep -q "lambda-entrypoint.sh"; then + # AWS base image — RIE is bundled + docker run -d --name ${{ inputs.service_name }}-smoke-test \ + -p 9000:8080 \ + "$IMG" + else + # Custom base — mount RIE from runner and re-wire entrypoint + docker run -d --name ${{ inputs.service_name }}-smoke-test \ + -v "$HOME/.aws-lambda-rie:/aws-lambda-rie" \ + -p 9000:8080 \ + --entrypoint /aws-lambda-rie/aws-lambda-rie \ + "$IMG" \ + $ENTRY $CMD_ARGS + fi - name: Invoke Lambda and check for import errors run: | From 6c8080ef6203694db127edb9aa9b7824bbc76898 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 14 May 2026 16:57:31 +0000 Subject: [PATCH 50/91] smoke tests --- backend/condition/handler/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/condition/handler/Dockerfile b/backend/condition/handler/Dockerfile index 71556895..fa130573 100644 --- a/backend/condition/handler/Dockerfile +++ b/backend/condition/handler/Dockerfile @@ -32,6 +32,7 @@ COPY utils/ utils/ COPY backend/condition/ backend/condition/ COPY backend/app/db/models/condition.py backend/app/db/models/condition.py +COPY backend/app/db/base.py backend/app/db/base.py COPY backend/app/db/connection.py backend/app/db/connection.py COPY backend/app/config.py backend/app/config.py From eeb2f9eb20a4c65b639da09ed818c27ebe2501fd Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 15 May 2026 10:58:42 +0000 Subject: [PATCH 51/91] tweaks before PR --- backend/pashub_fetcher/pashub_client.py | 7 +++- .../pashub_to_ara_trigger_request.py | 4 +-- .../trigger_pashub_sqs_from_file.py | 36 ++++++++++++++++++- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 7896664d..27342c25 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -74,6 +74,10 @@ class PashubClient: logger.info(f"Getting UPRN for job ID {job_id}") url = f"{self.base}/jobs/{job_id}" + logger.debug( + f"About to make API request with session headers: { self.session.headers}" + ) + r = self.session.get(url) if r.status_code == 401: raise UnauthorizedError("Token expired or invalid") @@ -82,7 +86,8 @@ class PashubClient: try: return r.json()["uprn"] - except Exception: + except Exception as e: + logger.warning(f"Failed to get UPRN for Job ID {job_id}", e) return None def _select_latest_core_files( diff --git a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py index 2e077c2e..715a09f8 100644 --- a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py +++ b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py @@ -4,9 +4,7 @@ from pydantic import BaseModel class PashubToAraTriggerRequest(BaseModel): - pashub_link: ( - str # e.g. https://pashub.net/jobs/{id}/details, /jobs/{id}/evidence/view, /jobs/{id} - ) + pashub_link: str # e.g. https://pashub.net/jobs/{id}/details, /jobs/{id}/evidence/view, /jobs/{id} address: Optional[str] = None sharepoint_link: Optional[str] = None diff --git a/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py index 24a29781..f4c03afc 100644 --- a/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py +++ b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py @@ -14,7 +14,36 @@ from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( logging.basicConfig(level=logging.INFO, format="%(message)s") logger: logging.Logger = logging.getLogger(__name__) -DRY_RUN: bool = True +DRY_RUN: bool = False + +DEAL_ID_FILTER: frozenset[str] = frozenset( + { + "379452094688", + "379466504437", + "379660170452", + "380016925932", + "379848065216", + "379466504434", + "379452094690", + "379965924567", + "380016925923", + "379792072898", + "379654754502", + "379560262861", + "379969670369", + "379248717001", + "379971468493", + "379999888607", + "379606372580", + "379969603797", + "379967743213", + "379263155434", + "379855267025", + "379889899719", + "379071064307", + "379867925741", + } +) EXCEL_PATH: str = os.path.join( os.path.dirname(__file__), @@ -75,6 +104,11 @@ def _build_requests(excel_path: str) -> list[PashubToAraTriggerRequest]: def main() -> None: trigger_requests: list[PashubToAraTriggerRequest] = _build_requests(EXCEL_PATH) + if DEAL_ID_FILTER: + trigger_requests = [ + r for r in trigger_requests if r.hubspot_deal_id in DEAL_ID_FILTER + ] + sqs: Any = cast(Any, boto3.client("sqs")) # type: ignore[reportUnknownMemberType] queue_url: str = get_settings().PASHUB_TO_ARA_SQS_URL From ad49bf9d85c95e480f9949f33eff3693d601e668 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 15 May 2026 11:00:58 +0000 Subject: [PATCH 52/91] tweak logs --- backend/pashub_fetcher/pashub_client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index 27342c25..79d81838 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -75,7 +75,7 @@ class PashubClient: url = f"{self.base}/jobs/{job_id}" logger.debug( - f"About to make API request with session headers: { self.session.headers}" + f"About to make API request with session headers: {self.session.headers}" ) r = self.session.get(url) @@ -87,7 +87,9 @@ class PashubClient: try: return r.json()["uprn"] except Exception as e: - logger.warning(f"Failed to get UPRN for Job ID {job_id}", e) + logger.warning( + f"Failed to get UPRN for Job ID {job_id} with exception: {e}" + ) return None def _select_latest_core_files( From 6afd07600598a0a92883319345e4918aecd46cc1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 15 May 2026 11:28:04 +0000 Subject: [PATCH 53/91] added 5 second rest every 100 tests --- backend/address2UPRN/tests/test_csv.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py index 73d94388..5c97e691 100644 --- a/backend/address2UPRN/tests/test_csv.py +++ b/backend/address2UPRN/tests/test_csv.py @@ -12,12 +12,21 @@ FIXTURE_PATH = Path(__file__).parent / "test_data.csv" # Each parametrized case fires at least one EPC request; without throttling, # GitHub-hosted runners burst fast enough to hit 429s. EPC_THROTTLE_SECONDS = 1.0 +EPC_LONG_PAUSE_EVERY = 100 +EPC_LONG_PAUSE_SECONDS = 5.0 + +_epc_request_count = 0 @pytest.fixture(autouse=True) def _throttle_epc_requests(): + global _epc_request_count yield - time.sleep(EPC_THROTTLE_SECONDS) + _epc_request_count += 1 + if _epc_request_count % EPC_LONG_PAUSE_EVERY == 0: + time.sleep(EPC_LONG_PAUSE_SECONDS) + else: + time.sleep(EPC_THROTTLE_SECONDS) def load_test_cases(): From fce1e1008ab166f0637f926ffef0bbbf1d8a18f8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 15 May 2026 16:00:02 +0000 Subject: [PATCH 54/91] added more test cases --- backend/address2UPRN/tests/test_data.csv | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv index 408edc29..1c1ce58a 100644 --- a/backend/address2UPRN/tests/test_data.csv +++ b/backend/address2UPRN/tests/test_data.csv @@ -364,4 +364,7 @@ FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 164a Victoria Square,M4 5FA,77211315 165a Victoria Square,M4 5FA,77211316 166a Victoria Square,M4 5FA,None -"FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY",CR2 7DL,None \ No newline at end of file +"FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY",CR2 7DL,None +71A Stoneleigh Avenue,NE12 8NP,None +71B Stoneleigh Avenue,NE12 8NP,None +71 Stoneleigh Avenue,NE12 8NP,47086009 \ No newline at end of file From a99972457864962e0a4be8b0f35ab5fb33eebeaa Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:05:54 +0000 Subject: [PATCH 55/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?= =?UTF-8?q?client=20when=20UPRN=20lookup=20returns=20401=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_service.py | 7 ++-- .../tests/test_pashub_service.py | 36 +++++++++++++++++-- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index b3302fd9..2b8f0926 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -1,6 +1,6 @@ import os from datetime import datetime, timezone -from typing import List, NamedTuple, Optional, cast +from typing import Callable, List, NamedTuple, Optional, cast from backend.app.db.connection import db_session from backend.app.db.models.uploaded_file import ( @@ -11,7 +11,7 @@ from backend.app.db.models.uploaded_file import ( from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_site_notes_pdf from backend.pashub_fetcher.core_files import get_file_type_string -from backend.pashub_fetcher.pashub_client import PashubClient +from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, ) @@ -36,10 +36,13 @@ class PashubService: pashub_client: PashubClient, sharepoint_client: DomnaSharepointClient, s3_bucket: str, + coordination_client_factory: Optional[Callable[[], PashubClient]] = None, ) -> None: self._pashub_client = pashub_client self._sharepoint_client = sharepoint_client self._s3_bucket = s3_bucket + self._coordination_client_factory = coordination_client_factory + self._coordination_client: Optional[PashubClient] = None def run(self, request: PashubToAraTriggerRequest) -> List[str]: job_id = request.pashub_job_id diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index 2aff416b..44c6af1a 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -1,8 +1,8 @@ -from typing import Optional +from typing import Callable, Optional from unittest.mock import MagicMock, call, patch -from backend.pashub_fetcher.pashub_client import PashubClient +from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_service import PashubService from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, @@ -31,11 +31,13 @@ def make_service( pashub_client: Optional[PashubClient] = None, sharepoint_client: Optional[DomnaSharepointClient] = None, s3_bucket: str = "test-bucket", + coordination_client_factory: Optional[Callable[[], PashubClient]] = None, ) -> PashubService: return PashubService( pashub_client=pashub_client or MagicMock(spec=PashubClient), sharepoint_client=sharepoint_client or MagicMock(spec=DomnaSharepointClient), s3_bucket=s3_bucket, + coordination_client_factory=coordination_client_factory, ) @@ -225,6 +227,36 @@ def test_run_parses_and_saves_site_notes_for_rd_sap_site_note_file() -> None: # --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# run(): coordination fallback +# --------------------------------------------------------------------------- + + +def test_run_uses_coordination_client_when_pas_401_on_uprn_lookup() -> None: + pas_client = MagicMock(spec=PashubClient) + pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError() + + coord_client = MagicMock(spec=PashubClient) + coord_client.get_uprn_by_job_id.return_value = "99999" + coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"] + + factory = MagicMock(return_value=coord_client) + + service = make_service(pashub_client=pas_client, coordination_client_factory=factory) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch("backend.pashub_fetcher.pashub_service.db_session"), + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + result = service.run(make_request()) + + assert result == ["/tmp/a.pdf"] + coord_client.get_uprn_by_job_id.assert_called_once() + coord_client.get_core_evidence_files_by_job_id.assert_called_once() + assert factory.call_count == 1 + + def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: mock_client = MagicMock(spec=PashubClient) mock_client.get_uprn_by_job_id.return_value = None From e0446381925964872e91607bbc5135c60177d969 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:06:46 +0000 Subject: [PATCH 56/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?= =?UTF-8?q?client=20when=20UPRN=20lookup=20returns=20401=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_service.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 2b8f0926..0a5fb535 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -44,12 +44,26 @@ class PashubService: self._coordination_client_factory = coordination_client_factory self._coordination_client: Optional[PashubClient] = None + def _get_coordination_client(self) -> PashubClient: + if self._coordination_client_factory is None: + raise UnauthorizedError("No coordination client factory configured") + if self._coordination_client is None: + self._coordination_client = self._coordination_client_factory() + return self._coordination_client + def run(self, request: PashubToAraTriggerRequest) -> List[str]: job_id = request.pashub_job_id + active_client = self._pashub_client + + if request.uprn: + uprn: Optional[str] = request.uprn + else: + try: + uprn = active_client.get_uprn_by_job_id(job_id) + except UnauthorizedError: + active_client = self._get_coordination_client() + uprn = active_client.get_uprn_by_job_id(job_id) - uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id( - job_id - ) hubspot_deal_id: Optional[str] = request.hubspot_deal_id if uprn: @@ -57,7 +71,7 @@ class PashubService: else: logger.info(f"No UPRN found for job {job_id}") - job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id( + job_files: List[str] = active_client.get_core_evidence_files_by_job_id( job_id ) From d49bd3620e2040af82ab737bf7bac3f58daf134c Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:08:47 +0000 Subject: [PATCH 57/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?= =?UTF-8?q?client=20when=20file=20listing=20returns=20401=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_service.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index 44c6af1a..dd8ad0a8 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -257,6 +257,29 @@ def test_run_uses_coordination_client_when_pas_401_on_uprn_lookup() -> None: assert factory.call_count == 1 +def test_run_uses_coordination_client_when_pas_401_on_file_listing() -> None: + pas_client = MagicMock(spec=PashubClient) + pas_client.get_core_evidence_files_by_job_id.side_effect = UnauthorizedError() + + coord_client = MagicMock(spec=PashubClient) + coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"] + + factory = MagicMock(return_value=coord_client) + + service = make_service(pashub_client=pas_client, coordination_client_factory=factory) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch("backend.pashub_fetcher.pashub_service.db_session"), + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + result = service.run(make_request(uprn="12345")) + + assert result == ["/tmp/a.pdf"] + coord_client.get_core_evidence_files_by_job_id.assert_called_once() + pas_client.get_uprn_by_job_id.assert_not_called() + + def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: mock_client = MagicMock(spec=PashubClient) mock_client.get_uprn_by_job_id.return_value = None From 0c1ecabf2f88ed0d2a519fc1e3b474ceb0b5a6f7 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:09:18 +0000 Subject: [PATCH 58/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?= =?UTF-8?q?client=20when=20file=20listing=20returns=20401=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_service.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 0a5fb535..b33b9dcf 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -71,9 +71,15 @@ class PashubService: else: logger.info(f"No UPRN found for job {job_id}") - job_files: List[str] = active_client.get_core_evidence_files_by_job_id( - job_id - ) + try: + job_files: List[str] = active_client.get_core_evidence_files_by_job_id( + job_id + ) + except UnauthorizedError: + if active_client is not self._pashub_client: + raise + active_client = self._get_coordination_client() + job_files = active_client.get_core_evidence_files_by_job_id(job_id) if uprn or hubspot_deal_id: logger.info("Uploading files to s3") From 5a29866245fefae3ac5b4aee6ddba1d09ce7eb1d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:12:19 +0000 Subject: [PATCH 59/91] =?UTF-8?q?PAS=20raises=20UnauthorizedError=20when?= =?UTF-8?q?=20401=20received=20with=20no=20coordination=20factory=20config?= =?UTF-8?q?ured=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_pashub_service.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index dd8ad0a8..ff4a8977 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -1,3 +1,4 @@ +import pytest from typing import Callable, Optional from unittest.mock import MagicMock, call, patch @@ -280,6 +281,16 @@ def test_run_uses_coordination_client_when_pas_401_on_file_listing() -> None: pas_client.get_uprn_by_job_id.assert_not_called() +def test_run_raises_unauthorized_when_pas_401_and_no_factory() -> None: + pas_client = MagicMock(spec=PashubClient) + pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError() + + service = make_service(pashub_client=pas_client) + + with pytest.raises(UnauthorizedError): + service.run(make_request()) + + def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: mock_client = MagicMock(spec=PashubClient) mock_client.get_uprn_by_job_id.return_value = None From dcff529219103ed2bfb0faf5a58e0be814683d8d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:13:51 +0000 Subject: [PATCH 60/91] =?UTF-8?q?UnauthorizedError=20propagates=20when=20b?= =?UTF-8?q?oth=20PAS=20and=20coordination=20clients=20return=20401=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pashub_fetcher/tests/test_pashub_service.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index ff4a8977..991d2a46 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -291,6 +291,21 @@ def test_run_raises_unauthorized_when_pas_401_and_no_factory() -> None: service.run(make_request()) +def test_run_raises_unauthorized_when_both_clients_401() -> None: + pas_client = MagicMock(spec=PashubClient) + pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError() + + coord_client = MagicMock(spec=PashubClient) + coord_client.get_uprn_by_job_id.side_effect = UnauthorizedError() + + factory = MagicMock(return_value=coord_client) + + service = make_service(pashub_client=pas_client, coordination_client_factory=factory) + + with pytest.raises(UnauthorizedError): + service.run(make_request()) + + def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: mock_client = MagicMock(spec=PashubClient) mock_client.get_uprn_by_job_id.return_value = None From 4cd59768c38e2f2a5ae90cb6bde000c40b6646d3 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:22:32 +0000 Subject: [PATCH 61/91] =?UTF-8?q?Wire=20coordination=20account=20fallback?= =?UTF-8?q?=20into=20config=20and=20handler,=20remove=20token-refresh=20re?= =?UTF-8?q?try=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/config.py | 2 ++ backend/pashub_fetcher/handler/handler.py | 43 +++++++++++++---------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/backend/app/config.py b/backend/app/config.py index bdfc9ace..fcfb6d5b 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -86,6 +86,8 @@ class Settings(BaseSettings): # Pas Hub PASHUB_EMAIL: Optional[str] = None PASHUB_PASSWORD: Optional[str] = None + PASHUB_COORDINATION_EMAIL: Optional[str] = None + PASHUB_COORDINATION_PASSWORD: Optional[str] = None # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index cd0c8113..626ce59d 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,9 +1,11 @@ -from typing import Any, Dict, List +from typing import Any, Callable, Dict, List, Optional from backend.app.config import get_settings -from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError +from backend.pashub_fetcher.pashub_client import PashubClient from backend.pashub_fetcher.pashub_service import PashubService -from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) from backend.pashub_fetcher.token_getter import get_token_from_local_storage from backend.app.db.models.tasks import SourceEnum from backend.utils.subtasks import task_handler @@ -28,38 +30,41 @@ def handler(body: Dict[str, Any], context: Any) -> List[str]: settings = get_settings() - pas_hub_email = settings.PASHUB_EMAIL - pas_hub_password = settings.PASHUB_PASSWORD + pashub_email = settings.PASHUB_EMAIL + pashub_password = settings.PASHUB_PASSWORD - if (not pas_hub_email) or (not pas_hub_password): + coordination_hub_email = settings.PASHUB_COORDINATION_EMAIL + coordination_hub_password = settings.PASHUB_COORDINATION_PASSWORD + coordination_client_factory: Optional[Callable[[], PashubClient]] = None + + if (not pashub_email) or (not pashub_password): raise ValueError("Pas Hub credentials not provided") sharepoint_client = DomnaSharepointClient( sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3 ) + if coordination_hub_email and coordination_hub_password: + _coord_email, _coord_password = ( + coordination_hub_email, + coordination_hub_password, + ) + coordination_client_factory = lambda: get_pashub_client( + _coord_email, _coord_password + ) + logger.debug("Validating request body") payload = PashubToAraTriggerRequest.model_validate(body) logger.debug("Successfully validated request body") service = PashubService( - pashub_client=get_pashub_client(pas_hub_email, pas_hub_password), + pashub_client=get_pashub_client(pashub_email, pashub_password), sharepoint_client=sharepoint_client, s3_bucket=S3_BUCKET, + coordination_client_factory=coordination_client_factory, ) - try: - files: List[str] = service.run(payload) - except UnauthorizedError: - logger.warning("Token expired - refreshing") - - service = PashubService( - pashub_client=get_pashub_client(pas_hub_email, pas_hub_password), - sharepoint_client=sharepoint_client, - s3_bucket=S3_BUCKET, - ) - - files = service.run(payload) + files: List[str] = service.run(payload) logger.info(f"Saved {len(files)} files") From 3a7a00051d159d7672c29357e664ebc9a2f165a2 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 09:34:34 +0000 Subject: [PATCH 62/91] add new variables to deployment pipeline --- .github/workflows/_deploy_lambda.yml | 8 ++++++++ .github/workflows/deploy_terraform.yml | 2 ++ .../terraform/lambda/pashub_to_ara/main.tf | 2 ++ .../terraform/lambda/pashub_to_ara/variables.tf | 12 ++++++++++++ 4 files changed, 24 insertions(+) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 1cc7d462..0d702155 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -80,6 +80,10 @@ on: required: false TF_VAR_pashub_password: required: false + TF_VAR_pashub_coordination_email: + required: false + TF_VAR_pashub_coordination_password: + required: false TF_VAR_hubspot_api_key: required: false @@ -154,6 +158,8 @@ jobs: TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.TF_VAR_social_housing_wave_3_sharepoint_id }} TF_VAR_pashub_email: ${{ secrets.TF_VAR_pashub_email }} TF_VAR_pashub_password: ${{ secrets.TF_VAR_pashub_password }} + TF_VAR_pashub_coordination_email: ${{ secrets.TF_VAR_pashub_coordination_email }} + TF_VAR_pashub_coordination_password: ${{ secrets.TF_VAR_pashub_coordination_password }} TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} @@ -202,6 +208,8 @@ jobs: TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.TF_VAR_social_housing_wave_3_sharepoint_id }} TF_VAR_pashub_email: ${{ secrets.TF_VAR_pashub_email }} TF_VAR_pashub_password: ${{ secrets.TF_VAR_pashub_password }} + TF_VAR_pashub_coordination_email: ${{ secrets.TF_VAR_pashub_coordination_email }} + TF_VAR_pashub_coordination_password: ${{ secrets.TF_VAR_pashub_coordination_password }} TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index e0343974..bd014e3d 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -407,6 +407,8 @@ jobs: TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID }} TF_VAR_pashub_email: ${{ secrets.PASHUB_EMAIL }} TF_VAR_pashub_password: ${{ secrets.PASHUB_PASSWORD }} + TF_VAR_pashub_coordination_email: ${{ secrets.PASHUB_COORDINATION_EMAIL }} + TF_VAR_pashub_coordination_password: ${{ secrets.PASHUB_COORDINATION_PASSWORD }} # ============================================================ diff --git a/infrastructure/terraform/lambda/pashub_to_ara/main.tf b/infrastructure/terraform/lambda/pashub_to_ara/main.tf index 902d7845..eba9c874 100644 --- a/infrastructure/terraform/lambda/pashub_to_ara/main.tf +++ b/infrastructure/terraform/lambda/pashub_to_ara/main.tf @@ -49,6 +49,8 @@ module "lambda" { SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID = var.social_housing_wave_3_sharepoint_id PASHUB_EMAIL = var.pashub_email PASHUB_PASSWORD = var.pashub_password + PASHUB_COORDINATION_EMAIL = var.pashub_coordination_email + PASHUB_COORDINATION_PASSWORD = var.pashub_coordination_password } } diff --git a/infrastructure/terraform/lambda/pashub_to_ara/variables.tf b/infrastructure/terraform/lambda/pashub_to_ara/variables.tf index 0e99d378..cdeff256 100644 --- a/infrastructure/terraform/lambda/pashub_to_ara/variables.tf +++ b/infrastructure/terraform/lambda/pashub_to_ara/variables.tf @@ -100,4 +100,16 @@ variable "pashub_email" { variable "pashub_password" { type = string sensitive = true +} + +variable "pashub_coordination_email" { + type = string + sensitive = true + default = null +} + +variable "pashub_coordination_password" { + type = string + sensitive = true + default = null } \ No newline at end of file From 770493ff9ec751073a3d3b798e51021252e2f10f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 18 May 2026 11:51:48 +0000 Subject: [PATCH 63/91] add logging --- backend/pashub_fetcher/pashub_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index b33b9dcf..13498a32 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -60,7 +60,9 @@ class PashubService: else: try: uprn = active_client.get_uprn_by_job_id(job_id) + logger.info(f"Failed to access job {job_id} with PasHub credentials") except UnauthorizedError: + logger.info(f"Trying CoordinationHub credentials for job {job_id}") active_client = self._get_coordination_client() uprn = active_client.get_uprn_by_job_id(job_id) From dc3543ac5f655c7f8ec9a76dad12cf014bf94621 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 19 May 2026 11:07:41 +0000 Subject: [PATCH 64/91] =?UTF-8?q?Coordination=20Hub=20fallback=20stores=20?= =?UTF-8?q?correct=20file=5Fsource=20in=20DB=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/db/models/uploaded_file.py | 1 + .../tests/test_pashub_service.py | 29 ++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index f3cfee79..b6a73d5d 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -25,6 +25,7 @@ class FileTypeEnum(enum.Enum): class FileSourceEnum(enum.Enum): PAS_HUB = "pas hub" + COORDINATION_HUB = "coordination_hub" SHAREPOINT = "sharepoint" HUBSPOT = "hubspot" ECMK = "ecmk" diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index 991d2a46..1d6d167f 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -1,8 +1,9 @@ import pytest -from typing import Callable, Optional +from typing import Any, Callable, Optional from unittest.mock import MagicMock, call, patch +from backend.app.db.models.uploaded_file import FileSourceEnum from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_service import PashubService from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( @@ -306,6 +307,32 @@ def test_run_raises_unauthorized_when_both_clients_401() -> None: service.run(make_request()) +def test_run_persists_coordination_hub_file_source_when_pas_401_on_uprn_lookup() -> None: + pas_client = MagicMock(spec=PashubClient) + pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError() + + coord_client = MagicMock(spec=PashubClient) + coord_client.get_uprn_by_job_id.return_value = "99999" + coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"] + + factory = MagicMock(return_value=coord_client) + fake_session = MagicMock() + + service = make_service(pashub_client=pas_client, coordination_client_factory=factory) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch("backend.pashub_fetcher.pashub_service.db_session") as mock_db, + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + mock_db.return_value.__enter__.return_value = fake_session + service.run(make_request()) + + fake_session.add_all.assert_called_once() + added: list[Any] = fake_session.add_all.call_args[0][0] + assert added[0].file_source == FileSourceEnum.COORDINATION_HUB.value + + def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: mock_client = MagicMock(spec=PashubClient) mock_client.get_uprn_by_job_id.return_value = None From 1e115ba3dee7a63f9b2ebe4e56fb2f4a22da03f7 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 19 May 2026 11:09:01 +0000 Subject: [PATCH 65/91] =?UTF-8?q?Coordination=20Hub=20fallback=20stores=20?= =?UTF-8?q?correct=20file=5Fsource=20in=20DB=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_service.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 13498a32..f7f6ccd9 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -60,9 +60,10 @@ class PashubService: else: try: uprn = active_client.get_uprn_by_job_id(job_id) - logger.info(f"Failed to access job {job_id} with PasHub credentials") except UnauthorizedError: - logger.info(f"Trying CoordinationHub credentials for job {job_id}") + logger.info( + f"PasHub credentials unauthorized for job {job_id}; retrying with CoordinationHub credentials" + ) active_client = self._get_coordination_client() uprn = active_client.get_uprn_by_job_id(job_id) @@ -85,8 +86,13 @@ class PashubService: if uprn or hubspot_deal_id: logger.info("Uploading files to s3") + file_source = ( + FileSourceEnum.PAS_HUB + if active_client is self._pashub_client + else FileSourceEnum.COORDINATION_HUB + ) upload_records = self._upload_to_s3_and_update_db( - job_files, uprn, hubspot_deal_id + job_files, uprn, hubspot_deal_id, file_source ) self._save_site_notes(upload_records) @@ -108,6 +114,7 @@ class PashubService: job_files: List[str], uprn: Optional[str], hubspot_deal_id: Optional[str], + file_source: FileSourceEnum, ) -> List[_FileUploadRecord]: if not uprn and not hubspot_deal_id: return [] @@ -133,7 +140,7 @@ class PashubService: s3_upload_timestamp=datetime.now(timezone.utc), uprn=int(uprn) if uprn else None, hubspot_deal_id=hubspot_deal_id, - file_source=FileSourceEnum.PAS_HUB.value, + file_source=file_source.value, file_type=get_file_type_string(filename), ) file_paths.append(file_path) From a4ad1ca11c90f8ff5e2080977b0567ab2ff8e269 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 19 May 2026 11:10:18 +0000 Subject: [PATCH 66/91] =?UTF-8?q?Coordination=20Hub=20file=20listing=20fal?= =?UTF-8?q?lback=20stores=20correct=20file=5Fsource=20in=20DB=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_pashub_service.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index 1d6d167f..cf1c489a 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -333,6 +333,31 @@ def test_run_persists_coordination_hub_file_source_when_pas_401_on_uprn_lookup() assert added[0].file_source == FileSourceEnum.COORDINATION_HUB.value +def test_run_persists_coordination_hub_file_source_when_pas_401_on_file_listing() -> None: + pas_client = MagicMock(spec=PashubClient) + pas_client.get_core_evidence_files_by_job_id.side_effect = UnauthorizedError() + + coord_client = MagicMock(spec=PashubClient) + coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"] + + factory = MagicMock(return_value=coord_client) + fake_session = MagicMock() + + service = make_service(pashub_client=pas_client, coordination_client_factory=factory) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch("backend.pashub_fetcher.pashub_service.db_session") as mock_db, + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + mock_db.return_value.__enter__.return_value = fake_session + service.run(make_request(uprn="12345")) + + fake_session.add_all.assert_called_once() + added: list[Any] = fake_session.add_all.call_args[0][0] + assert added[0].file_source == FileSourceEnum.COORDINATION_HUB.value + + def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: mock_client = MagicMock(spec=PashubClient) mock_client.get_uprn_by_job_id.return_value = None From 20ad0616bcdc32eb24abee7bb05f4f707475e00b Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 19 May 2026 11:10:45 +0000 Subject: [PATCH 67/91] =?UTF-8?q?PAS=20Hub=20happy=20path=20asserts=20file?= =?UTF-8?q?=5Fsource=20"pas=20hub"=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/tests/test_pashub_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index cf1c489a..1f750117 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -148,10 +148,11 @@ def test_run_persists_uploaded_file_records_to_db() -> None: service.run(make_request(uprn="12345")) fake_session.add_all.assert_called_once() - added: list = fake_session.add_all.call_args[0][0] + added: list[Any] = fake_session.add_all.call_args[0][0] assert len(added) == 1 assert added[0].s3_file_bucket == "test-bucket" assert added[0].uprn == 12345 + assert added[0].file_source == FileSourceEnum.PAS_HUB.value # --------------------------------------------------------------------------- From bc8ca3ead36e13b71272845baf80d01adee63e41 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 12:55:30 +0000 Subject: [PATCH 68/91] deployment from infrastructure --- .dockerignore | 2 +- .github/workflows/deploy_terraform.yml | 46 +++++++++---------- Dockerfile.test.dockerignore | 2 +- .../terraform/lambda/_template/README.md | 2 +- .../lambda_with_api_gateway/variables.tf | 2 +- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.dockerignore b/.dockerignore index 0c7d7749..90436ffc 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,7 +6,7 @@ backend/.idea/* backend/.env recommendations/tests/* model_data/tests/* -infrastructure/* +deployment/* data_collection/* node_modules/* conservation_areas/* diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index bd014e3d..923fc0a9 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -62,20 +62,20 @@ jobs: - uses: hashicorp/setup-terraform@v3 - name: Terraform Init - working-directory: infrastructure/terraform/shared + working-directory: deployment/terraform/shared run: terraform init -reconfigure - name: Terraform Workspace - working-directory: infrastructure/terraform/shared + working-directory: deployment/terraform/shared run: terraform workspace select ${STAGE} || terraform workspace new ${STAGE} - name: Terraform Plan - working-directory: infrastructure/terraform/shared + working-directory: deployment/terraform/shared run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply if: env.TERRAFORM_APPLY == 'true' - working-directory: infrastructure/terraform/shared + working-directory: deployment/terraform/shared run: terraform apply -auto-approve tfplan # ============================================================ @@ -101,7 +101,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: ara_engine - lambda_path: infrastructure/terraform/lambda/engine + lambda_path: deployment/terraform/lambda/engine stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: engine-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.ara_engine_image.outputs.image_digest }} @@ -150,7 +150,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: address2uprn - lambda_path: infrastructure/terraform/lambda/address2UPRN + lambda_path: deployment/terraform/lambda/address2UPRN stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} @@ -191,7 +191,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: postcodeSplitter - lambda_path: infrastructure/terraform/lambda/postcodeSplitter + lambda_path: deployment/terraform/lambda/postcodeSplitter stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} @@ -231,7 +231,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: bulk_address2uprn_combiner - lambda_path: infrastructure/terraform/lambda/bulk_address2uprn_combiner + lambda_path: deployment/terraform/lambda/bulk_address2uprn_combiner stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.bulk_address2uprn_combiner_image.outputs.image_digest }} @@ -271,7 +271,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: condition-etl - lambda_path: infrastructure/terraform/lambda/condition-etl + lambda_path: deployment/terraform/lambda/condition-etl stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.condition_etl_image.outputs.image_digest }} @@ -311,7 +311,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: categorisation - lambda_path: infrastructure/terraform/lambda/categorisation + lambda_path: deployment/terraform/lambda/categorisation stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: categorisation-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.categorisation_image.outputs.image_digest }} @@ -351,7 +351,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: ordnanceSurvey - lambda_path: infrastructure/terraform/lambda/ordnanceSurvey + lambda_path: deployment/terraform/lambda/ordnanceSurvey stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }} @@ -386,7 +386,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: pashub_to_ara - lambda_path: infrastructure/terraform/lambda/pashub_to_ara + lambda_path: deployment/terraform/lambda/pashub_to_ara stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: pashub_to_ara-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.pashub_to_ara_image.outputs.image_digest }} @@ -419,7 +419,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: ara_fast_api - lambda_path: infrastructure/terraform/lambda/fast-api + lambda_path: deployment/terraform/lambda/fast-api stage: ${{ needs.determine_stage.outputs.stage }} terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: @@ -458,17 +458,17 @@ jobs: - uses: hashicorp/setup-terraform@v3 - name: Terraform Init - working-directory: infrastructure/terraform/cdn_certificate + working-directory: deployment/terraform/cdn_certificate run: terraform init -reconfigure - name: Terraform Workspace - working-directory: infrastructure/terraform/cdn_certificate + working-directory: deployment/terraform/cdn_certificate run: | terraform workspace select $STAGE \ || terraform workspace new $STAGE - name: Terraform Plan - working-directory: infrastructure/terraform/cdn_certificate + working-directory: deployment/terraform/cdn_certificate run: | terraform plan \ -var="stage=${STAGE}" \ @@ -476,7 +476,7 @@ jobs: - name: Terraform Apply if: env.TERRAFORM_APPLY == 'true' - working-directory: infrastructure/terraform/cdn_certificate + working-directory: deployment/terraform/cdn_certificate run: terraform apply -auto-approve tfplan @@ -503,17 +503,17 @@ jobs: - uses: hashicorp/setup-terraform@v3 - name: Terraform Init - working-directory: infrastructure/terraform/cdn + working-directory: deployment/terraform/cdn run: terraform init -reconfigure - name: Terraform Workspace - working-directory: infrastructure/terraform/cdn + working-directory: deployment/terraform/cdn run: | terraform workspace select $STAGE \ || terraform workspace new $STAGE - name: Terraform Plan - working-directory: infrastructure/terraform/cdn + working-directory: deployment/terraform/cdn run: | terraform plan \ -var="stage=${STAGE}" \ @@ -521,7 +521,7 @@ jobs: - name: Terraform Apply if: env.TERRAFORM_APPLY == 'true' - working-directory: infrastructure/terraform/cdn + working-directory: deployment/terraform/cdn run: terraform apply -auto-approve tfplan # ============================================================ @@ -562,7 +562,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: magic_plan - lambda_path: infrastructure/terraform/lambda/magic_plan + lambda_path: deployment/terraform/lambda/magic_plan stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.magic_plan_image.outputs.image_digest }} @@ -585,7 +585,7 @@ jobs: uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: hubspot-etl-to-ara - lambda_path: infrastructure/terraform/lambda/hubspot_deal_etl + lambda_path: deployment/terraform/lambda/hubspot_deal_etl stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }} diff --git a/Dockerfile.test.dockerignore b/Dockerfile.test.dockerignore index 4f79c6ee..ed05c399 100644 --- a/Dockerfile.test.dockerignore +++ b/Dockerfile.test.dockerignore @@ -4,7 +4,7 @@ model_data/local_data/ backend/node_modules/ backend/.idea/ backend/.env -infrastructure/ +deployment/ data_collection/ node_modules/ conservation_areas/ diff --git a/infrastructure/terraform/lambda/_template/README.md b/infrastructure/terraform/lambda/_template/README.md index 5bb10627..f2a8638a 100644 --- a/infrastructure/terraform/lambda/_template/README.md +++ b/infrastructure/terraform/lambda/_template/README.md @@ -10,7 +10,7 @@ ### 2. Add infrastructure prerequisites (shared stack) - Add a new ECR repository in: - infrastructure/terraform/shared/main.tf + deployment/terraform/shared/main.tf - Create a PR to deploy this to main then dev in order to deploy the shared stack diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf b/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf index 95e5acd9..b5d0515a 100644 --- a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf +++ b/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf @@ -11,7 +11,7 @@ variable "zip_excludes" { "**/*.pyc", "**/.pytest_cache/**", "**/tests/**", - "**/infrastructure/**" + "**/deployment/**" ] } From 54a674b5c88bd7907e77ac83db755c47ff4d8028 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 16:35:09 +0000 Subject: [PATCH 69/91] added postcode splitter rewrite to ddd --- AGENTS.md | 29 ---- CLAUDE.md | 29 ---- asset_list/app.py | 24 ++- .../terraform/README.md | 0 .../terraform/cdn/main.tf | 0 .../terraform/cdn/provider.tf | 0 .../terraform/cdn/variables.tf | 0 .../terraform/cdn_certificate/main.tf | 0 .../terraform/cdn_certificate/outputs.tf | 0 .../terraform/cdn_certificate/provider.tf | 0 .../terraform/cdn_certificate/variables.tf | 0 .../terraform/lambda/_template/README.md | 0 .../terraform/lambda/_template/main.tf | 0 .../terraform/lambda/_template/provider.tf | 0 .../terraform/lambda/_template/variables.tf | 0 .../terraform/lambda/address2UPRN/main.tf | 0 .../terraform/lambda/address2UPRN/outputs.tf | 0 .../terraform/lambda/address2UPRN/provider.tf | 0 .../lambda/address2UPRN/variables.tf | 0 .../lambda/bulk_address2uprn_combiner/main.tf | 0 .../bulk_address2uprn_combiner/outputs.tf | 0 .../bulk_address2uprn_combiner/provider.tf | 0 .../bulk_address2uprn_combiner/variables.tf | 0 .../terraform/lambda/categorisation/main.tf | 0 .../lambda/categorisation/outputs.tf | 0 .../lambda/categorisation/provider.tf | 0 .../lambda/categorisation/variables.tf | 0 .../terraform/lambda/condition-etl/main.tf | 0 .../lambda/condition-etl/provider.tf | 0 .../lambda/condition-etl/variables.tf | 0 .../terraform/lambda/ecmk_to_ara/main.tf | 0 .../terraform/lambda/ecmk_to_ara/provider.tf | 0 .../terraform/lambda/ecmk_to_ara/variables.tf | 0 .../terraform/lambda/engine/main.tf | 0 .../terraform/lambda/engine/outputs.tf | 0 .../terraform/lambda/engine/provider.tf | 0 .../terraform/lambda/engine/variables.tf | 0 .../terraform/lambda/fast-api/main.tf | 0 .../terraform/lambda/fast-api/outputs.tf | 0 .../terraform/lambda/fast-api/provider.tf | 0 .../terraform/lambda/fast-api/variables.tf | 0 .../terraform/lambda/hubspot_deal_etl/main.tf | 0 .../lambda/hubspot_deal_etl/provider.tf | 0 .../lambda/hubspot_deal_etl/variables.tf | 0 .../terraform/lambda/magic_plan/main.tf | 0 .../terraform/lambda/magic_plan/outputs.tf | 0 .../terraform/lambda/magic_plan/provider.tf | 0 .../terraform/lambda/magic_plan/variables.tf | 0 .../terraform/lambda/ordnanceSurvey/main.tf | 0 .../lambda/ordnanceSurvey/provider.tf | 0 .../lambda/ordnanceSurvey/variables.tf | 0 .../terraform/lambda/pashub_to_ara/main.tf | 0 .../terraform/lambda/pashub_to_ara/outputs.tf | 0 .../lambda/pashub_to_ara/provider.tf | 0 .../lambda/pashub_to_ara/variables.tf | 0 .../terraform/lambda/postcodeSplitter/main.tf | 0 .../lambda/postcodeSplitter/outputs.tf | 0 .../lambda/postcodeSplitter/provider.tf | 0 .../lambda/postcodeSplitter/variables.tf | 0 .../terraform/modules/acm_certificate/main.tf | 0 .../modules/acm_certificate/outputs.tf | 0 .../modules/acm_certificate/variables.tf | 0 .../terraform/modules/cloudfront/main.tf | 0 .../terraform/modules/cloudfront/variables.tf | 0 .../modules/container_registry/main.tf | 0 .../modules/container_registry/outputs.tf | 0 .../modules/container_registry/variables.tf | 0 .../terraform/modules/ecr/main.tf | 0 .../terraform/modules/ecr/outputs.tf | 0 .../terraform/modules/ecr/variables.tf | 0 .../modules/general_iam_policy/main.tf | 0 .../modules/general_iam_policy/outputs.tf | 0 .../modules/general_iam_policy/variables.tf | 0 .../modules/lambda_execution_role/main.tf | 0 .../modules/lambda_execution_role/outputs.tf | 0 .../lambda_execution_role/variables.tf | 0 .../terraform/modules/lambda_service/main.tf | 0 .../modules/lambda_service/outputs.tf | 0 .../modules/lambda_service/variables.tf | 0 .../modules/lambda_service_zip/main.tf | 0 .../modules/lambda_service_zip/variables.tf | 0 .../modules/lambda_sqs_trigger/main.tf | 0 .../modules/lambda_sqs_trigger/variables.tf | 0 .../modules/lambda_with_api_gateway/main.tf | 0 .../lambda_with_api_gateway/outputs.tf | 0 .../lambda_with_api_gateway/variables.tf | 0 .../terraform/modules/lambda_with_sqs/main.tf | 0 .../modules/lambda_with_sqs/outputs.tf | 0 .../modules/lambda_with_sqs/variables.tf | 0 .../terraform/modules/route53/main.tf | 0 .../terraform/modules/route53/variables.tf | 0 .../terraform/modules/s3/main.tf | 0 .../terraform/modules/s3/outputs.tf | 0 .../terraform/modules/s3/variables.tf | 0 .../terraform/modules/s3_iam_policy/main.tf | 0 .../modules/s3_iam_policy/outputs.tf | 0 .../modules/s3_iam_policy/variables.tf | 0 .../modules/s3_presignable_bucket/main.tf | 0 .../modules/s3_presignable_bucket/outputs.tf | 0 .../s3_presignable_bucket/variables.tf | 0 .../terraform/modules/ses/main.tf | 0 .../terraform/modules/ses/outputs.tf | 0 .../terraform/modules/ses/variables.tf | 0 .../terraform/modules/sqs_queue/main.tf | 0 .../terraform/modules/sqs_queue/outputs.tf | 0 .../terraform/modules/sqs_queue/variables.tf | 0 .../terraform/modules/tf_state_bucket/main.tf | 0 .../modules/tf_state_bucket/outputs.tf | 0 .../modules/tf_state_bucket/variables.tf | 0 .../terraform/shared/dev.tfvars | 0 .../terraform/shared/main.tf | 0 .../terraform/shared/secrets.tf | 0 .../terraform/shared/variables.tf | 0 domain/__init__.py | 0 domain/tasks/__init__.py | 0 domain/tasks/subtasks.py | 55 +++++++ domain/tasks/tasks.py | 94 +++++++++++ infrastructure/__init__.py | 0 infrastructure/postgres/__init__.py | 0 infrastructure/postgres/config.py | 33 ++++ infrastructure/postgres/engine.py | 18 +++ infrastructure/postgres/subtask_table.py | 21 +++ infrastructure/postgres/task_table.py | 36 +++++ orchestration/__init__.py | 0 orchestration/task_orchestrator.py | 96 +++++++++++ repositories/__init__.py | 0 repositories/tasks/__init__.py | 0 .../tasks/subtask_postgres_repository.py | 89 +++++++++++ repositories/tasks/subtask_repository.py | 18 +++ .../tasks/task_postgres_repository.py | 77 +++++++++ repositories/tasks/task_repository.py | 15 ++ run_backlog.sh | 2 - tests/__init__.py | 0 tests/domain/__init__.py | 0 tests/domain/tasks/__init__.py | 0 tests/domain/tasks/test_subtasks.py | 75 +++++++++ tests/domain/tasks/test_tasks.py | 104 ++++++++++++ tests/orchestration/__init__.py | 0 tests/orchestration/test_task_orchestrator.py | 151 ++++++++++++++++++ tests/repositories/__init__.py | 0 tests/repositories/tasks/__init__.py | 0 tests/repositories/tasks/postgres/__init__.py | 0 .../test_subtask_postgres_repository.py | 81 ++++++++++ .../postgres/test_task_postgres_repository.py | 68 ++++++++ utilities/__init__.py | 0 utilities/aws_lambda/__init__.py | 0 utilities/aws_lambda/default_orchestrator.py | 26 +++ utilities/aws_lambda/subtask_handler.py | 67 ++++++++ utilities/aws_lambda/subtask_trigger_body.py | 17 ++ utilities/aws_lambda/task_handler.py | 98 ++++++++++++ utilities/private.py | 33 ++++ 151 files changed, 1281 insertions(+), 75 deletions(-) delete mode 100644 AGENTS.md rename {infrastructure => deployment}/terraform/README.md (100%) rename {infrastructure => deployment}/terraform/cdn/main.tf (100%) rename {infrastructure => deployment}/terraform/cdn/provider.tf (100%) rename {infrastructure => deployment}/terraform/cdn/variables.tf (100%) rename {infrastructure => deployment}/terraform/cdn_certificate/main.tf (100%) rename {infrastructure => deployment}/terraform/cdn_certificate/outputs.tf (100%) rename {infrastructure => deployment}/terraform/cdn_certificate/provider.tf (100%) rename {infrastructure => deployment}/terraform/cdn_certificate/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/_template/README.md (100%) rename {infrastructure => deployment}/terraform/lambda/_template/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/_template/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/_template/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/address2UPRN/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/address2UPRN/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/address2UPRN/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/address2UPRN/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/categorisation/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/categorisation/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/categorisation/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/categorisation/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/condition-etl/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/condition-etl/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/condition-etl/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/ecmk_to_ara/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/ecmk_to_ara/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/ecmk_to_ara/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/engine/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/engine/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/engine/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/engine/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/fast-api/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/fast-api/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/fast-api/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/fast-api/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/hubspot_deal_etl/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/hubspot_deal_etl/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/hubspot_deal_etl/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/magic_plan/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/magic_plan/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/magic_plan/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/magic_plan/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/ordnanceSurvey/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/ordnanceSurvey/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/ordnanceSurvey/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/variables.tf (100%) rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/main.tf (100%) rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/outputs.tf (100%) rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/provider.tf (100%) rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/acm_certificate/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/acm_certificate/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/acm_certificate/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/cloudfront/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/cloudfront/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/container_registry/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/container_registry/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/container_registry/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/ecr/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/ecr/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/ecr/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/general_iam_policy/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/general_iam_policy/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/general_iam_policy/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_execution_role/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_execution_role/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_execution_role/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_service/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_service/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_service/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_service_zip/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_service_zip/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_sqs_trigger/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_sqs_trigger/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_with_api_gateway/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_with_api_gateway/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_with_api_gateway/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_with_sqs/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_with_sqs/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/lambda_with_sqs/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/route53/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/route53/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3_iam_policy/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3_iam_policy/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3_iam_policy/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3_presignable_bucket/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3_presignable_bucket/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/s3_presignable_bucket/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/ses/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/ses/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/ses/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/sqs_queue/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/sqs_queue/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/sqs_queue/variables.tf (100%) rename {infrastructure => deployment}/terraform/modules/tf_state_bucket/main.tf (100%) rename {infrastructure => deployment}/terraform/modules/tf_state_bucket/outputs.tf (100%) rename {infrastructure => deployment}/terraform/modules/tf_state_bucket/variables.tf (100%) rename {infrastructure => deployment}/terraform/shared/dev.tfvars (100%) rename {infrastructure => deployment}/terraform/shared/main.tf (100%) rename {infrastructure => deployment}/terraform/shared/secrets.tf (100%) rename {infrastructure => deployment}/terraform/shared/variables.tf (100%) create mode 100644 domain/__init__.py create mode 100644 domain/tasks/__init__.py create mode 100644 domain/tasks/subtasks.py create mode 100644 domain/tasks/tasks.py create mode 100644 infrastructure/__init__.py create mode 100644 infrastructure/postgres/__init__.py create mode 100644 infrastructure/postgres/config.py create mode 100644 infrastructure/postgres/engine.py create mode 100644 infrastructure/postgres/subtask_table.py create mode 100644 infrastructure/postgres/task_table.py create mode 100644 orchestration/__init__.py create mode 100644 orchestration/task_orchestrator.py create mode 100644 repositories/__init__.py create mode 100644 repositories/tasks/__init__.py create mode 100644 repositories/tasks/subtask_postgres_repository.py create mode 100644 repositories/tasks/subtask_repository.py create mode 100644 repositories/tasks/task_postgres_repository.py create mode 100644 repositories/tasks/task_repository.py delete mode 100644 run_backlog.sh create mode 100644 tests/__init__.py create mode 100644 tests/domain/__init__.py create mode 100644 tests/domain/tasks/__init__.py create mode 100644 tests/domain/tasks/test_subtasks.py create mode 100644 tests/domain/tasks/test_tasks.py create mode 100644 tests/orchestration/__init__.py create mode 100644 tests/orchestration/test_task_orchestrator.py create mode 100644 tests/repositories/__init__.py create mode 100644 tests/repositories/tasks/__init__.py create mode 100644 tests/repositories/tasks/postgres/__init__.py create mode 100644 tests/repositories/tasks/postgres/test_subtask_postgres_repository.py create mode 100644 tests/repositories/tasks/postgres/test_task_postgres_repository.py create mode 100644 utilities/__init__.py create mode 100644 utilities/aws_lambda/__init__.py create mode 100644 utilities/aws_lambda/default_orchestrator.py create mode 100644 utilities/aws_lambda/subtask_handler.py create mode 100644 utilities/aws_lambda/subtask_trigger_body.py create mode 100644 utilities/aws_lambda/task_handler.py create mode 100644 utilities/private.py diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index aa0426a0..00000000 --- a/AGENTS.md +++ /dev/null @@ -1,29 +0,0 @@ - - - - - -## BACKLOG WORKFLOW INSTRUCTIONS - -This project uses Backlog.md MCP for all task and project management activities. - -**CRITICAL GUIDANCE** - -- If your client supports MCP resources, read `backlog://workflow/overview` to understand when and how to use Backlog for this project. -- If your client only supports tools or the above request fails, call `backlog.get_backlog_instructions()` to load the tool-oriented overview. Use the `instruction` selector when you need `task-creation`, `task-execution`, or `task-finalization`. - -- **First time working here?** Read the overview resource IMMEDIATELY to learn the workflow -- **Already familiar?** You should have the overview cached ("## Backlog.md Overview (MCP)") -- **When to read it**: BEFORE creating tasks, or when you're unsure whether to track work - -These guides cover: -- Decision framework for when to create tasks -- Search-first workflow to avoid duplicates -- Links to detailed guides for task creation, execution, and finalization -- MCP tools reference - -You MUST read the overview resource to understand the complete workflow. The information is NOT summarized here. - - - - diff --git a/CLAUDE.md b/CLAUDE.md index f88a59d5..2dabf532 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,33 +1,4 @@ - - - - -## BACKLOG WORKFLOW INSTRUCTIONS - -This project uses Backlog.md MCP for all task and project management activities. - -**CRITICAL GUIDANCE** - -- If your client supports MCP resources, read `backlog://workflow/overview` to understand when and how to use Backlog for this project. -- If your client only supports tools or the above request fails, call `backlog.get_backlog_instructions()` to load the tool-oriented overview. Use the `instruction` selector when you need `task-creation`, `task-execution`, or `task-finalization`. - -- **First time working here?** Read the overview resource IMMEDIATELY to learn the workflow -- **Already familiar?** You should have the overview cached ("## Backlog.md Overview (MCP)") -- **When to read it**: BEFORE creating tasks, or when you're unsure whether to track work - -These guides cover: -- Decision framework for when to create tasks -- Search-first workflow to avoid duplicates -- Links to detailed guides for task creation, execution, and finalization -- MCP tools reference - -You MUST read the overview resource to understand the complete workflow. The information is NOT summarized here. - - - - - ## Available Skills Five Claude Code skills are installed in this repo's dev container. Each maps to a phase of the feature lifecycle. diff --git a/asset_list/app.py b/asset_list/app.py index 7413c7cb..9b10d7f3 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -79,23 +79,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "input.xlsx" - sheet_name = "Handovers" - postcode_column = "POSTCODE" - address1_column = "Full Addres" + data_filename = "lincs_address_list.xlsx" + sheet_name = "Sheet1" + postcode_column = "Postcode" + address1_column = "Deal Name" address1_method = None - fulladdress_column = "Full Addres" + fulladdress_column = "Deal Name" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "domna_found_uprn" - landlord_property_type = "PROPERTY TYPE" # Good to include if landlord gave - landlord_built_form = "Type Description" # Good to include if landlord gave + landlord_os_uprn = None + landlord_property_type = None # Good to include if landlord gave + landlord_built_form = None # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "PROP REF" + landlord_property_id = "landlord_id" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -468,9 +468,3 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) - - - - -for key,value in dict.items(): - lsakjfldsa \ No newline at end of file diff --git a/infrastructure/terraform/README.md b/deployment/terraform/README.md similarity index 100% rename from infrastructure/terraform/README.md rename to deployment/terraform/README.md diff --git a/infrastructure/terraform/cdn/main.tf b/deployment/terraform/cdn/main.tf similarity index 100% rename from infrastructure/terraform/cdn/main.tf rename to deployment/terraform/cdn/main.tf diff --git a/infrastructure/terraform/cdn/provider.tf b/deployment/terraform/cdn/provider.tf similarity index 100% rename from infrastructure/terraform/cdn/provider.tf rename to deployment/terraform/cdn/provider.tf diff --git a/infrastructure/terraform/cdn/variables.tf b/deployment/terraform/cdn/variables.tf similarity index 100% rename from infrastructure/terraform/cdn/variables.tf rename to deployment/terraform/cdn/variables.tf diff --git a/infrastructure/terraform/cdn_certificate/main.tf b/deployment/terraform/cdn_certificate/main.tf similarity index 100% rename from infrastructure/terraform/cdn_certificate/main.tf rename to deployment/terraform/cdn_certificate/main.tf diff --git a/infrastructure/terraform/cdn_certificate/outputs.tf b/deployment/terraform/cdn_certificate/outputs.tf similarity index 100% rename from infrastructure/terraform/cdn_certificate/outputs.tf rename to deployment/terraform/cdn_certificate/outputs.tf diff --git a/infrastructure/terraform/cdn_certificate/provider.tf b/deployment/terraform/cdn_certificate/provider.tf similarity index 100% rename from infrastructure/terraform/cdn_certificate/provider.tf rename to deployment/terraform/cdn_certificate/provider.tf diff --git a/infrastructure/terraform/cdn_certificate/variables.tf b/deployment/terraform/cdn_certificate/variables.tf similarity index 100% rename from infrastructure/terraform/cdn_certificate/variables.tf rename to deployment/terraform/cdn_certificate/variables.tf diff --git a/infrastructure/terraform/lambda/_template/README.md b/deployment/terraform/lambda/_template/README.md similarity index 100% rename from infrastructure/terraform/lambda/_template/README.md rename to deployment/terraform/lambda/_template/README.md diff --git a/infrastructure/terraform/lambda/_template/main.tf b/deployment/terraform/lambda/_template/main.tf similarity index 100% rename from infrastructure/terraform/lambda/_template/main.tf rename to deployment/terraform/lambda/_template/main.tf diff --git a/infrastructure/terraform/lambda/_template/provider.tf b/deployment/terraform/lambda/_template/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/_template/provider.tf rename to deployment/terraform/lambda/_template/provider.tf diff --git a/infrastructure/terraform/lambda/_template/variables.tf b/deployment/terraform/lambda/_template/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/_template/variables.tf rename to deployment/terraform/lambda/_template/variables.tf diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/deployment/terraform/lambda/address2UPRN/main.tf similarity index 100% rename from infrastructure/terraform/lambda/address2UPRN/main.tf rename to deployment/terraform/lambda/address2UPRN/main.tf diff --git a/infrastructure/terraform/lambda/address2UPRN/outputs.tf b/deployment/terraform/lambda/address2UPRN/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/address2UPRN/outputs.tf rename to deployment/terraform/lambda/address2UPRN/outputs.tf diff --git a/infrastructure/terraform/lambda/address2UPRN/provider.tf b/deployment/terraform/lambda/address2UPRN/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/address2UPRN/provider.tf rename to deployment/terraform/lambda/address2UPRN/provider.tf diff --git a/infrastructure/terraform/lambda/address2UPRN/variables.tf b/deployment/terraform/lambda/address2UPRN/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/address2UPRN/variables.tf rename to deployment/terraform/lambda/address2UPRN/variables.tf diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/main.tf similarity index 100% rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf rename to deployment/terraform/lambda/bulk_address2uprn_combiner/main.tf diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf rename to deployment/terraform/lambda/bulk_address2uprn_combiner/outputs.tf diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf rename to deployment/terraform/lambda/bulk_address2uprn_combiner/provider.tf diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf rename to deployment/terraform/lambda/bulk_address2uprn_combiner/variables.tf diff --git a/infrastructure/terraform/lambda/categorisation/main.tf b/deployment/terraform/lambda/categorisation/main.tf similarity index 100% rename from infrastructure/terraform/lambda/categorisation/main.tf rename to deployment/terraform/lambda/categorisation/main.tf diff --git a/infrastructure/terraform/lambda/categorisation/outputs.tf b/deployment/terraform/lambda/categorisation/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/categorisation/outputs.tf rename to deployment/terraform/lambda/categorisation/outputs.tf diff --git a/infrastructure/terraform/lambda/categorisation/provider.tf b/deployment/terraform/lambda/categorisation/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/categorisation/provider.tf rename to deployment/terraform/lambda/categorisation/provider.tf diff --git a/infrastructure/terraform/lambda/categorisation/variables.tf b/deployment/terraform/lambda/categorisation/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/categorisation/variables.tf rename to deployment/terraform/lambda/categorisation/variables.tf diff --git a/infrastructure/terraform/lambda/condition-etl/main.tf b/deployment/terraform/lambda/condition-etl/main.tf similarity index 100% rename from infrastructure/terraform/lambda/condition-etl/main.tf rename to deployment/terraform/lambda/condition-etl/main.tf diff --git a/infrastructure/terraform/lambda/condition-etl/provider.tf b/deployment/terraform/lambda/condition-etl/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/condition-etl/provider.tf rename to deployment/terraform/lambda/condition-etl/provider.tf diff --git a/infrastructure/terraform/lambda/condition-etl/variables.tf b/deployment/terraform/lambda/condition-etl/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/condition-etl/variables.tf rename to deployment/terraform/lambda/condition-etl/variables.tf diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/main.tf b/deployment/terraform/lambda/ecmk_to_ara/main.tf similarity index 100% rename from infrastructure/terraform/lambda/ecmk_to_ara/main.tf rename to deployment/terraform/lambda/ecmk_to_ara/main.tf diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/provider.tf b/deployment/terraform/lambda/ecmk_to_ara/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/ecmk_to_ara/provider.tf rename to deployment/terraform/lambda/ecmk_to_ara/provider.tf diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/variables.tf b/deployment/terraform/lambda/ecmk_to_ara/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/ecmk_to_ara/variables.tf rename to deployment/terraform/lambda/ecmk_to_ara/variables.tf diff --git a/infrastructure/terraform/lambda/engine/main.tf b/deployment/terraform/lambda/engine/main.tf similarity index 100% rename from infrastructure/terraform/lambda/engine/main.tf rename to deployment/terraform/lambda/engine/main.tf diff --git a/infrastructure/terraform/lambda/engine/outputs.tf b/deployment/terraform/lambda/engine/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/engine/outputs.tf rename to deployment/terraform/lambda/engine/outputs.tf diff --git a/infrastructure/terraform/lambda/engine/provider.tf b/deployment/terraform/lambda/engine/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/engine/provider.tf rename to deployment/terraform/lambda/engine/provider.tf diff --git a/infrastructure/terraform/lambda/engine/variables.tf b/deployment/terraform/lambda/engine/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/engine/variables.tf rename to deployment/terraform/lambda/engine/variables.tf diff --git a/infrastructure/terraform/lambda/fast-api/main.tf b/deployment/terraform/lambda/fast-api/main.tf similarity index 100% rename from infrastructure/terraform/lambda/fast-api/main.tf rename to deployment/terraform/lambda/fast-api/main.tf diff --git a/infrastructure/terraform/lambda/fast-api/outputs.tf b/deployment/terraform/lambda/fast-api/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/fast-api/outputs.tf rename to deployment/terraform/lambda/fast-api/outputs.tf diff --git a/infrastructure/terraform/lambda/fast-api/provider.tf b/deployment/terraform/lambda/fast-api/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/fast-api/provider.tf rename to deployment/terraform/lambda/fast-api/provider.tf diff --git a/infrastructure/terraform/lambda/fast-api/variables.tf b/deployment/terraform/lambda/fast-api/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/fast-api/variables.tf rename to deployment/terraform/lambda/fast-api/variables.tf diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/deployment/terraform/lambda/hubspot_deal_etl/main.tf similarity index 100% rename from infrastructure/terraform/lambda/hubspot_deal_etl/main.tf rename to deployment/terraform/lambda/hubspot_deal_etl/main.tf diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf b/deployment/terraform/lambda/hubspot_deal_etl/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf rename to deployment/terraform/lambda/hubspot_deal_etl/provider.tf diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf b/deployment/terraform/lambda/hubspot_deal_etl/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf rename to deployment/terraform/lambda/hubspot_deal_etl/variables.tf diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/deployment/terraform/lambda/magic_plan/main.tf similarity index 100% rename from infrastructure/terraform/lambda/magic_plan/main.tf rename to deployment/terraform/lambda/magic_plan/main.tf diff --git a/infrastructure/terraform/lambda/magic_plan/outputs.tf b/deployment/terraform/lambda/magic_plan/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/magic_plan/outputs.tf rename to deployment/terraform/lambda/magic_plan/outputs.tf diff --git a/infrastructure/terraform/lambda/magic_plan/provider.tf b/deployment/terraform/lambda/magic_plan/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/magic_plan/provider.tf rename to deployment/terraform/lambda/magic_plan/provider.tf diff --git a/infrastructure/terraform/lambda/magic_plan/variables.tf b/deployment/terraform/lambda/magic_plan/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/magic_plan/variables.tf rename to deployment/terraform/lambda/magic_plan/variables.tf diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/main.tf b/deployment/terraform/lambda/ordnanceSurvey/main.tf similarity index 100% rename from infrastructure/terraform/lambda/ordnanceSurvey/main.tf rename to deployment/terraform/lambda/ordnanceSurvey/main.tf diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf b/deployment/terraform/lambda/ordnanceSurvey/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/ordnanceSurvey/provider.tf rename to deployment/terraform/lambda/ordnanceSurvey/provider.tf diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf b/deployment/terraform/lambda/ordnanceSurvey/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/ordnanceSurvey/variables.tf rename to deployment/terraform/lambda/ordnanceSurvey/variables.tf diff --git a/infrastructure/terraform/lambda/pashub_to_ara/main.tf b/deployment/terraform/lambda/pashub_to_ara/main.tf similarity index 100% rename from infrastructure/terraform/lambda/pashub_to_ara/main.tf rename to deployment/terraform/lambda/pashub_to_ara/main.tf diff --git a/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf b/deployment/terraform/lambda/pashub_to_ara/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/pashub_to_ara/outputs.tf rename to deployment/terraform/lambda/pashub_to_ara/outputs.tf diff --git a/infrastructure/terraform/lambda/pashub_to_ara/provider.tf b/deployment/terraform/lambda/pashub_to_ara/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/pashub_to_ara/provider.tf rename to deployment/terraform/lambda/pashub_to_ara/provider.tf diff --git a/infrastructure/terraform/lambda/pashub_to_ara/variables.tf b/deployment/terraform/lambda/pashub_to_ara/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/pashub_to_ara/variables.tf rename to deployment/terraform/lambda/pashub_to_ara/variables.tf diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf similarity index 100% rename from infrastructure/terraform/lambda/postcodeSplitter/main.tf rename to deployment/terraform/lambda/postcodeSplitter/main.tf diff --git a/infrastructure/terraform/lambda/postcodeSplitter/outputs.tf b/deployment/terraform/lambda/postcodeSplitter/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/postcodeSplitter/outputs.tf rename to deployment/terraform/lambda/postcodeSplitter/outputs.tf diff --git a/infrastructure/terraform/lambda/postcodeSplitter/provider.tf b/deployment/terraform/lambda/postcodeSplitter/provider.tf similarity index 100% rename from infrastructure/terraform/lambda/postcodeSplitter/provider.tf rename to deployment/terraform/lambda/postcodeSplitter/provider.tf diff --git a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf b/deployment/terraform/lambda/postcodeSplitter/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/postcodeSplitter/variables.tf rename to deployment/terraform/lambda/postcodeSplitter/variables.tf diff --git a/infrastructure/terraform/modules/acm_certificate/main.tf b/deployment/terraform/modules/acm_certificate/main.tf similarity index 100% rename from infrastructure/terraform/modules/acm_certificate/main.tf rename to deployment/terraform/modules/acm_certificate/main.tf diff --git a/infrastructure/terraform/modules/acm_certificate/outputs.tf b/deployment/terraform/modules/acm_certificate/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/acm_certificate/outputs.tf rename to deployment/terraform/modules/acm_certificate/outputs.tf diff --git a/infrastructure/terraform/modules/acm_certificate/variables.tf b/deployment/terraform/modules/acm_certificate/variables.tf similarity index 100% rename from infrastructure/terraform/modules/acm_certificate/variables.tf rename to deployment/terraform/modules/acm_certificate/variables.tf diff --git a/infrastructure/terraform/modules/cloudfront/main.tf b/deployment/terraform/modules/cloudfront/main.tf similarity index 100% rename from infrastructure/terraform/modules/cloudfront/main.tf rename to deployment/terraform/modules/cloudfront/main.tf diff --git a/infrastructure/terraform/modules/cloudfront/variables.tf b/deployment/terraform/modules/cloudfront/variables.tf similarity index 100% rename from infrastructure/terraform/modules/cloudfront/variables.tf rename to deployment/terraform/modules/cloudfront/variables.tf diff --git a/infrastructure/terraform/modules/container_registry/main.tf b/deployment/terraform/modules/container_registry/main.tf similarity index 100% rename from infrastructure/terraform/modules/container_registry/main.tf rename to deployment/terraform/modules/container_registry/main.tf diff --git a/infrastructure/terraform/modules/container_registry/outputs.tf b/deployment/terraform/modules/container_registry/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/container_registry/outputs.tf rename to deployment/terraform/modules/container_registry/outputs.tf diff --git a/infrastructure/terraform/modules/container_registry/variables.tf b/deployment/terraform/modules/container_registry/variables.tf similarity index 100% rename from infrastructure/terraform/modules/container_registry/variables.tf rename to deployment/terraform/modules/container_registry/variables.tf diff --git a/infrastructure/terraform/modules/ecr/main.tf b/deployment/terraform/modules/ecr/main.tf similarity index 100% rename from infrastructure/terraform/modules/ecr/main.tf rename to deployment/terraform/modules/ecr/main.tf diff --git a/infrastructure/terraform/modules/ecr/outputs.tf b/deployment/terraform/modules/ecr/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/ecr/outputs.tf rename to deployment/terraform/modules/ecr/outputs.tf diff --git a/infrastructure/terraform/modules/ecr/variables.tf b/deployment/terraform/modules/ecr/variables.tf similarity index 100% rename from infrastructure/terraform/modules/ecr/variables.tf rename to deployment/terraform/modules/ecr/variables.tf diff --git a/infrastructure/terraform/modules/general_iam_policy/main.tf b/deployment/terraform/modules/general_iam_policy/main.tf similarity index 100% rename from infrastructure/terraform/modules/general_iam_policy/main.tf rename to deployment/terraform/modules/general_iam_policy/main.tf diff --git a/infrastructure/terraform/modules/general_iam_policy/outputs.tf b/deployment/terraform/modules/general_iam_policy/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/general_iam_policy/outputs.tf rename to deployment/terraform/modules/general_iam_policy/outputs.tf diff --git a/infrastructure/terraform/modules/general_iam_policy/variables.tf b/deployment/terraform/modules/general_iam_policy/variables.tf similarity index 100% rename from infrastructure/terraform/modules/general_iam_policy/variables.tf rename to deployment/terraform/modules/general_iam_policy/variables.tf diff --git a/infrastructure/terraform/modules/lambda_execution_role/main.tf b/deployment/terraform/modules/lambda_execution_role/main.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_execution_role/main.tf rename to deployment/terraform/modules/lambda_execution_role/main.tf diff --git a/infrastructure/terraform/modules/lambda_execution_role/outputs.tf b/deployment/terraform/modules/lambda_execution_role/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_execution_role/outputs.tf rename to deployment/terraform/modules/lambda_execution_role/outputs.tf diff --git a/infrastructure/terraform/modules/lambda_execution_role/variables.tf b/deployment/terraform/modules/lambda_execution_role/variables.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_execution_role/variables.tf rename to deployment/terraform/modules/lambda_execution_role/variables.tf diff --git a/infrastructure/terraform/modules/lambda_service/main.tf b/deployment/terraform/modules/lambda_service/main.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_service/main.tf rename to deployment/terraform/modules/lambda_service/main.tf diff --git a/infrastructure/terraform/modules/lambda_service/outputs.tf b/deployment/terraform/modules/lambda_service/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_service/outputs.tf rename to deployment/terraform/modules/lambda_service/outputs.tf diff --git a/infrastructure/terraform/modules/lambda_service/variables.tf b/deployment/terraform/modules/lambda_service/variables.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_service/variables.tf rename to deployment/terraform/modules/lambda_service/variables.tf diff --git a/infrastructure/terraform/modules/lambda_service_zip/main.tf b/deployment/terraform/modules/lambda_service_zip/main.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_service_zip/main.tf rename to deployment/terraform/modules/lambda_service_zip/main.tf diff --git a/infrastructure/terraform/modules/lambda_service_zip/variables.tf b/deployment/terraform/modules/lambda_service_zip/variables.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_service_zip/variables.tf rename to deployment/terraform/modules/lambda_service_zip/variables.tf diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf b/deployment/terraform/modules/lambda_sqs_trigger/main.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_sqs_trigger/main.tf rename to deployment/terraform/modules/lambda_sqs_trigger/main.tf diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf b/deployment/terraform/modules/lambda_sqs_trigger/variables.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf rename to deployment/terraform/modules/lambda_sqs_trigger/variables.tf diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/main.tf b/deployment/terraform/modules/lambda_with_api_gateway/main.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_with_api_gateway/main.tf rename to deployment/terraform/modules/lambda_with_api_gateway/main.tf diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf b/deployment/terraform/modules/lambda_with_api_gateway/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf rename to deployment/terraform/modules/lambda_with_api_gateway/outputs.tf diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf b/deployment/terraform/modules/lambda_with_api_gateway/variables.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf rename to deployment/terraform/modules/lambda_with_api_gateway/variables.tf diff --git a/infrastructure/terraform/modules/lambda_with_sqs/main.tf b/deployment/terraform/modules/lambda_with_sqs/main.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_with_sqs/main.tf rename to deployment/terraform/modules/lambda_with_sqs/main.tf diff --git a/infrastructure/terraform/modules/lambda_with_sqs/outputs.tf b/deployment/terraform/modules/lambda_with_sqs/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_with_sqs/outputs.tf rename to deployment/terraform/modules/lambda_with_sqs/outputs.tf diff --git a/infrastructure/terraform/modules/lambda_with_sqs/variables.tf b/deployment/terraform/modules/lambda_with_sqs/variables.tf similarity index 100% rename from infrastructure/terraform/modules/lambda_with_sqs/variables.tf rename to deployment/terraform/modules/lambda_with_sqs/variables.tf diff --git a/infrastructure/terraform/modules/route53/main.tf b/deployment/terraform/modules/route53/main.tf similarity index 100% rename from infrastructure/terraform/modules/route53/main.tf rename to deployment/terraform/modules/route53/main.tf diff --git a/infrastructure/terraform/modules/route53/variables.tf b/deployment/terraform/modules/route53/variables.tf similarity index 100% rename from infrastructure/terraform/modules/route53/variables.tf rename to deployment/terraform/modules/route53/variables.tf diff --git a/infrastructure/terraform/modules/s3/main.tf b/deployment/terraform/modules/s3/main.tf similarity index 100% rename from infrastructure/terraform/modules/s3/main.tf rename to deployment/terraform/modules/s3/main.tf diff --git a/infrastructure/terraform/modules/s3/outputs.tf b/deployment/terraform/modules/s3/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/s3/outputs.tf rename to deployment/terraform/modules/s3/outputs.tf diff --git a/infrastructure/terraform/modules/s3/variables.tf b/deployment/terraform/modules/s3/variables.tf similarity index 100% rename from infrastructure/terraform/modules/s3/variables.tf rename to deployment/terraform/modules/s3/variables.tf diff --git a/infrastructure/terraform/modules/s3_iam_policy/main.tf b/deployment/terraform/modules/s3_iam_policy/main.tf similarity index 100% rename from infrastructure/terraform/modules/s3_iam_policy/main.tf rename to deployment/terraform/modules/s3_iam_policy/main.tf diff --git a/infrastructure/terraform/modules/s3_iam_policy/outputs.tf b/deployment/terraform/modules/s3_iam_policy/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/s3_iam_policy/outputs.tf rename to deployment/terraform/modules/s3_iam_policy/outputs.tf diff --git a/infrastructure/terraform/modules/s3_iam_policy/variables.tf b/deployment/terraform/modules/s3_iam_policy/variables.tf similarity index 100% rename from infrastructure/terraform/modules/s3_iam_policy/variables.tf rename to deployment/terraform/modules/s3_iam_policy/variables.tf diff --git a/infrastructure/terraform/modules/s3_presignable_bucket/main.tf b/deployment/terraform/modules/s3_presignable_bucket/main.tf similarity index 100% rename from infrastructure/terraform/modules/s3_presignable_bucket/main.tf rename to deployment/terraform/modules/s3_presignable_bucket/main.tf diff --git a/infrastructure/terraform/modules/s3_presignable_bucket/outputs.tf b/deployment/terraform/modules/s3_presignable_bucket/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/s3_presignable_bucket/outputs.tf rename to deployment/terraform/modules/s3_presignable_bucket/outputs.tf diff --git a/infrastructure/terraform/modules/s3_presignable_bucket/variables.tf b/deployment/terraform/modules/s3_presignable_bucket/variables.tf similarity index 100% rename from infrastructure/terraform/modules/s3_presignable_bucket/variables.tf rename to deployment/terraform/modules/s3_presignable_bucket/variables.tf diff --git a/infrastructure/terraform/modules/ses/main.tf b/deployment/terraform/modules/ses/main.tf similarity index 100% rename from infrastructure/terraform/modules/ses/main.tf rename to deployment/terraform/modules/ses/main.tf diff --git a/infrastructure/terraform/modules/ses/outputs.tf b/deployment/terraform/modules/ses/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/ses/outputs.tf rename to deployment/terraform/modules/ses/outputs.tf diff --git a/infrastructure/terraform/modules/ses/variables.tf b/deployment/terraform/modules/ses/variables.tf similarity index 100% rename from infrastructure/terraform/modules/ses/variables.tf rename to deployment/terraform/modules/ses/variables.tf diff --git a/infrastructure/terraform/modules/sqs_queue/main.tf b/deployment/terraform/modules/sqs_queue/main.tf similarity index 100% rename from infrastructure/terraform/modules/sqs_queue/main.tf rename to deployment/terraform/modules/sqs_queue/main.tf diff --git a/infrastructure/terraform/modules/sqs_queue/outputs.tf b/deployment/terraform/modules/sqs_queue/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/sqs_queue/outputs.tf rename to deployment/terraform/modules/sqs_queue/outputs.tf diff --git a/infrastructure/terraform/modules/sqs_queue/variables.tf b/deployment/terraform/modules/sqs_queue/variables.tf similarity index 100% rename from infrastructure/terraform/modules/sqs_queue/variables.tf rename to deployment/terraform/modules/sqs_queue/variables.tf diff --git a/infrastructure/terraform/modules/tf_state_bucket/main.tf b/deployment/terraform/modules/tf_state_bucket/main.tf similarity index 100% rename from infrastructure/terraform/modules/tf_state_bucket/main.tf rename to deployment/terraform/modules/tf_state_bucket/main.tf diff --git a/infrastructure/terraform/modules/tf_state_bucket/outputs.tf b/deployment/terraform/modules/tf_state_bucket/outputs.tf similarity index 100% rename from infrastructure/terraform/modules/tf_state_bucket/outputs.tf rename to deployment/terraform/modules/tf_state_bucket/outputs.tf diff --git a/infrastructure/terraform/modules/tf_state_bucket/variables.tf b/deployment/terraform/modules/tf_state_bucket/variables.tf similarity index 100% rename from infrastructure/terraform/modules/tf_state_bucket/variables.tf rename to deployment/terraform/modules/tf_state_bucket/variables.tf diff --git a/infrastructure/terraform/shared/dev.tfvars b/deployment/terraform/shared/dev.tfvars similarity index 100% rename from infrastructure/terraform/shared/dev.tfvars rename to deployment/terraform/shared/dev.tfvars diff --git a/infrastructure/terraform/shared/main.tf b/deployment/terraform/shared/main.tf similarity index 100% rename from infrastructure/terraform/shared/main.tf rename to deployment/terraform/shared/main.tf diff --git a/infrastructure/terraform/shared/secrets.tf b/deployment/terraform/shared/secrets.tf similarity index 100% rename from infrastructure/terraform/shared/secrets.tf rename to deployment/terraform/shared/secrets.tf diff --git a/infrastructure/terraform/shared/variables.tf b/deployment/terraform/shared/variables.tf similarity index 100% rename from infrastructure/terraform/shared/variables.tf rename to deployment/terraform/shared/variables.tf diff --git a/domain/__init__.py b/domain/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/tasks/__init__.py b/domain/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/tasks/subtasks.py b/domain/tasks/subtasks.py new file mode 100644 index 00000000..bd49a6ec --- /dev/null +++ b/domain/tasks/subtasks.py @@ -0,0 +1,55 @@ +from dataclasses import dataclass +from datetime import datetime, timezone +from enum import Enum +from typing import Any, Optional +from uuid import UUID, uuid4 + + +class SubTaskStatus(str, Enum): + WAITING = "waiting" + IN_PROGRESS = "in progress" + COMPLETE = "complete" + FAILED = "failed" + + +@dataclass +class SubTask: + id: UUID + task_id: UUID + status: SubTaskStatus = SubTaskStatus.WAITING + inputs: Optional[dict[str, Any]] = None + outputs: Optional[dict[str, Any]] = None + cloud_logs_url: Optional[str] = None + job_started: Optional[datetime] = None + job_completed: Optional[datetime] = None + + @classmethod + def create( + cls, *, task_id: UUID, inputs: Optional[dict[str, Any]] = None + ) -> "SubTask": + return cls( + id=uuid4(), + task_id=task_id, + status=SubTaskStatus.WAITING, + inputs=inputs, + ) + + def start(self, cloud_logs_url: Optional[str] = None) -> None: + if self.status not in (SubTaskStatus.WAITING, SubTaskStatus.IN_PROGRESS): + raise ValueError(f"cannot start subtask in status {self.status}") + if self.job_started is None: + self.job_started = datetime.now(timezone.utc) + self.status = SubTaskStatus.IN_PROGRESS + if cloud_logs_url is not None: + self.cloud_logs_url = cloud_logs_url + + def complete(self, result: Any = None) -> None: + self.status = SubTaskStatus.COMPLETE + self.job_completed = datetime.now(timezone.utc) + if result is not None: + self.outputs = {"result": result} + + def fail(self, error: BaseException) -> None: + self.status = SubTaskStatus.FAILED + self.job_completed = datetime.now(timezone.utc) + self.outputs = {"error": str(error)} diff --git a/domain/tasks/tasks.py b/domain/tasks/tasks.py new file mode 100644 index 00000000..177258d6 --- /dev/null +++ b/domain/tasks/tasks.py @@ -0,0 +1,94 @@ +from dataclasses import dataclass +from datetime import datetime, timezone +from enum import Enum +from typing import Optional +from uuid import UUID, uuid4 + +from domain.tasks.subtasks import SubTaskStatus + + +class TaskStatus(str, Enum): + WAITING = "waiting" + IN_PROGRESS = "in progress" + COMPLETE = "complete" + FAILED = "failed" + + +class Source(str, Enum): + PORTFOLIO = "portfolio_id" + HUBSPOT_DEAL = "hubspot_deal_id" + + +@dataclass +class Task: + id: UUID + task_source: str + status: TaskStatus = TaskStatus.WAITING + service: Optional[str] = None + source: Optional[Source] = None + source_id: Optional[str] = None + job_started: Optional[datetime] = None + job_completed: Optional[datetime] = None + + @classmethod + def create( + cls, + *, + task_source: str, + service: Optional[str] = None, + source: Optional[Source] = None, + source_id: Optional[str] = None, + ) -> "Task": + if not task_source.strip(): + raise ValueError("task_source must be non-empty") + return cls( + id=uuid4(), + task_source=task_source, + service=service, + source=source, + source_id=source_id, + status=TaskStatus.WAITING, + job_started=datetime.now(timezone.utc), + ) + + def start(self) -> None: + if self.status not in (TaskStatus.WAITING, TaskStatus.IN_PROGRESS): + raise ValueError(f"cannot start task in status {self.status}") + if self.job_started is None: + self.job_started = datetime.now(timezone.utc) + self.status = TaskStatus.IN_PROGRESS + + def complete(self) -> None: + self.status = TaskStatus.COMPLETE + self.job_completed = datetime.now(timezone.utc) + + def fail(self) -> None: + self.status = TaskStatus.FAILED + self.job_completed = datetime.now(timezone.utc) + + def recalculate_from_subtasks(self, statuses: list[SubTaskStatus]) -> None: + """Recompute Task.status from its SubTasks' statuses. + + Rule (preserved from legacy _update_task_progress): + - any FAILED → FAILED + - all COMPLETE → COMPLETE + - any IN_PROGRESS → IN_PROGRESS + - otherwise → WAITING + + Empty list is a no-op (newly-created task with no subtasks). + """ + if not statuses: + return + now = datetime.now(timezone.utc) + if SubTaskStatus.FAILED in statuses: + self.status = TaskStatus.FAILED + self.job_completed = now + elif all(s is SubTaskStatus.COMPLETE for s in statuses): + self.status = TaskStatus.COMPLETE + self.job_completed = now + elif SubTaskStatus.IN_PROGRESS in statuses: + self.status = TaskStatus.IN_PROGRESS + self.job_completed = None + else: + self.status = TaskStatus.WAITING + self.job_completed = None diff --git a/infrastructure/__init__.py b/infrastructure/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/postgres/__init__.py b/infrastructure/postgres/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/postgres/config.py b/infrastructure/postgres/config.py new file mode 100644 index 00000000..c39c6f30 --- /dev/null +++ b/infrastructure/postgres/config.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass +from typing import Mapping + + +@dataclass(frozen=True) +class PostgresConfig: + host: str + port: int + username: str + password: str + database: str + driver: str = "psycopg2" + pool_size: int = 3 + max_overflow: int = 5 + pool_pre_ping: bool = True + pool_recycle: int = 300 + + def url(self) -> str: + return ( + f"postgresql+{self.driver}://" + f"{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" + ) + + @classmethod + def from_env(cls, env: Mapping[str, str]) -> "PostgresConfig": + return cls( + host=env["POSTGRES_HOST"], + port=int(env["POSTGRES_PORT"]), + username=env["POSTGRES_USERNAME"], + password=env["POSTGRES_PASSWORD"], + database=env["POSTGRES_DATABASE"], + driver=env.get("POSTGRES_DRIVER", "psycopg2"), + ) diff --git a/infrastructure/postgres/engine.py b/infrastructure/postgres/engine.py new file mode 100644 index 00000000..0de9efcb --- /dev/null +++ b/infrastructure/postgres/engine.py @@ -0,0 +1,18 @@ +from sqlalchemy.engine import Engine +from sqlmodel import Session, create_engine + +from infrastructure.postgres.config import PostgresConfig + + +def make_engine(config: PostgresConfig) -> Engine: + return create_engine( + config.url(), + pool_size=config.pool_size, + max_overflow=config.max_overflow, + pool_pre_ping=config.pool_pre_ping, + pool_recycle=config.pool_recycle, + ) + + +def make_session(engine: Engine) -> Session: + return Session(engine) diff --git a/infrastructure/postgres/subtask_table.py b/infrastructure/postgres/subtask_table.py new file mode 100644 index 00000000..dec34fbf --- /dev/null +++ b/infrastructure/postgres/subtask_table.py @@ -0,0 +1,21 @@ +from datetime import datetime, timezone +from typing import ClassVar, Optional +from uuid import UUID, uuid4 + +from sqlmodel import Field, SQLModel + + +class SubTaskRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "sub_task" # pyright: ignore[reportIncompatibleVariableOverride] + + id: UUID = Field(default_factory=uuid4, primary_key=True, index=True) + task_id: UUID = Field(foreign_key="tasks.id") + job_started: Optional[datetime] = None + job_completed: Optional[datetime] = None + status: str = Field(default="waiting") + inputs: Optional[str] = None + outputs: Optional[str] = None + cloud_logs_url: Optional[str] = None + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) diff --git a/infrastructure/postgres/task_table.py b/infrastructure/postgres/task_table.py new file mode 100644 index 00000000..32e5450b --- /dev/null +++ b/infrastructure/postgres/task_table.py @@ -0,0 +1,36 @@ +from datetime import datetime, timezone +from typing import ClassVar, Optional +from uuid import UUID, uuid4 + +from sqlalchemy import Column +from sqlalchemy import Enum as SAEnum +from sqlmodel import Field, SQLModel + +from domain.tasks.tasks import Source + + +class TaskRow(SQLModel, table=True): + __tablename__: ClassVar[str] = "tasks" # pyright: ignore[reportIncompatibleVariableOverride] + + id: UUID = Field(default_factory=uuid4, primary_key=True, index=True) + task_source: str + job_started: Optional[datetime] = None + job_completed: Optional[datetime] = None + status: str = Field(default="waiting") + service: Optional[str] = None + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) + + source: Optional[Source] = Field( + default=None, + sa_column=Column( + SAEnum( + Source, + name="source", + values_callable=lambda cls: [m.value for m in cls], # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType] + ), + nullable=True, + ), + ) + source_id: Optional[str] = None diff --git a/orchestration/__init__.py b/orchestration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/orchestration/task_orchestrator.py b/orchestration/task_orchestrator.py new file mode 100644 index 00000000..6c67d1ce --- /dev/null +++ b/orchestration/task_orchestrator.py @@ -0,0 +1,96 @@ +from typing import Any, Callable, Optional +from uuid import UUID + +from domain.tasks.subtasks import SubTask +from domain.tasks.tasks import Source, Task +from repositories.tasks.subtask_repository import SubTaskRepository +from repositories.tasks.task_repository import TaskRepository +from utilities.private import private + + +class TaskOrchestrator: + """Coordinates Task + SubTask lifecycle. + + Exposes primitives (start/complete/fail_subtask) for handlers that want + fine-grained control, and a high-level run_subtask wrapper that owns the + try/except so it can replace the body of the legacy subtask_handler + decorator in backend/utils/subtasks.py. + + Each primitive saves the SubTask, then recomputes the parent Task's + status from all its children. + """ + + def __init__( + self, + task_repo: TaskRepository, + subtask_repo: SubTaskRepository, + ) -> None: + self._tasks = task_repo + self._subtasks = subtask_repo + + def create_task_with_subtask( + self, + *, + task_source: str, + inputs: Optional[dict[str, Any]] = None, + service: Optional[str] = None, + source: Optional[Source] = None, + source_id: Optional[str] = None, + ) -> tuple[Task, SubTask]: + task = Task.create( + task_source=task_source, + service=service, + source=source, + source_id=source_id, + ) + self._tasks.create(task) + subtask = SubTask.create(task_id=task.id, inputs=inputs) + self._subtasks.create(subtask) + return task, subtask + + def start_subtask( + self, subtask_id: UUID, cloud_logs_url: Optional[str] = None + ) -> SubTask: + subtask = self._subtasks.get(subtask_id) + subtask.start(cloud_logs_url) + self._subtasks.save(subtask) + self._cascade(subtask.task_id) + return subtask + + def complete_subtask( + self, subtask_id: UUID, result: Any = None + ) -> SubTask: + subtask = self._subtasks.get(subtask_id) + subtask.complete(result) + self._subtasks.save(subtask) + self._cascade(subtask.task_id) + return subtask + + def fail_subtask(self, subtask_id: UUID, error: BaseException) -> SubTask: + subtask = self._subtasks.get(subtask_id) + subtask.fail(error) + self._subtasks.save(subtask) + self._cascade(subtask.task_id) + return subtask + + def run_subtask( + self, + subtask_id: UUID, + work: Callable[[], Any], + cloud_logs_url: Optional[str] = None, + ) -> Any: + self.start_subtask(subtask_id, cloud_logs_url) + try: + result = work() + except Exception as e: + self.fail_subtask(subtask_id, e) + raise + self.complete_subtask(subtask_id, result) + return result + + @private + def _cascade(self, task_id: UUID) -> None: + statuses = [s.status for s in self._subtasks.list_by_task(task_id)] + task = self._tasks.get(task_id) + task.recalculate_from_subtasks(statuses) + self._tasks.save(task) diff --git a/repositories/__init__.py b/repositories/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/repositories/tasks/__init__.py b/repositories/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/repositories/tasks/subtask_postgres_repository.py b/repositories/tasks/subtask_postgres_repository.py new file mode 100644 index 00000000..affc280e --- /dev/null +++ b/repositories/tasks/subtask_postgres_repository.py @@ -0,0 +1,89 @@ +import json +from datetime import datetime, timezone +from typing import Any, Optional +from uuid import UUID + +from sqlmodel import Session, select + +from domain.tasks.subtasks import SubTask, SubTaskStatus +from infrastructure.postgres.subtask_table import SubTaskRow +from repositories.tasks.subtask_repository import SubTaskRepository +from utilities.private import private + + +class SubTaskPostgresRepository(SubTaskRepository): + def __init__(self, session: Session) -> None: + self._session = session + + def create(self, subtask: SubTask) -> SubTask: + row = self._to_row(subtask) + self._session.add(row) + self._session.commit() + self._session.refresh(row) + return self._to_domain(row) + + def get(self, subtask_id: UUID) -> SubTask: + row = self._session.get(SubTaskRow, subtask_id) + if row is None: + raise ValueError(f"SubTask {subtask_id} not found") + return self._to_domain(row) + + def save(self, subtask: SubTask) -> None: + row = self._session.get(SubTaskRow, subtask.id) + if row is None: + raise ValueError(f"SubTask {subtask.id} not found") + row.status = subtask.status.value + row.job_started = subtask.job_started + row.job_completed = subtask.job_completed + row.inputs = ( + json.dumps(subtask.inputs) if subtask.inputs is not None else None + ) + row.outputs = ( + json.dumps(subtask.outputs) if subtask.outputs is not None else None + ) + row.cloud_logs_url = subtask.cloud_logs_url + row.updated_at = datetime.now(timezone.utc) + self._session.add(row) + self._session.commit() + + def list_by_task(self, task_id: UUID) -> list[SubTask]: + rows = self._session.exec( + select(SubTaskRow).where(SubTaskRow.task_id == task_id) + ).all() + return [self._to_domain(r) for r in rows] + + @private + def _to_row(self, subtask: SubTask) -> SubTaskRow: + return SubTaskRow( + id=subtask.id, + task_id=subtask.task_id, + status=subtask.status.value, + inputs=( + json.dumps(subtask.inputs) if subtask.inputs is not None else None + ), + outputs=( + json.dumps(subtask.outputs) + if subtask.outputs is not None + else None + ), + cloud_logs_url=subtask.cloud_logs_url, + job_started=subtask.job_started, + job_completed=subtask.job_completed, + ) + + @private + def _to_domain(self, row: SubTaskRow) -> SubTask: + return SubTask( + id=row.id, + task_id=row.task_id, + status=SubTaskStatus(row.status.lower()), + inputs=_loads_or_none(row.inputs), + outputs=_loads_or_none(row.outputs), + cloud_logs_url=row.cloud_logs_url, + job_started=row.job_started, + job_completed=row.job_completed, + ) + + +def _loads_or_none(s: Optional[str]) -> Optional[dict[str, Any]]: + return json.loads(s) if s else None diff --git a/repositories/tasks/subtask_repository.py b/repositories/tasks/subtask_repository.py new file mode 100644 index 00000000..adb36f99 --- /dev/null +++ b/repositories/tasks/subtask_repository.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from uuid import UUID + +from domain.tasks.subtasks import SubTask + + +class SubTaskRepository(ABC): + @abstractmethod + def create(self, subtask: SubTask) -> SubTask: ... + + @abstractmethod + def get(self, subtask_id: UUID) -> SubTask: ... + + @abstractmethod + def save(self, subtask: SubTask) -> None: ... + + @abstractmethod + def list_by_task(self, task_id: UUID) -> list[SubTask]: ... diff --git a/repositories/tasks/task_postgres_repository.py b/repositories/tasks/task_postgres_repository.py new file mode 100644 index 00000000..d23fe91c --- /dev/null +++ b/repositories/tasks/task_postgres_repository.py @@ -0,0 +1,77 @@ +""" +Postgres implementation of TaskRepository. + +NOTE: this repository owns only the `tasks` table. Unlike the legacy +backend.app.db.functions.tasks.Tasks.TasksInterface.create_task, it does NOT +auto-create a child SubTask. Do not rewire existing Lambda callers to this +repo until the SubTask aggregate + TaskOrchestrator slice lands — they would +silently lose their initial SubTask row. +""" + +from datetime import datetime, timezone +from uuid import UUID + +from sqlmodel import Session + +from domain.tasks.tasks import Task, TaskStatus +from infrastructure.postgres.task_table import TaskRow +from repositories.tasks.task_repository import TaskRepository +from utilities.private import private + + +class TaskPostgresRepository(TaskRepository): + def __init__(self, session: Session) -> None: + self._session = session + + def create(self, task: Task) -> Task: + row = self._to_row(task) + self._session.add(row) + self._session.commit() + self._session.refresh(row) + return self._to_domain(row) + + def get(self, task_id: UUID) -> Task: + row = self._session.get(TaskRow, task_id) + if row is None: + raise ValueError(f"Task {task_id} not found") + return self._to_domain(row) + + def save(self, task: Task) -> None: + row = self._session.get(TaskRow, task.id) + if row is None: + raise ValueError(f"Task {task.id} not found") + row.status = task.status.value + row.job_started = task.job_started + row.job_completed = task.job_completed + row.service = task.service + row.source = task.source + row.source_id = task.source_id + row.updated_at = datetime.now(timezone.utc) + self._session.add(row) + self._session.commit() + + @private + def _to_row(self, task: Task) -> TaskRow: + return TaskRow( + id=task.id, + task_source=task.task_source, + status=task.status.value, + service=task.service, + source=task.source, + source_id=task.source_id, + job_started=task.job_started, + job_completed=task.job_completed, + ) + + @private + def _to_domain(self, row: TaskRow) -> Task: + return Task( + id=row.id, + task_source=row.task_source, + status=TaskStatus(row.status.lower()), + service=row.service, + source=row.source, + source_id=row.source_id, + job_started=row.job_started, + job_completed=row.job_completed, + ) diff --git a/repositories/tasks/task_repository.py b/repositories/tasks/task_repository.py new file mode 100644 index 00000000..8bdce0cc --- /dev/null +++ b/repositories/tasks/task_repository.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod +from uuid import UUID + +from domain.tasks.tasks import Task + + +class TaskRepository(ABC): + @abstractmethod + def create(self, task: Task) -> Task: ... + + @abstractmethod + def get(self, task_id: UUID) -> Task: ... + + @abstractmethod + def save(self, task: Task) -> None: ... diff --git a/run_backlog.sh b/run_backlog.sh deleted file mode 100644 index 398e921c..00000000 --- a/run_backlog.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -backlog browser --port 6421 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/__init__.py b/tests/domain/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/tasks/__init__.py b/tests/domain/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/tasks/test_subtasks.py b/tests/domain/tasks/test_subtasks.py new file mode 100644 index 00000000..2721d38f --- /dev/null +++ b/tests/domain/tasks/test_subtasks.py @@ -0,0 +1,75 @@ +from uuid import uuid4 + +import pytest + +from domain.tasks.subtasks import SubTask, SubTaskStatus + + +def test_create_subtask_starts_waiting() -> None: + task_id = uuid4() + + st = SubTask.create(task_id=task_id, inputs={"foo": "bar"}) + + assert st.task_id == task_id + assert st.status is SubTaskStatus.WAITING + assert st.inputs == {"foo": "bar"} + assert st.outputs is None + assert st.job_started is None + assert st.job_completed is None + + +def test_start_transitions_to_in_progress_and_sets_cloud_logs_url() -> None: + st = SubTask.create(task_id=uuid4()) + + st.start(cloud_logs_url="https://example/log") + + assert st.status is SubTaskStatus.IN_PROGRESS + assert st.cloud_logs_url == "https://example/log" + assert st.job_started is not None + + +def test_start_is_idempotent_from_in_progress() -> None: + st = SubTask.create(task_id=uuid4()) + st.start() + first_start = st.job_started + + st.start(cloud_logs_url="https://other") + + assert st.status is SubTaskStatus.IN_PROGRESS + assert st.job_started == first_start # not overwritten + assert st.cloud_logs_url == "https://other" + + +def test_start_rejects_from_terminal_status() -> None: + st = SubTask.create(task_id=uuid4()) + st.complete() + with pytest.raises(ValueError): + st.start() + + +def test_complete_marks_outputs_and_job_completed() -> None: + st = SubTask.create(task_id=uuid4()) + st.start() + + st.complete({"uprn": "123"}) + + assert st.status is SubTaskStatus.COMPLETE + assert st.outputs == {"result": {"uprn": "123"}} + assert st.job_completed is not None + + +def test_complete_without_result_leaves_outputs_unset() -> None: + st = SubTask.create(task_id=uuid4()) + st.complete() + assert st.outputs is None + + +def test_fail_records_error_in_outputs() -> None: + st = SubTask.create(task_id=uuid4()) + err = RuntimeError("boom") + + st.fail(err) + + assert st.status is SubTaskStatus.FAILED + assert st.outputs == {"error": "boom"} + assert st.job_completed is not None diff --git a/tests/domain/tasks/test_tasks.py b/tests/domain/tasks/test_tasks.py new file mode 100644 index 00000000..f30c0aa1 --- /dev/null +++ b/tests/domain/tasks/test_tasks.py @@ -0,0 +1,104 @@ +import pytest + +from domain.tasks.subtasks import SubTaskStatus +from domain.tasks.tasks import Source, Task, TaskStatus + + +def test_create_task_starts_waiting() -> None: + # Arrange / Act + t = Task.create( + task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123" + ) + + # Assert + assert t.status is TaskStatus.WAITING + assert t.source is Source.PORTFOLIO + assert t.source_id == "abc-123" + assert t.job_started is not None + assert t.job_completed is None + + +def test_create_task_rejects_blank_task_source() -> None: + with pytest.raises(ValueError, match="task_source"): + Task.create(task_source=" ") + + +def test_start_transitions_to_in_progress() -> None: + t = Task.create(task_source="manual:test") + t.start() + assert t.status is TaskStatus.IN_PROGRESS + + +def test_complete_marks_job_completed() -> None: + t = Task.create(task_source="manual:test") + t.start() + t.complete() + assert t.status is TaskStatus.COMPLETE + assert t.job_completed is not None + + +def test_fail_marks_job_completed() -> None: + t = Task.create(task_source="manual:test") + t.fail() + assert t.status is TaskStatus.FAILED + assert t.job_completed is not None + + +def test_start_rejects_from_terminal_status() -> None: + t = Task.create(task_source="manual:test") + t.complete() + with pytest.raises(ValueError): + t.start() + + +def test_recalculate_with_empty_statuses_is_noop() -> None: + t = Task.create(task_source="manual:test") + original_status = t.status + original_completed = t.job_completed + + t.recalculate_from_subtasks([]) + + assert t.status is original_status + assert t.job_completed is original_completed + + +def test_recalculate_all_waiting_keeps_waiting() -> None: + t = Task.create(task_source="manual:test") + t.start() # task moved to IN_PROGRESS earlier + t.complete() # then COMPLETE, with job_completed set + + t.recalculate_from_subtasks([SubTaskStatus.WAITING, SubTaskStatus.WAITING]) + + assert t.status is TaskStatus.WAITING + assert t.job_completed is None + + +def test_recalculate_any_in_progress_marks_in_progress() -> None: + t = Task.create(task_source="manual:test") + + t.recalculate_from_subtasks( + [SubTaskStatus.WAITING, SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE] + ) + + assert t.status is TaskStatus.IN_PROGRESS + assert t.job_completed is None + + +def test_recalculate_all_complete_marks_complete() -> None: + t = Task.create(task_source="manual:test") + + t.recalculate_from_subtasks([SubTaskStatus.COMPLETE, SubTaskStatus.COMPLETE]) + + assert t.status is TaskStatus.COMPLETE + assert t.job_completed is not None + + +def test_recalculate_any_failed_marks_failed_even_with_others() -> None: + t = Task.create(task_source="manual:test") + + t.recalculate_from_subtasks( + [SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE, SubTaskStatus.FAILED] + ) + + assert t.status is TaskStatus.FAILED + assert t.job_completed is not None diff --git a/tests/orchestration/__init__.py b/tests/orchestration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/orchestration/test_task_orchestrator.py b/tests/orchestration/test_task_orchestrator.py new file mode 100644 index 00000000..1a48127f --- /dev/null +++ b/tests/orchestration/test_task_orchestrator.py @@ -0,0 +1,151 @@ +from collections.abc import Iterator +from dataclasses import dataclass + +import pytest +from sqlmodel import Session, SQLModel, create_engine + +from domain.tasks.subtasks import SubTask, SubTaskStatus +from domain.tasks.tasks import Source, TaskStatus +from orchestration.task_orchestrator import TaskOrchestrator +from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository +from repositories.tasks.task_postgres_repository import TaskPostgresRepository + + +@dataclass +class Harness: + orchestrator: TaskOrchestrator + tasks: TaskPostgresRepository + subtasks: SubTaskPostgresRepository + + +@pytest.fixture +def harness() -> Iterator[Harness]: + engine = create_engine("sqlite://") + SQLModel.metadata.create_all(engine) + with Session(engine) as session: + tasks = TaskPostgresRepository(session=session) + subtasks = SubTaskPostgresRepository(session=session) + yield Harness( + orchestrator=TaskOrchestrator(task_repo=tasks, subtask_repo=subtasks), + tasks=tasks, + subtasks=subtasks, + ) + + +def test_create_task_with_subtask_creates_both_in_waiting( + harness: Harness, +) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test", + inputs={"foo": "bar"}, + source=Source.PORTFOLIO, + source_id="abc", + ) + + assert task.status is TaskStatus.WAITING + assert subtask.status is SubTaskStatus.WAITING + assert subtask.task_id == task.id + assert subtask.inputs == {"foo": "bar"} + + +def test_start_subtask_cascades_to_in_progress(harness: Harness) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + started = harness.orchestrator.start_subtask( + subtask.id, cloud_logs_url="https://example/log" + ) + + assert started.status is SubTaskStatus.IN_PROGRESS + assert started.cloud_logs_url == "https://example/log" + assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS + + +def test_complete_subtask_cascades_to_complete(harness: Harness) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + harness.orchestrator.start_subtask(subtask.id) + + harness.orchestrator.complete_subtask(subtask.id, {"value": 42}) + + done_subtask = harness.subtasks.get(subtask.id) + done_task = harness.tasks.get(task.id) + assert done_subtask.outputs == {"result": {"value": 42}} + assert done_task.status is TaskStatus.COMPLETE + assert done_task.job_completed is not None + + +def test_fail_subtask_cascades_to_failed(harness: Harness) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + harness.orchestrator.fail_subtask(subtask.id, RuntimeError("boom")) + + failed_subtask = harness.subtasks.get(subtask.id) + failed_task = harness.tasks.get(task.id) + assert failed_subtask.outputs == {"error": "boom"} + assert failed_task.status is TaskStatus.FAILED + + +def test_failed_subtask_locks_task_failed_even_with_others_complete( + harness: Harness, +) -> None: + task, first = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + second = SubTask.create(task_id=task.id) + harness.subtasks.create(second) + + harness.orchestrator.complete_subtask(first.id) + harness.orchestrator.fail_subtask(second.id, RuntimeError("nope")) + + assert harness.tasks.get(task.id).status is TaskStatus.FAILED + + +def test_mixed_complete_and_in_progress_keeps_task_in_progress( + harness: Harness, +) -> None: + task, first = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + second = SubTask.create(task_id=task.id) + harness.subtasks.create(second) + + harness.orchestrator.complete_subtask(first.id) + harness.orchestrator.start_subtask(second.id) + + assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS + + +def test_run_subtask_happy_path_returns_result_and_cascades_complete( + harness: Harness, +) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + result = harness.orchestrator.run_subtask(subtask.id, work=lambda: {"answer": 42}) + + assert result == {"answer": 42} + assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE + assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE + + +def test_run_subtask_failing_work_marks_failed_and_reraises( + harness: Harness, +) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + def boom() -> None: + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + harness.orchestrator.run_subtask(subtask.id, work=boom) + + assert harness.subtasks.get(subtask.id).status is SubTaskStatus.FAILED + assert harness.tasks.get(task.id).status is TaskStatus.FAILED diff --git a/tests/repositories/__init__.py b/tests/repositories/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/tasks/__init__.py b/tests/repositories/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/tasks/postgres/__init__.py b/tests/repositories/tasks/postgres/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py new file mode 100644 index 00000000..ac39e089 --- /dev/null +++ b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py @@ -0,0 +1,81 @@ +from collections.abc import Iterator +from uuid import uuid4 + +import pytest +from sqlmodel import Session, SQLModel, create_engine + +# Importing the SQLModel row modules registers their tables in +# SQLModel.metadata so create_all builds both. Imports look unused; they aren't. +import infrastructure.postgres.subtask_table # noqa: F401 # pyright: ignore[reportUnusedImport] +import infrastructure.postgres.task_table # noqa: F401 # pyright: ignore[reportUnusedImport] +from domain.tasks.subtasks import SubTask, SubTaskStatus +from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository + + +@pytest.fixture +def session() -> Iterator[Session]: + engine = create_engine("sqlite://") + SQLModel.metadata.create_all(engine) + with Session(engine) as s: + yield s + + +def test_create_and_get_round_trip_preserves_inputs(session: Session) -> None: + repo = SubTaskPostgresRepository(session=session) + task_id = uuid4() + st = SubTask.create(task_id=task_id, inputs={"address": "68 Glendon Way"}) + + repo.create(st) + fetched = repo.get(st.id) + + assert fetched.id == st.id + assert fetched.task_id == task_id + assert fetched.status is SubTaskStatus.WAITING + assert fetched.inputs == {"address": "68 Glendon Way"} + assert fetched.outputs is None + + +def test_save_persists_status_and_outputs(session: Session) -> None: + repo = SubTaskPostgresRepository(session=session) + st = SubTask.create(task_id=uuid4()) + repo.create(st) + + st.start(cloud_logs_url="https://example/log") + repo.save(st) + assert repo.get(st.id).status is SubTaskStatus.IN_PROGRESS + + st.complete({"uprn": "123"}) + repo.save(st) + done = repo.get(st.id) + assert done.status is SubTaskStatus.COMPLETE + assert done.outputs == {"result": {"uprn": "123"}} + assert done.cloud_logs_url == "https://example/log" + assert done.job_completed is not None + + +def test_list_by_task_filters_by_task_id(session: Session) -> None: + repo = SubTaskPostgresRepository(session=session) + task_a = uuid4() + task_b = uuid4() + repo.create(SubTask.create(task_id=task_a)) + repo.create(SubTask.create(task_id=task_a)) + repo.create(SubTask.create(task_id=task_b)) + + a_results = repo.list_by_task(task_a) + b_results = repo.list_by_task(task_b) + + assert len(a_results) == 2 + assert len(b_results) == 1 + assert all(s.task_id == task_a for s in a_results) + assert all(s.task_id == task_b for s in b_results) + + +def test_list_by_task_returns_empty_for_unknown_task(session: Session) -> None: + repo = SubTaskPostgresRepository(session=session) + assert repo.list_by_task(uuid4()) == [] + + +def test_get_missing_raises(session: Session) -> None: + repo = SubTaskPostgresRepository(session=session) + with pytest.raises(ValueError, match="not found"): + repo.get(uuid4()) diff --git a/tests/repositories/tasks/postgres/test_task_postgres_repository.py b/tests/repositories/tasks/postgres/test_task_postgres_repository.py new file mode 100644 index 00000000..3e1aa226 --- /dev/null +++ b/tests/repositories/tasks/postgres/test_task_postgres_repository.py @@ -0,0 +1,68 @@ +from collections.abc import Iterator +from uuid import uuid4 + +import pytest +from sqlmodel import Session, SQLModel, create_engine + +from domain.tasks.tasks import Source, Task, TaskStatus +from infrastructure.postgres.task_table import TaskRow +from repositories.tasks.task_postgres_repository import TaskPostgresRepository + + +@pytest.fixture +def session() -> Iterator[Session]: + engine = create_engine("sqlite://") + SQLModel.metadata.create_all(engine) + with Session(engine) as s: + yield s + + +def test_create_and_get_round_trip(session: Session) -> None: + # Arrange + repo = TaskPostgresRepository(session=session) + t = Task.create( + task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123" + ) + + # Act + repo.create(t) + fetched = repo.get(t.id) + + # Assert + assert fetched.id == t.id + assert fetched.status is TaskStatus.WAITING + assert fetched.source is Source.PORTFOLIO + assert fetched.source_id == "abc-123" + + +def test_save_persists_status_transition(session: Session) -> None: + repo = TaskPostgresRepository(session=session) + t = Task.create(task_source="manual:test") + repo.create(t) + + t.start() + repo.save(t) + assert repo.get(t.id).status is TaskStatus.IN_PROGRESS + + t.complete() + repo.save(t) + done = repo.get(t.id) + assert done.status is TaskStatus.COMPLETE + assert done.job_completed is not None + + +def test_get_missing_raises(session: Session) -> None: + repo = TaskPostgresRepository(session=session) + with pytest.raises(ValueError, match="not found"): + repo.get(uuid4()) + + +def test_get_normalises_legacy_capitalised_status(session: Session) -> None: + # Existing rows written by backend code use "In Progress" (capitalised). + repo = TaskPostgresRepository(session=session) + row = TaskRow(task_source="manual:test", status="In Progress") + session.add(row) + session.commit() + + fetched = repo.get(row.id) + assert fetched.status is TaskStatus.IN_PROGRESS diff --git a/utilities/__init__.py b/utilities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utilities/aws_lambda/__init__.py b/utilities/aws_lambda/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utilities/aws_lambda/default_orchestrator.py b/utilities/aws_lambda/default_orchestrator.py new file mode 100644 index 00000000..f78886b9 --- /dev/null +++ b/utilities/aws_lambda/default_orchestrator.py @@ -0,0 +1,26 @@ +import os +from collections.abc import Generator +from contextlib import contextmanager + +from sqlmodel import Session + +from infrastructure.postgres.config import PostgresConfig +from infrastructure.postgres.engine import make_engine +from orchestration.task_orchestrator import TaskOrchestrator +from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository +from repositories.tasks.task_postgres_repository import TaskPostgresRepository + + +@contextmanager +def default_orchestrator() -> Generator[TaskOrchestrator, None, None]: + """Yield a TaskOrchestrator wired to a fresh Postgres session. + + Connection params come from os.environ via PostgresConfig.from_env. Each + handler invocation gets its own session, cleaned up on context exit. + """ + engine = make_engine(PostgresConfig.from_env(dict(os.environ))) + with Session(engine) as session: + yield TaskOrchestrator( + task_repo=TaskPostgresRepository(session=session), + subtask_repo=SubTaskPostgresRepository(session=session), + ) diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py new file mode 100644 index 00000000..64c1daa6 --- /dev/null +++ b/utilities/aws_lambda/subtask_handler.py @@ -0,0 +1,67 @@ +"""@subtask_handler decorator for Lambdas that operate on existing SubTasks. + +Translates an AWS Lambda invocation (SQS-shaped or direct) into +TaskOrchestrator.run_subtask(...) calls. +""" + +import json +from contextlib import AbstractContextManager +from functools import wraps +from typing import Any, Callable, Optional, cast + +from utilities.aws_lambda.default_orchestrator import default_orchestrator +from utilities.aws_lambda.subtask_trigger_body import SubtaskTriggerBody +from orchestration.task_orchestrator import TaskOrchestrator + +OrchestratorCM = Callable[[], AbstractContextManager[TaskOrchestrator]] + + +def subtask_handler( + *, + orchestrator_cm: Optional[OrchestratorCM] = None, +) -> Callable[[Callable[..., Any]], Callable[..., Any]]: + """Run the wrapped function as the body of an existing SubTask. + + For each record, validates the body via SubtaskTriggerBody (must contain + task_id and sub_task_id), then runs the function inside + orchestrator.run_subtask(...). The orchestrator owns the start/complete/ + fail lifecycle and cascades status into the parent Task. On failure the + underlying exception propagates after the SubTask is marked FAILED. + """ + factory = orchestrator_cm or default_orchestrator + + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: + @wraps(func) + def wrapper(event: dict[str, Any], context: Any) -> None: + with factory() as orchestrator: + for record in _records(event): + body = _parse_body(record) + trigger = SubtaskTriggerBody.model_validate(body) + orchestrator.run_subtask( + trigger.sub_task_id, + work=lambda body=body: func(body, context), + ) + + return wrapper + + return decorator + + +def _parse_body(record: dict[str, Any]) -> dict[str, Any]: + raw = record.get("body", record) + if isinstance(raw, str): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return {} + return cast(dict[str, Any], parsed) if isinstance(parsed, dict) else {} + if isinstance(raw, dict): + return cast(dict[str, Any], raw) + return {} + + +def _records(event: dict[str, Any]) -> list[dict[str, Any]]: + raw_records = event.get("Records") + if isinstance(raw_records, list): + return [r for r in cast(list[Any], raw_records) if isinstance(r, dict)] + return [event] diff --git a/utilities/aws_lambda/subtask_trigger_body.py b/utilities/aws_lambda/subtask_trigger_body.py new file mode 100644 index 00000000..a6b539e5 --- /dev/null +++ b/utilities/aws_lambda/subtask_trigger_body.py @@ -0,0 +1,17 @@ +from uuid import UUID + +from pydantic import BaseModel, ConfigDict + + +class SubtaskTriggerBody(BaseModel): + """The minimum the subtask_handler needs to dispatch lifecycle calls. + + `extra="allow"` so the rest of the work payload passes through to the + decorated function untouched — handlers do their own model_validate on + the full body for fields specific to their use case. + """ + + model_config = ConfigDict(extra="allow") + + task_id: UUID + sub_task_id: UUID diff --git a/utilities/aws_lambda/task_handler.py b/utilities/aws_lambda/task_handler.py new file mode 100644 index 00000000..82c7198e --- /dev/null +++ b/utilities/aws_lambda/task_handler.py @@ -0,0 +1,98 @@ +"""@task_handler decorator for Lambdas that own the entire pipeline. + +Translates an AWS Lambda invocation (SQS-shaped or direct) into +TaskOrchestrator.create_task_with_subtask(...) + run_subtask(...). +""" + +import json +from contextlib import AbstractContextManager +from functools import wraps +from typing import Any, Callable, Optional, cast + +from utilities.aws_lambda.default_orchestrator import default_orchestrator +from domain.tasks.tasks import Source +from orchestration.task_orchestrator import TaskOrchestrator + +OrchestratorCM = Callable[[], AbstractContextManager[TaskOrchestrator]] + + +def task_handler( + *, + task_source: str, + source: Source, + orchestrator_cm: Optional[OrchestratorCM] = None, +) -> Callable[[Callable[..., Any]], Callable[..., Any]]: + """Run the wrapped function as the body of a freshly-created Task + SubTask. + + For each record, creates a new Task + initial SubTask, then runs the + wrapped function inside orchestrator.run_subtask(...). `source_id` is + read from body[source.value] (silent None if absent — preserved from + legacy ADR-0001). + + Records-style events use SQS partial-batch-failure semantics: individual + failures are reported via {"batchItemFailures": [...]} rather than + propagating. Direct invocations re-raise. + """ + factory = orchestrator_cm or default_orchestrator + + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: + @wraps(func) + def wrapper(event: dict[str, Any], context: Any) -> Any: + with factory() as orchestrator: + results: list[Any] = [] + failures: list[dict[str, Any]] = [] + + for record in _records(event): + body = _parse_body(record) + raw_source_id = body.get(source.value) + source_id = ( + str(raw_source_id) if raw_source_id is not None else None + ) + + _, subtask = orchestrator.create_task_with_subtask( + task_source=task_source, + inputs=body, + source=source, + source_id=source_id, + ) + + try: + result = orchestrator.run_subtask( + subtask.id, + work=lambda body=body: func(body, context), + ) + results.append(result) + except Exception: + if "Records" in event: + message_id = record.get("messageId", "") + failures.append({"itemIdentifier": message_id}) + else: + raise + + if "Records" in event: + return {"batchItemFailures": failures} + return results + + return wrapper + + return decorator + + +def _parse_body(record: dict[str, Any]) -> dict[str, Any]: + raw = record.get("body", record) + if isinstance(raw, str): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return {} + return cast(dict[str, Any], parsed) if isinstance(parsed, dict) else {} + if isinstance(raw, dict): + return cast(dict[str, Any], raw) + return {} + + +def _records(event: dict[str, Any]) -> list[dict[str, Any]]: + raw_records = event.get("Records") + if isinstance(raw_records, list): + return [r for r in cast(list[Any], raw_records) if isinstance(r, dict)] + return [event] diff --git a/utilities/private.py b/utilities/private.py new file mode 100644 index 00000000..77a70578 --- /dev/null +++ b/utilities/private.py @@ -0,0 +1,33 @@ +import inspect +from typing import Any, Callable + + +class private: + """Decorator that raises if a _-prefixed method is called from outside its class.""" + + func: Callable[..., Any] + name: str + owner: type + + def __init__(self, func: Callable[..., Any]) -> None: + self.func = func + self.name = getattr(func, "__name__", "") + + def __set_name__(self, owner: type, name: str) -> None: + self.owner = owner + + def __get__(self, instance: Any, owner: type) -> Callable[..., Any]: + # Walk up one frame to see who's calling + frame = inspect.currentframe() + if frame is None or frame.f_back is None: + raise RuntimeError("cannot inspect caller frame") + caller_frame = frame.f_back + caller_self = caller_frame.f_locals.get("self") + + if not isinstance(caller_self, self.owner): + raise RuntimeError( + f"{self.owner.__name__}.{self.name} is private; " + f"called from {caller_frame.f_code.co_name}" + ) + + return getattr(self.func, "__get__")(instance, owner) From 6198d7a46db83ecf2b74e2b260fd0b0923010b39 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 16:45:47 +0000 Subject: [PATCH 70/91] postcode_splitter: pure domain (UserAddress, sanitise_postcode, postcode_batching) Slice 1/6 of the postcode_splitter refactor (Hestia-Homes/Model#1100). Introduces the pure-domain foundation under domain/, with no AWS, Postgres, or pandas. UserAddress is a frozen dataclass that sanitises its postcode in __post_init__ via the canonical sanitise_postcode helper, and iter_postcode_grouped_batches preserves the legacy splitter's batching invariants (group-by-postcode in insertion order, never split a group, oversize single-postcode groups dispatched whole, final flush). Updates UBIQUITOUS_LANGUAGE.md so the User Address term covers both the dataclass sense (preferred in domain code) and the raw upstream-string sense. Co-Authored-By: Claude Opus 4.7 (1M context) --- UBIQUITOUS_LANGUAGE.md | 4 +- domain/addresses/__init__.py | 0 domain/addresses/postcode_batching.py | 87 +++++++++++++++++ domain/addresses/user_address.py | 36 +++++++ domain/postcodes/__init__.py | 0 domain/postcodes/sanitise.py | 23 +++++ tests/domain/addresses/__init__.py | 0 .../addresses/test_postcode_batching.py | 93 +++++++++++++++++++ tests/domain/addresses/test_user_address.py | 45 +++++++++ tests/domain/postcodes/__init__.py | 0 tests/domain/postcodes/test_sanitise.py | 28 ++++++ 11 files changed, 314 insertions(+), 2 deletions(-) create mode 100644 domain/addresses/__init__.py create mode 100644 domain/addresses/postcode_batching.py create mode 100644 domain/addresses/user_address.py create mode 100644 domain/postcodes/__init__.py create mode 100644 domain/postcodes/sanitise.py create mode 100644 tests/domain/addresses/__init__.py create mode 100644 tests/domain/addresses/test_postcode_batching.py create mode 100644 tests/domain/addresses/test_user_address.py create mode 100644 tests/domain/postcodes/__init__.py create mode 100644 tests/domain/postcodes/test_sanitise.py diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index 1765cbc8..c3074c02 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -23,7 +23,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve |------|------------|------------------| | **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" | | **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | -| **User Address** | A free-text address string provided by a user or imported from a customer dataset, before any normalisation or matching. | "user input", "raw address", "user_inputed_address" | +| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" | | **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | ## Address Matching @@ -72,7 +72,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve ## Flagged ambiguities -- **"address"** appears as both the raw **User Address** (free-text from customer data) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". +- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense. - **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments. - **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`. - **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter. diff --git a/domain/addresses/__init__.py b/domain/addresses/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py new file mode 100644 index 00000000..209e0784 --- /dev/null +++ b/domain/addresses/postcode_batching.py @@ -0,0 +1,87 @@ +"""Pure-Python postcode-grouped batching. + +This module preserves the batching invariants from the legacy postcode +splitter (``backend/postcode_splitter/main.py``) without touching pandas, +S3, or SQS: + + * Addresses are grouped by **Postcode** in *insertion order* -- the first + Postcode seen produces the first group. + * A Postcode group is never split across two batches. + * If a single Postcode group is larger than ``max_batch_size``, it is + flushed as its own oversize batch (any buffered groups go out first, + untouched). + * Adding a group that would push the buffer past ``max_batch_size`` first + flushes the existing buffer, then starts a new buffer with the group. + * Whatever remains in the buffer after the input is exhausted is flushed + as the final batch. + * Empty input yields no batches. +""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator + +from domain.addresses.user_address import UserAddress + + +def iter_postcode_grouped_batches( + addresses: Iterable[UserAddress], + *, + max_batch_size: int = 500, +) -> Iterator[list[UserAddress]]: + """Yield batches of ``UserAddress`` grouped by Postcode. + + Args: + addresses: An iterable of :class:`UserAddress`. Order is preserved + within each Postcode group, and groups are yielded in the order + their first member was seen. + max_batch_size: The soft upper bound on batch size, in number of + addresses. A single Postcode group larger than this cap is + dispatched whole (the cap is never used to split a group). + + Yields: + Lists of ``UserAddress``. Each list is non-empty. + """ + if max_batch_size < 1: + raise ValueError("max_batch_size must be >= 1") + + groups = _group_by_postcode_in_order(addresses) + + buffer: list[UserAddress] = [] + for group in groups.values(): + group_len = len(group) + + # Oversize single-Postcode group: flush buffer first, then dispatch + # the group as its own batch. Mirrors the legacy + # ``if group_len >= batch_size`` branch. + if group_len >= max_batch_size: + if buffer: + yield buffer + buffer = [] + yield group + continue + + # Adding this group would overflow: flush buffer before appending. + if len(buffer) + group_len > max_batch_size: + yield buffer + buffer = [] + + buffer.extend(group) + + # Final flush. + if buffer: + yield buffer + + +def _group_by_postcode_in_order( + addresses: Iterable[UserAddress], +) -> dict[str, list[UserAddress]]: + """Group addresses by ``postcode`` preserving first-seen order. + + Python dicts retain insertion order since 3.7, so a plain dict suffices + for the same effect as pandas ``groupby(..., sort=False)``. + """ + groups: dict[str, list[UserAddress]] = {} + for address in addresses: + groups.setdefault(address.postcode, []).append(address) + return groups diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py new file mode 100644 index 00000000..e48dfdec --- /dev/null +++ b/domain/addresses/user_address.py @@ -0,0 +1,36 @@ +"""The :class:`UserAddress` value object. + +A frozen dataclass capturing the splitter's domain entity: the raw input +address line, a sanitised postcode, and an optional internal reference from +the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no +caller can construct an instance with an un-normalised postcode. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +from domain.postcodes.sanitise import sanitise_postcode + + +@dataclass(frozen=True) +class UserAddress: + """A user-supplied address paired with its canonical postcode. + + Attributes: + user_address: The free-text address string as supplied upstream. + postcode: The postcode; always stored in canonical form + (uppercased, whitespace stripped). Sanitisation is enforced by + :meth:`__post_init__`. + internal_reference: Optional customer-side identifier preserved for + traceability through the matching pipeline. + """ + + user_address: str + postcode: str + internal_reference: Optional[str] = None + + def __post_init__(self) -> None: + # Frozen dataclass: bypass the descriptor with object.__setattr__. + object.__setattr__(self, "postcode", sanitise_postcode(self.postcode)) diff --git a/domain/postcodes/__init__.py b/domain/postcodes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/domain/postcodes/sanitise.py b/domain/postcodes/sanitise.py new file mode 100644 index 00000000..94b0dcf7 --- /dev/null +++ b/domain/postcodes/sanitise.py @@ -0,0 +1,23 @@ +"""Canonical postcode sanitisation for the domain layer. + +The legacy postcode_splitter normalises postcodes inline with +``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes +that operation to a pure, reusable function so the same canonical form is +applied wherever a postcode crosses a domain boundary -- including +:class:`domain.addresses.user_address.UserAddress` construction and future +migrations. +""" + +from __future__ import annotations + + +def sanitise_postcode(s: str) -> str: + """Return the canonical form of a postcode. + + The canonical form is uppercase with all whitespace removed. This matches + the legacy splitter's ``str.upper().str.replace(" ", "")`` for the + overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"`` + becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep + in from CSV ingestion. + """ + return "".join(s.split()).upper() diff --git a/tests/domain/addresses/__init__.py b/tests/domain/addresses/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py new file mode 100644 index 00000000..2dac46cc --- /dev/null +++ b/tests/domain/addresses/test_postcode_batching.py @@ -0,0 +1,93 @@ +import pytest + +from domain.addresses.postcode_batching import iter_postcode_grouped_batches +from domain.addresses.user_address import UserAddress + + +def _addrs(postcode: str, n: int) -> list[UserAddress]: + """Build ``n`` addresses sharing a postcode, with distinct address lines.""" + return [ + UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode) + for i in range(n) + ] + + +def test_empty_input_yields_no_batches() -> None: + assert list(iter_postcode_grouped_batches([])) == [] + + +def test_single_batch_under_cap() -> None: + addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500)) + assert len(batches) == 1 + assert batches[0] == addrs + + +def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None: + # Two groups whose total exactly equals the cap pack into a single + # batch -- no premature flush. + addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + assert len(batches) == 1 + assert len(batches[0]) == 5 + + +def test_flush_on_overflow_before_adding_next_postcode() -> None: + # Cap is 5. First group fills 3 slots; second group of 3 would overflow, + # so the buffer is flushed first and the next group starts a fresh batch. + addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + assert len(batches) == 2 + assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3 + assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3 + + +def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None: + # An oversize single-postcode group goes out as one batch larger than + # the cap -- the cap never splits a postcode. + addrs = _addrs("AA1 1AA", 7) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + assert len(batches) == 1 + assert len(batches[0]) == 7 + + +def test_oversize_group_flushes_existing_buffer_first() -> None: + # Mirrors the legacy ``if buffer: flush`` branch when an oversize group + # is encountered: buffered work must not be lost or interleaved. + small = _addrs("AA1 1AA", 2) + big = _addrs("BB2 2BB", 7) + tail = _addrs("CC3 3CC", 1) + batches = list( + iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) + ) + assert len(batches) == 3 + assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"] + assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7 + assert [a.postcode for a in batches[2]] == ["CC33CC"] + + +def test_final_flush_yields_remaining_buffer() -> None: + # No overflow ever happens, but the trailing buffer must still come out. + addrs = _addrs("AA1 1AA", 2) + _addrs("BB2 2BB", 2) + batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500)) + assert batches == [addrs] + + +def test_postcode_grouping_preserves_first_seen_order() -> None: + # Interleaved input must still group by postcode and emit in first-seen + # order -- never alphabetical. + a1, a2 = _addrs("ZZ9 9ZZ", 2) + b1, b2 = _addrs("AA1 1AA", 2) + batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2])) + assert len(batches) == 1 + assert [a.postcode for a in batches[0]] == [ + "ZZ99ZZ", + "ZZ99ZZ", + "AA11AA", + "AA11AA", + ] + + +def test_invalid_max_batch_size_raises() -> None: + with pytest.raises(ValueError, match="max_batch_size"): + list(iter_postcode_grouped_batches([], max_batch_size=0)) diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py new file mode 100644 index 00000000..e722077d --- /dev/null +++ b/tests/domain/addresses/test_user_address.py @@ -0,0 +1,45 @@ +import dataclasses + +import pytest + +from domain.addresses.user_address import UserAddress + + +def test_user_address_sanitises_postcode_on_construction() -> None: + addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") + assert addr.postcode == "SW1A1AA" + + +def test_user_address_preserves_user_address_verbatim() -> None: + # The free-text user_address string is intentionally NOT normalised -- + # only the postcode is canonicalised at the boundary. + addr = UserAddress(user_address=" 1 The Street ", postcode="sw1a 1aa") + assert addr.user_address == " 1 The Street " + + +def test_user_address_internal_reference_defaults_to_none() -> None: + addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + assert addr.internal_reference is None + + +def test_user_address_internal_reference_accepted() -> None: + addr = UserAddress( + user_address="1 The Street", + postcode="SW1A1AA", + internal_reference="cust-42", + ) + assert addr.internal_reference == "cust-42" + + +def test_user_address_is_frozen() -> None: + addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + with pytest.raises(dataclasses.FrozenInstanceError): + addr.postcode = "OTHER" # type: ignore[misc] + + +def test_user_address_equality_uses_sanitised_postcode() -> None: + # Two instances constructed with different surface forms of the same + # postcode must compare equal because sanitisation runs eagerly. + a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") + b = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + assert a == b diff --git a/tests/domain/postcodes/__init__.py b/tests/domain/postcodes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/domain/postcodes/test_sanitise.py b/tests/domain/postcodes/test_sanitise.py new file mode 100644 index 00000000..edd1679c --- /dev/null +++ b/tests/domain/postcodes/test_sanitise.py @@ -0,0 +1,28 @@ +from domain.postcodes.sanitise import sanitise_postcode + + +def test_sanitise_uppercases() -> None: + assert sanitise_postcode("sw1a1aa") == "SW1A1AA" + + +def test_sanitise_strips_internal_spaces() -> None: + assert sanitise_postcode("sw1a 1aa") == "SW1A1AA" + + +def test_sanitise_strips_leading_and_trailing_whitespace() -> None: + assert sanitise_postcode(" sw1a 1aa ") == "SW1A1AA" + + +def test_sanitise_strips_tabs_and_newlines() -> None: + # CSV ingestion occasionally introduces stray whitespace characters; the + # canonical form must absorb them just like literal spaces. + assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA" + + +def test_sanitise_already_canonical_is_idempotent() -> None: + assert sanitise_postcode("SW1A1AA") == "SW1A1AA" + assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA" + + +def test_sanitise_empty_string() -> None: + assert sanitise_postcode("") == "" From 7b00a33cd242e9959ac47e4e207d67477d53b8a2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 17:12:21 +0000 Subject: [PATCH 71/91] infrastructure: typed S3/SQS clients (S3Client, CsvS3Client, SqsClient, Address2UprnQueueClient) Slice 3/6 of the postcode_splitter refactor (Hestia-Homes/Model#1101). Introduces a thin typed infrastructure layer wrapping boto3 for the AWS side of the splitter. S3Client/SqsClient are bucket-/queue-bound byte adapters; CsvS3Client subclasses S3Client to round-trip CSV row dicts via the existing parse_s3_uri helper in utils/s3.py; Address2UprnQueueClient subclasses SqsClient to publish the typed {task_id, sub_task_id, s3_uri} fan-out body the downstream consumer expects. moto[s3,sqs] is pulled into test.requirements.txt and the new tests/infrastructure/ suite exercises each client against the moto backend (S3 round-trip, CSV round-trip, SQS send + body inspection, typed publish + body inspection). pyright --strict is clean on the new modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- infrastructure/address2uprn_queue_client.py | 27 ++++++++ infrastructure/csv_s3_client.py | 46 +++++++++++++ infrastructure/s3_client.py | 31 +++++++++ infrastructure/sqs_client.py | 28 ++++++++ test.requirements.txt | 3 +- tests/infrastructure/__init__.py | 17 +++++ tests/infrastructure/conftest.py | 32 +++++++++ .../test_address2uprn_queue_client.py | 65 +++++++++++++++++++ tests/infrastructure/test_csv_s3_client.py | 43 ++++++++++++ tests/infrastructure/test_s3_client.py | 31 +++++++++ tests/infrastructure/test_sqs_client.py | 38 +++++++++++ 11 files changed, 360 insertions(+), 1 deletion(-) create mode 100644 infrastructure/address2uprn_queue_client.py create mode 100644 infrastructure/csv_s3_client.py create mode 100644 infrastructure/s3_client.py create mode 100644 infrastructure/sqs_client.py create mode 100644 tests/infrastructure/__init__.py create mode 100644 tests/infrastructure/conftest.py create mode 100644 tests/infrastructure/test_address2uprn_queue_client.py create mode 100644 tests/infrastructure/test_csv_s3_client.py create mode 100644 tests/infrastructure/test_s3_client.py create mode 100644 tests/infrastructure/test_sqs_client.py diff --git a/infrastructure/address2uprn_queue_client.py b/infrastructure/address2uprn_queue_client.py new file mode 100644 index 00000000..d81e2dd1 --- /dev/null +++ b/infrastructure/address2uprn_queue_client.py @@ -0,0 +1,27 @@ +from uuid import UUID + +from infrastructure.sqs_client import SqsClient + + +class Address2UprnQueueClient(SqsClient): + """SQS client that publishes Address-to-UPRN fan-out messages. + + The body shape is fixed by the downstream consumer: + ``{"task_id": str, "sub_task_id": str, "s3_uri": str}`` + """ + + def publish( + self, + *, + parent_task_id: UUID, + child_subtask_id: UUID, + s3_uri: str, + ) -> str: + """Send a typed Address-to-UPRN message. Returns the SQS ``MessageId``.""" + return self.send( + { + "task_id": str(parent_task_id), + "sub_task_id": str(child_subtask_id), + "s3_uri": s3_uri, + } + ) diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py new file mode 100644 index 00000000..5163705b --- /dev/null +++ b/infrastructure/csv_s3_client.py @@ -0,0 +1,46 @@ +import csv +from io import StringIO + +from infrastructure.s3_client import S3Client +from utils.s3 import parse_s3_uri + + +class CsvS3Client(S3Client): + """:class:`S3Client` subclass that round-trips CSV row dictionaries. + + Rows are represented as ``list[dict[str, str]]`` — the same shape used by + :func:`csv.DictReader`/``DictWriter`` — which keeps the API trivially + compatible with existing CSV helpers in ``utils/s3.py``. + """ + + def read_rows(self, s3_uri: str) -> list[dict[str, str]]: + """Fetch the object at ``s3_uri`` and decode it as a CSV. + + The bucket portion of the URI is validated against this client's + configured bucket so cross-bucket reads fail loudly rather than + silently fetching from the wrong place. + """ + bucket, key = parse_s3_uri(s3_uri) + if bucket != self.bucket: + raise ValueError( + f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}" + ) + raw = self.get_object(key) + text = raw.decode("utf-8-sig") + reader = csv.DictReader(StringIO(text)) + return [dict(row) for row in reader] + + def save_rows(self, rows: list[dict[str, str]], key: str) -> str: + """Serialise ``rows`` to CSV under ``key`` and return the ``s3://`` URI. + + An empty ``rows`` list is rejected because we cannot otherwise infer + a header row. + """ + if not rows: + raise ValueError("Cannot save an empty rows list: header is unknown") + buffer = StringIO() + fieldnames = list(rows[0].keys()) + writer = csv.DictWriter(buffer, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + return self.put_object(key, buffer.getvalue().encode("utf-8")) diff --git a/infrastructure/s3_client.py b/infrastructure/s3_client.py new file mode 100644 index 00000000..9e772881 --- /dev/null +++ b/infrastructure/s3_client.py @@ -0,0 +1,31 @@ +from typing import Any + + +class S3Client: + """Thin typed wrapper around a boto3 S3 client bound to a single bucket. + + The class is deliberately small: it exposes only the byte-level + operations needed by the wider infrastructure layer. Serialisation + (CSV, JSON, etc.) lives in subclasses such as :class:`CsvS3Client`. + """ + + def __init__(self, boto_s3_client: Any, bucket: str) -> None: + self._client = boto_s3_client + self._bucket = bucket + + @property + def bucket(self) -> str: + return self._bucket + + def get_object(self, key: str) -> bytes: + """Return the raw bytes stored at ``key`` in this client's bucket.""" + response: dict[str, Any] = self._client.get_object( + Bucket=self._bucket, Key=key + ) + body: bytes = response["Body"].read() + return body + + def put_object(self, key: str, body: bytes) -> str: + """Write ``body`` to ``key`` and return the canonical ``s3://`` URI.""" + self._client.put_object(Bucket=self._bucket, Key=key, Body=body) + return f"s3://{self._bucket}/{key}" diff --git a/infrastructure/sqs_client.py b/infrastructure/sqs_client.py new file mode 100644 index 00000000..fb053680 --- /dev/null +++ b/infrastructure/sqs_client.py @@ -0,0 +1,28 @@ +import json +from typing import Any + + +class SqsClient: + """Thin typed wrapper around a boto3 SQS client bound to one queue URL. + + The body is JSON-serialised here so callers can pass plain dictionaries + instead of constructing message strings themselves. Typed publish + helpers (e.g. :class:`Address2UprnQueueClient`) build on this contract. + """ + + def __init__(self, boto_sqs_client: Any, queue_url: str) -> None: + self._client = boto_sqs_client + self._queue_url = queue_url + + @property + def queue_url(self) -> str: + return self._queue_url + + def send(self, body: dict[str, Any]) -> str: + """JSON-serialise ``body`` and send it. Returns the SQS ``MessageId``.""" + response: dict[str, Any] = self._client.send_message( + QueueUrl=self._queue_url, + MessageBody=json.dumps(body), + ) + message_id: str = response["MessageId"] + return message_id diff --git a/test.requirements.txt b/test.requirements.txt index 7fdd7dc4..26125034 100644 --- a/test.requirements.txt +++ b/test.requirements.txt @@ -9,4 +9,5 @@ hubspot-api-client fuzzywuzzy pymupdf playwright==1.58.0 -msal \ No newline at end of file +msal +moto[s3,sqs] \ No newline at end of file diff --git a/tests/infrastructure/__init__.py b/tests/infrastructure/__init__.py new file mode 100644 index 00000000..3478bda9 --- /dev/null +++ b/tests/infrastructure/__init__.py @@ -0,0 +1,17 @@ +from typing import Any + +import boto3 + +REGION = "us-east-1" + + +def make_boto_client(service_name: str) -> Any: + """Construct a boto3 client typed as ``Any``. + + boto3's overloaded ``client`` signature uses ``Literal[...]`` per service + in the installed stubs, which forces every call site to satisfy + ``reportArgumentType`` and ``reportUnknownMemberType`` under strict + pyright. Centralising the cast keeps each test file clean. + """ + factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + return factory(service_name, region_name=REGION) diff --git a/tests/infrastructure/conftest.py b/tests/infrastructure/conftest.py new file mode 100644 index 00000000..7ed2fdd6 --- /dev/null +++ b/tests/infrastructure/conftest.py @@ -0,0 +1,32 @@ +import os +from collections.abc import Iterator +from typing import Optional + +import pytest + + +@pytest.fixture(autouse=True) +def _aws_creds() -> Iterator[None]: # pyright: ignore[reportUnusedFunction] + """Stub AWS creds so botocore doesn't probe the host environment. + + Applied automatically to every test in ``tests/infrastructure/``. + """ + keys = ( + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + "AWS_DEFAULT_REGION", + ) + prev: dict[str, Optional[str]] = {k: os.environ.get(k) for k in keys} + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + try: + yield + finally: + for k, v in prev.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v diff --git a/tests/infrastructure/test_address2uprn_queue_client.py b/tests/infrastructure/test_address2uprn_queue_client.py new file mode 100644 index 00000000..b4114742 --- /dev/null +++ b/tests/infrastructure/test_address2uprn_queue_client.py @@ -0,0 +1,65 @@ +import json +from collections.abc import Iterator +from typing import Any, cast +from uuid import uuid4 + +import pytest +from moto import mock_aws + +from infrastructure.address2uprn_queue_client import Address2UprnQueueClient +from tests.infrastructure import make_boto_client + + +@pytest.fixture +def queue_setup() -> Iterator[tuple[Address2UprnQueueClient, Any, str]]: + with mock_aws(): + boto_client = make_boto_client("sqs") + queue: dict[str, Any] = boto_client.create_queue( + QueueName="address2uprn-queue" + ) + queue_url = cast(str, queue["QueueUrl"]) + yield ( + Address2UprnQueueClient(boto_client, queue_url), + boto_client, + queue_url, + ) + + +def test_publish_returns_message_id( + queue_setup: tuple[Address2UprnQueueClient, Any, str], +) -> None: + client, _boto, _url = queue_setup + message_id = client.publish( + parent_task_id=uuid4(), + child_subtask_id=uuid4(), + s3_uri="s3://my-bucket/path/to/chunk.csv", + ) + assert isinstance(message_id, str) + assert message_id + + +def test_publish_body_uses_typed_shape( + queue_setup: tuple[Address2UprnQueueClient, Any, str], +) -> None: + client, boto_client, queue_url = queue_setup + parent_id = uuid4() + child_id = uuid4() + s3_uri = "s3://my-bucket/path/to/chunk.csv" + + client.publish( + parent_task_id=parent_id, + child_subtask_id=child_id, + s3_uri=s3_uri, + ) + + received: dict[str, Any] = boto_client.receive_message( + QueueUrl=queue_url, MaxNumberOfMessages=1 + ) + messages: list[dict[str, Any]] = received["Messages"] + assert len(messages) == 1 + body = json.loads(messages[0]["Body"]) + assert body == { + "task_id": str(parent_id), + "sub_task_id": str(child_id), + "s3_uri": s3_uri, + } diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py new file mode 100644 index 00000000..4b9fc199 --- /dev/null +++ b/tests/infrastructure/test_csv_s3_client.py @@ -0,0 +1,43 @@ +from collections.abc import Iterator + +import pytest +from moto import mock_aws + +from infrastructure.csv_s3_client import CsvS3Client +from tests.infrastructure import make_boto_client + +BUCKET = "csv-bucket" + + +@pytest.fixture +def csv_client() -> Iterator[CsvS3Client]: + with mock_aws(): + boto_client = make_boto_client("s3") + boto_client.create_bucket(Bucket=BUCKET) + yield CsvS3Client(boto_client, BUCKET) + + +def test_save_rows_returns_s3_uri(csv_client: CsvS3Client) -> None: + rows = [{"address": "1 High St", "postcode": "AB1 2CD"}] + uri = csv_client.save_rows(rows, "uploads/addresses.csv") + assert uri == f"s3://{BUCKET}/uploads/addresses.csv" + + +def test_round_trip_preserves_rows(csv_client: CsvS3Client) -> None: + rows = [ + {"address": "1 High St", "postcode": "AB1 2CD"}, + {"address": "2 Low St", "postcode": "XY9 8ZW"}, + ] + uri = csv_client.save_rows(rows, "uploads/addresses.csv") + fetched = csv_client.read_rows(uri) + assert fetched == rows + + +def test_save_rows_rejects_empty_list(csv_client: CsvS3Client) -> None: + with pytest.raises(ValueError, match="empty"): + csv_client.save_rows([], "uploads/empty.csv") + + +def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None: + with pytest.raises(ValueError, match="does not match client bucket"): + csv_client.read_rows("s3://other-bucket/uploads/addresses.csv") diff --git a/tests/infrastructure/test_s3_client.py b/tests/infrastructure/test_s3_client.py new file mode 100644 index 00000000..7ed4c30b --- /dev/null +++ b/tests/infrastructure/test_s3_client.py @@ -0,0 +1,31 @@ +from collections.abc import Iterator + +import pytest +from moto import mock_aws + +from infrastructure.s3_client import S3Client +from tests.infrastructure import make_boto_client + +BUCKET = "test-bucket" + + +@pytest.fixture +def s3_client() -> Iterator[S3Client]: + with mock_aws(): + boto_client = make_boto_client("s3") + boto_client.create_bucket(Bucket=BUCKET) + yield S3Client(boto_client, BUCKET) + + +def test_put_object_returns_s3_uri(s3_client: S3Client) -> None: + uri = s3_client.put_object("folder/data.bin", b"payload") + assert uri == f"s3://{BUCKET}/folder/data.bin" + + +def test_get_object_returns_bytes_written_by_put_object(s3_client: S3Client) -> None: + s3_client.put_object("round/trip.bin", b"hello world") + assert s3_client.get_object("round/trip.bin") == b"hello world" + + +def test_bucket_property_exposes_configured_bucket(s3_client: S3Client) -> None: + assert s3_client.bucket == BUCKET diff --git a/tests/infrastructure/test_sqs_client.py b/tests/infrastructure/test_sqs_client.py new file mode 100644 index 00000000..7f1e8f78 --- /dev/null +++ b/tests/infrastructure/test_sqs_client.py @@ -0,0 +1,38 @@ +import json +from collections.abc import Iterator +from typing import Any, cast + +import pytest +from moto import mock_aws + +from infrastructure.sqs_client import SqsClient +from tests.infrastructure import make_boto_client + + +@pytest.fixture +def sqs_setup() -> Iterator[tuple[SqsClient, Any, str]]: + with mock_aws(): + boto_client = make_boto_client("sqs") + queue: dict[str, Any] = boto_client.create_queue(QueueName="test-queue") + queue_url = cast(str, queue["QueueUrl"]) + yield SqsClient(boto_client, queue_url), boto_client, queue_url + + +def test_send_returns_message_id(sqs_setup: tuple[SqsClient, Any, str]) -> None: + client, _boto, _url = sqs_setup + message_id = client.send({"hello": "world"}) + assert isinstance(message_id, str) + assert message_id + + +def test_send_json_serialises_body(sqs_setup: tuple[SqsClient, Any, str]) -> None: + client, boto_client, queue_url = sqs_setup + body = {"hello": "world", "count": 3} + client.send(body) + + received: dict[str, Any] = boto_client.receive_message( + QueueUrl=queue_url, MaxNumberOfMessages=1 + ) + messages: list[dict[str, Any]] = received["Messages"] + assert len(messages) == 1 + assert json.loads(messages[0]["Body"]) == body From d7f14033ba76b355543ded5fb3ced93e0411b2ae Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 17:19:41 +0000 Subject: [PATCH 72/91] orchestration: add TaskOrchestrator.create_child_subtask primitive Adds a primitive for creating a new WAITING SubTask under an existing parent Task, routing all SubTask creation through the orchestrator (replacing the legacy SubTaskInterface path used by the splitter). Skips _cascade because a new WAITING child against an IN_PROGRESS parent is a no-op under Task.recalculate_from_subtasks. --- orchestration/task_orchestrator.py | 16 ++++++++++++++ tests/orchestration/test_task_orchestrator.py | 22 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/orchestration/task_orchestrator.py b/orchestration/task_orchestrator.py index 6c67d1ce..82d95db1 100644 --- a/orchestration/task_orchestrator.py +++ b/orchestration/task_orchestrator.py @@ -48,6 +48,22 @@ class TaskOrchestrator: self._subtasks.create(subtask) return task, subtask + def create_child_subtask( + self, + parent_task_id: UUID, + *, + inputs: Optional[dict[str, Any]] = None, + ) -> SubTask: + """Add a new WAITING SubTask under an existing parent Task. + + Skips `_cascade`: a new WAITING child against an IN_PROGRESS parent + leaves the parent's status unchanged per `Task.recalculate_from_subtasks`, + so calling it here would be a no-op. + """ + subtask = SubTask.create(task_id=parent_task_id, inputs=inputs) + self._subtasks.create(subtask) + return subtask + def start_subtask( self, subtask_id: UUID, cloud_logs_url: Optional[str] = None ) -> SubTask: diff --git a/tests/orchestration/test_task_orchestrator.py b/tests/orchestration/test_task_orchestrator.py index 1a48127f..c0816d2d 100644 --- a/tests/orchestration/test_task_orchestrator.py +++ b/tests/orchestration/test_task_orchestrator.py @@ -134,6 +134,28 @@ def test_run_subtask_happy_path_returns_result_and_cascades_complete( assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE +def test_create_child_subtask_adds_waiting_child_without_changing_parent_status( + harness: Harness, +) -> None: + task, first = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + harness.orchestrator.start_subtask(first.id) + assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS + + child = harness.orchestrator.create_child_subtask( + task.id, inputs={"split": "a"} + ) + + persisted_child = harness.subtasks.get(child.id) + assert persisted_child.task_id == task.id + assert persisted_child.status is SubTaskStatus.WAITING + assert persisted_child.inputs == {"split": "a"} + assert persisted_child.id != first.id + # Cascade is a no-op: parent stays IN_PROGRESS. + assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS + + def test_run_subtask_failing_work_marks_failed_and_reraises( harness: Harness, ) -> None: From d70e8a9e53706cd09f2cc82b85ce499fed97bda2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 17:31:27 +0000 Subject: [PATCH 73/91] utilities/aws_lambda: @subtask_handler injects TaskOrchestrator as third positional arg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wrapped function now receives the decorator-owned TaskOrchestrator as a third positional argument so handlers can compose their own use-case orchestrator that shares the session, instead of opening a second Postgres connection per invocation. Both existing callers (backend/ordnanceSurvey/main.py and backend/bulk_address2uprn_combiner/main.py) have their signatures extended to accept the new positional argument (typed Optional[TaskOrchestrator] so the legacy backend.utils.subtasks.subtask_handler — which only passes two args — keeps working until the migration to the new decorator lands). @task_handler is intentionally unchanged in this slice; symmetry is deferred per issue #1103. --- backend/bulk_address2uprn_combiner/main.py | 14 +- backend/ordnanceSurvey/main.py | 12 +- tests/utilities/__init__.py | 0 tests/utilities/aws_lambda/__init__.py | 0 .../aws_lambda/test_subtask_handler.py | 144 ++++++++++++++++++ utilities/aws_lambda/subtask_handler.py | 2 +- 6 files changed, 168 insertions(+), 4 deletions(-) create mode 100644 tests/utilities/__init__.py create mode 100644 tests/utilities/aws_lambda/__init__.py create mode 100644 tests/utilities/aws_lambda/test_subtask_handler.py diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py index 44f0b3f9..37136e52 100644 --- a/backend/bulk_address2uprn_combiner/main.py +++ b/backend/bulk_address2uprn_combiner/main.py @@ -2,7 +2,7 @@ import os import boto3 import pandas as pd from io import BytesIO -from typing import Any +from typing import Any, Optional from uuid import UUID from datetime import datetime, timezone @@ -12,6 +12,7 @@ from backend.app.db.functions.bulk_address_uploads_functions import ( set_combined_output_s3_uri, set_combining_status, ) +from orchestration.task_orchestrator import TaskOrchestrator logger = setup_logger() @@ -35,7 +36,16 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame: @subtask_handler() -def handler(body: dict[str, Any], context: Any) -> str: +def handler( + body: dict[str, Any], + context: Any, + orchestrator: Optional[TaskOrchestrator] = None, +) -> str: + # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler + # decorator; unused here but accepted so the contract is uniform across + # callers (see issue #1103). + del orchestrator + task_id_str: str = body.get("task_id", "") if not task_id_str: diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 6e82b468..18c4e2f2 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -16,6 +16,7 @@ from backend.ordnanceSurvey.helpers import ( os_places_results_to_dataframe, ) from backend.app.config import get_settings +from orchestration.task_orchestrator import TaskOrchestrator from sqlalchemy import select from datetime import datetime import uuid @@ -105,7 +106,16 @@ def save_results_to_s3( @subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body -def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: +def handler( + body: dict[str, Any], + context: Any, + orchestrator: Optional[TaskOrchestrator] = None, + local: bool = False, +) -> None: + # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler + # decorator; unused here but accepted so the contract is uniform across + # callers (see issue #1103). + del orchestrator # delete this line after test # local = True diff --git a/tests/utilities/__init__.py b/tests/utilities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/utilities/aws_lambda/__init__.py b/tests/utilities/aws_lambda/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py new file mode 100644 index 00000000..426b250f --- /dev/null +++ b/tests/utilities/aws_lambda/test_subtask_handler.py @@ -0,0 +1,144 @@ +"""Tests for the @subtask_handler decorator. + +Covers the contract that the decorator owns the parent SubTask lifecycle and +injects the decorator-owned TaskOrchestrator as a third positional argument +to the wrapped function — so the handler can compose its own use-case +orchestrator that shares the session. +""" + +from collections.abc import Generator, Iterator +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Any +from uuid import UUID + +import pytest +from sqlmodel import Session, SQLModel, create_engine + +from domain.tasks.subtasks import SubTaskStatus +from domain.tasks.tasks import TaskStatus +from orchestration.task_orchestrator import TaskOrchestrator +from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository +from repositories.tasks.task_postgres_repository import TaskPostgresRepository +from utilities.aws_lambda.subtask_handler import subtask_handler + + +@dataclass +class Harness: + orchestrator: TaskOrchestrator + tasks: TaskPostgresRepository + subtasks: SubTaskPostgresRepository + + @contextmanager + def factory(self) -> Generator[TaskOrchestrator, None, None]: + yield self.orchestrator + + +@pytest.fixture +def harness() -> Iterator[Harness]: + engine = create_engine("sqlite://") + SQLModel.metadata.create_all(engine) + with Session(engine) as session: + tasks = TaskPostgresRepository(session=session) + subtasks = SubTaskPostgresRepository(session=session) + yield Harness( + orchestrator=TaskOrchestrator(task_repo=tasks, subtask_repo=subtasks), + tasks=tasks, + subtasks=subtasks, + ) + + +def _direct_event(task_id: UUID, subtask_id: UUID) -> dict[str, Any]: + return {"task_id": str(task_id), "sub_task_id": str(subtask_id)} + + +def test_subtask_handler_injects_orchestrator_as_third_positional_argument( + harness: Harness, +) -> None: + """The wrapped function receives the decorator-owned TaskOrchestrator + so it can share the session with its own use-case orchestrator.""" + _, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + received: dict[str, Any] = {} + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + received["body"] = body + received["context"] = context + received["orchestrator"] = orchestrator + + handler(_direct_event(subtask.task_id, subtask.id), context="ctx-sentinel") + + assert received["orchestrator"] is harness.orchestrator + assert received["context"] == "ctx-sentinel" + assert received["body"]["sub_task_id"] == str(subtask.id) + + +def test_subtask_handler_completes_parent_subtask_on_success( + harness: Harness, +) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + return None + + handler(_direct_event(task.id, subtask.id), context=None) + + assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE + assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE + + +def test_subtask_handler_marks_parent_failed_and_reraises_on_error( + harness: Harness, +) -> None: + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + handler(_direct_event(task.id, subtask.id), context=None) + + assert harness.subtasks.get(subtask.id).status is SubTaskStatus.FAILED + assert harness.tasks.get(task.id).status is TaskStatus.FAILED + + +def test_subtask_handler_injected_orchestrator_can_create_child_subtask( + harness: Harness, +) -> None: + """Smoke check the share-the-session promise: the injected orchestrator + is the same one the decorator owns, so a handler can use it to create + child SubTasks under the same session.""" + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + child_ids: list[UUID] = [] + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + child = orchestrator.create_child_subtask(task.id, inputs={"split": 1}) + child_ids.append(child.id) + + handler(_direct_event(task.id, subtask.id), context=None) + + assert len(child_ids) == 1 + persisted_child = harness.subtasks.get(child_ids[0]) + assert persisted_child.task_id == task.id + assert persisted_child.status is SubTaskStatus.WAITING diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py index 64c1daa6..5ad5f6e1 100644 --- a/utilities/aws_lambda/subtask_handler.py +++ b/utilities/aws_lambda/subtask_handler.py @@ -39,7 +39,7 @@ def subtask_handler( trigger = SubtaskTriggerBody.model_validate(body) orchestrator.run_subtask( trigger.sub_task_id, - work=lambda body=body: func(body, context), + work=lambda body=body, o=orchestrator: func(body, context, o), ) return wrapper From 708f1b5d189222793cd206cf883c3c427ca63917 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 17:37:02 +0000 Subject: [PATCH 74/91] repositories: UserAddressRepository + UserAddressCsvS3Repository (CSV-on-S3 adapter) Adds the persistence layer for UserAddress batches: - Abstract UserAddressRepository with load_batch / save_batch. - Concrete UserAddressCsvS3Repository over CsvS3Client: - load_batch reads canonical upload columns (Address 1/2/3, Postcode, Internal Reference), comma-joins non-empty address parts, and passes Internal Reference through (None when missing/empty). - save_batch writes a 3-column CSV (user_address,postcode, internal_reference) to {path_prefix}/{ISO datetime}_{uuid8}.csv and returns the s3://bucket/key URI. - Postcode sanitisation flows through UserAddress.__post_init__; the repo never calls sanitise_postcode directly. Tests (moto-backed) cover: three-line address load, Address-1-only load, missing Internal Reference, save->reload round trip, and unique-filename-per-save. pyright --strict clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- repositories/user_address/__init__.py | 0 .../user_address_csv_s3_repository.py | 87 +++++++++ .../user_address/user_address_repository.py | 30 +++ tests/repositories/user_address/__init__.py | 0 tests/repositories/user_address/conftest.py | 32 ++++ .../test_user_address_csv_s3_repository.py | 175 ++++++++++++++++++ 6 files changed, 324 insertions(+) create mode 100644 repositories/user_address/__init__.py create mode 100644 repositories/user_address/user_address_csv_s3_repository.py create mode 100644 repositories/user_address/user_address_repository.py create mode 100644 tests/repositories/user_address/__init__.py create mode 100644 tests/repositories/user_address/conftest.py create mode 100644 tests/repositories/user_address/test_user_address_csv_s3_repository.py diff --git a/repositories/user_address/__init__.py b/repositories/user_address/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py new file mode 100644 index 00000000..be2baa13 --- /dev/null +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -0,0 +1,87 @@ +"""CSV-on-S3 adapter for :class:`UserAddressRepository`. + +Reads canonical upload CSVs (``Address 1``, ``Address 2``, ``Address 3``, +``Postcode``, ``Internal Reference``) and writes the splitter's compact +3-column form (``user_address``, ``postcode``, ``internal_reference``). + +The frontend pre-applies the user's column mapping at upload time, so this +adapter does NOT consult any ``BulkAddressUpload.column_mapping``: it always +expects the canonical column names listed above. +""" + +from __future__ import annotations + +import uuid +from datetime import datetime, timezone +from typing import Optional + +from domain.addresses.user_address import UserAddress +from infrastructure.csv_s3_client import CsvS3Client +from repositories.user_address.user_address_repository import UserAddressRepository + +_ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") +_POSTCODE_COLUMN: str = "Postcode" +_INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" + + +class UserAddressCsvS3Repository(UserAddressRepository): + """Persist :class:`UserAddress` batches as CSV objects in S3. + + The repo owns the unique-filename-within-prefix convention + (``{ISO datetime}_{8-char uuid}.csv``); callers own the directory + hierarchy supplied as ``path_prefix``. + """ + + def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: + self._csv_client = csv_client + self._bucket = bucket + + def load_batch(self, s3_uri: str) -> list[UserAddress]: + """Load canonical upload CSV rows into :class:`UserAddress` objects. + + Concatenates ``Address 1``/``Address 2``/``Address 3`` with ``", "``, + skipping missing or empty parts, into ``user_address``. Falls back to + just ``Address 1`` when 2 and 3 are absent. Passes ``Internal Reference`` + through to :attr:`UserAddress.internal_reference` (``None`` when the + column is missing or empty). + """ + rows = self._csv_client.read_rows(s3_uri) + addresses: list[UserAddress] = [] + for row in rows: + parts = [ + row[col].strip() + for col in _ADDRESS_COLUMNS + if col in row and row[col].strip() + ] + user_address = ", ".join(parts) + postcode = row.get(_POSTCODE_COLUMN, "") + raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip() + internal_reference: Optional[str] = raw_ref or None + addresses.append( + UserAddress( + user_address=user_address, + postcode=postcode, + internal_reference=internal_reference, + ) + ) + return addresses + + def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + """Write a 3-column CSV under a unique key beneath ``path_prefix``. + + The key is ``{path_prefix}/{ISO-8601 datetime}_{8-char uuid}.csv``. + Returns the full ``s3://bucket/key`` URI. + """ + rows: list[dict[str, str]] = [ + { + "user_address": addr.user_address, + "postcode": addr.postcode, + "internal_reference": addr.internal_reference or "", + } + for addr in addresses + ] + filename = ( + f"{datetime.now(timezone.utc).isoformat()}_{uuid.uuid4().hex[:8]}.csv" + ) + key = f"{path_prefix.rstrip('/')}/{filename}" + return self._csv_client.save_rows(rows, key) diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py new file mode 100644 index 00000000..d8c12855 --- /dev/null +++ b/repositories/user_address/user_address_repository.py @@ -0,0 +1,30 @@ +"""Abstract repository for :class:`UserAddress` batches. + +Persistence-agnostic interface for loading and saving batches of +:class:`domain.addresses.user_address.UserAddress`. Concrete adapters -- +e.g. :class:`UserAddressCsvS3Repository` -- live alongside this module. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod + +from domain.addresses.user_address import UserAddress + + +class UserAddressRepository(ABC): + """Load and persist batches of :class:`UserAddress`. + + Implementations choose the underlying storage (S3 CSV, Postgres, + in-memory, ...) but must preserve the canonical column semantics: + the address text, postcode (sanitised by ``UserAddress.__post_init__``), + and an optional internal reference. + """ + + @abstractmethod + def load_batch(self, s3_uri: str) -> list[UserAddress]: + """Read a batch of addresses from ``s3_uri`` and return domain objects.""" + + @abstractmethod + def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: + """Persist ``addresses`` under ``path_prefix`` and return the URI written.""" diff --git a/tests/repositories/user_address/__init__.py b/tests/repositories/user_address/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/repositories/user_address/conftest.py b/tests/repositories/user_address/conftest.py new file mode 100644 index 00000000..1859ff0a --- /dev/null +++ b/tests/repositories/user_address/conftest.py @@ -0,0 +1,32 @@ +import os +from collections.abc import Iterator +from typing import Optional + +import pytest + + +@pytest.fixture(autouse=True) +def _aws_creds() -> Iterator[None]: # pyright: ignore[reportUnusedFunction] + """Stub AWS creds so botocore doesn't probe the host environment. + + Applied automatically to every test in ``tests/repositories/user_address/``. + """ + keys = ( + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + "AWS_DEFAULT_REGION", + ) + prev: dict[str, Optional[str]] = {k: os.environ.get(k) for k in keys} + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + try: + yield + finally: + for k, v in prev.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py new file mode 100644 index 00000000..ca9e8a57 --- /dev/null +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -0,0 +1,175 @@ +from collections.abc import Iterator + +import pytest +from moto import mock_aws + +from infrastructure.csv_s3_client import CsvS3Client +from repositories.user_address.user_address_csv_s3_repository import ( + UserAddressCsvS3Repository, +) +from tests.infrastructure import make_boto_client + +BUCKET = "user-address-bucket" + + +@pytest.fixture +def repo() -> Iterator[UserAddressCsvS3Repository]: + with mock_aws(): + boto_client = make_boto_client("s3") + boto_client.create_bucket(Bucket=BUCKET) + csv_client = CsvS3Client(boto_client, BUCKET) + yield UserAddressCsvS3Repository(csv_client, BUCKET) + + +def _upload_csv( + repo: UserAddressCsvS3Repository, rows: list[dict[str, str]], key: str +) -> str: + return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] + + +def test_load_batch_concatenates_three_address_lines( + repo: UserAddressCsvS3Repository, +) -> None: + rows = [ + { + "Address 1": "1 High Street", + "Address 2": "Flat 2", + "Address 3": "Townville", + "Postcode": "sw1a 1aa", + "Internal Reference": "REF-001", + } + ] + uri = _upload_csv(repo, rows, "uploads/full.csv") + + addresses = repo.load_batch(uri) + + assert len(addresses) == 1 + address = addresses[0] + assert address.user_address == "1 High Street, Flat 2, Townville" + assert address.postcode == "SW1A1AA" + assert address.internal_reference == "REF-001" + + +def test_load_batch_uses_only_address_1_when_others_missing( + repo: UserAddressCsvS3Repository, +) -> None: + rows = [ + { + "Address 1": "10 Cardiff Road", + "Address 2": "", + "Address 3": "", + "Postcode": "CF10 1AA", + "Internal Reference": "REF-002", + } + ] + uri = _upload_csv(repo, rows, "uploads/address1-only.csv") + + addresses = repo.load_batch(uri) + + assert len(addresses) == 1 + assert addresses[0].user_address == "10 Cardiff Road" + assert addresses[0].postcode == "CF101AA" + assert addresses[0].internal_reference == "REF-002" + + +def test_load_batch_handles_missing_internal_reference( + repo: UserAddressCsvS3Repository, +) -> None: + rows = [ + { + "Address 1": "5 Park Lane", + "Address 2": "", + "Address 3": "", + "Postcode": "M1 1AA", + "Internal Reference": "", + } + ] + uri = _upload_csv(repo, rows, "uploads/no-ref.csv") + + addresses = repo.load_batch(uri) + + assert len(addresses) == 1 + assert addresses[0].user_address == "5 Park Lane" + assert addresses[0].postcode == "M11AA" + assert addresses[0].internal_reference is None + + +def test_save_batch_returns_uri_under_path_prefix( + repo: UserAddressCsvS3Repository, +) -> None: + from domain.addresses.user_address import UserAddress + + addresses = [ + UserAddress( + user_address="1 High Street, Flat 2, Townville", + postcode="SW1A 1AA", + internal_reference="REF-001", + ), + ] + + uri = repo.save_batch(addresses, "tasks/abc/batches") + + assert uri.startswith(f"s3://{BUCKET}/tasks/abc/batches/") + assert uri.endswith(".csv") + + +def test_save_then_reload_round_trip_preserves_values( + repo: UserAddressCsvS3Repository, +) -> None: + from domain.addresses.user_address import UserAddress + + # save_batch writes the splitter's compact schema + # (user_address/postcode/internal_reference); load_batch reads the + # canonical upload schema. To round-trip through the repo we re-upload + # the saved CSV under the upload schema's column names. + original = [ + UserAddress( + user_address="1 High Street", + postcode="SW1A 1AA", + internal_reference="REF-001", + ), + UserAddress( + user_address="2 Low Street", + postcode="XY9 8ZW", + internal_reference=None, + ), + ] + + saved_uri = repo.save_batch(original, "tasks/round-trip") + + # Re-shape the saved CSV into the canonical upload schema for reload. + saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + upload_rows: list[dict[str, str]] = [ + { + "Address 1": row["user_address"], + "Address 2": "", + "Address 3": "", + "Postcode": row["postcode"], + "Internal Reference": row["internal_reference"], + } + for row in saved_rows + ] + upload_uri = _upload_csv(repo, upload_rows, "uploads/round-trip.csv") + + reloaded = repo.load_batch(upload_uri) + + assert reloaded == original + + +def test_save_batch_uses_unique_filename_per_call( + repo: UserAddressCsvS3Repository, +) -> None: + from domain.addresses.user_address import UserAddress + + addresses = [ + UserAddress( + user_address="1 High Street", + postcode="SW1A 1AA", + internal_reference="REF-001", + ), + ] + + uri_1 = repo.save_batch(addresses, "tasks/uniqueness") + uri_2 = repo.save_batch(addresses, "tasks/uniqueness") + + assert uri_1 != uri_2 From 0a0444821756543f57625832681e28a09d264056 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 19 May 2026 17:46:12 +0000 Subject: [PATCH 75/91] applications/postcode_splitter: PostcodeSplitterOrchestrator + Lambda entrypoint slice Wires slice 1-5 primitives into a deployable splitter: - orchestration/postcode_splitter_orchestrator.py: PostcodeSplitterOrchestrator loads addresses via UserAddressRepository, groups by postcode via iter_postcode_grouped_batches, persists each batch under ara_postcode_splitter_batches/{task_id}/{subtask_id}/, creates a WAITING child SubTask, and publishes an address2UPRN SQS message per batch. - applications/postcode_splitter/: Lambda entrypoint. handler.py is decorated with @subtask_handler() so the parent SubTask lifecycle is decorator-owned; PostcodeSplitterTriggerBody validates the body. Dockerfile is the python:3.11 Lambda base with the DDD-shaped source layers and no pandas. - tests/orchestration/test_postcode_splitter_orchestrator.py: integration test using moto S3 + moto SQS + in-memory SQLite that exercises the full wiring against a fixture CSV spanning three postcode groups (one oversize) and asserts child count, persisted inputs, queue bodies, and dispatch order. backend/postcode_splitter/ and .github/workflows/deploy_terraform.yml are intentionally unchanged: the dockerfile_path flip is deferred until the companion backend/address2UPRN/ migration is also ready. --- applications/__init__.py | 0 applications/postcode_splitter/Dockerfile | 21 ++ applications/postcode_splitter/__init__.py | 0 applications/postcode_splitter/handler.py | 70 ++++ .../postcode_splitter_trigger_body.py | 32 ++ .../postcode_splitter/requirements.txt | 4 + .../postcode_splitter_orchestrator.py | 89 ++++++ .../test_postcode_splitter_orchestrator.py | 298 ++++++++++++++++++ 8 files changed, 514 insertions(+) create mode 100644 applications/__init__.py create mode 100644 applications/postcode_splitter/Dockerfile create mode 100644 applications/postcode_splitter/__init__.py create mode 100644 applications/postcode_splitter/handler.py create mode 100644 applications/postcode_splitter/postcode_splitter_trigger_body.py create mode 100644 applications/postcode_splitter/requirements.txt create mode 100644 orchestration/postcode_splitter_orchestrator.py create mode 100644 tests/orchestration/test_postcode_splitter_orchestrator.py diff --git a/applications/__init__.py b/applications/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/applications/postcode_splitter/Dockerfile b/applications/postcode_splitter/Dockerfile new file mode 100644 index 00000000..578ee7a7 --- /dev/null +++ b/applications/postcode_splitter/Dockerfile @@ -0,0 +1,21 @@ +FROM public.ecr.aws/lambda/python:3.11 + +WORKDIR /var/task + +COPY applications/postcode_splitter/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the layered source the handler imports from. The new splitter pulls +# only DDD-shaped packages — no pandas, no legacy backend/. +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ +COPY orchestration/ orchestration/ +COPY repositories/ repositories/ +COPY utilities/ utilities/ +COPY applications/ applications/ + +# Place the handler at the Lambda task root so the runtime can resolve +# ``main.handler`` without an extra package prefix. +COPY applications/postcode_splitter/handler.py /var/task/main.py + +CMD ["main.handler"] diff --git a/applications/postcode_splitter/__init__.py b/applications/postcode_splitter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py new file mode 100644 index 00000000..005227a9 --- /dev/null +++ b/applications/postcode_splitter/handler.py @@ -0,0 +1,70 @@ +"""Lambda entrypoint for the postcode splitter slice. + +The :func:`handler` function is decorated with ``@subtask_handler()`` so the +decorator owns the parent ``SubTask`` lifecycle (start/complete/fail) and +injects the decorator-owned :class:`TaskOrchestrator` as the third positional +argument. The handler itself does only two things: + +1. Build a :class:`PostcodeSplitterOrchestrator` from env-driven config. +2. Delegate to ``split_and_dispatch`` and return its result so it lands in + ``SubTask.outputs["result"]``. +""" + +from __future__ import annotations + +import os +from typing import Any + +import boto3 + +from applications.postcode_splitter.postcode_splitter_trigger_body import ( + PostcodeSplitterTriggerBody, +) +from infrastructure.address2uprn_queue_client import Address2UprnQueueClient +from infrastructure.csv_s3_client import CsvS3Client +from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator +from orchestration.task_orchestrator import TaskOrchestrator +from repositories.user_address.user_address_csv_s3_repository import ( + UserAddressCsvS3Repository, +) +from utilities.aws_lambda.subtask_handler import subtask_handler + + +@subtask_handler() +def handler( + body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator +) -> dict[str, list[str]]: + """Validate the trigger body, build the splitter, dispatch children. + + Reads ``S3_BUCKET_NAME`` and ``ADDRESS2UPRN_QUEUE_URL`` from the + environment to construct the typed S3/SQS clients. The return value + lands in ``SubTask.outputs["result"]`` via the decorator. + """ + trigger = PostcodeSplitterTriggerBody.model_validate(body) + + bucket = os.environ["S3_BUCKET_NAME"] + queue_url = os.environ["ADDRESS2UPRN_QUEUE_URL"] + + # boto3.client is overloaded per-service in the installed stubs; cast + # to Any so the strict-mode checker treats it as opaque. + boto3_client: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + boto_s3: Any = boto3_client("s3") + boto_sqs: Any = boto3_client("sqs") + + csv_client = CsvS3Client(boto_s3, bucket) + user_address_repo = UserAddressCsvS3Repository(csv_client, bucket) + queue_client = Address2UprnQueueClient(boto_sqs, queue_url) + + splitter = PostcodeSplitterOrchestrator( + task_orchestrator=task_orchestrator, + user_address_repo=user_address_repo, + queue_client=queue_client, + ) + + child_ids = splitter.split_and_dispatch( + parent_task_id=trigger.task_id, + parent_subtask_id=trigger.sub_task_id, + input_s3_uri=trigger.s3_uri, + ) + + return {"child_subtask_ids": [str(cid) for cid in child_ids]} diff --git a/applications/postcode_splitter/postcode_splitter_trigger_body.py b/applications/postcode_splitter/postcode_splitter_trigger_body.py new file mode 100644 index 00000000..bc983abc --- /dev/null +++ b/applications/postcode_splitter/postcode_splitter_trigger_body.py @@ -0,0 +1,32 @@ +"""Trigger payload model for the postcode splitter Lambda. + +The decorator (``@subtask_handler``) already validates ``task_id`` and +``sub_task_id`` via :class:`SubtaskTriggerBody`; this model layers on the +splitter-specific ``s3_uri`` field while keeping ``extra="allow"`` so any +upstream-passthrough keys (e.g. ``portfolio_id``) survive untouched. +""" + +from uuid import UUID + +from pydantic import BaseModel, ConfigDict + + +class PostcodeSplitterTriggerBody(BaseModel): + """Validated body for the postcode splitter Lambda. + + Attributes: + task_id: Parent ``Task`` id; used as the ``task_id`` input on each + child ``SubTask`` and as the ``parent_task_id`` on the fan-out + SQS messages. + sub_task_id: The splitter's own ``SubTask`` id; used as the path + segment under ``ara_postcode_splitter_batches/{task_id}/{...}`` + so per-invocation outputs cannot collide. + s3_uri: ``s3://bucket/key`` URI of the uploaded address CSV the + splitter must read. + """ + + model_config = ConfigDict(extra="allow") + + task_id: UUID + sub_task_id: UUID + s3_uri: str diff --git a/applications/postcode_splitter/requirements.txt b/applications/postcode_splitter/requirements.txt new file mode 100644 index 00000000..6a85a255 --- /dev/null +++ b/applications/postcode_splitter/requirements.txt @@ -0,0 +1,4 @@ +boto3 +pydantic +sqlmodel +psycopg2-binary diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py new file mode 100644 index 00000000..6afa2538 --- /dev/null +++ b/orchestration/postcode_splitter_orchestrator.py @@ -0,0 +1,89 @@ +"""Use-case orchestrator for the postcode splitter Lambda. + +Wires the slice-1 domain (``iter_postcode_grouped_batches``), the slice-3 +``UserAddressRepository``, the slice-2 ``Address2UprnQueueClient``, and the +slice-4 ``TaskOrchestrator.create_child_subtask`` primitive together. + +``split_and_dispatch`` loads the input batch, groups it into per-postcode +chunks, writes each chunk back to S3 under a deterministic prefix, creates a +WAITING child ``SubTask`` for it, and publishes the address-to-UPRN fan-out +message that downstream consumers pick up. +""" + +from __future__ import annotations + +from uuid import UUID + +from infrastructure.address2uprn_queue_client import Address2UprnQueueClient +from orchestration.task_orchestrator import TaskOrchestrator +from domain.addresses.postcode_batching import iter_postcode_grouped_batches +from repositories.user_address.user_address_repository import UserAddressRepository + + +class PostcodeSplitterOrchestrator: + """Split an uploaded address batch into postcode-grouped child SubTasks. + + The orchestrator owns the algorithm; the IO collaborators + (:class:`UserAddressRepository`, :class:`Address2UprnQueueClient`) and + the :class:`TaskOrchestrator` lifecycle primitive are injected so the + same wiring can be exercised against moto/SQLite in tests and against + real AWS in the Lambda entrypoint. + """ + + def __init__( + self, + task_orchestrator: TaskOrchestrator, + user_address_repo: UserAddressRepository, + queue_client: Address2UprnQueueClient, + max_batch_size: int = 500, + ) -> None: + self._task_orchestrator = task_orchestrator + self._user_address_repo = user_address_repo + self._queue_client = queue_client + self._max_batch_size = max_batch_size + + def split_and_dispatch( + self, + *, + parent_task_id: UUID, + parent_subtask_id: UUID, + input_s3_uri: str, + ) -> list[UUID]: + """Split ``input_s3_uri`` into postcode batches and dispatch each. + + For each yielded batch: + + 1. Persist it under + ``ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}``. + 2. Create a WAITING child ``SubTask`` with + ``inputs={"task_id": str(parent_task_id), "s3_uri": batch_uri}``. + 3. Publish an ``address2UPRN`` SQS message referencing the new child. + + Returns: + The list of child ``SubTask`` ids, in dispatch order. + """ + addresses = self._user_address_repo.load_batch(input_s3_uri) + path_prefix = ( + f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" + ) + + child_ids: list[UUID] = [] + for batch in iter_postcode_grouped_batches( + addresses, max_batch_size=self._max_batch_size + ): + batch_uri = self._user_address_repo.save_batch(batch, path_prefix) + child = self._task_orchestrator.create_child_subtask( + parent_task_id, + inputs={ + "task_id": str(parent_task_id), + "s3_uri": batch_uri, + }, + ) + self._queue_client.publish( + parent_task_id=parent_task_id, + child_subtask_id=child.id, + s3_uri=batch_uri, + ) + child_ids.append(child.id) + + return child_ids diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py new file mode 100644 index 00000000..57bd2133 --- /dev/null +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -0,0 +1,298 @@ +"""Integration test: PostcodeSplitterOrchestrator wired end-to-end. + +Combines moto S3 + moto SQS + an in-memory SQLite session for the +``TaskOrchestrator`` so the full slice-6 wiring is exercised through real +infrastructure adapters (not mocks). The fixture CSV spans three postcodes +with one oversize group, which forces both the buffer-flush-then-oversize +branch and the final-flush branch of +``iter_postcode_grouped_batches`` — three batches in total. +""" + +from __future__ import annotations + +import json +import os +from collections.abc import Iterator +from dataclasses import dataclass +from typing import Any, cast + +import boto3 +import pytest +from moto import mock_aws +from sqlmodel import Session, SQLModel, create_engine + +from infrastructure.address2uprn_queue_client import Address2UprnQueueClient +from infrastructure.csv_s3_client import CsvS3Client +from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator +from orchestration.task_orchestrator import TaskOrchestrator +from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository +from repositories.tasks.task_postgres_repository import TaskPostgresRepository +from repositories.user_address.user_address_csv_s3_repository import ( + UserAddressCsvS3Repository, +) + +BUCKET = "splitter-bucket" +REGION = "us-east-1" + + +def _make_boto_client(service_name: str) -> Any: + factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + return factory(service_name, region_name=REGION) + + +@pytest.fixture(autouse=True) +def _aws_creds() -> Iterator[None]: # pyright: ignore[reportUnusedFunction] + keys = ( + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + "AWS_DEFAULT_REGION", + ) + prev: dict[str, Any] = {k: os.environ.get(k) for k in keys} + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = REGION + try: + yield + finally: + for k, v in prev.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + + +@dataclass +class Harness: + splitter: PostcodeSplitterOrchestrator + task_orchestrator: TaskOrchestrator + subtasks: SubTaskPostgresRepository + csv_client: CsvS3Client + boto_sqs: Any + queue_url: str + repo: UserAddressCsvS3Repository + + +@pytest.fixture +def harness() -> Iterator[Harness]: + with mock_aws(): + # Infra: S3 + SQS + boto_s3 = _make_boto_client("s3") + boto_s3.create_bucket(Bucket=BUCKET) + boto_sqs = _make_boto_client("sqs") + queue: dict[str, Any] = boto_sqs.create_queue(QueueName="address2uprn-queue") + queue_url = cast(str, queue["QueueUrl"]) + + csv_client = CsvS3Client(boto_s3, BUCKET) + repo = UserAddressCsvS3Repository(csv_client, BUCKET) + queue_client = Address2UprnQueueClient(boto_sqs, queue_url) + + # DB: in-memory SQLite TaskOrchestrator + engine = create_engine("sqlite://") + SQLModel.metadata.create_all(engine) + with Session(engine) as session: + task_repo = TaskPostgresRepository(session=session) + subtask_repo = SubTaskPostgresRepository(session=session) + task_orchestrator = TaskOrchestrator( + task_repo=task_repo, subtask_repo=subtask_repo + ) + + splitter = PostcodeSplitterOrchestrator( + task_orchestrator=task_orchestrator, + user_address_repo=repo, + queue_client=queue_client, + max_batch_size=3, + ) + + yield Harness( + splitter=splitter, + task_orchestrator=task_orchestrator, + subtasks=subtask_repo, + csv_client=csv_client, + boto_sqs=boto_sqs, + queue_url=queue_url, + repo=repo, + ) + + +def _upload_fixture_csv(csv_client: CsvS3Client) -> str: + # Three postcode groups: + # AA1 1AA × 2 (within cap) + # BB2 2BB × 4 (oversize: > max_batch_size=3) + # CC3 3CC × 1 (final flush) + # Expected batching with cap=3 and the algorithm in + # ``iter_postcode_grouped_batches``: + # batch 1: [AA1 1AA × 2] (flushed because oversize follows) + # batch 2: [BB2 2BB × 4] (oversize own batch) + # batch 3: [CC3 3CC × 1] (final flush) + rows: list[dict[str, str]] = [] + rows.extend( + { + "Address 1": f"{i} High St", + "Address 2": "", + "Address 3": "", + "Postcode": "AA1 1AA", + "Internal Reference": f"AA-{i}", + } + for i in range(1, 3) + ) + rows.extend( + { + "Address 1": f"{i} Long Road", + "Address 2": "", + "Address 3": "", + "Postcode": "BB2 2BB", + "Internal Reference": f"BB-{i}", + } + for i in range(1, 5) + ) + rows.append( + { + "Address 1": "1 Final Way", + "Address 2": "", + "Address 3": "", + "Postcode": "CC3 3CC", + "Internal Reference": "CC-1", + } + ) + return csv_client.save_rows(rows, "uploads/input.csv") + + +def _drain_queue(boto_sqs: Any, queue_url: str) -> list[dict[str, Any]]: + bodies: list[dict[str, Any]] = [] + while True: + received: dict[str, Any] = boto_sqs.receive_message( + QueueUrl=queue_url, MaxNumberOfMessages=10, WaitTimeSeconds=0 + ) + messages = cast(list[dict[str, Any]], received.get("Messages", [])) + if not messages: + break + for message in messages: + bodies.append(cast(dict[str, Any], json.loads(message["Body"]))) + boto_sqs.delete_message( + QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"] + ) + return bodies + + +def test_split_and_dispatch_creates_three_children_for_fixture( + harness: Harness, +) -> None: + parent_task, parent_subtask = ( + harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" + ) + ) + input_uri = _upload_fixture_csv(harness.csv_client) + + child_ids = harness.splitter.split_and_dispatch( + parent_task_id=parent_task.id, + parent_subtask_id=parent_subtask.id, + input_s3_uri=input_uri, + ) + + assert len(child_ids) == 3 + # All child ids are unique and persisted as WAITING children of the + # parent task. + assert len(set(child_ids)) == 3 + for cid in child_ids: + child = harness.subtasks.get(cid) + assert child.task_id == parent_task.id + + +def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri( + harness: Harness, +) -> None: + parent_task, parent_subtask = ( + harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" + ) + ) + input_uri = _upload_fixture_csv(harness.csv_client) + + child_ids = harness.splitter.split_and_dispatch( + parent_task_id=parent_task.id, + parent_subtask_id=parent_subtask.id, + input_s3_uri=input_uri, + ) + + for cid in child_ids: + child = harness.subtasks.get(cid) + assert child.inputs is not None + assert child.inputs["task_id"] == str(parent_task.id) + batch_uri = child.inputs["s3_uri"] + assert isinstance(batch_uri, str) + prefix = ( + f"s3://{BUCKET}/ara_postcode_splitter_batches/" + f"{parent_task.id}/{parent_subtask.id}/" + ) + assert batch_uri.startswith(prefix) + assert batch_uri.endswith(".csv") + + +def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids( + harness: Harness, +) -> None: + parent_task, parent_subtask = ( + harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" + ) + ) + input_uri = _upload_fixture_csv(harness.csv_client) + + child_ids = harness.splitter.split_and_dispatch( + parent_task_id=parent_task.id, + parent_subtask_id=parent_subtask.id, + input_s3_uri=input_uri, + ) + + bodies = _drain_queue(harness.boto_sqs, harness.queue_url) + assert len(bodies) == len(child_ids) + + # Match queue messages against persisted child inputs by child_subtask_id; + # the message body's task_id/s3_uri must agree with the SubTask inputs. + bodies_by_child = {body["sub_task_id"]: body for body in bodies} + assert set(bodies_by_child.keys()) == {str(cid) for cid in child_ids} + for cid in child_ids: + child = harness.subtasks.get(cid) + body = bodies_by_child[str(cid)] + assert child.inputs is not None + assert body == { + "task_id": str(parent_task.id), + "sub_task_id": str(cid), + "s3_uri": child.inputs["s3_uri"], + } + + +def test_split_and_dispatch_returns_child_ids_in_dispatch_order( + harness: Harness, +) -> None: + parent_task, parent_subtask = ( + harness.task_orchestrator.create_task_with_subtask( + task_source="manual:postcode-splitter-int" + ) + ) + input_uri = _upload_fixture_csv(harness.csv_client) + + child_ids = harness.splitter.split_and_dispatch( + parent_task_id=parent_task.id, + parent_subtask_id=parent_subtask.id, + input_s3_uri=input_uri, + ) + + # Re-load each child's saved batch and inspect the postcode column to + # confirm the dispatch order matches the postcode-batching algorithm: + # AA-batch first, BB oversize batch second, CC final-flush third. + postcodes_per_batch: list[set[str]] = [] + for cid in child_ids: + child = harness.subtasks.get(cid) + assert child.inputs is not None + rows = harness.csv_client.read_rows(child.inputs["s3_uri"]) + postcodes_per_batch.append({row["postcode"] for row in rows}) + + assert postcodes_per_batch == [ + {"AA11AA"}, + {"BB22BB"}, + {"CC33CC"}, + ] From 914a8ed51e13bdd1cb644de4972d0d4951269adc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 11:07:40 +0000 Subject: [PATCH 76/91] postcode splliter working e2e --- .gitignore | 1 + applications/postcode_splitter/Dockerfile | 13 ++ .../local_handler/.env.local.example | 34 +++++ .../local_handler/docker-compose.yml | 9 ++ .../local_handler/invoke_local_lambda.py | 37 +++++ .../local_handler/run_local.sh | 12 ++ backend/address2UPRN/handler/requirements.txt | 3 +- .../terraform/lambda/postcodeSplitter/main.tf | 14 -- domain/addresses/user_address.py | 16 ++- infrastructure/csv_s3_client.py | 2 +- infrastructure/s3_uri.py | 43 ++++++ .../user_address_csv_s3_repository.py | 57 +++++--- tests/domain/addresses/test_user_address.py | 26 ++++ tests/infrastructure/test_s3_uri.py | 32 +++++ .../test_postcode_splitter_orchestrator.py | 12 +- .../test_user_address_csv_s3_repository.py | 127 +++++++++++------- .../aws_lambda/test_subtask_handler.py | 111 +++++++++++++++ utilities/aws_lambda/subtask_handler.py | 67 ++++++++- 18 files changed, 523 insertions(+), 93 deletions(-) create mode 100644 applications/postcode_splitter/local_handler/.env.local.example create mode 100644 applications/postcode_splitter/local_handler/docker-compose.yml create mode 100755 applications/postcode_splitter/local_handler/invoke_local_lambda.py create mode 100755 applications/postcode_splitter/local_handler/run_local.sh create mode 100644 infrastructure/s3_uri.py create mode 100644 tests/infrastructure/test_s3_uri.py diff --git a/.gitignore b/.gitignore index 888d527a..9e5df0c7 100644 --- a/.gitignore +++ b/.gitignore @@ -121,6 +121,7 @@ celerybeat.pid # Environments .env +.env.local .venv env/ venv/ diff --git a/applications/postcode_splitter/Dockerfile b/applications/postcode_splitter/Dockerfile index 578ee7a7..aea1f914 100644 --- a/applications/postcode_splitter/Dockerfile +++ b/applications/postcode_splitter/Dockerfile @@ -1,5 +1,18 @@ FROM public.ecr.aws/lambda/python:3.11 +# Postgres host/port/database are baked into the image at build time from +# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets), +# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the +# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT +# baked in -- Terraform injects those as Lambda env vars from Secrets Manager. +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV POSTGRES_HOST=${DEV_DB_HOST} +ENV POSTGRES_PORT=${DEV_DB_PORT} +ENV POSTGRES_DATABASE=${DEV_DB_NAME} + WORKDIR /var/task COPY applications/postcode_splitter/requirements.txt . diff --git a/applications/postcode_splitter/local_handler/.env.local.example b/applications/postcode_splitter/local_handler/.env.local.example new file mode 100644 index 00000000..28fa8390 --- /dev/null +++ b/applications/postcode_splitter/local_handler/.env.local.example @@ -0,0 +1,34 @@ +# Local-test environment for the postcode_splitter Lambda. +# +# cp .env.local.example .env.local then fill in the values below. +# +# .env.local is gitignored. The container hits REAL AWS and a REAL Postgres, +# so every value here points at infrastructure that actually exists. +# +# NOTE: the new DDD code uses different env var names than the repo root +# .env. The mapping (root .env name -> var here) is given per section. +# Keep comments on their own lines — docker-compose's env_file parser folds a +# trailing "# ..." into the value. + +# --- Postgres (orchestration/default_orchestrator -> PostgresConfig.from_env) --- +# POSTGRES_HOST <- DB_HOST, PORT <- DB_PORT, USERNAME <- DB_USERNAME, +# PASSWORD <- DB_PASSWORD, DATABASE <- DB_NAME. +POSTGRES_HOST= +POSTGRES_PORT=5432 +POSTGRES_USERNAME= +POSTGRES_PASSWORD= +POSTGRES_DATABASE= +# POSTGRES_DRIVER=psycopg2 (optional; defaults to psycopg2) + +# --- Handler config (applications/postcode_splitter/handler.py) --- +# S3_BUCKET_NAME: bucket holding the input address CSV (root .env: DATA_BUCKET). +# ADDRESS2UPRN_QUEUE_URL: SQS queue the splitter fans batches out to; not in +# the root .env (Terraform sets it in prod). +S3_BUCKET_NAME= +ADDRESS2UPRN_QUEUE_URL= + +# --- AWS credentials for boto3 (S3 + SQS clients) --- +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +AWS_DEFAULT_REGION=eu-west-2 +# AWS_SESSION_TOKEN= (only if using temporary/SSO credentials) diff --git a/applications/postcode_splitter/local_handler/docker-compose.yml b/applications/postcode_splitter/local_handler/docker-compose.yml new file mode 100644 index 00000000..68af1c40 --- /dev/null +++ b/applications/postcode_splitter/local_handler/docker-compose.yml @@ -0,0 +1,9 @@ +services: + postcode-splitter: + build: + context: ../../../ + dockerfile: applications/postcode_splitter/Dockerfile + ports: + - "9001:8080" + env_file: + - .env.local diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py new file mode 100755 index 00000000..c0ca89ec --- /dev/null +++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +"""POST a single SQS-shaped event at the locally-running splitter Lambda. + +The container built by docker-compose runs the AWS Lambda Runtime Interface +Emulator, which accepts invocations on the URL below. Replace the three +placeholder values with a real parent Task id, the splitter's own SubTask id +(both must already exist in the Postgres pointed at by .env.local), and the +s3://... URI of an uploaded address CSV. +""" + +import json +import requests + +HOST = "localhost" +PORT = "9001" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + { + "task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf", + "sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv", + } + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/applications/postcode_splitter/local_handler/run_local.sh b/applications/postcode_splitter/local_handler/run_local.sh new file mode 100755 index 00000000..345b60ee --- /dev/null +++ b/applications/postcode_splitter/local_handler/run_local.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")" + +if [ ! -f .env.local ]; then + cp .env.local.example .env.local + echo "Created .env.local from the template — fill it in, then re-run." >&2 + exit 1 +fi + +docker compose build --no-cache +docker compose up --force-recreate diff --git a/backend/address2UPRN/handler/requirements.txt b/backend/address2UPRN/handler/requirements.txt index 6ef41b2d..02aaefba 100644 --- a/backend/address2UPRN/handler/requirements.txt +++ b/backend/address2UPRN/handler/requirements.txt @@ -8,4 +8,5 @@ boto3==1.35.44 sqlmodel sqlalchemy==2.0.36 psycopg2-binary==2.9.10 -pydantic-settings==2.6.0 \ No newline at end of file +pydantic-settings==2.6.0 +httpx \ No newline at end of file diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf index 94c5cd4e..325f7dc7 100644 --- a/deployment/terraform/lambda/postcodeSplitter/main.tf +++ b/deployment/terraform/lambda/postcodeSplitter/main.tf @@ -40,20 +40,6 @@ module "lambda" { LOG_LEVEL = "info" DB_USERNAME = local.db_credentials.db_assessment_model_username DB_PASSWORD = local.db_credentials.db_assessment_model_password - GOOGLE_SOLAR_API_KEY = "test" - SAP_PREDICTIONS_BUCKET = "test" - CARBON_PREDICTIONS_BUCKET = "test" - HEAT_PREDICTIONS_BUCKET = "test" - HEATING_KWH_PREDICTIONS_BUCKET = "test" - HOTWATER_KWH_PREDICTIONS_BUCKET = "test" - API_KEY = "test" - ENVIRONMENT = "test" - SECRET_KEY = "test" - PLAN_TRIGGER_BUCKET = "test" - DATA_BUCKET = "test" - EPC_AUTH_TOKEN = "test" - ENGINE_SQS_URL = "test" - ENERGY_ASSESSMENTS_BUCKET = "test" ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index e48dfdec..120a3659 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -8,12 +8,17 @@ caller can construct an instance with an un-normalised postcode. from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from domain.postcodes.sanitise import sanitise_postcode +def _empty_source_row() -> dict[str, str]: + """Typed default factory for :attr:`UserAddress.source_row`.""" + return {} + + @dataclass(frozen=True) class UserAddress: """A user-supplied address paired with its canonical postcode. @@ -25,11 +30,20 @@ class UserAddress: :meth:`__post_init__`. internal_reference: Optional customer-side identifier preserved for traceability through the matching pipeline. + source_row: The complete original CSV row this address was parsed + from, column name -> cell value. The splitter is a pass-through + router: it groups rows by postcode but must not drop the other + columns the downstream address2uprn stage relies on, so the raw + row travels alongside the parsed fields. Excluded from equality + and hashing -- identity stays defined by the parsed fields above. """ user_address: str postcode: str internal_reference: Optional[str] = None + source_row: dict[str, str] = field( + default_factory=_empty_source_row, compare=False + ) def __post_init__(self) -> None: # Frozen dataclass: bypass the descriptor with object.__setattr__. diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py index 5163705b..0a576b81 100644 --- a/infrastructure/csv_s3_client.py +++ b/infrastructure/csv_s3_client.py @@ -2,7 +2,7 @@ import csv from io import StringIO from infrastructure.s3_client import S3Client -from utils.s3 import parse_s3_uri +from infrastructure.s3_uri import parse_s3_uri class CsvS3Client(S3Client): diff --git a/infrastructure/s3_uri.py b/infrastructure/s3_uri.py new file mode 100644 index 00000000..bf97100e --- /dev/null +++ b/infrastructure/s3_uri.py @@ -0,0 +1,43 @@ +"""Parse S3 URIs into ``(bucket, key)`` pairs. + +A pure-stdlib helper for the infrastructure layer. It deliberately pulls in +neither pandas, boto3, nor the legacy ``utils`` package, so slim Lambda images +that only need URI parsing do not drag the wider data stack along. + +Two input shapes are supported: + +* canonical S3 URIs --- ``s3://bucket/key`` +* AWS S3 console URLs --- ``https://.../s3/object/bucket?prefix=key`` +""" + +from urllib.parse import unquote + + +def parse_s3_uri(s3_uri: str) -> tuple[str, str]: + """Return the ``(bucket, key)`` pair addressed by ``s3_uri``. + + Raises: + ValueError: if ``s3_uri`` is neither a well-formed ``s3://`` URI nor + an AWS console URL carrying a ``prefix`` query parameter. + """ + if s3_uri.startswith("s3://"): + parts = s3_uri[len("s3://") :].split("/", 1) + if len(parts) < 2 or not parts[0] or not parts[1]: + raise ValueError("S3 URI must include both a bucket and a key") + return parts[0], parts[1] + + if "?" not in s3_uri: + raise ValueError(f"Not an s3:// URI and has no query string: {s3_uri!r}") + base, query = s3_uri.split("?", 1) + + if "/s3/object/" not in base: + raise ValueError(f"Console URL has no '/s3/object/' segment: {s3_uri!r}") + bucket = base.split("/s3/object/", 1)[1] + + params: dict[str, str] = {} + for item in query.split("&"): + if "=" in item: + name, value = item.split("=", 1) + params[name] = value + key = unquote(params.get("prefix", "")) + return bucket, key diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index be2baa13..7cd10bac 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -1,12 +1,16 @@ """CSV-on-S3 adapter for :class:`UserAddressRepository`. -Reads canonical upload CSVs (``Address 1``, ``Address 2``, ``Address 3``, -``Postcode``, ``Internal Reference``) and writes the splitter's compact -3-column form (``user_address``, ``postcode``, ``internal_reference``). +Reads upload CSVs that carry a ``postcode`` column (plus optional +``Address 1``/``Address 2``/``Address 3`` and ``Internal Reference``), and +writes batch CSVs that pass *every* original column through unchanged with +one column appended -- ``postcode_clean`` (uppercase, whitespace-stripped) -- +which the downstream address2uprn stage groups on. -The frontend pre-applies the user's column mapping at upload time, so this -adapter does NOT consult any ``BulkAddressUpload.column_mapping``: it always -expects the canonical column names listed above. +The splitter is a pass-through router: it must not reshape or drop columns, +because address2uprn has not been migrated and still consumes the legacy +splitter's full-row output. The frontend pre-applies the user's column +mapping at upload time, so this adapter does NOT consult any +``BulkAddressUpload.column_mapping``. """ from __future__ import annotations @@ -20,8 +24,9 @@ from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_repository import UserAddressRepository _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3") -_POSTCODE_COLUMN: str = "Postcode" +_POSTCODE_COLUMN: str = "postcode" _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference" +_POSTCODE_CLEAN_COLUMN: str = "postcode_clean" class UserAddressCsvS3Repository(UserAddressRepository): @@ -37,15 +42,27 @@ class UserAddressCsvS3Repository(UserAddressRepository): self._bucket = bucket def load_batch(self, s3_uri: str) -> list[UserAddress]: - """Load canonical upload CSV rows into :class:`UserAddress` objects. + """Load upload CSV rows into :class:`UserAddress` objects. - Concatenates ``Address 1``/``Address 2``/``Address 3`` with ``", "``, - skipping missing or empty parts, into ``user_address``. Falls back to - just ``Address 1`` when 2 and 3 are absent. Passes ``Internal Reference`` - through to :attr:`UserAddress.internal_reference` (``None`` when the - column is missing or empty). + Each row's complete column set is preserved on + :attr:`UserAddress.source_row` so :meth:`save_batch` can pass it + through untouched. The parsed convenience fields are also populated: + ``Address 1``/``Address 2``/``Address 3`` are concatenated with + ``", "`` (skipping missing/empty parts) into ``user_address``, and + ``Internal Reference`` is threaded to + :attr:`UserAddress.internal_reference` (``None`` when missing/empty). + + Raises: + ValueError: if the CSV has rows but no ``postcode`` column -- + without it the splitter cannot group, and silently emitting + empty postcodes would corrupt every downstream batch. """ rows = self._csv_client.read_rows(s3_uri) + if rows and _POSTCODE_COLUMN not in rows[0]: + raise ValueError( + f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; " + f"columns present: {sorted(rows[0])}" + ) addresses: list[UserAddress] = [] for row in rows: parts = [ @@ -62,22 +79,24 @@ class UserAddressCsvS3Repository(UserAddressRepository): user_address=user_address, postcode=postcode, internal_reference=internal_reference, + source_row=row, ) ) return addresses def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: - """Write a 3-column CSV under a unique key beneath ``path_prefix``. + """Write a pass-through batch CSV under a unique key. + + Each output row is the address's original ``source_row`` with a + ``postcode_clean`` column appended (the canonical postcode the + downstream address2uprn stage groups on). No original column is + dropped or reshaped. The key is ``{path_prefix}/{ISO-8601 datetime}_{8-char uuid}.csv``. Returns the full ``s3://bucket/key`` URI. """ rows: list[dict[str, str]] = [ - { - "user_address": addr.user_address, - "postcode": addr.postcode, - "internal_reference": addr.internal_reference or "", - } + {**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode} for addr in addresses ] filename = ( diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index e722077d..4d8322da 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -43,3 +43,29 @@ def test_user_address_equality_uses_sanitised_postcode() -> None: a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") b = UserAddress(user_address="1 The Street", postcode="SW1A1AA") assert a == b + + +def test_user_address_source_row_defaults_to_empty_dict() -> None: + addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + assert addr.source_row == {} + + +def test_user_address_carries_source_row() -> None: + row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} + addr = UserAddress( + user_address="1 The Street", postcode="SW1A 1AA", source_row=row + ) + assert addr.source_row == row + + +def test_user_address_equality_ignores_source_row() -> None: + # source_row is excluded from equality (and hashing): identity stays + # defined by the parsed fields, so two addresses parsed from rows with + # different incidental columns still compare equal. + a = UserAddress( + user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"} + ) + b = UserAddress( + user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"} + ) + assert a == b diff --git a/tests/infrastructure/test_s3_uri.py b/tests/infrastructure/test_s3_uri.py new file mode 100644 index 00000000..896c5959 --- /dev/null +++ b/tests/infrastructure/test_s3_uri.py @@ -0,0 +1,32 @@ +import pytest + +from infrastructure.s3_uri import parse_s3_uri + + +def test_parses_simple_s3_uri() -> None: + assert parse_s3_uri("s3://my-bucket/file.csv") == ("my-bucket", "file.csv") + + +def test_parses_s3_uri_with_nested_key() -> None: + bucket, key = parse_s3_uri("s3://my-bucket/nested/path/to/file.csv") + assert (bucket, key) == ("my-bucket", "nested/path/to/file.csv") + + +def test_rejects_s3_uri_without_key() -> None: + with pytest.raises(ValueError, match="bucket and a key"): + parse_s3_uri("s3://my-bucket") + + +def test_rejects_s3_uri_with_empty_key() -> None: + with pytest.raises(ValueError, match="bucket and a key"): + parse_s3_uri("s3://my-bucket/") + + +def test_parses_console_url_prefix() -> None: + url = "https://eu-west-2.console.aws.amazon.com/s3/object/my-bucket?prefix=nested%2Ffile.csv" + assert parse_s3_uri(url) == ("my-bucket", "nested/file.csv") + + +def test_rejects_unparseable_string() -> None: + with pytest.raises(ValueError): + parse_s3_uri("not-a-uri-at-all") diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 57bd2133..79c60974 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -132,7 +132,7 @@ def _upload_fixture_csv(csv_client: CsvS3Client) -> str: "Address 1": f"{i} High St", "Address 2": "", "Address 3": "", - "Postcode": "AA1 1AA", + "postcode": "AA1 1AA", "Internal Reference": f"AA-{i}", } for i in range(1, 3) @@ -142,7 +142,7 @@ def _upload_fixture_csv(csv_client: CsvS3Client) -> str: "Address 1": f"{i} Long Road", "Address 2": "", "Address 3": "", - "Postcode": "BB2 2BB", + "postcode": "BB2 2BB", "Internal Reference": f"BB-{i}", } for i in range(1, 5) @@ -152,7 +152,7 @@ def _upload_fixture_csv(csv_client: CsvS3Client) -> str: "Address 1": "1 Final Way", "Address 2": "", "Address 3": "", - "Postcode": "CC3 3CC", + "postcode": "CC3 3CC", "Internal Reference": "CC-1", } ) @@ -281,15 +281,15 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order( input_s3_uri=input_uri, ) - # Re-load each child's saved batch and inspect the postcode column to - # confirm the dispatch order matches the postcode-batching algorithm: + # Re-load each child's saved batch and inspect the postcode_clean column + # to confirm the dispatch order matches the postcode-batching algorithm: # AA-batch first, BB oversize batch second, CC final-flush third. postcodes_per_batch: list[set[str]] = [] for cid in child_ids: child = harness.subtasks.get(cid) assert child.inputs is not None rows = harness.csv_client.read_rows(child.inputs["s3_uri"]) - postcodes_per_batch.append({row["postcode"] for row in rows}) + postcodes_per_batch.append({row["postcode_clean"] for row in rows}) assert postcodes_per_batch == [ {"AA11AA"}, diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index ca9e8a57..48733b55 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -3,6 +3,7 @@ from collections.abc import Iterator import pytest from moto import mock_aws +from domain.addresses.user_address import UserAddress from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, @@ -27,7 +28,7 @@ def _upload_csv( return repo._csv_client.save_rows(rows, key) # pyright: ignore[reportPrivateUsage] -def test_load_batch_concatenates_three_address_lines( +def test_load_batch_parses_address_postcode_and_reference( repo: UserAddressCsvS3Repository, ) -> None: rows = [ @@ -35,7 +36,7 @@ def test_load_batch_concatenates_three_address_lines( "Address 1": "1 High Street", "Address 2": "Flat 2", "Address 3": "Townville", - "Postcode": "sw1a 1aa", + "postcode": "sw1a 1aa", "Internal Reference": "REF-001", } ] @@ -58,7 +59,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( "Address 1": "10 Cardiff Road", "Address 2": "", "Address 3": "", - "Postcode": "CF10 1AA", + "postcode": "CF10 1AA", "Internal Reference": "REF-002", } ] @@ -80,7 +81,7 @@ def test_load_batch_handles_missing_internal_reference( "Address 1": "5 Park Lane", "Address 2": "", "Address 3": "", - "Postcode": "M1 1AA", + "postcode": "M1 1AA", "Internal Reference": "", } ] @@ -94,16 +95,67 @@ def test_load_batch_handles_missing_internal_reference( assert addresses[0].internal_reference is None +def test_load_batch_captures_full_source_row( + repo: UserAddressCsvS3Repository, +) -> None: + # A raw EPC-export-shaped row: the splitter must preserve every column, + # not just the ones it parses into UserAddress fields. + row = { + "Asset Reference": "511", + "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX", + "postcode": "BB12 7BX", + "Property Type": "House: End Terrace", + "SAP Score": "69", + } + uri = _upload_csv(repo, [row], "uploads/epc.csv") + + addresses = repo.load_batch(uri) + + assert addresses[0].source_row == row + + +def test_load_batch_raises_when_postcode_column_absent( + repo: UserAddressCsvS3Repository, +) -> None: + rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] + uri = _upload_csv(repo, rows, "uploads/no-postcode.csv") + + with pytest.raises(ValueError, match="no 'postcode' column"): + repo.load_batch(uri) + + +def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( + repo: UserAddressCsvS3Repository, +) -> None: + row = { + "Asset Reference": "511", + "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX", + "postcode": " BB12 7BX", + "Property Type": "House: End Terrace", + } + uri = _upload_csv(repo, [row], "uploads/epc.csv") + addresses = repo.load_batch(uri) + + saved_uri = repo.save_batch(addresses, "tasks/passthrough") + saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + + assert len(saved_rows) == 1 + saved = saved_rows[0] + # Every original column survives, byte-for-byte. + for column, value in row.items(): + assert saved[column] == value + # Plus the one appended column the downstream address2uprn stage groups on. + assert saved["postcode_clean"] == "BB127BX" + + def test_save_batch_returns_uri_under_path_prefix( repo: UserAddressCsvS3Repository, ) -> None: - from domain.addresses.user_address import UserAddress - addresses = [ UserAddress( - user_address="1 High Street, Flat 2, Townville", + user_address="1 High Street", postcode="SW1A 1AA", - internal_reference="REF-001", + source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, ), ] @@ -113,59 +165,42 @@ def test_save_batch_returns_uri_under_path_prefix( assert uri.endswith(".csv") -def test_save_then_reload_round_trip_preserves_values( +def test_save_then_reload_round_trip_preserves_columns( repo: UserAddressCsvS3Repository, ) -> None: - from domain.addresses.user_address import UserAddress - - # save_batch writes the splitter's compact schema - # (user_address/postcode/internal_reference); load_batch reads the - # canonical upload schema. To round-trip through the repo we re-upload - # the saved CSV under the upload schema's column names. - original = [ - UserAddress( - user_address="1 High Street", - postcode="SW1A 1AA", - internal_reference="REF-001", - ), - UserAddress( - user_address="2 Low Street", - postcode="XY9 8ZW", - internal_reference=None, - ), - ] - - saved_uri = repo.save_batch(original, "tasks/round-trip") - - # Re-shape the saved CSV into the canonical upload schema for reload. - saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] - upload_rows: list[dict[str, str]] = [ + rows = [ { - "Address 1": row["user_address"], - "Address 2": "", - "Address 3": "", - "Postcode": row["postcode"], - "Internal Reference": row["internal_reference"], - } - for row in saved_rows + "Address 1": "1 High Street", + "postcode": "SW1A 1AA", + "Internal Reference": "REF-001", + }, + { + "Address 1": "2 Low Street", + "postcode": "XY9 8ZW", + "Internal Reference": "", + }, ] - upload_uri = _upload_csv(repo, upload_rows, "uploads/round-trip.csv") + uri = _upload_csv(repo, rows, "uploads/round-trip.csv") + addresses = repo.load_batch(uri) - reloaded = repo.load_batch(upload_uri) + saved_uri = repo.save_batch(addresses, "tasks/round-trip") + saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] - assert reloaded == original + # Original columns come back verbatim; postcode_clean is the only addition. + assert [ + {k: v for k, v in r.items() if k != "postcode_clean"} for r in saved_rows + ] == rows + assert [r["postcode_clean"] for r in saved_rows] == ["SW1A1AA", "XY98ZW"] def test_save_batch_uses_unique_filename_per_call( repo: UserAddressCsvS3Repository, ) -> None: - from domain.addresses.user_address import UserAddress - addresses = [ UserAddress( user_address="1 High Street", postcode="SW1A 1AA", - internal_reference="REF-001", + source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, ), ] diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py index 426b250f..771a49f8 100644 --- a/tests/utilities/aws_lambda/test_subtask_handler.py +++ b/tests/utilities/aws_lambda/test_subtask_handler.py @@ -6,6 +6,7 @@ to the wrapped function — so the handler can compose its own use-case orchestrator that shares the session. """ +import logging from collections.abc import Generator, Iterator from contextlib import contextmanager from dataclasses import dataclass @@ -13,6 +14,8 @@ from typing import Any from uuid import UUID import pytest + +_LOGGER_NAME = "utilities.aws_lambda.subtask_handler" from sqlmodel import Session, SQLModel, create_engine from domain.tasks.subtasks import SubTaskStatus @@ -142,3 +145,111 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask( persisted_child = harness.subtasks.get(child_ids[0]) assert persisted_child.task_id == task.id assert persisted_child.status is SubTaskStatus.WAITING + + +def test_subtask_handler_logs_subtask_lifecycle_on_success( + harness: Harness, caplog: pytest.LogCaptureFixture +) -> None: + """Start and completion are logged at INFO so a successful invocation + leaves a CloudWatch breadcrumb (not just the Lambda runtime lines).""" + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + return None + + with caplog.at_level(logging.INFO, logger=_LOGGER_NAME): + handler(_direct_event(task.id, subtask.id), context=None) + + assert f"Running subtask {subtask.id}" in caplog.text + assert f"Subtask {subtask.id} completed" in caplog.text + + +def test_subtask_handler_logs_exception_on_failure( + harness: Harness, caplog: pytest.LogCaptureFixture +) -> None: + """A failing subtask is logged at ERROR with the traceback attached, + before the exception propagates for the Lambda runtime to surface.""" + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + raise RuntimeError("boom") + + with caplog.at_level(logging.INFO, logger=_LOGGER_NAME): + with pytest.raises(RuntimeError, match="boom"): + handler(_direct_event(task.id, subtask.id), context=None) + + failures = [r for r in caplog.records if r.levelno == logging.ERROR] + assert any( + f"Subtask {subtask.id} failed" in r.getMessage() for r in failures + ) + assert any(r.exc_info is not None for r in failures) + + +def test_subtask_handler_records_cloudwatch_url_on_subtask( + harness: Harness, monkeypatch: pytest.MonkeyPatch +) -> None: + """With the AWS Lambda runtime's log env vars present, a CloudWatch deep + link is built and persisted on the SubTask.""" + monkeypatch.setenv("AWS_REGION", "eu-west-2") + monkeypatch.setenv( + "AWS_LAMBDA_LOG_GROUP_NAME", "/aws/lambda/postcode-splitter" + ) + monkeypatch.setenv( + "AWS_LAMBDA_LOG_STREAM_NAME", "2026/05/20/[$LATEST]abc123" + ) + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + return None + + handler(_direct_event(task.id, subtask.id), context=None) + + saved_url = harness.subtasks.get(subtask.id).cloud_logs_url + assert saved_url is not None + assert saved_url.startswith( + "https://eu-west-2.console.aws.amazon.com/cloudwatch/home" + ) + # Log group / stream are console-encoded ("/" -> "$252F"). + assert "$252Faws$252Flambda$252Fpostcode-splitter" in saved_url + assert "$255B$2524LATEST$255D" in saved_url + + +def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda( + harness: Harness, monkeypatch: pytest.MonkeyPatch +) -> None: + """Outside a real Lambda (e.g. the local RIE) the runtime log env vars + are absent, so cloud_logs_url is left unset rather than storing junk.""" + for var in ( + "AWS_REGION", + "AWS_LAMBDA_LOG_GROUP_NAME", + "AWS_LAMBDA_LOG_STREAM_NAME", + ): + monkeypatch.delenv(var, raising=False) + task, subtask = harness.orchestrator.create_task_with_subtask( + task_source="manual:test" + ) + + @subtask_handler(orchestrator_cm=harness.factory) + def handler( + body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator + ) -> None: + return None + + handler(_direct_event(task.id, subtask.id), context=None) + + assert harness.subtasks.get(subtask.id).cloud_logs_url is None diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py index 5ad5f6e1..40f116ad 100644 --- a/utilities/aws_lambda/subtask_handler.py +++ b/utilities/aws_lambda/subtask_handler.py @@ -1,18 +1,32 @@ """@subtask_handler decorator for Lambdas that operate on existing SubTasks. Translates an AWS Lambda invocation (SQS-shaped or direct) into -TaskOrchestrator.run_subtask(...) calls. +TaskOrchestrator.run_subtask(...) calls, emitting an INFO log line for each +subtask's start and completion and a logged exception on failure. Those lines +land in CloudWatch via the Lambda runtime's stdout/stderr capture. + +Each subtask also records ``cloud_logs_url`` -- a deep link to this +invocation's CloudWatch log stream -- so an operator can jump from a SubTask +row straight to its logs. It is built from the environment variables the AWS +Lambda runtime sets, so it is populated only on real Lambda invocations and +left unset under the local RIE (which does not export them). """ import json +import logging +import os from contextlib import AbstractContextManager from functools import wraps from typing import Any, Callable, Optional, cast +from urllib.parse import quote from utilities.aws_lambda.default_orchestrator import default_orchestrator from utilities.aws_lambda.subtask_trigger_body import SubtaskTriggerBody from orchestration.task_orchestrator import TaskOrchestrator +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + OrchestratorCM = Callable[[], AbstractContextManager[TaskOrchestrator]] @@ -33,14 +47,26 @@ def subtask_handler( def decorator(func: Callable[..., Any]) -> Callable[..., Any]: @wraps(func) def wrapper(event: dict[str, Any], context: Any) -> None: + cloud_logs_url = _cloudwatch_url() with factory() as orchestrator: for record in _records(event): body = _parse_body(record) trigger = SubtaskTriggerBody.model_validate(body) - orchestrator.run_subtask( - trigger.sub_task_id, - work=lambda body=body, o=orchestrator: func(body, context, o), - ) + logger.info("Running subtask %s", trigger.sub_task_id) + try: + orchestrator.run_subtask( + trigger.sub_task_id, + work=lambda body=body, o=orchestrator: func( + body, context, o + ), + cloud_logs_url=cloud_logs_url, + ) + except Exception: + logger.exception( + "Subtask %s failed", trigger.sub_task_id + ) + raise + logger.info("Subtask %s completed", trigger.sub_task_id) return wrapper @@ -65,3 +91,34 @@ def _records(event: dict[str, Any]) -> list[dict[str, Any]]: if isinstance(raw_records, list): return [r for r in cast(list[Any], raw_records) if isinstance(r, dict)] return [event] + + +def _console_encode(value: str) -> str: + """Encode a value for a CloudWatch console deep link. + + The console expects URL-encoding with the percent signs themselves + re-encoded as ``$25`` -- e.g. ``/`` becomes ``%2F`` becomes ``$252F``. + """ + return quote(value, safe="").replace("%", "$25") + + +def _cloudwatch_url() -> Optional[str]: + """Build a CloudWatch console URL for this invocation's log stream. + + Sourced entirely from the environment variables the AWS Lambda runtime + sets -- ``AWS_REGION``, ``AWS_LAMBDA_LOG_GROUP_NAME`` and + ``AWS_LAMBDA_LOG_STREAM_NAME``. Returns None when any is absent, which is + the case outside a real Lambda (the local RIE does not export them) -- so + ``SubTask.cloud_logs_url`` is left unset rather than storing a link that + points nowhere. + """ + region = os.environ.get("AWS_REGION") + log_group = os.environ.get("AWS_LAMBDA_LOG_GROUP_NAME") + log_stream = os.environ.get("AWS_LAMBDA_LOG_STREAM_NAME") + if not (region and log_group and log_stream): + return None + return ( + f"https://{region}.console.aws.amazon.com/cloudwatch/home" + f"?region={region}#logsV2:log-groups/log-group/" + f"{_console_encode(log_group)}/log-events/{_console_encode(log_stream)}" + ) From 8bb90a5aa5beb495de799c481c2faa7899e6c5de Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 12:57:03 +0000 Subject: [PATCH 77/91] sanitisation of postcode --- backend/bulk_address2uprn_combiner/main.py | 14 +----- backend/ordnanceSurvey/main.py | 12 +---- domain/addresses/postcode_batching.py | 8 ++-- domain/addresses/user_address.py | 20 ++++---- domain/postcode.py | 40 ++++++++++++++++ domain/postcodes/__init__.py | 0 domain/postcodes/sanitise.py | 23 --------- .../user_address_csv_s3_repository.py | 5 +- .../user_address/user_address_repository.py | 4 +- .../addresses/test_postcode_batching.py | 17 ++++--- tests/domain/addresses/test_user_address.py | 48 +++++++++++-------- tests/domain/postcodes/__init__.py | 0 tests/domain/postcodes/test_sanitise.py | 28 ----------- tests/domain/test_postcode.py | 48 +++++++++++++++++++ .../test_user_address_csv_s3_repository.py | 11 +++-- 15 files changed, 153 insertions(+), 125 deletions(-) create mode 100644 domain/postcode.py delete mode 100644 domain/postcodes/__init__.py delete mode 100644 domain/postcodes/sanitise.py delete mode 100644 tests/domain/postcodes/__init__.py delete mode 100644 tests/domain/postcodes/test_sanitise.py create mode 100644 tests/domain/test_postcode.py diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py index 37136e52..44f0b3f9 100644 --- a/backend/bulk_address2uprn_combiner/main.py +++ b/backend/bulk_address2uprn_combiner/main.py @@ -2,7 +2,7 @@ import os import boto3 import pandas as pd from io import BytesIO -from typing import Any, Optional +from typing import Any from uuid import UUID from datetime import datetime, timezone @@ -12,7 +12,6 @@ from backend.app.db.functions.bulk_address_uploads_functions import ( set_combined_output_s3_uri, set_combining_status, ) -from orchestration.task_orchestrator import TaskOrchestrator logger = setup_logger() @@ -36,16 +35,7 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame: @subtask_handler() -def handler( - body: dict[str, Any], - context: Any, - orchestrator: Optional[TaskOrchestrator] = None, -) -> str: - # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler - # decorator; unused here but accepted so the contract is uniform across - # callers (see issue #1103). - del orchestrator - +def handler(body: dict[str, Any], context: Any) -> str: task_id_str: str = body.get("task_id", "") if not task_id_str: diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 18c4e2f2..6e82b468 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -16,7 +16,6 @@ from backend.ordnanceSurvey.helpers import ( os_places_results_to_dataframe, ) from backend.app.config import get_settings -from orchestration.task_orchestrator import TaskOrchestrator from sqlalchemy import select from datetime import datetime import uuid @@ -106,16 +105,7 @@ def save_results_to_s3( @subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body -def handler( - body: dict[str, Any], - context: Any, - orchestrator: Optional[TaskOrchestrator] = None, - local: bool = False, -) -> None: - # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler - # decorator; unused here but accepted so the contract is uniform across - # callers (see issue #1103). - del orchestrator +def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # delete this line after test # local = True diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index 209e0784..b73dc1bb 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -22,6 +22,7 @@ from __future__ import annotations from collections.abc import Iterable, Iterator from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode def iter_postcode_grouped_batches( @@ -75,13 +76,14 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( addresses: Iterable[UserAddress], -) -> dict[str, list[UserAddress]]: +) -> dict[Postcode, list[UserAddress]]: """Group addresses by ``postcode`` preserving first-seen order. Python dicts retain insertion order since 3.7, so a plain dict suffices - for the same effect as pandas ``groupby(..., sort=False)``. + for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode`` + is a frozen value object, hence hashable and usable as the dict key. """ - groups: dict[str, list[UserAddress]] = {} + groups: dict[Postcode, list[UserAddress]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) return groups diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index 120a3659..672b2c54 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -1,9 +1,9 @@ """The :class:`UserAddress` value object. A frozen dataclass capturing the splitter's domain entity: the raw input -address line, a sanitised postcode, and an optional internal reference from -the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no -caller can construct an instance with an un-normalised postcode. +address line, a :class:`~domain.postcode.Postcode`, and an optional internal +reference from the customer dataset. The postcode is a value object that is +canonical by construction, so no caller can hold an un-normalised postcode. """ from __future__ import annotations @@ -11,7 +11,7 @@ from __future__ import annotations from dataclasses import dataclass, field from typing import Optional -from domain.postcodes.sanitise import sanitise_postcode +from domain.postcode import Postcode def _empty_source_row() -> dict[str, str]: @@ -25,9 +25,9 @@ class UserAddress: Attributes: user_address: The free-text address string as supplied upstream. - postcode: The postcode; always stored in canonical form - (uppercased, whitespace stripped). Sanitisation is enforced by - :meth:`__post_init__`. + postcode: The postcode as a :class:`~domain.postcode.Postcode` value + object -- canonical (uppercased, whitespace stripped) by + construction. internal_reference: Optional customer-side identifier preserved for traceability through the matching pipeline. source_row: The complete original CSV row this address was parsed @@ -39,12 +39,8 @@ class UserAddress: """ user_address: str - postcode: str + postcode: Postcode internal_reference: Optional[str] = None source_row: dict[str, str] = field( default_factory=_empty_source_row, compare=False ) - - def __post_init__(self) -> None: - # Frozen dataclass: bypass the descriptor with object.__setattr__. - object.__setattr__(self, "postcode", sanitise_postcode(self.postcode)) diff --git a/domain/postcode.py b/domain/postcode.py new file mode 100644 index 00000000..514e1a39 --- /dev/null +++ b/domain/postcode.py @@ -0,0 +1,40 @@ +"""The :class:`Postcode` value object. + +A frozen value object that owns postcode sanitisation. Constructing a +``Postcode`` always yields the canonical form -- uppercase with all +whitespace removed -- so no part of the domain can hold an un-normalised +postcode. This matches the legacy splitter's +``df["postcode"].str.upper().str.replace(" ", "")``. + +``Postcode`` is the single sanitisation point: anywhere a postcode crosses a +domain boundary it should be wrapped in one, and ``str(postcode)`` gives the +canonical string back for serialisation. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Postcode: + """A postcode held in canonical form. + + The ``value`` passed to the constructor is sanitised eagerly in + :meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs, + newlines) removed -- so every ``Postcode`` instance is canonical by + construction. Two postcodes that differ only in surface whitespace or + case therefore compare equal. + + Attributes: + value: The canonical postcode string (e.g. ``"SW1A1AA"``). + """ + + value: str + + def __post_init__(self) -> None: + # Frozen dataclass: bypass the descriptor with object.__setattr__. + object.__setattr__(self, "value", "".join(self.value.split()).upper()) + + def __str__(self) -> str: + return self.value diff --git a/domain/postcodes/__init__.py b/domain/postcodes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/domain/postcodes/sanitise.py b/domain/postcodes/sanitise.py deleted file mode 100644 index 94b0dcf7..00000000 --- a/domain/postcodes/sanitise.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Canonical postcode sanitisation for the domain layer. - -The legacy postcode_splitter normalises postcodes inline with -``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes -that operation to a pure, reusable function so the same canonical form is -applied wherever a postcode crosses a domain boundary -- including -:class:`domain.addresses.user_address.UserAddress` construction and future -migrations. -""" - -from __future__ import annotations - - -def sanitise_postcode(s: str) -> str: - """Return the canonical form of a postcode. - - The canonical form is uppercase with all whitespace removed. This matches - the legacy splitter's ``str.upper().str.replace(" ", "")`` for the - overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"`` - becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep - in from CSV ingestion. - """ - return "".join(s.split()).upper() diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 7cd10bac..2432d8e9 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -20,6 +20,7 @@ from datetime import datetime, timezone from typing import Optional from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_repository import UserAddressRepository @@ -77,7 +78,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): addresses.append( UserAddress( user_address=user_address, - postcode=postcode, + postcode=Postcode(postcode), internal_reference=internal_reference, source_row=row, ) @@ -96,7 +97,7 @@ class UserAddressCsvS3Repository(UserAddressRepository): Returns the full ``s3://bucket/key`` URI. """ rows: list[dict[str, str]] = [ - {**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode} + {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)} for addr in addresses ] filename = ( diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index d8c12855..ab9b6671 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -17,8 +17,8 @@ class UserAddressRepository(ABC): Implementations choose the underlying storage (S3 CSV, Postgres, in-memory, ...) but must preserve the canonical column semantics: - the address text, postcode (sanitised by ``UserAddress.__post_init__``), - and an optional internal reference. + the address text, postcode (a :class:`~domain.postcode.Postcode` value + object), and an optional internal reference. """ @abstractmethod diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 2dac46cc..6e52b581 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -2,12 +2,15 @@ import pytest from domain.addresses.postcode_batching import iter_postcode_grouped_batches from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode def _addrs(postcode: str, n: int) -> list[UserAddress]: """Build ``n`` addresses sharing a postcode, with distinct address lines.""" return [ - UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode) + UserAddress( + user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) + ) for i in range(n) ] @@ -38,8 +41,8 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None: addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3) batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) assert len(batches) == 2 - assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3 - assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3 + assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3 + assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3 def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None: @@ -61,9 +64,9 @@ def test_oversize_group_flushes_existing_buffer_first() -> None: iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) ) assert len(batches) == 3 - assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"] - assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7 - assert [a.postcode for a in batches[2]] == ["CC33CC"] + assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"] + assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7 + assert [str(a.postcode) for a in batches[2]] == ["CC33CC"] def test_final_flush_yields_remaining_buffer() -> None: @@ -80,7 +83,7 @@ def test_postcode_grouping_preserves_first_seen_order() -> None: b1, b2 = _addrs("AA1 1AA", 2) batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2])) assert len(batches) == 1 - assert [a.postcode for a in batches[0]] == [ + assert [str(a.postcode) for a in batches[0]] == [ "ZZ99ZZ", "ZZ99ZZ", "AA11AA", diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index 4d8322da..fa44ad61 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -3,69 +3,77 @@ import dataclasses import pytest from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode -def test_user_address_sanitises_postcode_on_construction() -> None: - addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") - assert addr.postcode == "SW1A1AA" +def test_user_address_holds_postcode_value_object() -> None: + addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + assert addr.postcode == Postcode("SW1A1AA") def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- - # only the postcode is canonicalised at the boundary. - addr = UserAddress(user_address=" 1 The Street ", postcode="sw1a 1aa") + # only the postcode is canonicalised, and that happens inside Postcode. + addr = UserAddress( + user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") + ) assert addr.user_address == " 1 The Street " def test_user_address_internal_reference_defaults_to_none() -> None: - addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) assert addr.internal_reference is None def test_user_address_internal_reference_accepted() -> None: addr = UserAddress( user_address="1 The Street", - postcode="SW1A1AA", + postcode=Postcode("SW1A1AA"), internal_reference="cust-42", ) assert addr.internal_reference == "cust-42" def test_user_address_is_frozen() -> None: - addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) with pytest.raises(dataclasses.FrozenInstanceError): - addr.postcode = "OTHER" # type: ignore[misc] + addr.postcode = Postcode("OTHER") # type: ignore[misc] -def test_user_address_equality_uses_sanitised_postcode() -> None: - # Two instances constructed with different surface forms of the same - # postcode must compare equal because sanitisation runs eagerly. - a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa") - b = UserAddress(user_address="1 The Street", postcode="SW1A1AA") +def test_user_address_equality_uses_canonical_postcode() -> None: + # Postcode sanitises eagerly, so addresses built from different surface + # forms of the same postcode compare equal. + a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) assert a == b def test_user_address_source_row_defaults_to_empty_dict() -> None: - addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA") + addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) assert addr.source_row == {} def test_user_address_carries_source_row() -> None: row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} addr = UserAddress( - user_address="1 The Street", postcode="SW1A 1AA", source_row=row + user_address="1 The Street", + postcode=Postcode("SW1A 1AA"), + source_row=row, ) assert addr.source_row == row def test_user_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays - # defined by the parsed fields, so two addresses parsed from rows with - # different incidental columns still compare equal. + # defined by the parsed fields. a = UserAddress( - user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"} + user_address="1 The Street", + postcode=Postcode("SW1A1AA"), + source_row={"x": "1"}, ) b = UserAddress( - user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"} + user_address="1 The Street", + postcode=Postcode("SW1A1AA"), + source_row={"y": "2"}, ) assert a == b diff --git a/tests/domain/postcodes/__init__.py b/tests/domain/postcodes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/domain/postcodes/test_sanitise.py b/tests/domain/postcodes/test_sanitise.py deleted file mode 100644 index edd1679c..00000000 --- a/tests/domain/postcodes/test_sanitise.py +++ /dev/null @@ -1,28 +0,0 @@ -from domain.postcodes.sanitise import sanitise_postcode - - -def test_sanitise_uppercases() -> None: - assert sanitise_postcode("sw1a1aa") == "SW1A1AA" - - -def test_sanitise_strips_internal_spaces() -> None: - assert sanitise_postcode("sw1a 1aa") == "SW1A1AA" - - -def test_sanitise_strips_leading_and_trailing_whitespace() -> None: - assert sanitise_postcode(" sw1a 1aa ") == "SW1A1AA" - - -def test_sanitise_strips_tabs_and_newlines() -> None: - # CSV ingestion occasionally introduces stray whitespace characters; the - # canonical form must absorb them just like literal spaces. - assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA" - - -def test_sanitise_already_canonical_is_idempotent() -> None: - assert sanitise_postcode("SW1A1AA") == "SW1A1AA" - assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA" - - -def test_sanitise_empty_string() -> None: - assert sanitise_postcode("") == "" diff --git a/tests/domain/test_postcode.py b/tests/domain/test_postcode.py new file mode 100644 index 00000000..89d5cdc8 --- /dev/null +++ b/tests/domain/test_postcode.py @@ -0,0 +1,48 @@ +import dataclasses + +import pytest + +from domain.postcode import Postcode + + +def test_postcode_uppercases() -> None: + assert Postcode("sw1a1aa").value == "SW1A1AA" + + +def test_postcode_strips_internal_spaces() -> None: + assert Postcode("sw1a 1aa").value == "SW1A1AA" + + +def test_postcode_strips_leading_and_trailing_whitespace() -> None: + assert Postcode(" sw1a 1aa ").value == "SW1A1AA" + + +def test_postcode_strips_tabs_and_newlines() -> None: + # CSV ingestion occasionally introduces stray whitespace characters; the + # canonical form must absorb them just like literal spaces. + assert Postcode("sw1a\t1aa\n").value == "SW1A1AA" + + +def test_postcode_construction_is_idempotent() -> None: + once = Postcode("sw1a 1aa") + assert Postcode(once.value).value == "SW1A1AA" + + +def test_postcode_empty_string() -> None: + assert Postcode("").value == "" + + +def test_postcode_str_returns_canonical_value() -> None: + assert str(Postcode("sw1a 1aa")) == "SW1A1AA" + + +def test_postcode_equality_ignores_surface_form() -> None: + # Differing case / whitespace sanitise to the same canonical value, so + # the value objects compare equal. + assert Postcode("sw1a 1aa") == Postcode("SW1A1AA") + + +def test_postcode_is_frozen() -> None: + postcode = Postcode("SW1A1AA") + with pytest.raises(dataclasses.FrozenInstanceError): + postcode.value = "OTHER" # type: ignore[misc] diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index 48733b55..c1acee32 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -4,6 +4,7 @@ import pytest from moto import mock_aws from domain.addresses.user_address import UserAddress +from domain.postcode import Postcode from infrastructure.csv_s3_client import CsvS3Client from repositories.user_address.user_address_csv_s3_repository import ( UserAddressCsvS3Repository, @@ -47,7 +48,7 @@ def test_load_batch_parses_address_postcode_and_reference( assert len(addresses) == 1 address = addresses[0] assert address.user_address == "1 High Street, Flat 2, Townville" - assert address.postcode == "SW1A1AA" + assert address.postcode == Postcode("SW1A1AA") assert address.internal_reference == "REF-001" @@ -69,7 +70,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( assert len(addresses) == 1 assert addresses[0].user_address == "10 Cardiff Road" - assert addresses[0].postcode == "CF101AA" + assert addresses[0].postcode == Postcode("CF101AA") assert addresses[0].internal_reference == "REF-002" @@ -91,7 +92,7 @@ def test_load_batch_handles_missing_internal_reference( assert len(addresses) == 1 assert addresses[0].user_address == "5 Park Lane" - assert addresses[0].postcode == "M11AA" + assert addresses[0].postcode == Postcode("M11AA") assert addresses[0].internal_reference is None @@ -154,7 +155,7 @@ def test_save_batch_returns_uri_under_path_prefix( addresses = [ UserAddress( user_address="1 High Street", - postcode="SW1A 1AA", + postcode=Postcode("SW1A 1AA"), source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, ), ] @@ -199,7 +200,7 @@ def test_save_batch_uses_unique_filename_per_call( addresses = [ UserAddress( user_address="1 High Street", - postcode="SW1A 1AA", + postcode=Postcode("SW1A 1AA"), source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"}, ), ] From d0cf3d14ad5116d0b2926aceb23d642408ca71bc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 13:21:11 +0000 Subject: [PATCH 78/91] get rid of comments --- applications/postcode_splitter/handler.py | 18 ------- .../local_handler/invoke_local_lambda.py | 9 ---- .../postcode_splitter_trigger_body.py | 21 --------- domain/addresses/postcode_batching.py | 38 --------------- domain/addresses/user_address.py | 30 +----------- domain/postcode.py | 25 ---------- infrastructure/address2uprn_queue_client.py | 7 --- infrastructure/csv_s3_client.py | 18 ------- infrastructure/s3_client.py | 9 ---- infrastructure/s3_uri.py | 18 ------- infrastructure/sqs_client.py | 8 ---- .../postcode_splitter_orchestrator.py | 34 -------------- orchestration/task_orchestrator.py | 6 --- .../user_address_csv_s3_repository.py | 47 ------------------- .../user_address/user_address_repository.py | 19 +------- .../addresses/test_postcode_batching.py | 1 - tests/infrastructure/__init__.py | 7 --- tests/infrastructure/conftest.py | 4 -- .../test_postcode_splitter_orchestrator.py | 10 ---- tests/repositories/user_address/conftest.py | 4 -- .../aws_lambda/test_subtask_handler.py | 25 +--------- utilities/aws_lambda/subtask_handler.py | 24 +--------- 22 files changed, 6 insertions(+), 376 deletions(-) diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py index 005227a9..9fb3ca6a 100644 --- a/applications/postcode_splitter/handler.py +++ b/applications/postcode_splitter/handler.py @@ -1,15 +1,3 @@ -"""Lambda entrypoint for the postcode splitter slice. - -The :func:`handler` function is decorated with ``@subtask_handler()`` so the -decorator owns the parent ``SubTask`` lifecycle (start/complete/fail) and -injects the decorator-owned :class:`TaskOrchestrator` as the third positional -argument. The handler itself does only two things: - -1. Build a :class:`PostcodeSplitterOrchestrator` from env-driven config. -2. Delegate to ``split_and_dispatch`` and return its result so it lands in - ``SubTask.outputs["result"]``. -""" - from __future__ import annotations import os @@ -34,12 +22,6 @@ from utilities.aws_lambda.subtask_handler import subtask_handler def handler( body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator ) -> dict[str, list[str]]: - """Validate the trigger body, build the splitter, dispatch children. - - Reads ``S3_BUCKET_NAME`` and ``ADDRESS2UPRN_QUEUE_URL`` from the - environment to construct the typed S3/SQS clients. The return value - lands in ``SubTask.outputs["result"]`` via the decorator. - """ trigger = PostcodeSplitterTriggerBody.model_validate(body) bucket = os.environ["S3_BUCKET_NAME"] diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py index c0ca89ec..21fa9b9e 100755 --- a/applications/postcode_splitter/local_handler/invoke_local_lambda.py +++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py @@ -1,13 +1,4 @@ #!/usr/bin/env python3 -"""POST a single SQS-shaped event at the locally-running splitter Lambda. - -The container built by docker-compose runs the AWS Lambda Runtime Interface -Emulator, which accepts invocations on the URL below. Replace the three -placeholder values with a real parent Task id, the splitter's own SubTask id -(both must already exist in the Postgres pointed at by .env.local), and the -s3://... URI of an uploaded address CSV. -""" - import json import requests diff --git a/applications/postcode_splitter/postcode_splitter_trigger_body.py b/applications/postcode_splitter/postcode_splitter_trigger_body.py index bc983abc..4c33f4a4 100644 --- a/applications/postcode_splitter/postcode_splitter_trigger_body.py +++ b/applications/postcode_splitter/postcode_splitter_trigger_body.py @@ -1,30 +1,9 @@ -"""Trigger payload model for the postcode splitter Lambda. - -The decorator (``@subtask_handler``) already validates ``task_id`` and -``sub_task_id`` via :class:`SubtaskTriggerBody`; this model layers on the -splitter-specific ``s3_uri`` field while keeping ``extra="allow"`` so any -upstream-passthrough keys (e.g. ``portfolio_id``) survive untouched. -""" - from uuid import UUID from pydantic import BaseModel, ConfigDict class PostcodeSplitterTriggerBody(BaseModel): - """Validated body for the postcode splitter Lambda. - - Attributes: - task_id: Parent ``Task`` id; used as the ``task_id`` input on each - child ``SubTask`` and as the ``parent_task_id`` on the fan-out - SQS messages. - sub_task_id: The splitter's own ``SubTask`` id; used as the path - segment under ``ara_postcode_splitter_batches/{task_id}/{...}`` - so per-invocation outputs cannot collide. - s3_uri: ``s3://bucket/key`` URI of the uploaded address CSV the - splitter must read. - """ - model_config = ConfigDict(extra="allow") task_id: UUID diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py index b73dc1bb..44e4d967 100644 --- a/domain/addresses/postcode_batching.py +++ b/domain/addresses/postcode_batching.py @@ -1,22 +1,3 @@ -"""Pure-Python postcode-grouped batching. - -This module preserves the batching invariants from the legacy postcode -splitter (``backend/postcode_splitter/main.py``) without touching pandas, -S3, or SQS: - - * Addresses are grouped by **Postcode** in *insertion order* -- the first - Postcode seen produces the first group. - * A Postcode group is never split across two batches. - * If a single Postcode group is larger than ``max_batch_size``, it is - flushed as its own oversize batch (any buffered groups go out first, - untouched). - * Adding a group that would push the buffer past ``max_batch_size`` first - flushes the existing buffer, then starts a new buffer with the group. - * Whatever remains in the buffer after the input is exhausted is flushed - as the final batch. - * Empty input yields no batches. -""" - from __future__ import annotations from collections.abc import Iterable, Iterator @@ -30,19 +11,6 @@ def iter_postcode_grouped_batches( *, max_batch_size: int = 500, ) -> Iterator[list[UserAddress]]: - """Yield batches of ``UserAddress`` grouped by Postcode. - - Args: - addresses: An iterable of :class:`UserAddress`. Order is preserved - within each Postcode group, and groups are yielded in the order - their first member was seen. - max_batch_size: The soft upper bound on batch size, in number of - addresses. A single Postcode group larger than this cap is - dispatched whole (the cap is never used to split a group). - - Yields: - Lists of ``UserAddress``. Each list is non-empty. - """ if max_batch_size < 1: raise ValueError("max_batch_size must be >= 1") @@ -77,12 +45,6 @@ def iter_postcode_grouped_batches( def _group_by_postcode_in_order( addresses: Iterable[UserAddress], ) -> dict[Postcode, list[UserAddress]]: - """Group addresses by ``postcode`` preserving first-seen order. - - Python dicts retain insertion order since 3.7, so a plain dict suffices - for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode`` - is a frozen value object, hence hashable and usable as the dict key. - """ groups: dict[Postcode, list[UserAddress]] = {} for address in addresses: groups.setdefault(address.postcode, []).append(address) diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py index 672b2c54..9a28751b 100644 --- a/domain/addresses/user_address.py +++ b/domain/addresses/user_address.py @@ -1,11 +1,3 @@ -"""The :class:`UserAddress` value object. - -A frozen dataclass capturing the splitter's domain entity: the raw input -address line, a :class:`~domain.postcode.Postcode`, and an optional internal -reference from the customer dataset. The postcode is a value object that is -canonical by construction, so no caller can hold an un-normalised postcode. -""" - from __future__ import annotations from dataclasses import dataclass, field @@ -15,32 +7,12 @@ from domain.postcode import Postcode def _empty_source_row() -> dict[str, str]: - """Typed default factory for :attr:`UserAddress.source_row`.""" return {} @dataclass(frozen=True) class UserAddress: - """A user-supplied address paired with its canonical postcode. - - Attributes: - user_address: The free-text address string as supplied upstream. - postcode: The postcode as a :class:`~domain.postcode.Postcode` value - object -- canonical (uppercased, whitespace stripped) by - construction. - internal_reference: Optional customer-side identifier preserved for - traceability through the matching pipeline. - source_row: The complete original CSV row this address was parsed - from, column name -> cell value. The splitter is a pass-through - router: it groups rows by postcode but must not drop the other - columns the downstream address2uprn stage relies on, so the raw - row travels alongside the parsed fields. Excluded from equality - and hashing -- identity stays defined by the parsed fields above. - """ - user_address: str postcode: Postcode internal_reference: Optional[str] = None - source_row: dict[str, str] = field( - default_factory=_empty_source_row, compare=False - ) + source_row: dict[str, str] = field(default_factory=_empty_source_row, compare=False) diff --git a/domain/postcode.py b/domain/postcode.py index 514e1a39..8e4e7c79 100644 --- a/domain/postcode.py +++ b/domain/postcode.py @@ -1,16 +1,3 @@ -"""The :class:`Postcode` value object. - -A frozen value object that owns postcode sanitisation. Constructing a -``Postcode`` always yields the canonical form -- uppercase with all -whitespace removed -- so no part of the domain can hold an un-normalised -postcode. This matches the legacy splitter's -``df["postcode"].str.upper().str.replace(" ", "")``. - -``Postcode`` is the single sanitisation point: anywhere a postcode crosses a -domain boundary it should be wrapped in one, and ``str(postcode)`` gives the -canonical string back for serialisation. -""" - from __future__ import annotations from dataclasses import dataclass @@ -18,18 +5,6 @@ from dataclasses import dataclass @dataclass(frozen=True) class Postcode: - """A postcode held in canonical form. - - The ``value`` passed to the constructor is sanitised eagerly in - :meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs, - newlines) removed -- so every ``Postcode`` instance is canonical by - construction. Two postcodes that differ only in surface whitespace or - case therefore compare equal. - - Attributes: - value: The canonical postcode string (e.g. ``"SW1A1AA"``). - """ - value: str def __post_init__(self) -> None: diff --git a/infrastructure/address2uprn_queue_client.py b/infrastructure/address2uprn_queue_client.py index d81e2dd1..314e981f 100644 --- a/infrastructure/address2uprn_queue_client.py +++ b/infrastructure/address2uprn_queue_client.py @@ -4,12 +4,6 @@ from infrastructure.sqs_client import SqsClient class Address2UprnQueueClient(SqsClient): - """SQS client that publishes Address-to-UPRN fan-out messages. - - The body shape is fixed by the downstream consumer: - ``{"task_id": str, "sub_task_id": str, "s3_uri": str}`` - """ - def publish( self, *, @@ -17,7 +11,6 @@ class Address2UprnQueueClient(SqsClient): child_subtask_id: UUID, s3_uri: str, ) -> str: - """Send a typed Address-to-UPRN message. Returns the SQS ``MessageId``.""" return self.send( { "task_id": str(parent_task_id), diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py index 0a576b81..055d1ce3 100644 --- a/infrastructure/csv_s3_client.py +++ b/infrastructure/csv_s3_client.py @@ -6,20 +6,7 @@ from infrastructure.s3_uri import parse_s3_uri class CsvS3Client(S3Client): - """:class:`S3Client` subclass that round-trips CSV row dictionaries. - - Rows are represented as ``list[dict[str, str]]`` — the same shape used by - :func:`csv.DictReader`/``DictWriter`` — which keeps the API trivially - compatible with existing CSV helpers in ``utils/s3.py``. - """ - def read_rows(self, s3_uri: str) -> list[dict[str, str]]: - """Fetch the object at ``s3_uri`` and decode it as a CSV. - - The bucket portion of the URI is validated against this client's - configured bucket so cross-bucket reads fail loudly rather than - silently fetching from the wrong place. - """ bucket, key = parse_s3_uri(s3_uri) if bucket != self.bucket: raise ValueError( @@ -31,11 +18,6 @@ class CsvS3Client(S3Client): return [dict(row) for row in reader] def save_rows(self, rows: list[dict[str, str]], key: str) -> str: - """Serialise ``rows`` to CSV under ``key`` and return the ``s3://`` URI. - - An empty ``rows`` list is rejected because we cannot otherwise infer - a header row. - """ if not rows: raise ValueError("Cannot save an empty rows list: header is unknown") buffer = StringIO() diff --git a/infrastructure/s3_client.py b/infrastructure/s3_client.py index 9e772881..a789fcc2 100644 --- a/infrastructure/s3_client.py +++ b/infrastructure/s3_client.py @@ -2,13 +2,6 @@ from typing import Any class S3Client: - """Thin typed wrapper around a boto3 S3 client bound to a single bucket. - - The class is deliberately small: it exposes only the byte-level - operations needed by the wider infrastructure layer. Serialisation - (CSV, JSON, etc.) lives in subclasses such as :class:`CsvS3Client`. - """ - def __init__(self, boto_s3_client: Any, bucket: str) -> None: self._client = boto_s3_client self._bucket = bucket @@ -18,7 +11,6 @@ class S3Client: return self._bucket def get_object(self, key: str) -> bytes: - """Return the raw bytes stored at ``key`` in this client's bucket.""" response: dict[str, Any] = self._client.get_object( Bucket=self._bucket, Key=key ) @@ -26,6 +18,5 @@ class S3Client: return body def put_object(self, key: str, body: bytes) -> str: - """Write ``body`` to ``key`` and return the canonical ``s3://`` URI.""" self._client.put_object(Bucket=self._bucket, Key=key, Body=body) return f"s3://{self._bucket}/{key}" diff --git a/infrastructure/s3_uri.py b/infrastructure/s3_uri.py index bf97100e..1dd5d967 100644 --- a/infrastructure/s3_uri.py +++ b/infrastructure/s3_uri.py @@ -1,25 +1,7 @@ -"""Parse S3 URIs into ``(bucket, key)`` pairs. - -A pure-stdlib helper for the infrastructure layer. It deliberately pulls in -neither pandas, boto3, nor the legacy ``utils`` package, so slim Lambda images -that only need URI parsing do not drag the wider data stack along. - -Two input shapes are supported: - -* canonical S3 URIs --- ``s3://bucket/key`` -* AWS S3 console URLs --- ``https://.../s3/object/bucket?prefix=key`` -""" - from urllib.parse import unquote def parse_s3_uri(s3_uri: str) -> tuple[str, str]: - """Return the ``(bucket, key)`` pair addressed by ``s3_uri``. - - Raises: - ValueError: if ``s3_uri`` is neither a well-formed ``s3://`` URI nor - an AWS console URL carrying a ``prefix`` query parameter. - """ if s3_uri.startswith("s3://"): parts = s3_uri[len("s3://") :].split("/", 1) if len(parts) < 2 or not parts[0] or not parts[1]: diff --git a/infrastructure/sqs_client.py b/infrastructure/sqs_client.py index fb053680..6fe8dd2e 100644 --- a/infrastructure/sqs_client.py +++ b/infrastructure/sqs_client.py @@ -3,13 +3,6 @@ from typing import Any class SqsClient: - """Thin typed wrapper around a boto3 SQS client bound to one queue URL. - - The body is JSON-serialised here so callers can pass plain dictionaries - instead of constructing message strings themselves. Typed publish - helpers (e.g. :class:`Address2UprnQueueClient`) build on this contract. - """ - def __init__(self, boto_sqs_client: Any, queue_url: str) -> None: self._client = boto_sqs_client self._queue_url = queue_url @@ -19,7 +12,6 @@ class SqsClient: return self._queue_url def send(self, body: dict[str, Any]) -> str: - """JSON-serialise ``body`` and send it. Returns the SQS ``MessageId``.""" response: dict[str, Any] = self._client.send_message( QueueUrl=self._queue_url, MessageBody=json.dumps(body), diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py index 6afa2538..36f4b515 100644 --- a/orchestration/postcode_splitter_orchestrator.py +++ b/orchestration/postcode_splitter_orchestrator.py @@ -1,15 +1,3 @@ -"""Use-case orchestrator for the postcode splitter Lambda. - -Wires the slice-1 domain (``iter_postcode_grouped_batches``), the slice-3 -``UserAddressRepository``, the slice-2 ``Address2UprnQueueClient``, and the -slice-4 ``TaskOrchestrator.create_child_subtask`` primitive together. - -``split_and_dispatch`` loads the input batch, groups it into per-postcode -chunks, writes each chunk back to S3 under a deterministic prefix, creates a -WAITING child ``SubTask`` for it, and publishes the address-to-UPRN fan-out -message that downstream consumers pick up. -""" - from __future__ import annotations from uuid import UUID @@ -21,15 +9,6 @@ from repositories.user_address.user_address_repository import UserAddressReposit class PostcodeSplitterOrchestrator: - """Split an uploaded address batch into postcode-grouped child SubTasks. - - The orchestrator owns the algorithm; the IO collaborators - (:class:`UserAddressRepository`, :class:`Address2UprnQueueClient`) and - the :class:`TaskOrchestrator` lifecycle primitive are injected so the - same wiring can be exercised against moto/SQLite in tests and against - real AWS in the Lambda entrypoint. - """ - def __init__( self, task_orchestrator: TaskOrchestrator, @@ -49,19 +28,6 @@ class PostcodeSplitterOrchestrator: parent_subtask_id: UUID, input_s3_uri: str, ) -> list[UUID]: - """Split ``input_s3_uri`` into postcode batches and dispatch each. - - For each yielded batch: - - 1. Persist it under - ``ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}``. - 2. Create a WAITING child ``SubTask`` with - ``inputs={"task_id": str(parent_task_id), "s3_uri": batch_uri}``. - 3. Publish an ``address2UPRN`` SQS message referencing the new child. - - Returns: - The list of child ``SubTask`` ids, in dispatch order. - """ addresses = self._user_address_repo.load_batch(input_s3_uri) path_prefix = ( f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}" diff --git a/orchestration/task_orchestrator.py b/orchestration/task_orchestrator.py index 82d95db1..ebb71a32 100644 --- a/orchestration/task_orchestrator.py +++ b/orchestration/task_orchestrator.py @@ -54,12 +54,6 @@ class TaskOrchestrator: *, inputs: Optional[dict[str, Any]] = None, ) -> SubTask: - """Add a new WAITING SubTask under an existing parent Task. - - Skips `_cascade`: a new WAITING child against an IN_PROGRESS parent - leaves the parent's status unchanged per `Task.recalculate_from_subtasks`, - so calling it here would be a no-op. - """ subtask = SubTask.create(task_id=parent_task_id, inputs=inputs) self._subtasks.create(subtask) return subtask diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 2432d8e9..9b93b638 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -1,18 +1,3 @@ -"""CSV-on-S3 adapter for :class:`UserAddressRepository`. - -Reads upload CSVs that carry a ``postcode`` column (plus optional -``Address 1``/``Address 2``/``Address 3`` and ``Internal Reference``), and -writes batch CSVs that pass *every* original column through unchanged with -one column appended -- ``postcode_clean`` (uppercase, whitespace-stripped) -- -which the downstream address2uprn stage groups on. - -The splitter is a pass-through router: it must not reshape or drop columns, -because address2uprn has not been migrated and still consumes the legacy -splitter's full-row output. The frontend pre-applies the user's column -mapping at upload time, so this adapter does NOT consult any -``BulkAddressUpload.column_mapping``. -""" - from __future__ import annotations import uuid @@ -31,33 +16,11 @@ _POSTCODE_CLEAN_COLUMN: str = "postcode_clean" class UserAddressCsvS3Repository(UserAddressRepository): - """Persist :class:`UserAddress` batches as CSV objects in S3. - - The repo owns the unique-filename-within-prefix convention - (``{ISO datetime}_{8-char uuid}.csv``); callers own the directory - hierarchy supplied as ``path_prefix``. - """ - def __init__(self, csv_client: CsvS3Client, bucket: str) -> None: self._csv_client = csv_client self._bucket = bucket def load_batch(self, s3_uri: str) -> list[UserAddress]: - """Load upload CSV rows into :class:`UserAddress` objects. - - Each row's complete column set is preserved on - :attr:`UserAddress.source_row` so :meth:`save_batch` can pass it - through untouched. The parsed convenience fields are also populated: - ``Address 1``/``Address 2``/``Address 3`` are concatenated with - ``", "`` (skipping missing/empty parts) into ``user_address``, and - ``Internal Reference`` is threaded to - :attr:`UserAddress.internal_reference` (``None`` when missing/empty). - - Raises: - ValueError: if the CSV has rows but no ``postcode`` column -- - without it the splitter cannot group, and silently emitting - empty postcodes would corrupt every downstream batch. - """ rows = self._csv_client.read_rows(s3_uri) if rows and _POSTCODE_COLUMN not in rows[0]: raise ValueError( @@ -86,16 +49,6 @@ class UserAddressCsvS3Repository(UserAddressRepository): return addresses def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: - """Write a pass-through batch CSV under a unique key. - - Each output row is the address's original ``source_row`` with a - ``postcode_clean`` column appended (the canonical postcode the - downstream address2uprn stage groups on). No original column is - dropped or reshaped. - - The key is ``{path_prefix}/{ISO-8601 datetime}_{8-char uuid}.csv``. - Returns the full ``s3://bucket/key`` URI. - """ rows: list[dict[str, str]] = [ {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)} for addr in addresses diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index ab9b6671..170f34dd 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -1,10 +1,3 @@ -"""Abstract repository for :class:`UserAddress` batches. - -Persistence-agnostic interface for loading and saving batches of -:class:`domain.addresses.user_address.UserAddress`. Concrete adapters -- -e.g. :class:`UserAddressCsvS3Repository` -- live alongside this module. -""" - from __future__ import annotations from abc import ABC, abstractmethod @@ -13,18 +6,10 @@ from domain.addresses.user_address import UserAddress class UserAddressRepository(ABC): - """Load and persist batches of :class:`UserAddress`. - - Implementations choose the underlying storage (S3 CSV, Postgres, - in-memory, ...) but must preserve the canonical column semantics: - the address text, postcode (a :class:`~domain.postcode.Postcode` value - object), and an optional internal reference. - """ - @abstractmethod def load_batch(self, s3_uri: str) -> list[UserAddress]: - """Read a batch of addresses from ``s3_uri`` and return domain objects.""" + ... @abstractmethod def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: - """Persist ``addresses`` under ``path_prefix`` and return the URI written.""" + ... diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index 6e52b581..c69722ba 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -6,7 +6,6 @@ from domain.postcode import Postcode def _addrs(postcode: str, n: int) -> list[UserAddress]: - """Build ``n`` addresses sharing a postcode, with distinct address lines.""" return [ UserAddress( user_address=f"{i} {postcode} Street", postcode=Postcode(postcode) diff --git a/tests/infrastructure/__init__.py b/tests/infrastructure/__init__.py index 3478bda9..f5ad62d0 100644 --- a/tests/infrastructure/__init__.py +++ b/tests/infrastructure/__init__.py @@ -6,12 +6,5 @@ REGION = "us-east-1" def make_boto_client(service_name: str) -> Any: - """Construct a boto3 client typed as ``Any``. - - boto3's overloaded ``client`` signature uses ``Literal[...]`` per service - in the installed stubs, which forces every call site to satisfy - ``reportArgumentType`` and ``reportUnknownMemberType`` under strict - pyright. Centralising the cast keeps each test file clean. - """ factory: Any = boto3.client # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] return factory(service_name, region_name=REGION) diff --git a/tests/infrastructure/conftest.py b/tests/infrastructure/conftest.py index 7ed2fdd6..25c1ac3b 100644 --- a/tests/infrastructure/conftest.py +++ b/tests/infrastructure/conftest.py @@ -7,10 +7,6 @@ import pytest @pytest.fixture(autouse=True) def _aws_creds() -> Iterator[None]: # pyright: ignore[reportUnusedFunction] - """Stub AWS creds so botocore doesn't probe the host environment. - - Applied automatically to every test in ``tests/infrastructure/``. - """ keys = ( "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 79c60974..4ee2315e 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -1,13 +1,3 @@ -"""Integration test: PostcodeSplitterOrchestrator wired end-to-end. - -Combines moto S3 + moto SQS + an in-memory SQLite session for the -``TaskOrchestrator`` so the full slice-6 wiring is exercised through real -infrastructure adapters (not mocks). The fixture CSV spans three postcodes -with one oversize group, which forces both the buffer-flush-then-oversize -branch and the final-flush branch of -``iter_postcode_grouped_batches`` — three batches in total. -""" - from __future__ import annotations import json diff --git a/tests/repositories/user_address/conftest.py b/tests/repositories/user_address/conftest.py index 1859ff0a..25c1ac3b 100644 --- a/tests/repositories/user_address/conftest.py +++ b/tests/repositories/user_address/conftest.py @@ -7,10 +7,6 @@ import pytest @pytest.fixture(autouse=True) def _aws_creds() -> Iterator[None]: # pyright: ignore[reportUnusedFunction] - """Stub AWS creds so botocore doesn't probe the host environment. - - Applied automatically to every test in ``tests/repositories/user_address/``. - """ keys = ( "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py index 771a49f8..9cf68f28 100644 --- a/tests/utilities/aws_lambda/test_subtask_handler.py +++ b/tests/utilities/aws_lambda/test_subtask_handler.py @@ -1,11 +1,3 @@ -"""Tests for the @subtask_handler decorator. - -Covers the contract that the decorator owns the parent SubTask lifecycle and -injects the decorator-owned TaskOrchestrator as a third positional argument -to the wrapped function — so the handler can compose its own use-case -orchestrator that shares the session. -""" - import logging from collections.abc import Generator, Iterator from contextlib import contextmanager @@ -14,8 +6,6 @@ from typing import Any from uuid import UUID import pytest - -_LOGGER_NAME = "utilities.aws_lambda.subtask_handler" from sqlmodel import Session, SQLModel, create_engine from domain.tasks.subtasks import SubTaskStatus @@ -25,6 +15,8 @@ from repositories.tasks.subtask_postgres_repository import SubTaskPostgresReposi from repositories.tasks.task_postgres_repository import TaskPostgresRepository from utilities.aws_lambda.subtask_handler import subtask_handler +_LOGGER_NAME = "utilities.aws_lambda.subtask_handler" + @dataclass class Harness: @@ -58,8 +50,6 @@ def _direct_event(task_id: UUID, subtask_id: UUID) -> dict[str, Any]: def test_subtask_handler_injects_orchestrator_as_third_positional_argument( harness: Harness, ) -> None: - """The wrapped function receives the decorator-owned TaskOrchestrator - so it can share the session with its own use-case orchestrator.""" _, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -123,9 +113,6 @@ def test_subtask_handler_marks_parent_failed_and_reraises_on_error( def test_subtask_handler_injected_orchestrator_can_create_child_subtask( harness: Harness, ) -> None: - """Smoke check the share-the-session promise: the injected orchestrator - is the same one the decorator owns, so a handler can use it to create - child SubTasks under the same session.""" task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -150,8 +137,6 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask( def test_subtask_handler_logs_subtask_lifecycle_on_success( harness: Harness, caplog: pytest.LogCaptureFixture ) -> None: - """Start and completion are logged at INFO so a successful invocation - leaves a CloudWatch breadcrumb (not just the Lambda runtime lines).""" task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -172,8 +157,6 @@ def test_subtask_handler_logs_subtask_lifecycle_on_success( def test_subtask_handler_logs_exception_on_failure( harness: Harness, caplog: pytest.LogCaptureFixture ) -> None: - """A failing subtask is logged at ERROR with the traceback attached, - before the exception propagates for the Lambda runtime to surface.""" task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -198,8 +181,6 @@ def test_subtask_handler_logs_exception_on_failure( def test_subtask_handler_records_cloudwatch_url_on_subtask( harness: Harness, monkeypatch: pytest.MonkeyPatch ) -> None: - """With the AWS Lambda runtime's log env vars present, a CloudWatch deep - link is built and persisted on the SubTask.""" monkeypatch.setenv("AWS_REGION", "eu-west-2") monkeypatch.setenv( "AWS_LAMBDA_LOG_GROUP_NAME", "/aws/lambda/postcode-splitter" @@ -232,8 +213,6 @@ def test_subtask_handler_records_cloudwatch_url_on_subtask( def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda( harness: Harness, monkeypatch: pytest.MonkeyPatch ) -> None: - """Outside a real Lambda (e.g. the local RIE) the runtime log env vars - are absent, so cloud_logs_url is left unset rather than storing junk.""" for var in ( "AWS_REGION", "AWS_LAMBDA_LOG_GROUP_NAME", diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py index 40f116ad..592ffebf 100644 --- a/utilities/aws_lambda/subtask_handler.py +++ b/utilities/aws_lambda/subtask_handler.py @@ -1,15 +1,7 @@ """@subtask_handler decorator for Lambdas that operate on existing SubTasks. Translates an AWS Lambda invocation (SQS-shaped or direct) into -TaskOrchestrator.run_subtask(...) calls, emitting an INFO log line for each -subtask's start and completion and a logged exception on failure. Those lines -land in CloudWatch via the Lambda runtime's stdout/stderr capture. - -Each subtask also records ``cloud_logs_url`` -- a deep link to this -invocation's CloudWatch log stream -- so an operator can jump from a SubTask -row straight to its logs. It is built from the environment variables the AWS -Lambda runtime sets, so it is populated only on real Lambda invocations and -left unset under the local RIE (which does not export them). +TaskOrchestrator.run_subtask(...) calls. """ import json @@ -94,24 +86,10 @@ def _records(event: dict[str, Any]) -> list[dict[str, Any]]: def _console_encode(value: str) -> str: - """Encode a value for a CloudWatch console deep link. - - The console expects URL-encoding with the percent signs themselves - re-encoded as ``$25`` -- e.g. ``/`` becomes ``%2F`` becomes ``$252F``. - """ return quote(value, safe="").replace("%", "$25") def _cloudwatch_url() -> Optional[str]: - """Build a CloudWatch console URL for this invocation's log stream. - - Sourced entirely from the environment variables the AWS Lambda runtime - sets -- ``AWS_REGION``, ``AWS_LAMBDA_LOG_GROUP_NAME`` and - ``AWS_LAMBDA_LOG_STREAM_NAME``. Returns None when any is absent, which is - the case outside a real Lambda (the local RIE does not export them) -- so - ``SubTask.cloud_logs_url`` is left unset rather than storing a link that - points nowhere. - """ region = os.environ.get("AWS_REGION") log_group = os.environ.get("AWS_LAMBDA_LOG_GROUP_NAME") log_stream = os.environ.get("AWS_LAMBDA_LOG_STREAM_NAME") From dc159e0b457d8e72e0e64dc931d21a9ae9dfed39 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 14:00:19 +0000 Subject: [PATCH 79/91] tests framework completed --- .../user_address_csv_s3_repository.py | 3 ++ .../user_address/user_address_repository.py | 6 +-- tests/conftest.py | 48 +++++++++++++++++++ .../addresses/test_postcode_batching.py | 23 +++++++++ tests/domain/addresses/test_user_address.py | 19 ++++++++ tests/domain/tasks/test_subtasks.py | 20 ++++++++ tests/domain/tasks/test_tasks.py | 31 +++++++++++- tests/domain/test_postcode.py | 11 +++++ .../test_address2uprn_queue_client.py | 6 +++ tests/infrastructure/test_csv_s3_client.py | 8 ++++ tests/infrastructure/test_s3_client.py | 5 ++ tests/infrastructure/test_s3_uri.py | 8 ++++ tests/infrastructure/test_sqs_client.py | 6 +++ .../test_postcode_splitter_orchestrator.py | 23 ++++++--- tests/orchestration/test_task_orchestrator.py | 34 +++++++++++-- .../test_subtask_postgres_repository.py | 47 ++++++++++++------ .../postgres/test_task_postgres_repository.py | 25 ++++++---- .../test_user_address_csv_s3_repository.py | 26 ++++++++++ .../aws_lambda/test_subtask_handler.py | 31 ++++++++++-- 19 files changed, 336 insertions(+), 44 deletions(-) create mode 100644 tests/conftest.py diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py index 9b93b638..058fd5a5 100644 --- a/repositories/user_address/user_address_csv_s3_repository.py +++ b/repositories/user_address/user_address_csv_s3_repository.py @@ -53,6 +53,9 @@ class UserAddressCsvS3Repository(UserAddressRepository): {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)} for addr in addresses ] + + # TODO: [New Starter Task] file_name generation can be standardised + # and also easier to read, test for future implementation. Buiild that! filename = ( f"{datetime.now(timezone.utc).isoformat()}_{uuid.uuid4().hex[:8]}.csv" ) diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py index 170f34dd..b2c0f866 100644 --- a/repositories/user_address/user_address_repository.py +++ b/repositories/user_address/user_address_repository.py @@ -7,9 +7,7 @@ from domain.addresses.user_address import UserAddress class UserAddressRepository(ABC): @abstractmethod - def load_batch(self, s3_uri: str) -> list[UserAddress]: - ... + def load_batch(self, s3_uri: str) -> list[UserAddress]: ... @abstractmethod - def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: - ... + def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ... diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..0a246372 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,48 @@ +"""Shared pytest fixtures for the ``tests/`` tree. + +Provides an ephemeral PostgreSQL engine for tests that exercise SQLModel +repositories. PostgreSQL has no true in-memory mode; ``pytest-postgresql`` +starts a real, throwaway server in a temp directory (the process is started +once per session and a fresh database is created/dropped per test). That is +the closest equivalent to "in-memory" and matches production behaviour far +better than SQLite (enums, JSONB, constraint semantics, etc.). +""" + +from __future__ import annotations + +import glob +from collections.abc import Iterator +from typing import Any + +import pytest +from psycopg import Connection +from pytest_postgresql import factories +from sqlalchemy import Engine +from sqlmodel import SQLModel, create_engine + +# Importing the SQLModel row modules registers their tables on +# SQLModel.metadata so ``create_all`` builds the full schema. Imports look +# unused; they aren't. + + +# pg_ctl ships under a versioned path and is not on PATH in the dev container. +_PG_CTL = next(iter(sorted(glob.glob("/usr/lib/postgresql/*/bin/pg_ctl"))), "pg_ctl") + +postgresql_proc = factories.postgresql_proc( + executable=_PG_CTL +) # pyright: ignore[reportUnknownMemberType] +postgresql = factories.postgresql("postgresql_proc") + + +@pytest.fixture +def db_engine(postgresql: Connection[Any]) -> Iterator[Engine]: + """A SQLModel engine bound to a fresh, ephemeral PostgreSQL database.""" + info = postgresql.info + url = f"postgresql+psycopg://{info.user}:@{info.host}:{info.port}/{info.dbname}" + engine = create_engine(url) + SQLModel.metadata.create_all(engine) + try: + yield engine + finally: + SQLModel.metadata.drop_all(engine) + engine.dispose() diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py index c69722ba..8ffcf1b5 100644 --- a/tests/domain/addresses/test_postcode_batching.py +++ b/tests/domain/addresses/test_postcode_batching.py @@ -15,12 +15,16 @@ def _addrs(postcode: str, n: int) -> list[UserAddress]: def test_empty_input_yields_no_batches() -> None: + # act / assert assert list(iter_postcode_grouped_batches([])) == [] def test_single_batch_under_cap() -> None: + # arrange addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2) + # act batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500)) + # assert assert len(batches) == 1 assert batches[0] == addrs @@ -28,8 +32,11 @@ def test_single_batch_under_cap() -> None: def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None: # Two groups whose total exactly equals the cap pack into a single # batch -- no premature flush. + # arrange addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2) + # act batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + # assert assert len(batches) == 1 assert len(batches[0]) == 5 @@ -37,8 +44,11 @@ def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None: def test_flush_on_overflow_before_adding_next_postcode() -> None: # Cap is 5. First group fills 3 slots; second group of 3 would overflow, # so the buffer is flushed first and the next group starts a fresh batch. + # arrange addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3) + # act batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + # assert assert len(batches) == 2 assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3 assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3 @@ -47,8 +57,11 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None: def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None: # An oversize single-postcode group goes out as one batch larger than # the cap -- the cap never splits a postcode. + # arrange addrs = _addrs("AA1 1AA", 7) + # act batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5)) + # assert assert len(batches) == 1 assert len(batches[0]) == 7 @@ -56,12 +69,15 @@ def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None: def test_oversize_group_flushes_existing_buffer_first() -> None: # Mirrors the legacy ``if buffer: flush`` branch when an oversize group # is encountered: buffered work must not be lost or interleaved. + # arrange small = _addrs("AA1 1AA", 2) big = _addrs("BB2 2BB", 7) tail = _addrs("CC3 3CC", 1) + # act batches = list( iter_postcode_grouped_batches(small + big + tail, max_batch_size=5) ) + # assert assert len(batches) == 3 assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"] assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7 @@ -70,17 +86,23 @@ def test_oversize_group_flushes_existing_buffer_first() -> None: def test_final_flush_yields_remaining_buffer() -> None: # No overflow ever happens, but the trailing buffer must still come out. + # arrange addrs = _addrs("AA1 1AA", 2) + _addrs("BB2 2BB", 2) + # act batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500)) + # assert assert batches == [addrs] def test_postcode_grouping_preserves_first_seen_order() -> None: # Interleaved input must still group by postcode and emit in first-seen # order -- never alphabetical. + # arrange a1, a2 = _addrs("ZZ9 9ZZ", 2) b1, b2 = _addrs("AA1 1AA", 2) + # act batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2])) + # assert assert len(batches) == 1 assert [str(a.postcode) for a in batches[0]] == [ "ZZ99ZZ", @@ -91,5 +113,6 @@ def test_postcode_grouping_preserves_first_seen_order() -> None: def test_invalid_max_batch_size_raises() -> None: + # act / assert with pytest.raises(ValueError, match="max_batch_size"): list(iter_postcode_grouped_batches([], max_batch_size=0)) diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py index fa44ad61..8d092df3 100644 --- a/tests/domain/addresses/test_user_address.py +++ b/tests/domain/addresses/test_user_address.py @@ -7,35 +7,45 @@ from domain.postcode import Postcode def test_user_address_holds_postcode_value_object() -> None: + # act addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) + # assert assert addr.postcode == Postcode("SW1A1AA") def test_user_address_preserves_user_address_verbatim() -> None: # The free-text user_address string is intentionally NOT normalised -- # only the postcode is canonicalised, and that happens inside Postcode. + # act addr = UserAddress( user_address=" 1 The Street ", postcode=Postcode("SW1A1AA") ) + # assert assert addr.user_address == " 1 The Street " def test_user_address_internal_reference_defaults_to_none() -> None: + # act addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + # assert assert addr.internal_reference is None def test_user_address_internal_reference_accepted() -> None: + # act addr = UserAddress( user_address="1 The Street", postcode=Postcode("SW1A1AA"), internal_reference="cust-42", ) + # assert assert addr.internal_reference == "cust-42" def test_user_address_is_frozen() -> None: + # arrange addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + # act / assert with pytest.raises(dataclasses.FrozenInstanceError): addr.postcode = Postcode("OTHER") # type: ignore[misc] @@ -43,29 +53,37 @@ def test_user_address_is_frozen() -> None: def test_user_address_equality_uses_canonical_postcode() -> None: # Postcode sanitises eagerly, so addresses built from different surface # forms of the same postcode compare equal. + # arrange a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa")) b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + # act / assert assert a == b def test_user_address_source_row_defaults_to_empty_dict() -> None: + # act addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA")) + # assert assert addr.source_row == {} def test_user_address_carries_source_row() -> None: + # arrange row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"} + # act addr = UserAddress( user_address="1 The Street", postcode=Postcode("SW1A 1AA"), source_row=row, ) + # assert assert addr.source_row == row def test_user_address_equality_ignores_source_row() -> None: # source_row is excluded from equality (and hashing): identity stays # defined by the parsed fields. + # arrange a = UserAddress( user_address="1 The Street", postcode=Postcode("SW1A1AA"), @@ -76,4 +94,5 @@ def test_user_address_equality_ignores_source_row() -> None: postcode=Postcode("SW1A1AA"), source_row={"y": "2"}, ) + # act / assert assert a == b diff --git a/tests/domain/tasks/test_subtasks.py b/tests/domain/tasks/test_subtasks.py index 2721d38f..8cee4496 100644 --- a/tests/domain/tasks/test_subtasks.py +++ b/tests/domain/tasks/test_subtasks.py @@ -6,10 +6,13 @@ from domain.tasks.subtasks import SubTask, SubTaskStatus def test_create_subtask_starts_waiting() -> None: + # arrange task_id = uuid4() + # act st = SubTask.create(task_id=task_id, inputs={"foo": "bar"}) + # assert assert st.task_id == task_id assert st.status is SubTaskStatus.WAITING assert st.inputs == {"foo": "bar"} @@ -19,57 +22,74 @@ def test_create_subtask_starts_waiting() -> None: def test_start_transitions_to_in_progress_and_sets_cloud_logs_url() -> None: + # arrange st = SubTask.create(task_id=uuid4()) + # act st.start(cloud_logs_url="https://example/log") + # assert assert st.status is SubTaskStatus.IN_PROGRESS assert st.cloud_logs_url == "https://example/log" assert st.job_started is not None def test_start_is_idempotent_from_in_progress() -> None: + # arrange st = SubTask.create(task_id=uuid4()) st.start() first_start = st.job_started + # act st.start(cloud_logs_url="https://other") + # assert assert st.status is SubTaskStatus.IN_PROGRESS assert st.job_started == first_start # not overwritten assert st.cloud_logs_url == "https://other" def test_start_rejects_from_terminal_status() -> None: + # arrange st = SubTask.create(task_id=uuid4()) st.complete() + # act / assert with pytest.raises(ValueError): st.start() def test_complete_marks_outputs_and_job_completed() -> None: + # arrange st = SubTask.create(task_id=uuid4()) st.start() + # act st.complete({"uprn": "123"}) + # assert assert st.status is SubTaskStatus.COMPLETE assert st.outputs == {"result": {"uprn": "123"}} assert st.job_completed is not None def test_complete_without_result_leaves_outputs_unset() -> None: + # arrange st = SubTask.create(task_id=uuid4()) + # act st.complete() + # assert assert st.outputs is None def test_fail_records_error_in_outputs() -> None: + # arrange st = SubTask.create(task_id=uuid4()) err = RuntimeError("boom") + # act st.fail(err) + # assert assert st.status is SubTaskStatus.FAILED assert st.outputs == {"error": "boom"} assert st.job_completed is not None diff --git a/tests/domain/tasks/test_tasks.py b/tests/domain/tasks/test_tasks.py index f30c0aa1..ba82412b 100644 --- a/tests/domain/tasks/test_tasks.py +++ b/tests/domain/tasks/test_tasks.py @@ -5,12 +5,12 @@ from domain.tasks.tasks import Source, Task, TaskStatus def test_create_task_starts_waiting() -> None: - # Arrange / Act + # arrange / act t = Task.create( task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123" ) - # Assert + # assert assert t.status is TaskStatus.WAITING assert t.source is Source.PORTFOLIO assert t.source_id == "abc-123" @@ -19,86 +19,113 @@ def test_create_task_starts_waiting() -> None: def test_create_task_rejects_blank_task_source() -> None: + # act / assert with pytest.raises(ValueError, match="task_source"): Task.create(task_source=" ") def test_start_transitions_to_in_progress() -> None: + # arrange t = Task.create(task_source="manual:test") + # act t.start() + # assert assert t.status is TaskStatus.IN_PROGRESS def test_complete_marks_job_completed() -> None: + # arrange t = Task.create(task_source="manual:test") t.start() + # act t.complete() + # assert assert t.status is TaskStatus.COMPLETE assert t.job_completed is not None def test_fail_marks_job_completed() -> None: + # arrange t = Task.create(task_source="manual:test") + # act t.fail() + # assert assert t.status is TaskStatus.FAILED assert t.job_completed is not None def test_start_rejects_from_terminal_status() -> None: + # arrange t = Task.create(task_source="manual:test") t.complete() + # act / assert with pytest.raises(ValueError): t.start() def test_recalculate_with_empty_statuses_is_noop() -> None: + # arrange t = Task.create(task_source="manual:test") original_status = t.status original_completed = t.job_completed + # act t.recalculate_from_subtasks([]) + # assert assert t.status is original_status assert t.job_completed is original_completed def test_recalculate_all_waiting_keeps_waiting() -> None: + # arrange t = Task.create(task_source="manual:test") t.start() # task moved to IN_PROGRESS earlier t.complete() # then COMPLETE, with job_completed set + # act t.recalculate_from_subtasks([SubTaskStatus.WAITING, SubTaskStatus.WAITING]) + # assert assert t.status is TaskStatus.WAITING assert t.job_completed is None def test_recalculate_any_in_progress_marks_in_progress() -> None: + # arrange t = Task.create(task_source="manual:test") + # act t.recalculate_from_subtasks( [SubTaskStatus.WAITING, SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE] ) + # assert assert t.status is TaskStatus.IN_PROGRESS assert t.job_completed is None def test_recalculate_all_complete_marks_complete() -> None: + # arrange t = Task.create(task_source="manual:test") + # act t.recalculate_from_subtasks([SubTaskStatus.COMPLETE, SubTaskStatus.COMPLETE]) + # assert assert t.status is TaskStatus.COMPLETE assert t.job_completed is not None def test_recalculate_any_failed_marks_failed_even_with_others() -> None: + # arrange t = Task.create(task_source="manual:test") + # act t.recalculate_from_subtasks( [SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE, SubTaskStatus.FAILED] ) + # assert assert t.status is TaskStatus.FAILED assert t.job_completed is not None diff --git a/tests/domain/test_postcode.py b/tests/domain/test_postcode.py index 89d5cdc8..f7ce9015 100644 --- a/tests/domain/test_postcode.py +++ b/tests/domain/test_postcode.py @@ -6,43 +6,54 @@ from domain.postcode import Postcode def test_postcode_uppercases() -> None: + # act / assert assert Postcode("sw1a1aa").value == "SW1A1AA" def test_postcode_strips_internal_spaces() -> None: + # act / assert assert Postcode("sw1a 1aa").value == "SW1A1AA" def test_postcode_strips_leading_and_trailing_whitespace() -> None: + # act / assert assert Postcode(" sw1a 1aa ").value == "SW1A1AA" def test_postcode_strips_tabs_and_newlines() -> None: # CSV ingestion occasionally introduces stray whitespace characters; the # canonical form must absorb them just like literal spaces. + # act / assert assert Postcode("sw1a\t1aa\n").value == "SW1A1AA" def test_postcode_construction_is_idempotent() -> None: + # arrange once = Postcode("sw1a 1aa") + # act / assert assert Postcode(once.value).value == "SW1A1AA" def test_postcode_empty_string() -> None: + # act / assert assert Postcode("").value == "" def test_postcode_str_returns_canonical_value() -> None: + # act / assert assert str(Postcode("sw1a 1aa")) == "SW1A1AA" def test_postcode_equality_ignores_surface_form() -> None: # Differing case / whitespace sanitise to the same canonical value, so # the value objects compare equal. + # act / assert assert Postcode("sw1a 1aa") == Postcode("SW1A1AA") def test_postcode_is_frozen() -> None: + # arrange postcode = Postcode("SW1A1AA") + # act / assert with pytest.raises(dataclasses.FrozenInstanceError): postcode.value = "OTHER" # type: ignore[misc] diff --git a/tests/infrastructure/test_address2uprn_queue_client.py b/tests/infrastructure/test_address2uprn_queue_client.py index b4114742..c8e89ece 100644 --- a/tests/infrastructure/test_address2uprn_queue_client.py +++ b/tests/infrastructure/test_address2uprn_queue_client.py @@ -28,12 +28,15 @@ def queue_setup() -> Iterator[tuple[Address2UprnQueueClient, Any, str]]: def test_publish_returns_message_id( queue_setup: tuple[Address2UprnQueueClient, Any, str], ) -> None: + # arrange client, _boto, _url = queue_setup + # act message_id = client.publish( parent_task_id=uuid4(), child_subtask_id=uuid4(), s3_uri="s3://my-bucket/path/to/chunk.csv", ) + # assert assert isinstance(message_id, str) assert message_id @@ -41,17 +44,20 @@ def test_publish_returns_message_id( def test_publish_body_uses_typed_shape( queue_setup: tuple[Address2UprnQueueClient, Any, str], ) -> None: + # arrange client, boto_client, queue_url = queue_setup parent_id = uuid4() child_id = uuid4() s3_uri = "s3://my-bucket/path/to/chunk.csv" + # act client.publish( parent_task_id=parent_id, child_subtask_id=child_id, s3_uri=s3_uri, ) + # assert received: dict[str, Any] = boto_client.receive_message( QueueUrl=queue_url, MaxNumberOfMessages=1 ) diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py index 4b9fc199..30e27164 100644 --- a/tests/infrastructure/test_csv_s3_client.py +++ b/tests/infrastructure/test_csv_s3_client.py @@ -18,26 +18,34 @@ def csv_client() -> Iterator[CsvS3Client]: def test_save_rows_returns_s3_uri(csv_client: CsvS3Client) -> None: + # arrange rows = [{"address": "1 High St", "postcode": "AB1 2CD"}] + # act uri = csv_client.save_rows(rows, "uploads/addresses.csv") + # assert assert uri == f"s3://{BUCKET}/uploads/addresses.csv" def test_round_trip_preserves_rows(csv_client: CsvS3Client) -> None: + # arrange rows = [ {"address": "1 High St", "postcode": "AB1 2CD"}, {"address": "2 Low St", "postcode": "XY9 8ZW"}, ] + # act uri = csv_client.save_rows(rows, "uploads/addresses.csv") fetched = csv_client.read_rows(uri) + # assert assert fetched == rows def test_save_rows_rejects_empty_list(csv_client: CsvS3Client) -> None: + # act / assert with pytest.raises(ValueError, match="empty"): csv_client.save_rows([], "uploads/empty.csv") def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None: + # act / assert with pytest.raises(ValueError, match="does not match client bucket"): csv_client.read_rows("s3://other-bucket/uploads/addresses.csv") diff --git a/tests/infrastructure/test_s3_client.py b/tests/infrastructure/test_s3_client.py index 7ed4c30b..67db4f58 100644 --- a/tests/infrastructure/test_s3_client.py +++ b/tests/infrastructure/test_s3_client.py @@ -18,14 +18,19 @@ def s3_client() -> Iterator[S3Client]: def test_put_object_returns_s3_uri(s3_client: S3Client) -> None: + # act uri = s3_client.put_object("folder/data.bin", b"payload") + # assert assert uri == f"s3://{BUCKET}/folder/data.bin" def test_get_object_returns_bytes_written_by_put_object(s3_client: S3Client) -> None: + # arrange s3_client.put_object("round/trip.bin", b"hello world") + # act / assert assert s3_client.get_object("round/trip.bin") == b"hello world" def test_bucket_property_exposes_configured_bucket(s3_client: S3Client) -> None: + # act / assert assert s3_client.bucket == BUCKET diff --git a/tests/infrastructure/test_s3_uri.py b/tests/infrastructure/test_s3_uri.py index 896c5959..32fd710f 100644 --- a/tests/infrastructure/test_s3_uri.py +++ b/tests/infrastructure/test_s3_uri.py @@ -4,29 +4,37 @@ from infrastructure.s3_uri import parse_s3_uri def test_parses_simple_s3_uri() -> None: + # act / assert assert parse_s3_uri("s3://my-bucket/file.csv") == ("my-bucket", "file.csv") def test_parses_s3_uri_with_nested_key() -> None: + # act bucket, key = parse_s3_uri("s3://my-bucket/nested/path/to/file.csv") + # assert assert (bucket, key) == ("my-bucket", "nested/path/to/file.csv") def test_rejects_s3_uri_without_key() -> None: + # act / assert with pytest.raises(ValueError, match="bucket and a key"): parse_s3_uri("s3://my-bucket") def test_rejects_s3_uri_with_empty_key() -> None: + # act / assert with pytest.raises(ValueError, match="bucket and a key"): parse_s3_uri("s3://my-bucket/") def test_parses_console_url_prefix() -> None: + # arrange url = "https://eu-west-2.console.aws.amazon.com/s3/object/my-bucket?prefix=nested%2Ffile.csv" + # act / assert assert parse_s3_uri(url) == ("my-bucket", "nested/file.csv") def test_rejects_unparseable_string() -> None: + # act / assert with pytest.raises(ValueError): parse_s3_uri("not-a-uri-at-all") diff --git a/tests/infrastructure/test_sqs_client.py b/tests/infrastructure/test_sqs_client.py index 7f1e8f78..44186bbb 100644 --- a/tests/infrastructure/test_sqs_client.py +++ b/tests/infrastructure/test_sqs_client.py @@ -19,17 +19,23 @@ def sqs_setup() -> Iterator[tuple[SqsClient, Any, str]]: def test_send_returns_message_id(sqs_setup: tuple[SqsClient, Any, str]) -> None: + # arrange client, _boto, _url = sqs_setup + # act message_id = client.send({"hello": "world"}) + # assert assert isinstance(message_id, str) assert message_id def test_send_json_serialises_body(sqs_setup: tuple[SqsClient, Any, str]) -> None: + # arrange client, boto_client, queue_url = sqs_setup body = {"hello": "world", "count": 3} + # act client.send(body) + # assert received: dict[str, Any] = boto_client.receive_message( QueueUrl=queue_url, MaxNumberOfMessages=1 ) diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py index 4ee2315e..a718ffbc 100644 --- a/tests/orchestration/test_postcode_splitter_orchestrator.py +++ b/tests/orchestration/test_postcode_splitter_orchestrator.py @@ -9,7 +9,8 @@ from typing import Any, cast import boto3 import pytest from moto import mock_aws -from sqlmodel import Session, SQLModel, create_engine +from sqlalchemy import Engine +from sqlmodel import Session from infrastructure.address2uprn_queue_client import Address2UprnQueueClient from infrastructure.csv_s3_client import CsvS3Client @@ -65,7 +66,7 @@ class Harness: @pytest.fixture -def harness() -> Iterator[Harness]: +def harness(db_engine: Engine) -> Iterator[Harness]: with mock_aws(): # Infra: S3 + SQS boto_s3 = _make_boto_client("s3") @@ -78,10 +79,8 @@ def harness() -> Iterator[Harness]: repo = UserAddressCsvS3Repository(csv_client, BUCKET) queue_client = Address2UprnQueueClient(boto_sqs, queue_url) - # DB: in-memory SQLite TaskOrchestrator - engine = create_engine("sqlite://") - SQLModel.metadata.create_all(engine) - with Session(engine) as session: + # DB: ephemeral PostgreSQL TaskOrchestrator + with Session(db_engine) as session: task_repo = TaskPostgresRepository(session=session) subtask_repo = SubTaskPostgresRepository(session=session) task_orchestrator = TaskOrchestrator( @@ -169,6 +168,7 @@ def _drain_queue(boto_sqs: Any, queue_url: str) -> list[dict[str, Any]]: def test_split_and_dispatch_creates_three_children_for_fixture( harness: Harness, ) -> None: + # arrange parent_task, parent_subtask = ( harness.task_orchestrator.create_task_with_subtask( task_source="manual:postcode-splitter-int" @@ -176,12 +176,14 @@ def test_split_and_dispatch_creates_three_children_for_fixture( ) input_uri = _upload_fixture_csv(harness.csv_client) + # act child_ids = harness.splitter.split_and_dispatch( parent_task_id=parent_task.id, parent_subtask_id=parent_subtask.id, input_s3_uri=input_uri, ) + # assert assert len(child_ids) == 3 # All child ids are unique and persisted as WAITING children of the # parent task. @@ -194,6 +196,7 @@ def test_split_and_dispatch_creates_three_children_for_fixture( def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri( harness: Harness, ) -> None: + # arrange parent_task, parent_subtask = ( harness.task_orchestrator.create_task_with_subtask( task_source="manual:postcode-splitter-int" @@ -201,12 +204,14 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri( ) input_uri = _upload_fixture_csv(harness.csv_client) + # act child_ids = harness.splitter.split_and_dispatch( parent_task_id=parent_task.id, parent_subtask_id=parent_subtask.id, input_s3_uri=input_uri, ) + # assert for cid in child_ids: child = harness.subtasks.get(cid) assert child.inputs is not None @@ -224,6 +229,7 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri( def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids( harness: Harness, ) -> None: + # arrange parent_task, parent_subtask = ( harness.task_orchestrator.create_task_with_subtask( task_source="manual:postcode-splitter-int" @@ -231,12 +237,14 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids( ) input_uri = _upload_fixture_csv(harness.csv_client) + # act child_ids = harness.splitter.split_and_dispatch( parent_task_id=parent_task.id, parent_subtask_id=parent_subtask.id, input_s3_uri=input_uri, ) + # assert bodies = _drain_queue(harness.boto_sqs, harness.queue_url) assert len(bodies) == len(child_ids) @@ -258,6 +266,7 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids( def test_split_and_dispatch_returns_child_ids_in_dispatch_order( harness: Harness, ) -> None: + # arrange parent_task, parent_subtask = ( harness.task_orchestrator.create_task_with_subtask( task_source="manual:postcode-splitter-int" @@ -265,12 +274,14 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order( ) input_uri = _upload_fixture_csv(harness.csv_client) + # act child_ids = harness.splitter.split_and_dispatch( parent_task_id=parent_task.id, parent_subtask_id=parent_subtask.id, input_s3_uri=input_uri, ) + # assert # Re-load each child's saved batch and inspect the postcode_clean column # to confirm the dispatch order matches the postcode-batching algorithm: # AA-batch first, BB oversize batch second, CC final-flush third. diff --git a/tests/orchestration/test_task_orchestrator.py b/tests/orchestration/test_task_orchestrator.py index c0816d2d..ae89991d 100644 --- a/tests/orchestration/test_task_orchestrator.py +++ b/tests/orchestration/test_task_orchestrator.py @@ -2,7 +2,8 @@ from collections.abc import Iterator from dataclasses import dataclass import pytest -from sqlmodel import Session, SQLModel, create_engine +from sqlalchemy import Engine +from sqlmodel import Session from domain.tasks.subtasks import SubTask, SubTaskStatus from domain.tasks.tasks import Source, TaskStatus @@ -19,10 +20,8 @@ class Harness: @pytest.fixture -def harness() -> Iterator[Harness]: - engine = create_engine("sqlite://") - SQLModel.metadata.create_all(engine) - with Session(engine) as session: +def harness(db_engine: Engine) -> Iterator[Harness]: + with Session(db_engine) as session: tasks = TaskPostgresRepository(session=session) subtasks = SubTaskPostgresRepository(session=session) yield Harness( @@ -35,6 +34,7 @@ def harness() -> Iterator[Harness]: def test_create_task_with_subtask_creates_both_in_waiting( harness: Harness, ) -> None: + # act task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test", inputs={"foo": "bar"}, @@ -42,6 +42,7 @@ def test_create_task_with_subtask_creates_both_in_waiting( source_id="abc", ) + # assert assert task.status is TaskStatus.WAITING assert subtask.status is SubTaskStatus.WAITING assert subtask.task_id == task.id @@ -49,27 +50,33 @@ def test_create_task_with_subtask_creates_both_in_waiting( def test_start_subtask_cascades_to_in_progress(harness: Harness) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) + # act started = harness.orchestrator.start_subtask( subtask.id, cloud_logs_url="https://example/log" ) + # assert assert started.status is SubTaskStatus.IN_PROGRESS assert started.cloud_logs_url == "https://example/log" assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS def test_complete_subtask_cascades_to_complete(harness: Harness) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) harness.orchestrator.start_subtask(subtask.id) + # act harness.orchestrator.complete_subtask(subtask.id, {"value": 42}) + # assert done_subtask = harness.subtasks.get(subtask.id) done_task = harness.tasks.get(task.id) assert done_subtask.outputs == {"result": {"value": 42}} @@ -78,12 +85,15 @@ def test_complete_subtask_cascades_to_complete(harness: Harness) -> None: def test_fail_subtask_cascades_to_failed(harness: Harness) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) + # act harness.orchestrator.fail_subtask(subtask.id, RuntimeError("boom")) + # assert failed_subtask = harness.subtasks.get(subtask.id) failed_task = harness.tasks.get(task.id) assert failed_subtask.outputs == {"error": "boom"} @@ -93,42 +103,51 @@ def test_fail_subtask_cascades_to_failed(harness: Harness) -> None: def test_failed_subtask_locks_task_failed_even_with_others_complete( harness: Harness, ) -> None: + # arrange task, first = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) second = SubTask.create(task_id=task.id) harness.subtasks.create(second) + # act harness.orchestrator.complete_subtask(first.id) harness.orchestrator.fail_subtask(second.id, RuntimeError("nope")) + # assert assert harness.tasks.get(task.id).status is TaskStatus.FAILED def test_mixed_complete_and_in_progress_keeps_task_in_progress( harness: Harness, ) -> None: + # arrange task, first = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) second = SubTask.create(task_id=task.id) harness.subtasks.create(second) + # act harness.orchestrator.complete_subtask(first.id) harness.orchestrator.start_subtask(second.id) + # assert assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS def test_run_subtask_happy_path_returns_result_and_cascades_complete( harness: Harness, ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) + # act result = harness.orchestrator.run_subtask(subtask.id, work=lambda: {"answer": 42}) + # assert assert result == {"answer": 42} assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE @@ -137,16 +156,19 @@ def test_run_subtask_happy_path_returns_result_and_cascades_complete( def test_create_child_subtask_adds_waiting_child_without_changing_parent_status( harness: Harness, ) -> None: + # arrange task, first = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) harness.orchestrator.start_subtask(first.id) assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS + # act child = harness.orchestrator.create_child_subtask( task.id, inputs={"split": "a"} ) + # assert persisted_child = harness.subtasks.get(child.id) assert persisted_child.task_id == task.id assert persisted_child.status is SubTaskStatus.WAITING @@ -159,6 +181,7 @@ def test_create_child_subtask_adds_waiting_child_without_changing_parent_status( def test_run_subtask_failing_work_marks_failed_and_reraises( harness: Harness, ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -166,6 +189,7 @@ def test_run_subtask_failing_work_marks_failed_and_reraises( def boom() -> None: raise RuntimeError("boom") + # act / assert with pytest.raises(RuntimeError, match="boom"): harness.orchestrator.run_subtask(subtask.id, work=boom) diff --git a/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py index ac39e089..9cec52ea 100644 --- a/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py +++ b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py @@ -1,33 +1,40 @@ from collections.abc import Iterator -from uuid import uuid4 +from uuid import UUID, uuid4 import pytest -from sqlmodel import Session, SQLModel, create_engine +from sqlalchemy import Engine +from sqlmodel import Session -# Importing the SQLModel row modules registers their tables in -# SQLModel.metadata so create_all builds both. Imports look unused; they aren't. -import infrastructure.postgres.subtask_table # noqa: F401 # pyright: ignore[reportUnusedImport] -import infrastructure.postgres.task_table # noqa: F401 # pyright: ignore[reportUnusedImport] from domain.tasks.subtasks import SubTask, SubTaskStatus +from domain.tasks.tasks import Task from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository +from repositories.tasks.task_postgres_repository import TaskPostgresRepository @pytest.fixture -def session() -> Iterator[Session]: - engine = create_engine("sqlite://") - SQLModel.metadata.create_all(engine) - with Session(engine) as s: +def session(db_engine: Engine) -> Iterator[Session]: + with Session(db_engine) as s: yield s +def _persisted_task_id(session: Session) -> UUID: + """Create a parent Task row so SubTask FK constraints are satisfied.""" + task = Task.create(task_source="manual:test") + TaskPostgresRepository(session=session).create(task) + return task.id + + def test_create_and_get_round_trip_preserves_inputs(session: Session) -> None: + # arrange repo = SubTaskPostgresRepository(session=session) - task_id = uuid4() + task_id = _persisted_task_id(session) st = SubTask.create(task_id=task_id, inputs={"address": "68 Glendon Way"}) + # act repo.create(st) fetched = repo.get(st.id) + # assert assert fetched.id == st.id assert fetched.task_id == task_id assert fetched.status is SubTaskStatus.WAITING @@ -36,16 +43,21 @@ def test_create_and_get_round_trip_preserves_inputs(session: Session) -> None: def test_save_persists_status_and_outputs(session: Session) -> None: + # arrange repo = SubTaskPostgresRepository(session=session) - st = SubTask.create(task_id=uuid4()) + st = SubTask.create(task_id=_persisted_task_id(session)) repo.create(st) + # act st.start(cloud_logs_url="https://example/log") repo.save(st) + # assert assert repo.get(st.id).status is SubTaskStatus.IN_PROGRESS + # act st.complete({"uprn": "123"}) repo.save(st) + # assert done = repo.get(st.id) assert done.status is SubTaskStatus.COMPLETE assert done.outputs == {"result": {"uprn": "123"}} @@ -54,16 +66,19 @@ def test_save_persists_status_and_outputs(session: Session) -> None: def test_list_by_task_filters_by_task_id(session: Session) -> None: + # arrange repo = SubTaskPostgresRepository(session=session) - task_a = uuid4() - task_b = uuid4() + task_a = _persisted_task_id(session) + task_b = _persisted_task_id(session) repo.create(SubTask.create(task_id=task_a)) repo.create(SubTask.create(task_id=task_a)) repo.create(SubTask.create(task_id=task_b)) + # act a_results = repo.list_by_task(task_a) b_results = repo.list_by_task(task_b) + # assert assert len(a_results) == 2 assert len(b_results) == 1 assert all(s.task_id == task_a for s in a_results) @@ -71,11 +86,15 @@ def test_list_by_task_filters_by_task_id(session: Session) -> None: def test_list_by_task_returns_empty_for_unknown_task(session: Session) -> None: + # arrange repo = SubTaskPostgresRepository(session=session) + # act / assert assert repo.list_by_task(uuid4()) == [] def test_get_missing_raises(session: Session) -> None: + # arrange repo = SubTaskPostgresRepository(session=session) + # act / assert with pytest.raises(ValueError, match="not found"): repo.get(uuid4()) diff --git a/tests/repositories/tasks/postgres/test_task_postgres_repository.py b/tests/repositories/tasks/postgres/test_task_postgres_repository.py index 3e1aa226..8a49a861 100644 --- a/tests/repositories/tasks/postgres/test_task_postgres_repository.py +++ b/tests/repositories/tasks/postgres/test_task_postgres_repository.py @@ -2,7 +2,8 @@ from collections.abc import Iterator from uuid import uuid4 import pytest -from sqlmodel import Session, SQLModel, create_engine +from sqlalchemy import Engine +from sqlmodel import Session from domain.tasks.tasks import Source, Task, TaskStatus from infrastructure.postgres.task_table import TaskRow @@ -10,25 +11,23 @@ from repositories.tasks.task_postgres_repository import TaskPostgresRepository @pytest.fixture -def session() -> Iterator[Session]: - engine = create_engine("sqlite://") - SQLModel.metadata.create_all(engine) - with Session(engine) as s: +def session(db_engine: Engine) -> Iterator[Session]: + with Session(db_engine) as s: yield s def test_create_and_get_round_trip(session: Session) -> None: - # Arrange + # arrange repo = TaskPostgresRepository(session=session) t = Task.create( task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123" ) - # Act + # act repo.create(t) fetched = repo.get(t.id) - # Assert + # assert assert fetched.id == t.id assert fetched.status is TaskStatus.WAITING assert fetched.source is Source.PORTFOLIO @@ -36,33 +35,43 @@ def test_create_and_get_round_trip(session: Session) -> None: def test_save_persists_status_transition(session: Session) -> None: + # arrange repo = TaskPostgresRepository(session=session) t = Task.create(task_source="manual:test") repo.create(t) + # act t.start() repo.save(t) + # assert assert repo.get(t.id).status is TaskStatus.IN_PROGRESS + # act t.complete() repo.save(t) + # assert done = repo.get(t.id) assert done.status is TaskStatus.COMPLETE assert done.job_completed is not None def test_get_missing_raises(session: Session) -> None: + # arrange repo = TaskPostgresRepository(session=session) + # act / assert with pytest.raises(ValueError, match="not found"): repo.get(uuid4()) def test_get_normalises_legacy_capitalised_status(session: Session) -> None: # Existing rows written by backend code use "In Progress" (capitalised). + # arrange repo = TaskPostgresRepository(session=session) row = TaskRow(task_source="manual:test", status="In Progress") session.add(row) session.commit() + # act fetched = repo.get(row.id) + # assert assert fetched.status is TaskStatus.IN_PROGRESS diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py index c1acee32..9ffb250a 100644 --- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py +++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py @@ -32,6 +32,7 @@ def _upload_csv( def test_load_batch_parses_address_postcode_and_reference( repo: UserAddressCsvS3Repository, ) -> None: + # arrange rows = [ { "Address 1": "1 High Street", @@ -43,8 +44,10 @@ def test_load_batch_parses_address_postcode_and_reference( ] uri = _upload_csv(repo, rows, "uploads/full.csv") + # act addresses = repo.load_batch(uri) + # assert assert len(addresses) == 1 address = addresses[0] assert address.user_address == "1 High Street, Flat 2, Townville" @@ -55,6 +58,7 @@ def test_load_batch_parses_address_postcode_and_reference( def test_load_batch_uses_only_address_1_when_others_missing( repo: UserAddressCsvS3Repository, ) -> None: + # arrange rows = [ { "Address 1": "10 Cardiff Road", @@ -66,8 +70,10 @@ def test_load_batch_uses_only_address_1_when_others_missing( ] uri = _upload_csv(repo, rows, "uploads/address1-only.csv") + # act addresses = repo.load_batch(uri) + # assert assert len(addresses) == 1 assert addresses[0].user_address == "10 Cardiff Road" assert addresses[0].postcode == Postcode("CF101AA") @@ -77,6 +83,7 @@ def test_load_batch_uses_only_address_1_when_others_missing( def test_load_batch_handles_missing_internal_reference( repo: UserAddressCsvS3Repository, ) -> None: + # arrange rows = [ { "Address 1": "5 Park Lane", @@ -88,8 +95,10 @@ def test_load_batch_handles_missing_internal_reference( ] uri = _upload_csv(repo, rows, "uploads/no-ref.csv") + # act addresses = repo.load_batch(uri) + # assert assert len(addresses) == 1 assert addresses[0].user_address == "5 Park Lane" assert addresses[0].postcode == Postcode("M11AA") @@ -101,6 +110,7 @@ def test_load_batch_captures_full_source_row( ) -> None: # A raw EPC-export-shaped row: the splitter must preserve every column, # not just the ones it parses into UserAddress fields. + # arrange row = { "Asset Reference": "511", "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX", @@ -110,17 +120,21 @@ def test_load_batch_captures_full_source_row( } uri = _upload_csv(repo, [row], "uploads/epc.csv") + # act addresses = repo.load_batch(uri) + # assert assert addresses[0].source_row == row def test_load_batch_raises_when_postcode_column_absent( repo: UserAddressCsvS3Repository, ) -> None: + # arrange rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}] uri = _upload_csv(repo, rows, "uploads/no-postcode.csv") + # act / assert with pytest.raises(ValueError, match="no 'postcode' column"): repo.load_batch(uri) @@ -128,6 +142,7 @@ def test_load_batch_raises_when_postcode_column_absent( def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( repo: UserAddressCsvS3Repository, ) -> None: + # arrange row = { "Asset Reference": "511", "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX", @@ -137,9 +152,11 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( uri = _upload_csv(repo, [row], "uploads/epc.csv") addresses = repo.load_batch(uri) + # act saved_uri = repo.save_batch(addresses, "tasks/passthrough") saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + # assert assert len(saved_rows) == 1 saved = saved_rows[0] # Every original column survives, byte-for-byte. @@ -152,6 +169,7 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean( def test_save_batch_returns_uri_under_path_prefix( repo: UserAddressCsvS3Repository, ) -> None: + # arrange addresses = [ UserAddress( user_address="1 High Street", @@ -160,8 +178,10 @@ def test_save_batch_returns_uri_under_path_prefix( ), ] + # act uri = repo.save_batch(addresses, "tasks/abc/batches") + # assert assert uri.startswith(f"s3://{BUCKET}/tasks/abc/batches/") assert uri.endswith(".csv") @@ -169,6 +189,7 @@ def test_save_batch_returns_uri_under_path_prefix( def test_save_then_reload_round_trip_preserves_columns( repo: UserAddressCsvS3Repository, ) -> None: + # arrange rows = [ { "Address 1": "1 High Street", @@ -184,9 +205,11 @@ def test_save_then_reload_round_trip_preserves_columns( uri = _upload_csv(repo, rows, "uploads/round-trip.csv") addresses = repo.load_batch(uri) + # act saved_uri = repo.save_batch(addresses, "tasks/round-trip") saved_rows = repo._csv_client.read_rows(saved_uri) # pyright: ignore[reportPrivateUsage] + # assert # Original columns come back verbatim; postcode_clean is the only addition. assert [ {k: v for k, v in r.items() if k != "postcode_clean"} for r in saved_rows @@ -197,6 +220,7 @@ def test_save_then_reload_round_trip_preserves_columns( def test_save_batch_uses_unique_filename_per_call( repo: UserAddressCsvS3Repository, ) -> None: + # arrange addresses = [ UserAddress( user_address="1 High Street", @@ -205,7 +229,9 @@ def test_save_batch_uses_unique_filename_per_call( ), ] + # act uri_1 = repo.save_batch(addresses, "tasks/uniqueness") uri_2 = repo.save_batch(addresses, "tasks/uniqueness") + # assert assert uri_1 != uri_2 diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py index 9cf68f28..d671adc4 100644 --- a/tests/utilities/aws_lambda/test_subtask_handler.py +++ b/tests/utilities/aws_lambda/test_subtask_handler.py @@ -6,7 +6,8 @@ from typing import Any from uuid import UUID import pytest -from sqlmodel import Session, SQLModel, create_engine +from sqlalchemy import Engine +from sqlmodel import Session from domain.tasks.subtasks import SubTaskStatus from domain.tasks.tasks import TaskStatus @@ -30,10 +31,8 @@ class Harness: @pytest.fixture -def harness() -> Iterator[Harness]: - engine = create_engine("sqlite://") - SQLModel.metadata.create_all(engine) - with Session(engine) as session: +def harness(db_engine: Engine) -> Iterator[Harness]: + with Session(db_engine) as session: tasks = TaskPostgresRepository(session=session) subtasks = SubTaskPostgresRepository(session=session) yield Harness( @@ -50,6 +49,7 @@ def _direct_event(task_id: UUID, subtask_id: UUID) -> dict[str, Any]: def test_subtask_handler_injects_orchestrator_as_third_positional_argument( harness: Harness, ) -> None: + # arrange _, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -64,8 +64,10 @@ def test_subtask_handler_injects_orchestrator_as_third_positional_argument( received["context"] = context received["orchestrator"] = orchestrator + # act handler(_direct_event(subtask.task_id, subtask.id), context="ctx-sentinel") + # assert assert received["orchestrator"] is harness.orchestrator assert received["context"] == "ctx-sentinel" assert received["body"]["sub_task_id"] == str(subtask.id) @@ -74,6 +76,7 @@ def test_subtask_handler_injects_orchestrator_as_third_positional_argument( def test_subtask_handler_completes_parent_subtask_on_success( harness: Harness, ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -84,8 +87,10 @@ def test_subtask_handler_completes_parent_subtask_on_success( ) -> None: return None + # act handler(_direct_event(task.id, subtask.id), context=None) + # assert assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE @@ -93,6 +98,7 @@ def test_subtask_handler_completes_parent_subtask_on_success( def test_subtask_handler_marks_parent_failed_and_reraises_on_error( harness: Harness, ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -103,6 +109,7 @@ def test_subtask_handler_marks_parent_failed_and_reraises_on_error( ) -> None: raise RuntimeError("boom") + # act / assert with pytest.raises(RuntimeError, match="boom"): handler(_direct_event(task.id, subtask.id), context=None) @@ -113,6 +120,7 @@ def test_subtask_handler_marks_parent_failed_and_reraises_on_error( def test_subtask_handler_injected_orchestrator_can_create_child_subtask( harness: Harness, ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -126,8 +134,10 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask( child = orchestrator.create_child_subtask(task.id, inputs={"split": 1}) child_ids.append(child.id) + # act handler(_direct_event(task.id, subtask.id), context=None) + # assert assert len(child_ids) == 1 persisted_child = harness.subtasks.get(child_ids[0]) assert persisted_child.task_id == task.id @@ -137,6 +147,7 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask( def test_subtask_handler_logs_subtask_lifecycle_on_success( harness: Harness, caplog: pytest.LogCaptureFixture ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -147,9 +158,11 @@ def test_subtask_handler_logs_subtask_lifecycle_on_success( ) -> None: return None + # act with caplog.at_level(logging.INFO, logger=_LOGGER_NAME): handler(_direct_event(task.id, subtask.id), context=None) + # assert assert f"Running subtask {subtask.id}" in caplog.text assert f"Subtask {subtask.id} completed" in caplog.text @@ -157,6 +170,7 @@ def test_subtask_handler_logs_subtask_lifecycle_on_success( def test_subtask_handler_logs_exception_on_failure( harness: Harness, caplog: pytest.LogCaptureFixture ) -> None: + # arrange task, subtask = harness.orchestrator.create_task_with_subtask( task_source="manual:test" ) @@ -167,6 +181,7 @@ def test_subtask_handler_logs_exception_on_failure( ) -> None: raise RuntimeError("boom") + # act / assert with caplog.at_level(logging.INFO, logger=_LOGGER_NAME): with pytest.raises(RuntimeError, match="boom"): handler(_direct_event(task.id, subtask.id), context=None) @@ -181,6 +196,7 @@ def test_subtask_handler_logs_exception_on_failure( def test_subtask_handler_records_cloudwatch_url_on_subtask( harness: Harness, monkeypatch: pytest.MonkeyPatch ) -> None: + # arrange monkeypatch.setenv("AWS_REGION", "eu-west-2") monkeypatch.setenv( "AWS_LAMBDA_LOG_GROUP_NAME", "/aws/lambda/postcode-splitter" @@ -198,8 +214,10 @@ def test_subtask_handler_records_cloudwatch_url_on_subtask( ) -> None: return None + # act handler(_direct_event(task.id, subtask.id), context=None) + # assert saved_url = harness.subtasks.get(subtask.id).cloud_logs_url assert saved_url is not None assert saved_url.startswith( @@ -213,6 +231,7 @@ def test_subtask_handler_records_cloudwatch_url_on_subtask( def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda( harness: Harness, monkeypatch: pytest.MonkeyPatch ) -> None: + # arrange for var in ( "AWS_REGION", "AWS_LAMBDA_LOG_GROUP_NAME", @@ -229,6 +248,8 @@ def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda( ) -> None: return None + # act handler(_direct_event(task.id, subtask.id), context=None) + # assert assert harness.subtasks.get(subtask.id).cloud_logs_url is None From f10947699eca992b3cbc5ef9b69b744acaf73226 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 14:13:04 +0000 Subject: [PATCH 80/91] pytest.ini --- pytest.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/pytest.ini b/pytest.ini index 99cc8e1b..5044465b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,6 +4,7 @@ log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial testpaths = + tests recommendations/tests backend/tests backend/address2UPRN/tests From 154b820b29f7b6ba2c24c34a3a60a98435a79df8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 14:26:46 +0000 Subject: [PATCH 81/91] pytest.ini --- .github/workflows/unit_tests.yml | 12 ++++++++++++ pytest.ini | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index fa4fdf2a..15d4cfe9 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -60,3 +60,15 @@ jobs: -e DB_PASSWORD=test \ -e DB_PORT=5432 \ model-test pytest -vv -m 'not integration' + + # The DDD rewrite (tests/) defines SQLModel table classes that map to the + # same physical tables as the legacy backend models. Both sets share the + # one global SQLModel.metadata, so they cannot be imported into the same + # pytest process. It runs as a separate invocation until the legacy + # models are retired. Its DB is spawned in-process by pytest-postgresql, + # so no DB service or env is required. + - name: Run DDD tests + run: | + docker run --rm \ + --network host \ + model-test pytest -vv tests/ diff --git a/pytest.ini b/pytest.ini index 5044465b..99cc8e1b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,7 +4,6 @@ log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial testpaths = - tests recommendations/tests backend/tests backend/address2UPRN/tests From 8610a0c87518c3dd7c2625b839218aa4593b9e4c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 15:17:55 +0000 Subject: [PATCH 82/91] actually deploy postcode splitter --- .github/workflows/deploy_terraform.yml | 2 +- deployment/terraform/lambda/postcodeSplitter/main.tf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 923fc0a9..8ba473ca 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -169,7 +169,7 @@ jobs: uses: ./.github/workflows/_build_image.yml with: ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} - dockerfile_path: backend/postcode_splitter/handler/Dockerfile + dockerfile_path: applications/postcode_splitter/Dockerfile build_context: . build_args: | DEV_DB_HOST=$DEV_DB_HOST diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf index 325f7dc7..721cb2ea 100644 --- a/deployment/terraform/lambda/postcodeSplitter/main.tf +++ b/deployment/terraform/lambda/postcodeSplitter/main.tf @@ -38,8 +38,8 @@ module "lambda" { { STAGE = var.stage LOG_LEVEL = "info" - DB_USERNAME = local.db_credentials.db_assessment_model_username - DB_PASSWORD = local.db_credentials.db_assessment_model_password + POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username + POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, From 78c1d150fa2552ad4386cf113a0ee61523d8aa9a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 15:25:42 +0000 Subject: [PATCH 83/91] added smoke test --- .github/workflows/lambda_smoke_tests.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/lambda_smoke_tests.yml b/.github/workflows/lambda_smoke_tests.yml index 5ff5420a..b562f91e 100644 --- a/.github/workflows/lambda_smoke_tests.yml +++ b/.github/workflows/lambda_smoke_tests.yml @@ -36,6 +36,13 @@ jobs: build_context: . service_name: postcode-splitter + postcode_splitter_ddd_smoke_test: + uses: ./.github/workflows/_smoke_test_lambda.yml + with: + dockerfile_path: applications/postcode_splitter/Dockerfile + build_context: . + service_name: postcode-splitter-ddd + # ============================================================ # Bulk Address2UPRN Combiner # ============================================================ From 53b211e951c1b2eb71ac0fce20aefeab6cd9ddc5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 15:43:41 +0000 Subject: [PATCH 84/91] epc token added --- .github/workflows/_build_image.yml | 3 +++ .github/workflows/deploy_terraform.yml | 2 ++ backend/address2UPRN/handler/Dockerfile | 2 ++ 3 files changed, 7 insertions(+) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index 3435c92d..e7ad9424 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -40,6 +40,8 @@ on: required: false EPC_AUTH_TOKEN: required: false + OPEN_EPC_API_TOKEN: + required: false jobs: build: @@ -50,6 +52,7 @@ jobs: DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} EPC_AUTH_TOKEN: ${{ secrets.EPC_AUTH_TOKEN }} + OPEN_EPC_API_TOKEN: ${{ secrets.OPEN_EPC_API_TOKEN }} outputs: image_digest: ${{ steps.digest.outputs.image_digest }} diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 8ba473ca..7f2eb890 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -133,6 +133,7 @@ jobs: DEV_DB_PORT=$DEV_DB_PORT DEV_DB_NAME=$DEV_DB_NAME EPC_AUTH_TOKEN=$EPC_AUTH_TOKEN + OPEN_EPC_API_TOKEN=$OPEN_EPC_API_TOKEN secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -141,6 +142,7 @@ jobs: DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} + OPEN_EPC_API_TOKEN: ${{ secrets.DEV_OPEN_EPC_API_TOKEN }} # ============================================================ # Deploy Address 2 UPRN Lambda diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 07159357..7d174152 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -6,11 +6,13 @@ ARG DEV_DB_HOST ARG DEV_DB_PORT ARG DEV_DB_NAME ARG EPC_AUTH_TOKEN +ARG OPEN_EPC_API_TOKEN ENV DB_HOST=${DEV_DB_HOST} ENV DB_PORT=${DEV_DB_PORT} ENV DB_NAME=${DEV_DB_NAME} ENV EPC_AUTH_TOKEN=${EPC_AUTH_TOKEN} +ENV OPEN_EPC_API_TOKEN=${OPEN_EPC_API_TOKEN} # Set working directory (Lambda task root) From 4e21dda328dc4a06ab1eb69e5f44857c1a6cf03f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 20 May 2026 16:26:07 +0000 Subject: [PATCH 85/91] rename files in sharepoint to desired structure --- scripts/rename_sharepoint_files.py | 128 ++++++++++++++++++++ utils/sharepoint/domna_sharepoint_client.py | 9 ++ utils/sharepoint/sharepoint_client.py | 11 ++ 3 files changed, 148 insertions(+) create mode 100644 scripts/rename_sharepoint_files.py diff --git a/scripts/rename_sharepoint_files.py b/scripts/rename_sharepoint_files.py new file mode 100644 index 00000000..881b96ef --- /dev/null +++ b/scripts/rename_sharepoint_files.py @@ -0,0 +1,128 @@ +""" +Rename files in SharePoint property folders to the canonical format: + {UPRN}_{Street} {Postcode}_{Document Name}.ext + +Set DRY_RUN = False when ready to commit. Run from repo root. +Required env vars: SHAREPOINT_CLIENT_ID, SHAREPOINT_CLIENT_SECRET, + SHAREPOINT_TENANT_ID, SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID +""" + +import csv +import os +from typing import Optional + +from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders +from utils.logger import setup_logger +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +from utils.sharepoint.domna_sites import DomnaSites + +DRY_RUN: bool = True +CSV_PATH: str = "scripts/sero_address_list.csv" + +BASE_PATH = ( + "Osmosis-ACD Projects/Sero-Clarion Housing/" + "Sero Project Documents/Property Folders" +) +ASSESSMENT_SUBFOLDER = "A. Assessment" + +logger = setup_logger() + + +def build_canonical_filename( + uprn: str, address: str, postcode: str, original_name: str +) -> Optional[str]: + """ + Returns the canonical filename, or None if the file is already renamed. + + Already-renamed: name starts with "{uprn}_". + Strips any existing address prefix (address+postcode first, then address alone) + before inserting the canonical prefix. + """ + if original_name.startswith(f"{uprn}_"): + return None + + stem, ext = os.path.splitext(original_name) + stem_lower = stem.lower() + + street = address.split(",")[0].strip() + prefixes = [ + f"{address} {postcode}", + address, + f"{street} {postcode}", + street, + ] + + doc_name = stem + for prefix in prefixes: + if stem_lower.startswith(prefix.lower()): + doc_name = stem[len(prefix) :] + break + + if doc_name.startswith(" - "): + doc_name = doc_name[3:] + elif doc_name.startswith(" _ "): + doc_name = doc_name[3:] + doc_name = doc_name.strip() + + street_post = f"{street} {postcode}" + if doc_name: + return f"{uprn}_{street_post}_{doc_name}{ext}" + return f"{uprn}_{street_post}{ext}" + + +def main() -> None: + sp_client = DomnaSharepointClient(DomnaSites.SOCIAL_HOUSING_WAVE_3) + + with open(CSV_PATH, newline="", encoding="utf-8-sig") as f: + reader = csv.DictReader(f) + required = {"UPRN", "Address", "Postcode"} + if not reader.fieldnames or not required.issubset(set(reader.fieldnames)): + raise ValueError( + f"CSV missing required columns. Expected {required}, got {reader.fieldnames}" + ) + + for row in reader: + uprn = row["UPRN"].strip() + address = row["Address"].strip() + postcode = row["Postcode"].strip() + folder_path = ( + f"{BASE_PATH}/{address}, {postcode}" + f"/{SharepointSubfolders.ASSESSMENT.value}/{ASSESSMENT_SUBFOLDER}" + ) + + try: + contents = sp_client.get_folders_in_path(folder_path) + except ValueError: + logger.warning(f"Missing folder for UPRN {uprn}: {folder_path}") + continue + + for item in contents.get("value", []): + if "file" not in item: + continue + + original_name: str = item["name"] + new_name = build_canonical_filename( + uprn, address, postcode, original_name + ) + + if new_name is None: + continue + + if DRY_RUN: + logger.info( + f'[DRY RUN] Renaming: "{original_name}" → "{new_name}" (UPRN: {uprn})' + ) + else: + try: + sp_client.rename_file(item["id"], new_name) + logger.info( + f'Renamed: "{original_name}" → "{new_name}" (UPRN: {uprn})' + ) + except Exception as e: + logger.error( + f'Failed to rename "{original_name}" → "{new_name}" (UPRN: {uprn}): {e}' + ) + + +if __name__ == "__main__": + main() diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py index 5e0255ac..3e9168ba 100644 --- a/utils/sharepoint/domna_sharepoint_client.py +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -125,6 +125,15 @@ class DomnaSharepointClient: self.logger.debug(f"Downloaded SharePoint file to: {local_path}") return True + def rename_file(self, item_id: str, new_name: str) -> None: + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + sharepoint_client.rename_file(item_id, new_name) + def create_temp_file(self, content: BytesIO, path: str): # Ensure the path is under /tmp/ new_path = os.path.join("/tmp/sharepoint", path) diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py index 5807c3bd..38107dbf 100644 --- a/utils/sharepoint/sharepoint_client.py +++ b/utils/sharepoint/sharepoint_client.py @@ -335,6 +335,17 @@ class SharePointClient: if retry == "retry": return self.upload_file(file_name, sharepoint_parent_id, file_stream) + @api_call_decorator + def rename_file(self, item_id: str, new_name: str) -> None: + """ + PATCH /drives/{drive_id}/items/{item_id} + + Renames a file in-place. Caller should discard the return value. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/items/{item_id}" + data: Dict[str, Any] = {"name": new_name} + return "PATCH", url, data # type: ignore[return-value] + @staticmethod def download_sharepoint_file(download_url: str) -> BytesIO: """ From e5583aac1f693fa58ed1d1f5501751d97b38bd01 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 17:36:20 +0000 Subject: [PATCH 86/91] some excel files are formatted differently --- .../postcode_splitter/local_handler/invoke_local_lambda.py | 2 +- infrastructure/csv_s3_client.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py index 21fa9b9e..17d7e345 100755 --- a/applications/postcode_splitter/local_handler/invoke_local_lambda.py +++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py @@ -14,7 +14,7 @@ payload = { { "task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf", "sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d", - "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv", + "s3_uri": "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2.csv", } ) } diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py index 055d1ce3..8af8de73 100644 --- a/infrastructure/csv_s3_client.py +++ b/infrastructure/csv_s3_client.py @@ -13,7 +13,12 @@ class CsvS3Client(S3Client): f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}" ) raw = self.get_object(key) - text = raw.decode("utf-8-sig") + try: + text = raw.decode("utf-8-sig") + except UnicodeDecodeError: + # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8. + text = raw.decode("cp1252") + reader = csv.DictReader(StringIO(text)) return [dict(row) for row in reader] From 714478a99a7a221e26367bb2a762d1a31f938ac0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 20 May 2026 17:51:45 +0000 Subject: [PATCH 87/91] clean up sanitise postcode --- backend/epc_client/epc_client_service.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py index 86caeea3..72dbf142 100644 --- a/backend/epc_client/epc_client_service.py +++ b/backend/epc_client/epc_client_service.py @@ -47,8 +47,14 @@ class EpcClientService: latest = max(results, key=lambda r: r.registration_date) return self.get_by_certificate_number(latest.certificate_number) + @staticmethod + def _normalise_postcode(postcode: str) -> str: + """Return the postcode with all spaces removed and uppercased.""" + return postcode.replace(" ", "").upper() + def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: - return call_with_retry(lambda: self._search(postcode=postcode)) + normalised = self._normalise_postcode(postcode) + return call_with_retry(lambda: self._search(postcode=normalised)) # ------------------------------------------------------------------ # Private helperEpcRateLimpolarss From c5ab795f851402145bc7ed65e3b17a10cd8cd494 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 09:46:47 +0000 Subject: [PATCH 88/91] redeploy old postcode splitter --- .github/workflows/deploy_terraform.yml | 4 +++- asset_list/app.py | 13 ++++++----- .../terraform/lambda/postcodeSplitter/main.tf | 22 +++++++++++++++++-- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 8ba473ca..1af90291 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -169,7 +169,9 @@ jobs: uses: ./.github/workflows/_build_image.yml with: ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} - dockerfile_path: applications/postcode_splitter/Dockerfile + # dockerfile_path: applications/postcode_splitter/Dockerfile + # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm + dockerfile_path: backend/postcode_splitter/handler/Dockerfile build_context: . build_args: | DEV_DB_HOST=$DEV_DB_HOST diff --git a/asset_list/app.py b/asset_list/app.py index 9b10d7f3..424f4df6 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -79,23 +79,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "lincs_address_list.xlsx" - sheet_name = "Sheet1" + data_filename = "hyde.xlsx" + sheet_name = "AddressProfilingResults" postcode_column = "Postcode" - address1_column = "Deal Name" + address1_column = "Address" address1_method = None - fulladdress_column = "Deal Name" + fulladdress_column = "Postcode" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = None # Good to include if landlord gave + landlord_property_type = "Property Type" # Good to include if landlord gave landlord_built_form = None # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "landlord_id" + landlord_property_id = "Organisation Reference" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -468,3 +468,4 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) + diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf index 721cb2ea..e04ae00f 100644 --- a/deployment/terraform/lambda/postcodeSplitter/main.tf +++ b/deployment/terraform/lambda/postcodeSplitter/main.tf @@ -38,8 +38,26 @@ module "lambda" { { STAGE = var.stage LOG_LEVEL = "info" - POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username - POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password + # POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username + # POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password + # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + # Placeholder values so backend/app/config.py Settings doesn't fall back to "changeme" + GOOGLE_SOLAR_API_KEY = "test" + SAP_PREDICTIONS_BUCKET = "test" + CARBON_PREDICTIONS_BUCKET = "test" + HEAT_PREDICTIONS_BUCKET = "test" + HEATING_KWH_PREDICTIONS_BUCKET = "test" + HOTWATER_KWH_PREDICTIONS_BUCKET = "test" + API_KEY = "test" + ENVIRONMENT = "test" + SECRET_KEY = "test" + PLAN_TRIGGER_BUCKET = "test" + DATA_BUCKET = "test" + EPC_AUTH_TOKEN = "test" + ENGINE_SQS_URL = "test" + ENERGY_ASSESSMENTS_BUCKET = "test" ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, From 856ea6eb9358f10e89e6b574a3a4367b0e92a874 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 10:12:08 +0000 Subject: [PATCH 89/91] undo postcodesplitter changes --- .github/workflows/deploy_terraform.yml | 4 +--- .../terraform/lambda/postcodeSplitter/main.tf | 22 ++----------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 1af90291..8ba473ca 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -169,9 +169,7 @@ jobs: uses: ./.github/workflows/_build_image.yml with: ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} - # dockerfile_path: applications/postcode_splitter/Dockerfile - # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm - dockerfile_path: backend/postcode_splitter/handler/Dockerfile + dockerfile_path: applications/postcode_splitter/Dockerfile build_context: . build_args: | DEV_DB_HOST=$DEV_DB_HOST diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf index e04ae00f..721cb2ea 100644 --- a/deployment/terraform/lambda/postcodeSplitter/main.tf +++ b/deployment/terraform/lambda/postcodeSplitter/main.tf @@ -38,26 +38,8 @@ module "lambda" { { STAGE = var.stage LOG_LEVEL = "info" - # POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username - # POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password - # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm - DB_USERNAME = local.db_credentials.db_assessment_model_username - DB_PASSWORD = local.db_credentials.db_assessment_model_password - # Placeholder values so backend/app/config.py Settings doesn't fall back to "changeme" - GOOGLE_SOLAR_API_KEY = "test" - SAP_PREDICTIONS_BUCKET = "test" - CARBON_PREDICTIONS_BUCKET = "test" - HEAT_PREDICTIONS_BUCKET = "test" - HEATING_KWH_PREDICTIONS_BUCKET = "test" - HOTWATER_KWH_PREDICTIONS_BUCKET = "test" - API_KEY = "test" - ENVIRONMENT = "test" - SECRET_KEY = "test" - PLAN_TRIGGER_BUCKET = "test" - DATA_BUCKET = "test" - EPC_AUTH_TOKEN = "test" - ENGINE_SQS_URL = "test" - ENERGY_ASSESSMENTS_BUCKET = "test" + POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username + POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, From dbd03de842933fa189de077d48e5c13ecf9729f4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 21 May 2026 10:37:13 +0000 Subject: [PATCH 90/91] local run changes --- .../postcode_splitter/local_handler/invoke_local_lambda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py index 17d7e345..5f4b1d36 100755 --- a/applications/postcode_splitter/local_handler/invoke_local_lambda.py +++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py @@ -12,9 +12,9 @@ payload = { { "body": json.dumps( { - "task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf", - "sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d", - "s3_uri": "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2.csv", + "task_id": "e295d89b-a7c5-4a9a-8b4e-b405fab1f298", + "sub_task_id": "f4a9944f-41f0-4a33-8669-5016ec574068", + "s3_uri": "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv", } ) } From 9f7c16ccbd35e00d081701d5b46393ba3736278d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 21 May 2026 15:30:03 +0000 Subject: [PATCH 91/91] add address list --- scripts/sero_address_list.csv | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scripts/sero_address_list.csv diff --git a/scripts/sero_address_list.csv b/scripts/sero_address_list.csv new file mode 100644 index 00000000..8c9401c9 --- /dev/null +++ b/scripts/sero_address_list.csv @@ -0,0 +1,51 @@ +UPRN,Address,Postcode +U1035052,"1 Sudbury Crescent, Bromley",BR1 4PY +U1027449,"11 Station Road, Bromley",BR1 3LP +U1021310,"126 Faringdon Avenue, Bromley",BR2 8BU +U1010811,"13 Gilbert Road, Bromley",BR1 3QP +U1024017,"13 Manor Way, Bromley",BR2 8ES +U1042232,"154 Southover, Bromley",BR1 4RZ +U1009369,"17 Minster Road, Bromley",BR1 4DY +U1022305,"18a Lansdowne Road, Bromley",BR1 3LZ +U1033165,"2 Laburnum Way, Bromley",BR2 8BZ +U1035326,"2 Whitebeam Avenue, Bromley",BR2 8DL +U1037872,"20 Sudbury Crescent, Bromley",BR1 4PZ +U1007432,"21 Detling Road, Bromley",BR1 4SH +U1005123,"24 Bonville Road, Bromley",BR1 4QA +U1034810,"24 Newbury Road, Bromley",BR2 0QW +U1020351,"27 Laburnum Way, Bromley",BR2 8BY +U1009511,"27 Newbury Road, Bromley",BR2 0QN +U1034985,"272 Southborough Lane, Bromley",BR2 8AS +U1037954,"28 Treewall Gardens, Bromley",BR1 5BT +U1038103,"29 Whitebeam Avenue, Bromley",BR2 8DJ +U1013358,"3 Bird In Hand Lane, Bromley",BR1 2NA +U1024709,"3 Parkfield Way, Bromley",BR2 8AE +U1031058,"303 Keedonwood Road, Bromley",BR1 4QR +U1014077,"32 Aylesbury Road, Bromley",BR2 0QP +U1019564,"32 Brook Lane, Bromley",BR1 4PU +U1020237,"33 Hornbeam Way, Bromley",BR2 8DB +U1027493,"35 Sudbury Crescent, Bromley",BR1 4PY +U1042298,"39 Sudbury Crescent, Bromley",BR1 4PY +U1024698,"4 Palace View, Bromley",BR1 3EL +U1052186,"4 Ravensleigh Gardens, Bromley",BR1 5SN +U1042153,"4 Scotts Road, Bromley",BR1 3QD +U1037814,"42 Stanley Road, Bromley",BR2 9JH +U1014078,"43 Aylesbury Road, Bromley",BR2 0QR +U1007701,"46 Harwood Avenue, Bromley",BR1 3DU +U1036758,"46 Newbury Road, Bromley",BR2 0QW +U1025820,"46 Princes Plain, Bromley",BR2 8LE +U1022991,"5 Link Way, Bromley",BR2 8JH +U1024484,"55 Mounthurst Road, Bromley",BR2 7PG +U1014793,"59 Headcorn Road, Bromley",BR1 4SQ +U1037465,"6 Princes Plain, Bromley",BR2 8LE +U1009202,"63 Mead Way, Bromley",BR2 9ER +U1021353,"66 George Lane, Bromley",BR2 7LQ +U1042733,"68 Whitebeam Avenue, Bromley",BR2 8DL +U1030962,"7 Ravensleigh Gardens, Bromley",BR1 5SN +U1031294,"70 London Lane, Bromley",BR1 4HE +U1037450,"70 Pontefract Road, Bromley",BR1 4RB +U1014589,"71 Empress Drive, Chislehurst",BR7 5BQ +U1052429,"76 Southover, Bromley",BR1 4RY +U1020199,"78 Hillside Road, Bromley",BR2 0ST +U1024511,"81 Nightingale Lane, Bromley",BR1 2SA +U1009194,"84 Mays Hill Road, Bromley",BR2 0HT