From 265be9849b1eb8b7e5393a830c87624aa87e8f07 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 10:50:28 +0000
Subject: [PATCH 01/91] =?UTF-8?q?Store=20uploaded=5Ffile=5Fid=20on=20magic?=
 =?UTF-8?q?=5Fplan=5Fplan=20row=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../app/db/functions/magic_plan_functions.py  |  8 +++--
 .../tests/test_magic_plan_functions.py        | 34 +++++++++++++++----
 backend/app/db/models/magic_plan.py           |  1 +
 backend/magic_plan/magic_plan_service.py      |  3 +-
 4 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/backend/app/db/functions/magic_plan_functions.py b/backend/app/db/functions/magic_plan_functions.py
index 9400f36f..143e4172 100644
--- a/backend/app/db/functions/magic_plan_functions.py
+++ b/backend/app/db/functions/magic_plan_functions.py
@@ -14,15 +14,15 @@ from backend.app.db.models.magic_plan import (
 )
 
 
-def save_plan(session: Session, plan: Plan) -> None:
-    plan_id: int = _upsert_plan(session, plan)
+def save_plan(session: Session, plan: Plan, uploaded_file_id: int) -> None:
+    plan_id: int = _upsert_plan(session, plan, uploaded_file_id)
     _delete_children(session, plan_id)
     floor_ids: list[int] = _insert_floors(session, plan.floors, plan_id)
     room_ids: list[int] = _insert_rooms(session, plan.floors, floor_ids)
     _insert_windows_and_doors(session, plan.floors, room_ids)
 
 
-def _upsert_plan(session: Session, plan: Plan) -> int:
+def _upsert_plan(session: Session, plan: Plan, uploaded_file_id: int) -> int:
     stmt = (
         pg_insert(MagicPlanPlanModel)
         .values(
@@ -30,6 +30,7 @@ def _upsert_plan(session: Session, plan: Plan) -> int:
             name=plan.name,
             address=plan.address,
             postcode=plan.postcode,
+            uploaded_file_id=uploaded_file_id,
         )
         .on_conflict_do_update(
             index_elements=["magic_plan_uid"],
@@ -37,6 +38,7 @@ def _upsert_plan(session: Session, plan: Plan) -> int:
                 "name": plan.name,
                 "address": plan.address,
                 "postcode": plan.postcode,
+                "uploaded_file_id": uploaded_file_id,
             },
         )
         .returning(col(MagicPlanPlanModel.id))
diff --git a/backend/app/db/functions/tests/test_magic_plan_functions.py b/backend/app/db/functions/tests/test_magic_plan_functions.py
index e58d0528..0b93685c 100644
--- a/backend/app/db/functions/tests/test_magic_plan_functions.py
+++ b/backend/app/db/functions/tests/test_magic_plan_functions.py
@@ -36,7 +36,7 @@ def _count(session: Session, model: type[SQLModel]) -> int:
 
 def test_plan_row_present_after_save(db_session: Session, domain_plan: Plan) -> None:
     # Act
-    save_plan(db_session, domain_plan)
+    save_plan(db_session, domain_plan, 1)
     # Assert
     assert _count(db_session, MagicPlanPlanModel) == 1
 
@@ -45,7 +45,7 @@ def test_floor_count_matches_domain(db_session: Session, domain_plan: Plan) -> N
     # Arrange
     expected = len(domain_plan.floors)
     # Act
-    save_plan(db_session, domain_plan)
+    save_plan(db_session, domain_plan, 1)
     # Assert
     assert _count(db_session, MagicPlanFloorModel) == expected
 
@@ -54,7 +54,7 @@ def test_room_count_matches_domain(db_session: Session, domain_plan: Plan) -> No
     # Arrange
     expected = sum(len(f.rooms) for f in domain_plan.floors)
     # Act
-    save_plan(db_session, domain_plan)
+    save_plan(db_session, domain_plan, 1)
     # Assert
     assert _count(db_session, MagicPlanRoomModel) == expected
 
@@ -63,7 +63,7 @@ def test_window_count_matches_domain(db_session: Session, domain_plan: Plan) ->
     # Arrange
     expected = sum(len(r.windows) for f in domain_plan.floors for r in f.rooms)
     # Act
-    save_plan(db_session, domain_plan)
+    save_plan(db_session, domain_plan, 1)
     # Assert
     assert _count(db_session, MagicPlanWindowModel) == expected
 
@@ -72,15 +72,15 @@ def test_door_count_matches_domain(db_session: Session, domain_plan: Plan) -> No
     # Arrange
     expected = sum(len(r.doors) for f in domain_plan.floors for r in f.rooms)
     # Act
-    save_plan(db_session, domain_plan)
+    save_plan(db_session, domain_plan, 1)
     # Assert
     assert _count(db_session, MagicPlanDoorModel) == expected
 
 
 def test_save_plan_idempotent(db_session: Session, domain_plan: Plan) -> None:
     # Act — call twice within the same session
-    save_plan(db_session, domain_plan)
-    save_plan(db_session, domain_plan)
+    save_plan(db_session, domain_plan, 1)
+    save_plan(db_session, domain_plan, 1)
     # Assert — same row counts as a single call
     assert _count(db_session, MagicPlanPlanModel) == 1
     assert _count(db_session, MagicPlanFloorModel) == len(domain_plan.floors)
@@ -93,3 +93,23 @@ def test_save_plan_idempotent(db_session: Session, domain_plan: Plan) -> None:
     assert _count(db_session, MagicPlanDoorModel) == sum(
         len(r.doors) for f in domain_plan.floors for r in f.rooms
     )
+
+
+def test_uploaded_file_id_stored_after_save(db_session: Session, domain_plan: Plan) -> None:
+    # Act
+    save_plan(db_session, domain_plan, 1)
+    # Assert
+    row = db_session.execute(select(MagicPlanPlanModel)).scalar_one()
+    assert row.uploaded_file_id == 1
+
+
+def test_save_plan_updates_uploaded_file_id_on_reingest(
+    db_session: Session, domain_plan: Plan
+) -> None:
+    # Arrange
+    save_plan(db_session, domain_plan, 1)
+    # Act
+    save_plan(db_session, domain_plan, 2)
+    # Assert
+    row = db_session.execute(select(MagicPlanPlanModel)).scalar_one()
+    assert row.uploaded_file_id == 2
diff --git a/backend/app/db/models/magic_plan.py b/backend/app/db/models/magic_plan.py
index 38e9de18..77ca52fd 100644
--- a/backend/app/db/models/magic_plan.py
+++ b/backend/app/db/models/magic_plan.py
@@ -11,6 +11,7 @@ class MagicPlanPlanModel(SQLModel, table=True):
     name: Optional[str] = None
     address: Optional[str] = None
     postcode: Optional[str] = None
+    uploaded_file_id: Optional[int] = Field(default=None)
 
 
 class MagicPlanFloorModel(SQLModel, table=True):
diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py
index 22e19ddf..2be3379d 100644
--- a/backend/magic_plan/magic_plan_service.py
+++ b/backend/magic_plan/magic_plan_service.py
@@ -55,8 +55,9 @@ class MagicPlanService:
         )
 
         with db_session() as session:
-            save_plan(session, plan)
             session.add(uploaded_file)
+            session.flush()
+            save_plan(session, plan, uploaded_file.id)
 
         return plan
 

From 509fbf2abfa3849a44782f5a9cf2f8d033157823 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 11:02:46 +0000
Subject: [PATCH 02/91] =?UTF-8?q?Store=20uploaded=5Ffile=5Fid=20on=20magic?=
 =?UTF-8?q?=5Fplan=5Fplan=20row=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/magic_plan/magic_plan_service.py      |  4 +--
 .../tests/test_magic_plan_service.py          | 35 +++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py
index 2be3379d..8a75c716 100644
--- a/backend/magic_plan/magic_plan_service.py
+++ b/backend/magic_plan/magic_plan_service.py
@@ -1,7 +1,7 @@
 import gzip
 import json
 from datetime import datetime, timezone
-from typing import Optional
+from typing import Optional, cast
 
 from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary
 from datatypes.magicplan.domain.mapper import map_plan
@@ -57,7 +57,7 @@ class MagicPlanService:
         with db_session() as session:
             session.add(uploaded_file)
             session.flush()
-            save_plan(session, plan, uploaded_file.id)
+            save_plan(session, plan, cast(int, uploaded_file.id))
 
         return plan
 
diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py
index 158cf4d6..a2302ab4 100644
--- a/backend/magic_plan/tests/test_magic_plan_service.py
+++ b/backend/magic_plan/tests/test_magic_plan_service.py
@@ -271,3 +271,38 @@ def test_run_creates_uploaded_file_record(
     assert uploaded_file.s3_upload_timestamp is not None
     assert uploaded_file.uprn == 100023336956
     assert uploaded_file.hubspot_deal_id == "deal-789"
+
+
+def test_run_passes_flushed_uploaded_file_id_to_save_plan(
+    mock_client: MagicMock,
+    plan_summary: PlanSummary,
+) -> None:
+    # Arrange
+    mock_client.get_plans.return_value = [plan_summary]
+    service = _make_service(mock_client)
+    mock_session = MagicMock()
+    added_objects: list = []
+
+    mock_session.add.side_effect = added_objects.append
+
+    def simulate_flush() -> None:
+        for obj in added_objects:
+            if isinstance(obj, UploadedFile):
+                obj.id = 42
+
+    mock_session.flush.side_effect = simulate_flush
+
+    with patch(
+        "backend.magic_plan.magic_plan_service.find_matching_plan",
+        return_value=plan_summary,
+    ), patch("backend.magic_plan.magic_plan_service.save_plan") as mock_save, patch(
+        "backend.magic_plan.magic_plan_service.db_session"
+    ) as mock_db, patch(
+        "backend.magic_plan.magic_plan_service.save_data_to_s3"
+    ):
+        mock_db.return_value.__enter__.return_value = mock_session
+        # Act
+        service.run(_make_request())
+
+    # Assert
+    assert mock_save.call_args[0][2] == 42

From e3159665653557584edbe681a371c42b4a044a2f Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 12:29:25 +0000
Subject: [PATCH 03/91] add coordination and design document types to enums

---
 backend/app/db/models/uploaded_file.py | 3 +++
 backend/pashub_fetcher/core_files.py   | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py
index c629f574..f3cfee79 100644
--- a/backend/app/db/models/uploaded_file.py
+++ b/backend/app/db/models/uploaded_file.py
@@ -18,6 +18,9 @@ class FileTypeEnum(enum.Enum):
     ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note"
     ECMK_SURVEY_XML = "ecmk_survey_xml"
     MAGIC_PLAN_JSON = "magic_plan_json"
+    IMPROVEMENT_OPTION_EVALUATION = "improvement_option_evaluation"
+    MEDIUM_TERM_IMPROVEMENT_PLAN = "medium_term_improvement_plan"
+    RETROFIT_DESIGN_DOC = "retrofit_design_doc"
 
 
 class FileSourceEnum(enum.Enum):
diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 4da10661..aa426475 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -14,6 +14,9 @@ class CoreFiles(Enum):
     PAR_PHOTOPACK = "PAR Photo Pack"
     PAS2023_PROPERTY = "PAS 2023 Property Assessment Report"
     PAS2023_OCCUPANCY = "PAS 2023 Occupancy Assessment Report"
+    IMPROVEMENT_OPTION_EVALUATION = "Improvement Option Evaluation"
+    MEDIUM_TERM_IMPROVEMENT_PLAN = "Medium Term Improvement Plan"
+    RETROFIT_DESIGN_DOC = "Retrofit Design Doc"
 
 
 CORE_TO_FILETYPE_MAP = {
@@ -26,6 +29,9 @@ CORE_TO_FILETYPE_MAP = {
     CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value,
     CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value,
     CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value,
+    CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value,
+    CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value,
+    CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value,
 }
 
 

From e3646162de686884b17da231a26eeeaa3c4cdc41 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 13:09:40 +0000
Subject: [PATCH 04/91] =?UTF-8?q?new=20files=20types=20inferred=20from=20f?=
 =?UTF-8?q?ile=20names=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pashub_fetcher/tests/test_core_files.py   | 64 +++++++++++++++++++
 pytest.ini                                    | 23 ++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 backend/pashub_fetcher/tests/test_core_files.py

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
new file mode 100644
index 00000000..fca29b7e
--- /dev/null
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -0,0 +1,64 @@
+import pytest
+
+from backend.pashub_fetcher.core_files import infer_file_type
+
+
+# --- GREEN: pre-existing file types (startswith match) ---
+
+
+def test_infer_photopack():
+    assert infer_file_type("Photopack_123456_V1.pdf") == "photo_pack"
+
+
+def test_infer_sitenote():
+    assert infer_file_type("SiteNote_123456_V1.pdf") == "site_note"
+
+
+def test_infer_rdsap_sitenote():
+    assert infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
+
+
+def test_infer_pas2023_ventilation():
+    assert infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation"
+
+
+def test_infer_pas2023_condition():
+    assert infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
+
+
+def test_infer_pas_significance():
+    assert infer_file_type("PAS Significance_123456.pdf") == "pas_significance"
+
+
+def test_infer_par_photopack():
+    assert infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
+
+
+def test_infer_pas2023_property():
+    assert infer_file_type("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property"
+
+
+def test_infer_pas2023_occupancy():
+    assert infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy"
+
+
+def test_infer_unknown_returns_none():
+    assert infer_file_type("unknown_document_123.pdf") is None
+
+
+# --- RED: new file types (suffix match not yet implemented) ---
+
+
+def test_infer_improvement_option_evaluation():
+    # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf"
+    assert infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") == "improvement_option_evaluation"
+
+
+def test_infer_medium_term_improvement_plan():
+    # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf"
+    assert infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan"
+
+
+@pytest.mark.skip(reason="Retrofit Design Doc filename pattern not yet known")
+def test_infer_retrofit_design_doc():
+    assert infer_file_type("2512-OSM-H56M900-XX-DR-N-A_Radford Road 408.pdf") == "retrofit_design_doc"
diff --git a/pytest.ini b/pytest.ini
index e2a4a25d..99cc8e1b 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,6 +3,27 @@ pythonpath = .
 log_cli = true
 log_cli_level = INFO
 addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
-testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests
+testpaths =
+    recommendations/tests
+    backend/tests
+    backend/address2UPRN/tests
+    backend/app/db/functions/tests
+    backend/categorisation/tests
+    backend/condition/tests
+    backend/documents_parser/tests
+    backend/ecmk_fetcher/tests
+    backend/export/tests
+    backend/magic_plan/tests
+    backend/onboarders/tests
+    backend/pashub_fetcher/tests
+    datatypes/epc/domain/tests
+    datatypes/epc/schema/tests
+    datatypes/epc/surveys/tests
+    datatypes/magicplan/api/tests
+    datatypes/magicplan/domain/tests
+    etl/epc/tests
+    etl/epc_clean/tests
+    etl/hubspot/tests
+    etl/spatial/tests
 markers =
     integration: mark a test as an integration test

From b3a68a264a08af77fc047f97f9adb7453b77f037 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 13:32:54 +0000
Subject: [PATCH 05/91] =?UTF-8?q?new=20files=20types=20inferred=20from=20f?=
 =?UTF-8?q?ile=20names=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py            | 10 ++++++++++
 backend/pashub_fetcher/tests/test_core_files.py |  6 ++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index aa426475..b5ce1073 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -39,4 +39,14 @@ def infer_file_type(filename: str) -> Optional[str]:
     for core_file, file_type in CORE_TO_FILETYPE_MAP.items():
         if filename.startswith(core_file.value):
             return file_type
+
+    if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename:
+        return CORE_TO_FILETYPE_MAP[CoreFiles.IMPROVEMENT_OPTION_EVALUATION]
+
+    if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename:
+        return CORE_TO_FILETYPE_MAP[CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN]
+
+    if "-OSM-" in filename and "DR-N-A" in filename:
+        return CORE_TO_FILETYPE_MAP[CoreFiles.RETROFIT_DESIGN_DOC]
+
     return None
diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index fca29b7e..f8e8b431 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,5 +1,3 @@
-import pytest
-
 from backend.pashub_fetcher.core_files import infer_file_type
 
 
@@ -59,6 +57,6 @@ def test_infer_medium_term_improvement_plan():
     assert infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan"
 
 
-@pytest.mark.skip(reason="Retrofit Design Doc filename pattern not yet known")
 def test_infer_retrofit_design_doc():
-    assert infer_file_type("2512-OSM-H56M900-XX-DR-N-A_Radford Road 408.pdf") == "retrofit_design_doc"
+    assert infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc"
+    assert infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc"

From 39c5fd57693e6ceb5af2ce0bac7d1e53e7aca7e1 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 13:41:41 +0000
Subject: [PATCH 06/91] =?UTF-8?q?new=20files=20types=20inferred=20from=20f?=
 =?UTF-8?q?ile=20names=20=F0=9F=9F=AA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py | 71 +++++++++++++++++-----------
 1 file changed, 44 insertions(+), 27 deletions(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index b5ce1073..3e69bf9a 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Optional
+from typing import Callable, Optional
 
 from backend.app.db.models.uploaded_file import FileTypeEnum
 
@@ -19,34 +19,51 @@ class CoreFiles(Enum):
     RETROFIT_DESIGN_DOC = "Retrofit Design Doc"
 
 
-CORE_TO_FILETYPE_MAP = {
-    CoreFiles.PHOTOPACK: FileTypeEnum.PHOTO_PACK.value,
-    CoreFiles.SITENOTE: FileTypeEnum.SITE_NOTE.value,
-    CoreFiles.RDSAP_SITENOTE: FileTypeEnum.RD_SAP_SITE_NOTE.value,
-    CoreFiles.PAS2023_VENTILATION: FileTypeEnum.PAS_2023_VENTILATION.value,
-    CoreFiles.PAS2023_CONDITION: FileTypeEnum.PAS_2023_CONDITION.value,
-    CoreFiles.PAS_SIGNIFICANCE: FileTypeEnum.PAS_SIGNIFICANCE.value,
-    CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value,
-    CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value,
-    CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value,
-    CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value,
-    CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value,
-    CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value,
-}
+_MATCHERS: list[tuple[Callable[[str], bool], str]] = [
+    (lambda f: f.startswith(CoreFiles.PHOTOPACK.value), FileTypeEnum.PHOTO_PACK.value),
+    (lambda f: f.startswith(CoreFiles.SITENOTE.value), FileTypeEnum.SITE_NOTE.value),
+    (
+        lambda f: f.startswith(CoreFiles.RDSAP_SITENOTE.value),
+        FileTypeEnum.RD_SAP_SITE_NOTE.value,
+    ),
+    (
+        lambda f: f.startswith(CoreFiles.PAS2023_VENTILATION.value),
+        FileTypeEnum.PAS_2023_VENTILATION.value,
+    ),
+    (
+        lambda f: f.startswith(CoreFiles.PAS2023_CONDITION.value),
+        FileTypeEnum.PAS_2023_CONDITION.value,
+    ),
+    (
+        lambda f: f.startswith(CoreFiles.PAS_SIGNIFICANCE.value),
+        FileTypeEnum.PAS_SIGNIFICANCE.value,
+    ),
+    (
+        lambda f: f.startswith(CoreFiles.PAR_PHOTOPACK.value),
+        FileTypeEnum.PAR_PHOTO_PACK.value,
+    ),
+    (
+        lambda f: f.startswith(CoreFiles.PAS2023_PROPERTY.value),
+        FileTypeEnum.PAS_2023_PROPERTY.value,
+    ),
+    (
+        lambda f: f.startswith(CoreFiles.PAS2023_OCCUPANCY.value),
+        FileTypeEnum.PAS_2023_OCCUPANCY.value,
+    ),
+    (
+        lambda f: CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in f,
+        FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value,
+    ),
+    (
+        lambda f: CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in f,
+        FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value,
+    ),
+    (lambda f: "-OSM-" in f and "DR-N-A" in f, FileTypeEnum.RETROFIT_DESIGN_DOC.value),
+]
 
 
 def infer_file_type(filename: str) -> Optional[str]:
-    for core_file, file_type in CORE_TO_FILETYPE_MAP.items():
-        if filename.startswith(core_file.value):
+    for matcher, file_type in _MATCHERS:
+        if matcher(filename):
             return file_type
-
-    if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename:
-        return CORE_TO_FILETYPE_MAP[CoreFiles.IMPROVEMENT_OPTION_EVALUATION]
-
-    if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename:
-        return CORE_TO_FILETYPE_MAP[CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN]
-
-    if "-OSM-" in filename and "DR-N-A" in filename:
-        return CORE_TO_FILETYPE_MAP[CoreFiles.RETROFIT_DESIGN_DOC]
-
     return None

From 7635c800e6b88d65ae3ef7ddbbf7d199aaa7e64b Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 13 May 2026 16:04:53 +0000
Subject: [PATCH 07/91] added 0.0.7

---
 .devcontainer/backend/devcontainer.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json
index 24949770..0a78dadf 100644
--- a/.devcontainer/backend/devcontainer.json
+++ b/.devcontainer/backend/devcontainer.json
@@ -5,7 +5,7 @@
   "remoteUser": "vscode",
   "workspaceFolder": "/workspaces/model",
   "initializeCommand": "docker network create shared-dev 2>/dev/null || true; test -d \"$HOME/.config/gh\" || test -n \"$GITHUB_TOKEN\" || { echo >&2 'error: no GitHub auth found. Run `gh auth login && gh auth setup-git` on the host, or export GITHUB_TOKEN, then retry.'; exit 1; }",
-  "postCreateCommand": "gh repo clone Hestia-Homes/agentic-toolkit /tmp/agentic-toolkit -- --branch 0.0.5 --depth 1 && bash /tmp/agentic-toolkit/setup.sh",
+  "postCreateCommand": "gh repo clone Hestia-Homes/agentic-toolkit /tmp/agentic-toolkit -- --branch 0.0.7 --depth 1 && bash /tmp/agentic-toolkit/setup.sh",
   "postStartCommand": "bash .devcontainer/backend/post-install.sh",
   "mounts": [
     "source=${localEnv:HOME},target=/workspaces/home,type=bind",

From df0f089d4f65d1107d69195820706205380d7e66 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:05:20 +0000
Subject: [PATCH 08/91] =?UTF-8?q?Retrofit=20design=20doc=20selected=20by?=
 =?UTF-8?q?=20evidence=5Fcategory=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_client.py               | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 backend/pashub_fetcher/tests/test_pashub_client.py

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
new file mode 100644
index 00000000..4f5aef98
--- /dev/null
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -0,0 +1,44 @@
+from typing import Optional
+
+from backend.pashub_fetcher.core_files import CoreFiles
+from backend.pashub_fetcher.evidence_file_data import EvidenceFileData
+from backend.pashub_fetcher.pashub_client import PashubClient
+
+
+def make_client() -> PashubClient:
+    return PashubClient(token="test-token")
+
+
+def make_file(
+    file_name: str = "unknown.pdf",
+    evidence_category: Optional[str] = None,
+    created_utc: str = "2024-01-01T00:00:00",
+) -> EvidenceFileData:
+    return EvidenceFileData(
+        file_id="id-1",
+        file_name=file_name,
+        created_utc=created_utc,
+        file_size=1024,
+        file_extension="pdf",
+        evidence_category=evidence_category,
+    )
+
+
+# ---------------------------------------------------------------------------
+# _get_core_file_type
+# ---------------------------------------------------------------------------
+
+
+def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(
+        file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
+        evidence_category="retrofit design",
+    )
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result == CoreFiles.RETROFIT_DESIGN_DOC

From f2bbb44207cc9971e8a04436dd8591d16846c2ef Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:10:56 +0000
Subject: [PATCH 09/91] =?UTF-8?q?Retrofit=20design=20doc=20selected=20by?=
 =?UTF-8?q?=20evidence=5Fcategory=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 20b8590d..11195960 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -87,6 +87,9 @@ class PashubClient:
             return None
 
     def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]:
+        if file.evidence_category == "retrofit design":
+            return CoreFiles.RETROFIT_DESIGN_DOC
+
         for core_file in CoreFiles:
             if file.file_name.startswith(core_file.value):
                 return core_file

From 157a36f0cd5801799d2df54cd836b12894b56284 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:14:07 +0000
Subject: [PATCH 10/91] =?UTF-8?q?Evidence=20category=20matching=20is=20cas?=
 =?UTF-8?q?e-insensitive=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pashub_fetcher/tests/test_pashub_client.py    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 4f5aef98..ccf32fa6 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -42,3 +42,18 @@ def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category()
 
     # Assert
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
+
+
+def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(
+        file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
+        evidence_category="Retrofit Design",
+    )
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result == CoreFiles.RETROFIT_DESIGN_DOC

From 6922ff3e06be9dd1f12b4914aeaa960e25ee08d9 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:16:14 +0000
Subject: [PATCH 11/91] =?UTF-8?q?Evidence=20category=20matching=20is=20cas?=
 =?UTF-8?q?e-insensitive=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 11195960..d7200a1f 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -87,7 +87,7 @@ class PashubClient:
             return None
 
     def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]:
-        if file.evidence_category == "retrofit design":
+        if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design":
             return CoreFiles.RETROFIT_DESIGN_DOC
 
         for core_file in CoreFiles:

From 5c652d94852476d469436064dbc940ae7c62f46a Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:24:14 +0000
Subject: [PATCH 12/91] =?UTF-8?q?Retrofit=20Design=20Doc=20startswith=20ch?=
 =?UTF-8?q?eck=20removed=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_pashub_client.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index ccf32fa6..8654a137 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -44,6 +44,18 @@ def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category()
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
 
 
+def test_get_core_file_type_returns_improvement_option_evaluation_via_substring() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(file_name="6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From a1f6ffd6b39f9b1b077cf98cf2346d2414c1c0c0 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:24:34 +0000
Subject: [PATCH 13/91] =?UTF-8?q?Improvement=20Option=20Evaluation=20selec?=
 =?UTF-8?q?ted=20via=20substring=20match=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index d7200a1f..ba0f0221 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -90,6 +90,9 @@ class PashubClient:
         if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design":
             return CoreFiles.RETROFIT_DESIGN_DOC
 
+        if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in file.file_name:
+            return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
+
         for core_file in CoreFiles:
             if file.file_name.startswith(core_file.value):
                 return core_file

From d99d8a33479470156c671dd440e0e5e61269380f Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:25:02 +0000
Subject: [PATCH 14/91] =?UTF-8?q?Medium=20Term=20Improvement=20Plan=20sele?=
 =?UTF-8?q?cted=20via=20substring=20match=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_pashub_client.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 8654a137..9b99cf5c 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -56,6 +56,18 @@ def test_get_core_file_type_returns_improvement_option_evaluation_via_substring(
     assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION
 
 
+def test_get_core_file_type_returns_medium_term_improvement_plan_via_substring() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(file_name="60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From 084c8218a6c5acbc532d8d41ced6cd2eb364e402 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:25:57 +0000
Subject: [PATCH 15/91] =?UTF-8?q?Medium=20Term=20Improvement=20Plan=20sele?=
 =?UTF-8?q?cted=20via=20substring=20match=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index ba0f0221..556884fe 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -93,6 +93,9 @@ class PashubClient:
         if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in file.file_name:
             return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
 
+        if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in file.file_name:
+            return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
+
         for core_file in CoreFiles:
             if file.file_name.startswith(core_file.value):
                 return core_file

From a8e876d83d1e5b0bf7f204a8401e76b7fafe3170 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:26:34 +0000
Subject: [PATCH 16/91] =?UTF-8?q?Prefix=20and=20unknown=20file=20matching?=
 =?UTF-8?q?=20behaviour=20documented=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_client.py               | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 9b99cf5c..036e50bc 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -68,6 +68,30 @@ def test_get_core_file_type_returns_medium_term_improvement_plan_via_substring()
     assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
 
 
+def test_get_core_file_type_returns_photopack_via_prefix() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(file_name="Photopack_123456_V1.pdf")
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result == CoreFiles.PHOTOPACK
+
+
+def test_get_core_file_type_returns_none_for_unknown_file() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(file_name="unknown_document_123.pdf")
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result is None
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From 506dc92aa3ccc9ef4b3b7f6ab0351e0c76dd7ec8 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:27:42 +0000
Subject: [PATCH 17/91] =?UTF-8?q?=5Fselect=5Flatest=5Fcore=5Ffiles=20retur?=
 =?UTF-8?q?ns=20single=20retrofit=20design=20doc=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_client.py               | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 036e50bc..334f2de0 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -1,3 +1,4 @@
+# pyright: reportPrivateUsage=false
 from typing import Optional
 
 from backend.pashub_fetcher.core_files import CoreFiles
@@ -92,6 +93,29 @@ def test_get_core_file_type_returns_none_for_unknown_file() -> None:
     assert result is None
 
 
+# ---------------------------------------------------------------------------
+# _select_latest_core_files
+# ---------------------------------------------------------------------------
+
+
+def test_select_latest_core_files_returns_single_retrofit_design_doc() -> None:
+    # Arrange
+    client = make_client()
+    files = [
+        make_file(
+            file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-06-01T00:00:00",
+        )
+    ]
+
+    # Act
+    result = client._select_latest_core_files(files)
+
+    # Assert
+    assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From b685008e5ee1816588dedc096866a63764fc9c2a Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:28:19 +0000
Subject: [PATCH 18/91] =?UTF-8?q?OSM=20candidate=20wins=20over=20non-OSM?=
 =?UTF-8?q?=20retrofit=20design=20doc=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_client.py               | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 334f2de0..646ff3bc 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -116,6 +116,29 @@ def test_select_latest_core_files_returns_single_retrofit_design_doc() -> None:
     assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
 
 
+def test_select_latest_core_files_osm_candidate_wins_over_non_osm() -> None:
+    # Arrange - the non-OSM file is newer but should lose to the OSM file
+    client = make_client()
+    files = [
+        make_file(
+            file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-01-01T00:00:00",
+        ),
+        make_file(
+            file_name="Retrofit Design Doc non-osm variant.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-06-01T00:00:00",
+        ),
+    ]
+
+    # Act
+    result = client._select_latest_core_files(files)
+
+    # Assert
+    assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From aff79d4151da0b8b0958a34b8090abac7a27260b Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:28:50 +0000
Subject: [PATCH 19/91] =?UTF-8?q?OSM=20candidate=20wins=20over=20non-OSM?=
 =?UTF-8?q?=20retrofit=20design=20doc=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 556884fe..4435c278 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -116,6 +116,9 @@ class PashubClient:
         latest_files: Dict[CoreFiles, EvidenceFileData] = {}
 
         for core_type, group in grouped.items():
+            if core_type == CoreFiles.RETROFIT_DESIGN_DOC and len(group) > 1:
+                osm_candidates = [f for f in group if "-OSM-" in f.file_name]
+                group = osm_candidates if osm_candidates else group
             latest = max(group, key=lambda f: datetime.fromisoformat(f.created_utc))
             latest_files[core_type] = latest
 

From 3fe85a635ca94aca2af08ee71b7ee59e9495b106 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:29:24 +0000
Subject: [PATCH 20/91] =?UTF-8?q?Latest=20wins=20when=20both=20retrofit=20?=
 =?UTF-8?q?design=20doc=20candidates=20have=20OSM=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_client.py               | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 646ff3bc..7f0663db 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -139,6 +139,29 @@ def test_select_latest_core_files_osm_candidate_wins_over_non_osm() -> None:
     assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
 
 
+def test_select_latest_core_files_picks_latest_when_both_candidates_have_osm() -> None:
+    # Arrange
+    client = make_client()
+    files = [
+        make_file(
+            file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-01-01T00:00:00",
+        ),
+        make_file(
+            file_name="2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-06-01T00:00:00",
+        ),
+    ]
+
+    # Act
+    result = client._select_latest_core_files(files)
+
+    # Assert
+    assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf"
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From 9a04d89cae07671fbe182334df59e079a22f5e78 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:29:54 +0000
Subject: [PATCH 21/91] =?UTF-8?q?Latest=20wins=20as=20fallback=20when=20no?=
 =?UTF-8?q?=20OSM=20retrofit=20design=20doc=20candidates=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_client.py               | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 7f0663db..9ee8948a 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -162,6 +162,29 @@ def test_select_latest_core_files_picks_latest_when_both_candidates_have_osm() -
     assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf"
 
 
+def test_select_latest_core_files_falls_back_to_latest_when_no_osm_candidates() -> None:
+    # Arrange
+    client = make_client()
+    files = [
+        make_file(
+            file_name="retrofit_design_v1.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-01-01T00:00:00",
+        ),
+        make_file(
+            file_name="retrofit_design_v2.pdf",
+            evidence_category="retrofit design",
+            created_utc="2024-06-01T00:00:00",
+        ),
+    ]
+
+    # Act
+    result = client._select_latest_core_files(files)
+
+    # Assert
+    assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "retrofit_design_v2.pdf"
+
+
 def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
     # Arrange
     client = make_client()

From 16af543560f559c005f649a47b05c60cce2b2c94 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 13 May 2026 16:32:44 +0000
Subject: [PATCH 22/91] =?UTF-8?q?Consolidate=20three-tier=20matching=20and?=
 =?UTF-8?q?=20tidy=20test=20ordering=20=F0=9F=9F=AA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py       | 13 ++++----
 .../tests/test_pashub_client.py               | 30 +++++++++----------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 4435c278..25bf7b72 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -90,13 +90,16 @@ class PashubClient:
         if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design":
             return CoreFiles.RETROFIT_DESIGN_DOC
 
-        if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in file.file_name:
-            return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
-
-        if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in file.file_name:
-            return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
+        for core_file in (
+            CoreFiles.IMPROVEMENT_OPTION_EVALUATION,
+            CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN,
+        ):
+            if core_file.value in file.file_name:
+                return core_file
 
         for core_file in CoreFiles:
+            if core_file is CoreFiles.RETROFIT_DESIGN_DOC:
+                continue
             if file.file_name.startswith(core_file.value):
                 return core_file
         return None
diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 9ee8948a..7fd10381 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -45,6 +45,21 @@ def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category()
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
 
 
+def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
+    # Arrange
+    client = make_client()
+    file = make_file(
+        file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
+        evidence_category="Retrofit Design",
+    )
+
+    # Act
+    result = client._get_core_file_type(file)
+
+    # Assert
+    assert result == CoreFiles.RETROFIT_DESIGN_DOC
+
+
 def test_get_core_file_type_returns_improvement_option_evaluation_via_substring() -> None:
     # Arrange
     client = make_client()
@@ -183,18 +198,3 @@ def test_select_latest_core_files_falls_back_to_latest_when_no_osm_candidates()
 
     # Assert
     assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "retrofit_design_v2.pdf"
-
-
-def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(
-        file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
-        evidence_category="Retrofit Design",
-    )
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result == CoreFiles.RETROFIT_DESIGN_DOC

From 664c9b91fa9e280766dbadda11a065b6c044d0a9 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 07:38:43 +0000
Subject: [PATCH 23/91] delete incorrect comment in test

---
 .../pashub_fetcher/tests/test_core_files.py   | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index f8e8b431..8715f6ca 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,6 +1,5 @@
 from backend.pashub_fetcher.core_files import infer_file_type
 
-
 # --- GREEN: pre-existing file types (startswith match) ---
 
 
@@ -13,15 +12,22 @@ def test_infer_sitenote():
 
 
 def test_infer_rdsap_sitenote():
-    assert infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
+    assert (
+        infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
+    )
 
 
 def test_infer_pas2023_ventilation():
-    assert infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf") == "pas_2023_ventilation"
+    assert (
+        infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf")
+        == "pas_2023_ventilation"
+    )
 
 
 def test_infer_pas2023_condition():
-    assert infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
+    assert (
+        infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
+    )
 
 
 def test_infer_pas_significance():
@@ -29,34 +35,51 @@ def test_infer_pas_significance():
 
 
 def test_infer_par_photopack():
-    assert infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
+    assert (
+        infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
+    )
 
 
 def test_infer_pas2023_property():
-    assert infer_file_type("PAS 2023 Property Assessment Report_123456.pdf") == "pas_2023_property"
+    assert (
+        infer_file_type("PAS 2023 Property Assessment Report_123456.pdf")
+        == "pas_2023_property"
+    )
 
 
 def test_infer_pas2023_occupancy():
-    assert infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf") == "pas_2023_occupancy"
+    assert (
+        infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf")
+        == "pas_2023_occupancy"
+    )
 
 
 def test_infer_unknown_returns_none():
     assert infer_file_type("unknown_document_123.pdf") is None
 
 
-# --- RED: new file types (suffix match not yet implemented) ---
-
-
 def test_infer_improvement_option_evaluation():
     # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf"
-    assert infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf") == "improvement_option_evaluation"
+    assert (
+        infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
+        == "improvement_option_evaluation"
+    )
 
 
 def test_infer_medium_term_improvement_plan():
     # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf"
-    assert infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf") == "medium_term_improvement_plan"
+    assert (
+        infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
+        == "medium_term_improvement_plan"
+    )
 
 
 def test_infer_retrofit_design_doc():
-    assert infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf") == "retrofit_design_doc"
-    assert infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf") == "retrofit_design_doc"
+    assert (
+        infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
+        == "retrofit_design_doc"
+    )
+    assert (
+        infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
+        == "retrofit_design_doc"
+    )

From 75093fc8333b1cb2ff80cca61e4588e73a448f6a Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 07:38:58 +0000
Subject: [PATCH 24/91] delete incorrect comment in test

---
 backend/pashub_fetcher/tests/test_core_files.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 8715f6ca..8bd31f15 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,7 +1,5 @@
 from backend.pashub_fetcher.core_files import infer_file_type
 
-# --- GREEN: pre-existing file types (startswith match) ---
-
 
 def test_infer_photopack():
     assert infer_file_type("Photopack_123456_V1.pdf") == "photo_pack"

From 1a789ec609c4b6ca6afe3ea83e9a753687f8a0a4 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:37:32 +0000
Subject: [PATCH 25/91] =?UTF-8?q?new=20core=5Ffile=5Ffor=20function=20iden?=
 =?UTF-8?q?tifies=20CoreFiles=20type=20from=20filename=20and=20evidence=20?=
 =?UTF-8?q?category=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py           |  6 ++++++
 .../pashub_fetcher/tests/test_core_files.py    | 18 +++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 3e69bf9a..050dde27 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -62,6 +62,12 @@ _MATCHERS: list[tuple[Callable[[str], bool], str]] = [
 ]
 
 
+def core_file_for(
+    filename: str, evidence_category: Optional[str] = None
+) -> Optional[CoreFiles]:
+    raise NotImplementedError
+
+
 def infer_file_type(filename: str) -> Optional[str]:
     for matcher, file_type in _MATCHERS:
         if matcher(filename):
diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 8bd31f15..5ac6b4f7 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,4 +1,4 @@
-from backend.pashub_fetcher.core_files import infer_file_type
+from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, infer_file_type
 
 
 def test_infer_photopack():
@@ -81,3 +81,19 @@ def test_infer_retrofit_design_doc():
         infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
         == "retrofit_design_doc"
     )
+
+
+# ---------------------------------------------------------------------------
+# core_file_for
+# ---------------------------------------------------------------------------
+
+
+def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None:
+    # Arrange
+    filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
+
+    # Act
+    result = core_file_for(filename, evidence_category="retrofit design")
+
+    # Assert
+    assert result == CoreFiles.RETROFIT_DESIGN_DOC

From 9adb467a02e42d1d0a82285f1acafa4c344deb1d Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:38:36 +0000
Subject: [PATCH 26/91] =?UTF-8?q?new=20core=5Ffile=5Ffor=20function=20iden?=
 =?UTF-8?q?tifies=20CoreFiles=20type=20from=20filename=20and=20evidence=20?=
 =?UTF-8?q?category=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 050dde27..07297653 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -65,6 +65,8 @@ _MATCHERS: list[tuple[Callable[[str], bool], str]] = [
 def core_file_for(
     filename: str, evidence_category: Optional[str] = None
 ) -> Optional[CoreFiles]:
+    if evidence_category is not None and evidence_category.lower() == "retrofit design":
+        return CoreFiles.RETROFIT_DESIGN_DOC
     raise NotImplementedError
 
 

From e312dd26146115b467437ee60f93de4cb76125ee Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:39:11 +0000
Subject: [PATCH 27/91] =?UTF-8?q?core=5Ffile=5Ffor=20evidence=5Fcategory?=
 =?UTF-8?q?=20match=20is=20case-insensitive=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 5ac6b4f7..f968a976 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -88,6 +88,17 @@ def test_infer_retrofit_design_doc():
 # ---------------------------------------------------------------------------
 
 
+def test_core_file_for_evidence_category_match_is_case_insensitive() -> None:
+    # Arrange
+    filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
+
+    # Act
+    result = core_file_for(filename, evidence_category="Retrofit Design")
+
+    # Assert
+    assert result == CoreFiles.RETROFIT_DESIGN_DOC
+
+
 def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None:
     # Arrange
     filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"

From 9bbd5f1ff9fc0810383c73a2d7bc8863c4f2c258 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:39:58 +0000
Subject: [PATCH 28/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20IOE=20fi?=
 =?UTF-8?q?les=20via=20filename=20substring=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index f968a976..c6970def 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -108,3 +108,14 @@ def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None:
 
     # Assert
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
+
+
+def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() -> None:
+    # Arrange
+    filename = "6000802 - NG4 4HD - Improvement Option Evaluation.pdf"
+
+    # Act
+    result = core_file_for(filename)
+
+    # Assert
+    assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION

From 46355be3f1e24d10662583afa3b8b55f3a1d8cc6 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:40:21 +0000
Subject: [PATCH 29/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20IOE=20fi?=
 =?UTF-8?q?les=20via=20filename=20substring=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 07297653..72ef15f8 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -67,6 +67,8 @@ def core_file_for(
 ) -> Optional[CoreFiles]:
     if evidence_category is not None and evidence_category.lower() == "retrofit design":
         return CoreFiles.RETROFIT_DESIGN_DOC
+    if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename:
+        return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
     raise NotImplementedError
 
 

From 176239475a977943bf81e6bddb9d042bbbb5d014 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:40:49 +0000
Subject: [PATCH 30/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20MTIP=20f?=
 =?UTF-8?q?iles=20via=20filename=20substring=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index c6970def..85e7607e 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -119,3 +119,14 @@ def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() ->
 
     # Assert
     assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION
+
+
+def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() -> None:
+    # Arrange
+    filename = "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf"
+
+    # Act
+    result = core_file_for(filename)
+
+    # Assert
+    assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN

From 4d3d6dba05477bef466f64dde09f4d88956efad0 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:41:26 +0000
Subject: [PATCH 31/91] =?UTF-8?q?core=5Ffile=5Ffor=20identifies=20MTIP=20f?=
 =?UTF-8?q?iles=20via=20filename=20substring=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 72ef15f8..4b1023d2 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -69,6 +69,8 @@ def core_file_for(
         return CoreFiles.RETROFIT_DESIGN_DOC
     if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename:
         return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
+    if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename:
+        return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
     raise NotImplementedError
 
 

From e940e75a43f1a3aebe8a78dc7bd06d4c648997fb Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:41:52 +0000
Subject: [PATCH 32/91] =?UTF-8?q?core=5Ffile=5Ffor=20falls=20back=20to=20O?=
 =?UTF-8?q?SM=20filename=20pattern=20for=20Retrofit=20Design=20Doc=20?=
 =?UTF-8?q?=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 85e7607e..7b991c23 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -130,3 +130,14 @@ def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() ->
 
     # Assert
     assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
+
+
+def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> None:
+    # Arrange
+    filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
+
+    # Act
+    result = core_file_for(filename)
+
+    # Assert
+    assert result == CoreFiles.RETROFIT_DESIGN_DOC

From 3ef8a591223ea50ade12b36545dda1f92542abee Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:43:04 +0000
Subject: [PATCH 33/91] =?UTF-8?q?core=5Ffile=5Ffor=20falls=20back=20to=20O?=
 =?UTF-8?q?SM=20filename=20pattern=20for=20Retrofit=20Design=20Doc=20?=
 =?UTF-8?q?=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 4b1023d2..75981cb1 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -71,6 +71,8 @@ def core_file_for(
         return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
     if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename:
         return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
+    if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename:
+        return CoreFiles.RETROFIT_DESIGN_DOC
     raise NotImplementedError
 
 

From a2dc945bf38005826a6bc713e3d93ca30b5a79e0 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:43:41 +0000
Subject: [PATCH 34/91] =?UTF-8?q?core=5Ffile=5Ffor=20matches=20remaining?=
 =?UTF-8?q?=20core=20file=20types=20via=20filename=20prefix=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 7b991c23..f87d8679 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -141,3 +141,14 @@ def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_
 
     # Assert
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
+
+
+def test_core_file_for_prefix_returns_photopack() -> None:
+    # Arrange
+    filename = "Photopack_123456_V1.pdf"
+
+    # Act
+    result = core_file_for(filename)
+
+    # Assert
+    assert result == CoreFiles.PHOTOPACK

From 605f2e3d1e1f5bdc3ceaa953fd7150f938ade72f Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:45:18 +0000
Subject: [PATCH 35/91] =?UTF-8?q?core=5Ffile=5Ffor=20matches=20remaining?=
 =?UTF-8?q?=20core=20file=20types=20via=20filename=20prefix=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 75981cb1..87a4044a 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -73,7 +73,17 @@ def core_file_for(
         return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
     if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename:
         return CoreFiles.RETROFIT_DESIGN_DOC
-    raise NotImplementedError
+    _prefix_skip = {
+        CoreFiles.RETROFIT_DESIGN_DOC,
+        CoreFiles.IMPROVEMENT_OPTION_EVALUATION,
+        CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN,
+    }
+    for core_file in CoreFiles:
+        if core_file in _prefix_skip:
+            continue
+        if filename.startswith(core_file.value):
+            return core_file
+    return None
 
 
 def infer_file_type(filename: str) -> Optional[str]:

From d4cc00b5e31d7b6653ccb0a7f1307b2638dc2a12 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:46:10 +0000
Subject: [PATCH 36/91] =?UTF-8?q?core=5Ffile=5Ffor=20returns=20None=20for?=
 =?UTF-8?q?=20unrecognised=20filenames=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index f87d8679..2b20803c 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -152,3 +152,14 @@ def test_core_file_for_prefix_returns_photopack() -> None:
 
     # Assert
     assert result == CoreFiles.PHOTOPACK
+
+
+def test_core_file_for_unknown_filename_returns_none() -> None:
+    # Arrange
+    filename = "unknown_document_123.pdf"
+
+    # Act
+    result = core_file_for(filename)
+
+    # Assert
+    assert result is None

From 541d5965b7619090b9d1a564761e424cba37d86e Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:46:48 +0000
Subject: [PATCH 37/91] =?UTF-8?q?core=5Ffile=5Ffor=20OSM=20fallback=20is?=
 =?UTF-8?q?=20suppressed=20when=20evidence=5Fcategory=20is=20present=20?=
 =?UTF-8?q?=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_core_files.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 2b20803c..e97df476 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -163,3 +163,14 @@ def test_core_file_for_unknown_filename_returns_none() -> None:
 
     # Assert
     assert result is None
+
+
+def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> None:
+    # Arrange — OSM+DR-N-A filename but evidence_category is something other than retrofit design
+    filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
+
+    # Act
+    result = core_file_for(filename, evidence_category="some other category")
+
+    # Assert
+    assert result is None

From 5e31c0f3dadd4d4da36dc023612388ef66f5b4c9 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:51:28 +0000
Subject: [PATCH 38/91] =?UTF-8?q?file=5Ftype=5Ffor=20delegates=20to=20core?=
 =?UTF-8?q?=5Ffile=5Ffor;=20=5FMATCHERS=20removed=20=F0=9F=9F=AA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py          | 67 ++++++-------------
 backend/pashub_fetcher/pashub_service.py      |  4 +-
 .../pashub_fetcher/tests/test_core_files.py   | 30 ++++-----
 3 files changed, 37 insertions(+), 64 deletions(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 87a4044a..01ae189f 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Callable, Optional
+from typing import Optional
 
 from backend.app.db.models.uploaded_file import FileTypeEnum
 
@@ -19,47 +19,20 @@ class CoreFiles(Enum):
     RETROFIT_DESIGN_DOC = "Retrofit Design Doc"
 
 
-_MATCHERS: list[tuple[Callable[[str], bool], str]] = [
-    (lambda f: f.startswith(CoreFiles.PHOTOPACK.value), FileTypeEnum.PHOTO_PACK.value),
-    (lambda f: f.startswith(CoreFiles.SITENOTE.value), FileTypeEnum.SITE_NOTE.value),
-    (
-        lambda f: f.startswith(CoreFiles.RDSAP_SITENOTE.value),
-        FileTypeEnum.RD_SAP_SITE_NOTE.value,
-    ),
-    (
-        lambda f: f.startswith(CoreFiles.PAS2023_VENTILATION.value),
-        FileTypeEnum.PAS_2023_VENTILATION.value,
-    ),
-    (
-        lambda f: f.startswith(CoreFiles.PAS2023_CONDITION.value),
-        FileTypeEnum.PAS_2023_CONDITION.value,
-    ),
-    (
-        lambda f: f.startswith(CoreFiles.PAS_SIGNIFICANCE.value),
-        FileTypeEnum.PAS_SIGNIFICANCE.value,
-    ),
-    (
-        lambda f: f.startswith(CoreFiles.PAR_PHOTOPACK.value),
-        FileTypeEnum.PAR_PHOTO_PACK.value,
-    ),
-    (
-        lambda f: f.startswith(CoreFiles.PAS2023_PROPERTY.value),
-        FileTypeEnum.PAS_2023_PROPERTY.value,
-    ),
-    (
-        lambda f: f.startswith(CoreFiles.PAS2023_OCCUPANCY.value),
-        FileTypeEnum.PAS_2023_OCCUPANCY.value,
-    ),
-    (
-        lambda f: CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in f,
-        FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value,
-    ),
-    (
-        lambda f: CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in f,
-        FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value,
-    ),
-    (lambda f: "-OSM-" in f and "DR-N-A" in f, FileTypeEnum.RETROFIT_DESIGN_DOC.value),
-]
+_CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = {
+    CoreFiles.PHOTOPACK: FileTypeEnum.PHOTO_PACK.value,
+    CoreFiles.SITENOTE: FileTypeEnum.SITE_NOTE.value,
+    CoreFiles.RDSAP_SITENOTE: FileTypeEnum.RD_SAP_SITE_NOTE.value,
+    CoreFiles.PAS2023_VENTILATION: FileTypeEnum.PAS_2023_VENTILATION.value,
+    CoreFiles.PAS2023_CONDITION: FileTypeEnum.PAS_2023_CONDITION.value,
+    CoreFiles.PAS_SIGNIFICANCE: FileTypeEnum.PAS_SIGNIFICANCE.value,
+    CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value,
+    CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value,
+    CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value,
+    CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value,
+    CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value,
+    CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value,
+}
 
 
 def core_file_for(
@@ -86,8 +59,8 @@ def core_file_for(
     return None
 
 
-def infer_file_type(filename: str) -> Optional[str]:
-    for matcher, file_type in _MATCHERS:
-        if matcher(filename):
-            return file_type
-    return None
+def file_type_for(filename: str) -> Optional[str]:
+    core_file = core_file_for(filename)
+    if core_file is None:
+        return None
+    return _CORE_FILE_TO_FILE_TYPE[core_file]
diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index 316902f4..ec623f7a 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import (
 )
 from backend.documents_parser.db_writer import save_epc_property_data
 from backend.documents_parser.parser import parse_site_notes_pdf
-from backend.pashub_fetcher.core_files import infer_file_type
+from backend.pashub_fetcher.core_files import file_type_for
 from backend.pashub_fetcher.pashub_client import PashubClient
 from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
     PashubToAraTriggerRequest,
@@ -109,7 +109,7 @@ class PashubService:
                 uprn=int(uprn) if uprn else None,
                 hubspot_deal_id=hubspot_deal_id,
                 file_source=FileSourceEnum.PAS_HUB.value,
-                file_type=infer_file_type(filename),
+                file_type=file_type_for(filename),
             )
             file_paths.append(file_path)
             uploaded_files.append(uploaded_file)
diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index e97df476..09fcdcb2 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,65 +1,65 @@
-from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, infer_file_type
+from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, file_type_for
 
 
 def test_infer_photopack():
-    assert infer_file_type("Photopack_123456_V1.pdf") == "photo_pack"
+    assert file_type_for("Photopack_123456_V1.pdf") == "photo_pack"
 
 
 def test_infer_sitenote():
-    assert infer_file_type("SiteNote_123456_V1.pdf") == "site_note"
+    assert file_type_for("SiteNote_123456_V1.pdf") == "site_note"
 
 
 def test_infer_rdsap_sitenote():
     assert (
-        infer_file_type("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
+        file_type_for("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
     )
 
 
 def test_infer_pas2023_ventilation():
     assert (
-        infer_file_type("PAS 2023 Ventilation Assessment Report_123456.pdf")
+        file_type_for("PAS 2023 Ventilation Assessment Report_123456.pdf")
         == "pas_2023_ventilation"
     )
 
 
 def test_infer_pas2023_condition():
     assert (
-        infer_file_type("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
+        file_type_for("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
     )
 
 
 def test_infer_pas_significance():
-    assert infer_file_type("PAS Significance_123456.pdf") == "pas_significance"
+    assert file_type_for("PAS Significance_123456.pdf") == "pas_significance"
 
 
 def test_infer_par_photopack():
     assert (
-        infer_file_type("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
+        file_type_for("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
     )
 
 
 def test_infer_pas2023_property():
     assert (
-        infer_file_type("PAS 2023 Property Assessment Report_123456.pdf")
+        file_type_for("PAS 2023 Property Assessment Report_123456.pdf")
         == "pas_2023_property"
     )
 
 
 def test_infer_pas2023_occupancy():
     assert (
-        infer_file_type("PAS 2023 Occupancy Assessment Report_123456.pdf")
+        file_type_for("PAS 2023 Occupancy Assessment Report_123456.pdf")
         == "pas_2023_occupancy"
     )
 
 
 def test_infer_unknown_returns_none():
-    assert infer_file_type("unknown_document_123.pdf") is None
+    assert file_type_for("unknown_document_123.pdf") is None
 
 
 def test_infer_improvement_option_evaluation():
     # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf"
     assert (
-        infer_file_type("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
+        file_type_for("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
         == "improvement_option_evaluation"
     )
 
@@ -67,18 +67,18 @@ def test_infer_improvement_option_evaluation():
 def test_infer_medium_term_improvement_plan():
     # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf"
     assert (
-        infer_file_type("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
+        file_type_for("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
         == "medium_term_improvement_plan"
     )
 
 
 def test_infer_retrofit_design_doc():
     assert (
-        infer_file_type("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
+        file_type_for("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
         == "retrofit_design_doc"
     )
     assert (
-        infer_file_type("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
+        file_type_for("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
         == "retrofit_design_doc"
     )
 

From fb9bdbc585940e4afe714c152cfc52b48559336d Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 08:53:56 +0000
Subject: [PATCH 39/91] =?UTF-8?q?=5Fselect=5Flatest=5Fcore=5Ffiles=20deleg?=
 =?UTF-8?q?ates=20to=20core=5Ffile=5Ffor;=20=5Fget=5Fcore=5Ffile=5Ftype=20?=
 =?UTF-8?q?removed=20=F0=9F=9F=AA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_client.py       | 22 +----
 .../tests/test_pashub_client.py               | 83 -------------------
 2 files changed, 2 insertions(+), 103 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 25bf7b72..f851c410 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -5,7 +5,7 @@ from datetime import datetime
 
 import requests
 
-from backend.pashub_fetcher.core_files import CoreFiles
+from backend.pashub_fetcher.core_files import CoreFiles, core_file_for
 from backend.pashub_fetcher.evidence_file_data import EvidenceFileData
 from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata
 from utils.logger import setup_logger
@@ -86,24 +86,6 @@ class PashubClient:
         except Exception:
             return None
 
-    def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]:
-        if file.evidence_category is not None and file.evidence_category.lower() == "retrofit design":
-            return CoreFiles.RETROFIT_DESIGN_DOC
-
-        for core_file in (
-            CoreFiles.IMPROVEMENT_OPTION_EVALUATION,
-            CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN,
-        ):
-            if core_file.value in file.file_name:
-                return core_file
-
-        for core_file in CoreFiles:
-            if core_file is CoreFiles.RETROFIT_DESIGN_DOC:
-                continue
-            if file.file_name.startswith(core_file.value):
-                return core_file
-        return None
-
     def _select_latest_core_files(
         self,
         files: List[EvidenceFileData],
@@ -111,7 +93,7 @@ class PashubClient:
         grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list)
 
         for file in files:
-            core_type = self._get_core_file_type(file)
+            core_type = core_file_for(file.file_name, file.evidence_category)
             if not core_type:
                 continue
             grouped[core_type].append(file)
diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py
index 7fd10381..34260c73 100644
--- a/backend/pashub_fetcher/tests/test_pashub_client.py
+++ b/backend/pashub_fetcher/tests/test_pashub_client.py
@@ -25,89 +25,6 @@ def make_file(
     )
 
 
-# ---------------------------------------------------------------------------
-# _get_core_file_type
-# ---------------------------------------------------------------------------
-
-
-def test_get_core_file_type_returns_retrofit_design_doc_for_evidence_category() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(
-        file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
-        evidence_category="retrofit design",
-    )
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result == CoreFiles.RETROFIT_DESIGN_DOC
-
-
-def test_get_core_file_type_evidence_category_match_is_case_insensitive() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(
-        file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
-        evidence_category="Retrofit Design",
-    )
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result == CoreFiles.RETROFIT_DESIGN_DOC
-
-
-def test_get_core_file_type_returns_improvement_option_evaluation_via_substring() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(file_name="6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION
-
-
-def test_get_core_file_type_returns_medium_term_improvement_plan_via_substring() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(file_name="60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
-
-
-def test_get_core_file_type_returns_photopack_via_prefix() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(file_name="Photopack_123456_V1.pdf")
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result == CoreFiles.PHOTOPACK
-
-
-def test_get_core_file_type_returns_none_for_unknown_file() -> None:
-    # Arrange
-    client = make_client()
-    file = make_file(file_name="unknown_document_123.pdf")
-
-    # Act
-    result = client._get_core_file_type(file)
-
-    # Assert
-    assert result is None
-
-
 # ---------------------------------------------------------------------------
 # _select_latest_core_files
 # ---------------------------------------------------------------------------

From e8b7cfdcec0c62389759ba4d7ce8642994df062e Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 09:01:56 +0000
Subject: [PATCH 40/91] =?UTF-8?q?remove=20redundant=20unknown-file=20test;?=
 =?UTF-8?q?=20rename=20test=5Finfer=5F*=20to=20test=5Ffile=5Ftype=5Ffor=5F?=
 =?UTF-8?q?*=20=F0=9F=9F=AA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/core_files.py          |  9 ++++++
 .../pashub_fetcher/tests/test_core_files.py   | 28 ++++++++-----------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 01ae189f..e668ba7f 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -38,24 +38,33 @@ _CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = {
 def core_file_for(
     filename: str, evidence_category: Optional[str] = None
 ) -> Optional[CoreFiles]:
+    # Identify retrofit design doc using evidence category as the name is possibly unreliable.
+    # We might change to always use evidence category, but needs more investigation
     if evidence_category is not None and evidence_category.lower() == "retrofit design":
         return CoreFiles.RETROFIT_DESIGN_DOC
+
     if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename:
         return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
+
     if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename:
         return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
+
     if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename:
         return CoreFiles.RETROFIT_DESIGN_DOC
+
     _prefix_skip = {
         CoreFiles.RETROFIT_DESIGN_DOC,
         CoreFiles.IMPROVEMENT_OPTION_EVALUATION,
         CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN,
     }
+
     for core_file in CoreFiles:
         if core_file in _prefix_skip:
             continue
+
         if filename.startswith(core_file.value):
             return core_file
+
     return None
 
 
diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index 09fcdcb2..ee91298e 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,62 +1,58 @@
 from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, file_type_for
 
 
-def test_infer_photopack():
+def test_file_type_for_photopack():
     assert file_type_for("Photopack_123456_V1.pdf") == "photo_pack"
 
 
-def test_infer_sitenote():
+def test_file_type_for_sitenote():
     assert file_type_for("SiteNote_123456_V1.pdf") == "site_note"
 
 
-def test_infer_rdsap_sitenote():
+def test_file_type_for_rdsap_sitenote():
     assert (
         file_type_for("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
     )
 
 
-def test_infer_pas2023_ventilation():
+def test_file_type_for_pas2023_ventilation():
     assert (
         file_type_for("PAS 2023 Ventilation Assessment Report_123456.pdf")
         == "pas_2023_ventilation"
     )
 
 
-def test_infer_pas2023_condition():
+def test_file_type_for_pas2023_condition():
     assert (
         file_type_for("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
     )
 
 
-def test_infer_pas_significance():
+def test_file_type_for_pas_significance():
     assert file_type_for("PAS Significance_123456.pdf") == "pas_significance"
 
 
-def test_infer_par_photopack():
+def test_file_type_for_par_photopack():
     assert (
         file_type_for("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
     )
 
 
-def test_infer_pas2023_property():
+def test_file_type_for_pas2023_property():
     assert (
         file_type_for("PAS 2023 Property Assessment Report_123456.pdf")
         == "pas_2023_property"
     )
 
 
-def test_infer_pas2023_occupancy():
+def test_file_type_for_pas2023_occupancy():
     assert (
         file_type_for("PAS 2023 Occupancy Assessment Report_123456.pdf")
         == "pas_2023_occupancy"
     )
 
 
-def test_infer_unknown_returns_none():
-    assert file_type_for("unknown_document_123.pdf") is None
-
-
-def test_infer_improvement_option_evaluation():
+def test_file_type_for_improvement_option_evaluation():
     # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf"
     assert (
         file_type_for("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
@@ -64,7 +60,7 @@ def test_infer_improvement_option_evaluation():
     )
 
 
-def test_infer_medium_term_improvement_plan():
+def test_file_type_for_medium_term_improvement_plan():
     # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf"
     assert (
         file_type_for("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
@@ -72,7 +68,7 @@ def test_infer_medium_term_improvement_plan():
     )
 
 
-def test_infer_retrofit_design_doc():
+def test_file_type_for_retrofit_design_doc():
     assert (
         file_type_for("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
         == "retrofit_design_doc"

From faf698eb7162af4a2f08da1379d5ce3f1be41444 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 10:57:37 +0000
Subject: [PATCH 41/91] rename functions and include typehints

---
 backend/pashub_fetcher/core_files.py          |  6 +-
 backend/pashub_fetcher/pashub_client.py       |  7 ++-
 backend/pashub_fetcher/pashub_service.py      |  4 +-
 .../pashub_fetcher/tests/test_core_files.py   | 61 +++++++++++--------
 4 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index e668ba7f..30aa2ba8 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -35,7 +35,7 @@ _CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = {
 }
 
 
-def core_file_for(
+def get_core_file_type(
     filename: str, evidence_category: Optional[str] = None
 ) -> Optional[CoreFiles]:
     # Identify retrofit design doc using evidence category as the name is possibly unreliable.
@@ -68,8 +68,8 @@ def core_file_for(
     return None
 
 
-def file_type_for(filename: str) -> Optional[str]:
-    core_file = core_file_for(filename)
+def get_file_type_string(filename: str) -> Optional[str]:
+    core_file = get_core_file_type(filename)
     if core_file is None:
         return None
     return _CORE_FILE_TO_FILE_TYPE[core_file]
diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index f851c410..7896664d 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -5,12 +5,11 @@ from datetime import datetime
 
 import requests
 
-from backend.pashub_fetcher.core_files import CoreFiles, core_file_for
+from backend.pashub_fetcher.core_files import CoreFiles, get_core_file_type
 from backend.pashub_fetcher.evidence_file_data import EvidenceFileData
 from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata
 from utils.logger import setup_logger
 
-
 logger = setup_logger()
 
 
@@ -93,7 +92,9 @@ class PashubClient:
         grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list)
 
         for file in files:
-            core_type = core_file_for(file.file_name, file.evidence_category)
+            core_type: Optional[CoreFiles] = get_core_file_type(
+                file.file_name, file.evidence_category
+            )
             if not core_type:
                 continue
             grouped[core_type].append(file)
diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index ec623f7a..b3302fd9 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import (
 )
 from backend.documents_parser.db_writer import save_epc_property_data
 from backend.documents_parser.parser import parse_site_notes_pdf
-from backend.pashub_fetcher.core_files import file_type_for
+from backend.pashub_fetcher.core_files import get_file_type_string
 from backend.pashub_fetcher.pashub_client import PashubClient
 from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
     PashubToAraTriggerRequest,
@@ -109,7 +109,7 @@ class PashubService:
                 uprn=int(uprn) if uprn else None,
                 hubspot_deal_id=hubspot_deal_id,
                 file_source=FileSourceEnum.PAS_HUB.value,
-                file_type=file_type_for(filename),
+                file_type=get_file_type_string(filename),
             )
             file_paths.append(file_path)
             uploaded_files.append(uploaded_file)
diff --git a/backend/pashub_fetcher/tests/test_core_files.py b/backend/pashub_fetcher/tests/test_core_files.py
index ee91298e..3c1d11b8 100644
--- a/backend/pashub_fetcher/tests/test_core_files.py
+++ b/backend/pashub_fetcher/tests/test_core_files.py
@@ -1,53 +1,60 @@
-from backend.pashub_fetcher.core_files import CoreFiles, core_file_for, file_type_for
+from backend.pashub_fetcher.core_files import (
+    CoreFiles,
+    get_core_file_type,
+    get_file_type_string,
+)
 
 
 def test_file_type_for_photopack():
-    assert file_type_for("Photopack_123456_V1.pdf") == "photo_pack"
+    assert get_file_type_string("Photopack_123456_V1.pdf") == "photo_pack"
 
 
 def test_file_type_for_sitenote():
-    assert file_type_for("SiteNote_123456_V1.pdf") == "site_note"
+    assert get_file_type_string("SiteNote_123456_V1.pdf") == "site_note"
 
 
 def test_file_type_for_rdsap_sitenote():
     assert (
-        file_type_for("RdSAP_SiteNote_9510890_V1_Assessmet.pdf") == "rd_sap_site_note"
+        get_file_type_string("RdSAP_SiteNote_9510890_V1_Assessmet.pdf")
+        == "rd_sap_site_note"
     )
 
 
 def test_file_type_for_pas2023_ventilation():
     assert (
-        file_type_for("PAS 2023 Ventilation Assessment Report_123456.pdf")
+        get_file_type_string("PAS 2023 Ventilation Assessment Report_123456.pdf")
         == "pas_2023_ventilation"
     )
 
 
 def test_file_type_for_pas2023_condition():
     assert (
-        file_type_for("PAS 2023 Condition Report_123456.pdf") == "pas_2023_condition"
+        get_file_type_string("PAS 2023 Condition Report_123456.pdf")
+        == "pas_2023_condition"
     )
 
 
 def test_file_type_for_pas_significance():
-    assert file_type_for("PAS Significance_123456.pdf") == "pas_significance"
+    assert get_file_type_string("PAS Significance_123456.pdf") == "pas_significance"
 
 
 def test_file_type_for_par_photopack():
     assert (
-        file_type_for("PAR Photo Pack_95101890_V2_Assessment.pdf") == "par_photo_pack"
+        get_file_type_string("PAR Photo Pack_95101890_V2_Assessment.pdf")
+        == "par_photo_pack"
     )
 
 
 def test_file_type_for_pas2023_property():
     assert (
-        file_type_for("PAS 2023 Property Assessment Report_123456.pdf")
+        get_file_type_string("PAS 2023 Property Assessment Report_123456.pdf")
         == "pas_2023_property"
     )
 
 
 def test_file_type_for_pas2023_occupancy():
     assert (
-        file_type_for("PAS 2023 Occupancy Assessment Report_123456.pdf")
+        get_file_type_string("PAS 2023 Occupancy Assessment Report_123456.pdf")
         == "pas_2023_occupancy"
     )
 
@@ -55,7 +62,7 @@ def test_file_type_for_pas2023_occupancy():
 def test_file_type_for_improvement_option_evaluation():
     # filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf"
     assert (
-        file_type_for("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
+        get_file_type_string("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
         == "improvement_option_evaluation"
     )
 
@@ -63,18 +70,20 @@ def test_file_type_for_improvement_option_evaluation():
 def test_file_type_for_medium_term_improvement_plan():
     # filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf"
     assert (
-        file_type_for("60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf")
+        get_file_type_string(
+            "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf"
+        )
         == "medium_term_improvement_plan"
     )
 
 
 def test_file_type_for_retrofit_design_doc():
     assert (
-        file_type_for("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
+        get_file_type_string("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
         == "retrofit_design_doc"
     )
     assert (
-        file_type_for("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
+        get_file_type_string("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
         == "retrofit_design_doc"
     )
 
@@ -89,7 +98,7 @@ def test_core_file_for_evidence_category_match_is_case_insensitive() -> None:
     filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
 
     # Act
-    result = core_file_for(filename, evidence_category="Retrofit Design")
+    result = get_core_file_type(filename, evidence_category="Retrofit Design")
 
     # Assert
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
@@ -100,7 +109,7 @@ def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None:
     filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
 
     # Act
-    result = core_file_for(filename, evidence_category="retrofit design")
+    result = get_core_file_type(filename, evidence_category="retrofit design")
 
     # Assert
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
@@ -111,7 +120,7 @@ def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() ->
     filename = "6000802 - NG4 4HD - Improvement Option Evaluation.pdf"
 
     # Act
-    result = core_file_for(filename)
+    result = get_core_file_type(filename)
 
     # Assert
     assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION
@@ -122,18 +131,20 @@ def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() ->
     filename = "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf"
 
     # Act
-    result = core_file_for(filename)
+    result = get_core_file_type(filename)
 
     # Assert
     assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
 
 
-def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> None:
+def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> (
+    None
+):
     # Arrange
     filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
 
     # Act
-    result = core_file_for(filename)
+    result = get_core_file_type(filename)
 
     # Assert
     assert result == CoreFiles.RETROFIT_DESIGN_DOC
@@ -144,7 +155,7 @@ def test_core_file_for_prefix_returns_photopack() -> None:
     filename = "Photopack_123456_V1.pdf"
 
     # Act
-    result = core_file_for(filename)
+    result = get_core_file_type(filename)
 
     # Assert
     assert result == CoreFiles.PHOTOPACK
@@ -155,18 +166,20 @@ def test_core_file_for_unknown_filename_returns_none() -> None:
     filename = "unknown_document_123.pdf"
 
     # Act
-    result = core_file_for(filename)
+    result = get_core_file_type(filename)
 
     # Assert
     assert result is None
 
 
-def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> None:
+def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> (
+    None
+):
     # Arrange — OSM+DR-N-A filename but evidence_category is something other than retrofit design
     filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
 
     # Act
-    result = core_file_for(filename, evidence_category="some other category")
+    result = get_core_file_type(filename, evidence_category="some other category")
 
     # Assert
     assert result is None

From 955db1c3eb8167bfbd1aa277624e2966eb16f6f8 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 10:58:38 +0000
Subject: [PATCH 42/91] additional typehint

---
 backend/pashub_fetcher/core_files.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py
index 30aa2ba8..e63511eb 100644
--- a/backend/pashub_fetcher/core_files.py
+++ b/backend/pashub_fetcher/core_files.py
@@ -69,7 +69,9 @@ def get_core_file_type(
 
 
 def get_file_type_string(filename: str) -> Optional[str]:
-    core_file = get_core_file_type(filename)
+    core_file: Optional[CoreFiles] = get_core_file_type(filename)
+
     if core_file is None:
         return None
+
     return _CORE_FILE_TO_FILE_TYPE[core_file]

From 03ae73f39adf9515d2d9010ab4a7df6d333652c3 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 13:37:08 +0000
Subject: [PATCH 43/91] trigger via sqs from local file

---
 .../trigger_pashub_sqs_from_file.py           | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 backend/pashub_fetcher/trigger_pashub_sqs_from_file.py

diff --git a/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py
new file mode 100644
index 00000000..24a29781
--- /dev/null
+++ b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py
@@ -0,0 +1,103 @@
+import json
+import logging
+import os
+from typing import Any, Optional, cast
+
+import boto3
+from openpyxl import load_workbook
+
+from backend.app.config import get_settings
+from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
+    PashubToAraTriggerRequest,
+)
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger: logging.Logger = logging.getLogger(__name__)
+
+DRY_RUN: bool = True
+
+EXCEL_PATH: str = os.path.join(
+    os.path.dirname(__file__),
+    "united-infrastructure-exports-all-deals-2026-05-14.xlsx",
+)
+
+
+def _build_requests(excel_path: str) -> list[PashubToAraTriggerRequest]:
+    wb = load_workbook(excel_path, data_only=True)
+    ws = wb.worksheets[0]
+
+    headers: dict[str, int] = {}
+    for col in range(1, ws.max_column + 1):
+        header_val = ws.cell(row=1, column=col).value
+        if header_val is not None:
+            headers[str(header_val).strip()] = col
+
+    pashub_col: int = headers["PasHub link"]
+    record_id_col: int = headers["Record ID"]
+    deal_name_col: int = headers["Deal Name"]
+    deal_stage_col: int = headers["Deal Stage"]
+
+    requests: list[PashubToAraTriggerRequest] = []
+
+    for row in range(2, ws.max_row + 1):
+        pashub_link_raw = ws.cell(row=row, column=pashub_col).value
+        if not pashub_link_raw:
+            continue
+
+        pashub_link: str = str(pashub_link_raw).strip()
+
+        record_id_raw = ws.cell(row=row, column=record_id_col).value
+        deal_name_raw = ws.cell(row=row, column=deal_name_col).value
+        deal_stage_raw = ws.cell(row=row, column=deal_stage_col).value
+
+        hubspot_deal_id: Optional[str] = (
+            str(record_id_raw) if record_id_raw is not None else None
+        )
+        address: Optional[str] = (
+            str(deal_name_raw).strip() if deal_name_raw is not None else None
+        )
+        deal_stage: Optional[str] = (
+            str(deal_stage_raw).strip() if deal_stage_raw is not None else None
+        )
+
+        requests.append(
+            PashubToAraTriggerRequest(
+                pashub_link=pashub_link,
+                hubspot_deal_id=hubspot_deal_id,
+                address=address,
+                deal_stage=deal_stage,
+            )
+        )
+
+    return requests
+
+
+def main() -> None:
+    trigger_requests: list[PashubToAraTriggerRequest] = _build_requests(EXCEL_PATH)
+
+    sqs: Any = cast(Any, boto3.client("sqs"))  # type: ignore[reportUnknownMemberType]
+    queue_url: str = get_settings().PASHUB_TO_ARA_SQS_URL
+
+    count: int = 0
+    for request in trigger_requests:
+        action: str = "DRY RUN" if DRY_RUN else "SENDING"
+        logger.info(
+            f"[{action}] deal_id={request.hubspot_deal_id} pashub_link={request.pashub_link}"
+        )
+
+        if not DRY_RUN:
+            response: dict[str, Any] = sqs.send_message(
+                QueueUrl=queue_url,
+                MessageBody=json.dumps(request.model_dump()),
+            )
+            message_id: str = response["MessageId"]
+            logger.info(f"  MessageId: {message_id}")
+
+        count += 1
+
+    label: str = "would send" if DRY_RUN else "sent"
+    print(f"{count} messages {label}")
+
+
+if __name__ == "__main__":
+    main()

From 0b358e6de66a04efefe19f83319f8854fdac52ae Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 13:37:14 +0000
Subject: [PATCH 44/91] =?UTF-8?q?pashub=5Fjob=5Fid=20extracts=20ID=20from?=
 =?UTF-8?q?=20/evidence/view=20links=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../test_pashub_to_ara_trigger_request.py     | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py

diff --git a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
new file mode 100644
index 00000000..b538fa7e
--- /dev/null
+++ b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
@@ -0,0 +1,20 @@
+import pytest
+
+from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
+    PashubToAraTriggerRequest,
+)
+
+
+def make_request(pashub_link: str) -> PashubToAraTriggerRequest:
+    return PashubToAraTriggerRequest(pashub_link=pashub_link)
+
+
+def test_pashub_job_id_extracts_id_from_evidence_view_link() -> None:
+    # Arrange
+    request = make_request("https://pashub.net/jobs/job-id-123/evidence/view")
+
+    # Act
+    result = request.pashub_job_id
+
+    # Assert
+    assert result == "job-id-123"

From 567778991961189679c45f1ca0312c5ff089702e Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 13:40:28 +0000
Subject: [PATCH 45/91] =?UTF-8?q?pashub=5Fjob=5Fid=20extracts=20ID=20from?=
 =?UTF-8?q?=20/evidence/view=20links=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pashub_to_ara_trigger_request.py          |  8 ++++++--
 .../test_pashub_to_ara_trigger_request.py     | 20 +++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
index 518a8dc3..2e077c2e 100644
--- a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
+++ b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
@@ -1,10 +1,11 @@
+import re
 from typing import Optional
 from pydantic import BaseModel
 
 
 class PashubToAraTriggerRequest(BaseModel):
     pashub_link: (
-        str  # e.g. https://pashub.net/jobs/12345-abcd-1234-abcd-12345abcde/details
+        str  # e.g. https://pashub.net/jobs/{id}/details, /jobs/{id}/evidence/view, /jobs/{id}
     )
 
     address: Optional[str] = None
@@ -17,4 +18,7 @@ class PashubToAraTriggerRequest(BaseModel):
 
     @property
     def pashub_job_id(self) -> str:
-        return self.pashub_link.split("/")[-2]
+        match = re.search(r"/jobs/([^/]+)", self.pashub_link)
+        if not match:
+            raise ValueError(f"No job ID found in PasHub link: {self.pashub_link}")
+        return match.group(1)
diff --git a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
index b538fa7e..6eec1e14 100644
--- a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
+++ b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
@@ -9,6 +9,26 @@ def make_request(pashub_link: str) -> PashubToAraTriggerRequest:
     return PashubToAraTriggerRequest(pashub_link=pashub_link)
 
 
+def test_pashub_job_id_raises_for_invalid_link() -> None:
+    # Arrange
+    request = make_request("https://pashub.net/rcs-dashboard")
+
+    # Act / Assert
+    with pytest.raises(ValueError):
+        request.pashub_job_id
+
+
+def test_pashub_job_id_extracts_id_from_bare_job_link() -> None:
+    # Arrange
+    request = make_request("https://pashub.net/jobs/job-id-123")
+
+    # Act
+    result = request.pashub_job_id
+
+    # Assert
+    assert result == "job-id-123"
+
+
 def test_pashub_job_id_extracts_id_from_evidence_view_link() -> None:
     # Arrange
     request = make_request("https://pashub.net/jobs/job-id-123/evidence/view")

From ecd2676c5e9bc909f642855345c13e02ba52d4fc Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 14 May 2026 13:42:38 +0000
Subject: [PATCH 46/91] =?UTF-8?q?pashub=5Fjob=5Fid=20extracts=20job=20ID?=
 =?UTF-8?q?=20from=20all=20valid=20PasHub=20link=20shapes=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_to_ara_trigger_request.py       | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
index 6eec1e14..56187350 100644
--- a/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
+++ b/backend/pashub_fetcher/tests/test_pashub_to_ara_trigger_request.py
@@ -9,6 +9,17 @@ def make_request(pashub_link: str) -> PashubToAraTriggerRequest:
     return PashubToAraTriggerRequest(pashub_link=pashub_link)
 
 
+def test_pashub_job_id_extracts_id_from_details_link() -> None:
+    # Arrange
+    request = make_request("https://pashub.net/jobs/job-id-123/details")
+
+    # Act
+    result = request.pashub_job_id
+
+    # Assert
+    assert result == "job-id-123"
+
+
 def test_pashub_job_id_raises_for_invalid_link() -> None:
     # Arrange
     request = make_request("https://pashub.net/rcs-dashboard")

From 572fcc1406d93ed7b0a8c32e1ab53b99183fd6e2 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 14 May 2026 16:38:22 +0000
Subject: [PATCH 47/91] smoke tests

---
 .github/workflows/_smoke_test_lambda.yml |  63 +++++++++++++
 .github/workflows/lambda_smoke_tests.yml | 107 +++++++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 .github/workflows/_smoke_test_lambda.yml
 create mode 100644 .github/workflows/lambda_smoke_tests.yml

diff --git a/.github/workflows/_smoke_test_lambda.yml b/.github/workflows/_smoke_test_lambda.yml
new file mode 100644
index 00000000..63ec0af4
--- /dev/null
+++ b/.github/workflows/_smoke_test_lambda.yml
@@ -0,0 +1,63 @@
+name: Lambda smoke test
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile_path:
+        required: true
+        type: string
+      build_context:
+        required: false
+        default: "."
+        type: string
+      service_name:
+        required: true
+        type: string
+
+jobs:
+  smoke-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build Lambda image
+        run: |
+          docker build \
+            --platform linux/amd64 \
+            -f ${{ inputs.dockerfile_path }} \
+            -t ${{ inputs.service_name }}-smoke-test:latest \
+            ${{ inputs.build_context }}
+
+      - name: Start Lambda container
+        run: |
+          docker run -d --name ${{ inputs.service_name }}-smoke-test \
+            -p 9000:8080 \
+            ${{ inputs.service_name }}-smoke-test:latest
+
+      - name: Invoke Lambda and check for import errors
+        run: |
+          sleep 2
+          response=$(curl -s -X POST \
+            http://localhost:9000/2015-03-31/functions/function/invocations \
+            -H "Content-Type: application/json" \
+            -d '{"Records":[{"body":"{}"}]}')
+
+          echo "Response: $response"
+
+          if [ -z "$response" ]; then
+            echo "No response from Lambda RIE"
+            exit 1
+          fi
+
+          if echo "$response" | grep -qE 'ImportModuleError|ModuleNotFoundError|ImportError'; then
+            echo "Import error detected in handler"
+            exit 1
+          fi
+
+      - name: Dump container logs
+        if: always()
+        run: docker logs ${{ inputs.service_name }}-smoke-test
+
+      - name: Tear down container
+        if: always()
+        run: docker rm -f ${{ inputs.service_name }}-smoke-test
diff --git a/.github/workflows/lambda_smoke_tests.yml b/.github/workflows/lambda_smoke_tests.yml
new file mode 100644
index 00000000..5ff5420a
--- /dev/null
+++ b/.github/workflows/lambda_smoke_tests.yml
@@ -0,0 +1,107 @@
+name: Lambda Smoke Tests
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  # ============================================================
+  # Ara Engine
+  # ============================================================
+  ara_engine_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/docker/engine.Dockerfile
+      build_context: .
+      service_name: ara-engine
+
+  # ============================================================
+  # Address 2 UPRN
+  # ============================================================
+  address2uprn_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/address2UPRN/handler/Dockerfile
+      build_context: .
+      service_name: address2uprn
+
+  # ============================================================
+  # Postcode Splitter
+  # ============================================================
+  postcode_splitter_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/postcode_splitter/handler/Dockerfile
+      build_context: .
+      service_name: postcode-splitter
+
+  # ============================================================
+  # Bulk Address2UPRN Combiner
+  # ============================================================
+  bulk_address2uprn_combiner_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/bulk_address2uprn_combiner/handler/Dockerfile
+      build_context: .
+      service_name: bulk-address2uprn-combiner
+
+  # ============================================================
+  # Condition ETL
+  # ============================================================
+  condition_etl_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/condition/handler/Dockerfile
+      build_context: .
+      service_name: condition-etl
+
+  # ============================================================
+  # Categorisation
+  # ============================================================
+  categorisation_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/categorisation/handler/Dockerfile
+      build_context: .
+      service_name: categorisation
+
+  # ============================================================
+  # Ordnance Survey
+  # ============================================================
+  ordnance_survey_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/ordnanceSurvey/handler/Dockerfile
+      build_context: .
+      service_name: ordnance-survey
+
+  # ============================================================
+  # Pas Hub Fetcher
+  # ============================================================
+  pashub_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/pashub_fetcher/handler/Dockerfile
+      build_context: .
+      service_name: pashub
+
+  # ============================================================
+  # MagicPlan
+  # ============================================================
+  magic_plan_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: backend/magic_plan/handler/Dockerfile
+      build_context: .
+      service_name: magic-plan
+
+  # ============================================================
+  # HubSpot Scraper
+  # ============================================================
+  hubspot_scraper_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: etl/hubspot/scripts/scraper/handler/Dockerfile
+      build_context: .
+      service_name: hubspot-scraper

From 16e60001800fca1db834d90adf843fdd15b419ce Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 14 May 2026 16:44:18 +0000
Subject: [PATCH 48/91] smoke tests

---
 .github/workflows/_smoke_test_lambda.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_smoke_test_lambda.yml b/.github/workflows/_smoke_test_lambda.yml
index 63ec0af4..9b564f73 100644
--- a/.github/workflows/_smoke_test_lambda.yml
+++ b/.github/workflows/_smoke_test_lambda.yml
@@ -36,8 +36,8 @@ jobs:
 
       - name: Invoke Lambda and check for import errors
         run: |
-          sleep 2
-          response=$(curl -s -X POST \
+          response=$(curl -s --retry-connrefused --retry 15 --retry-delay 1 \
+            -X POST \
             http://localhost:9000/2015-03-31/functions/function/invocations \
             -H "Content-Type: application/json" \
             -d '{"Records":[{"body":"{}"}]}')

From 0c3a31ed81a094d0907b321ab2d7ff3ad061e523 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 14 May 2026 16:49:45 +0000
Subject: [PATCH 49/91] smoke tests

---
 .github/workflows/_smoke_test_lambda.yml | 28 +++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_smoke_test_lambda.yml b/.github/workflows/_smoke_test_lambda.yml
index 9b564f73..3fcf0de4 100644
--- a/.github/workflows/_smoke_test_lambda.yml
+++ b/.github/workflows/_smoke_test_lambda.yml
@@ -20,6 +20,13 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Download AWS Lambda RIE
+        run: |
+          mkdir -p ~/.aws-lambda-rie
+          curl -fsSL -o ~/.aws-lambda-rie/aws-lambda-rie \
+            https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie
+          chmod +x ~/.aws-lambda-rie/aws-lambda-rie
+
       - name: Build Lambda image
         run: |
           docker build \
@@ -30,9 +37,24 @@ jobs:
 
       - name: Start Lambda container
         run: |
-          docker run -d --name ${{ inputs.service_name }}-smoke-test \
-            -p 9000:8080 \
-            ${{ inputs.service_name }}-smoke-test:latest
+          IMG=${{ inputs.service_name }}-smoke-test:latest
+          ENTRY=$(docker inspect --format='{{range .Config.Entrypoint}}{{.}} {{end}}' "$IMG")
+          CMD_ARGS=$(docker inspect --format='{{range .Config.Cmd}}{{.}} {{end}}' "$IMG")
+
+          if echo "$ENTRY" | grep -q "lambda-entrypoint.sh"; then
+            # AWS base image — RIE is bundled
+            docker run -d --name ${{ inputs.service_name }}-smoke-test \
+              -p 9000:8080 \
+              "$IMG"
+          else
+            # Custom base — mount RIE from runner and re-wire entrypoint
+            docker run -d --name ${{ inputs.service_name }}-smoke-test \
+              -v "$HOME/.aws-lambda-rie:/aws-lambda-rie" \
+              -p 9000:8080 \
+              --entrypoint /aws-lambda-rie/aws-lambda-rie \
+              "$IMG" \
+              $ENTRY $CMD_ARGS
+          fi
 
       - name: Invoke Lambda and check for import errors
         run: |

From 6c8080ef6203694db127edb9aa9b7824bbc76898 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 14 May 2026 16:57:31 +0000
Subject: [PATCH 50/91] smoke tests

---
 backend/condition/handler/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/condition/handler/Dockerfile b/backend/condition/handler/Dockerfile
index 71556895..fa130573 100644
--- a/backend/condition/handler/Dockerfile
+++ b/backend/condition/handler/Dockerfile
@@ -32,6 +32,7 @@ COPY utils/ utils/
 COPY backend/condition/ backend/condition/
 
 COPY backend/app/db/models/condition.py backend/app/db/models/condition.py
+COPY backend/app/db/base.py backend/app/db/base.py
 COPY backend/app/db/connection.py backend/app/db/connection.py
 COPY backend/app/config.py backend/app/config.py
 

From eeb2f9eb20a4c65b639da09ed818c27ebe2501fd Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Fri, 15 May 2026 10:58:42 +0000
Subject: [PATCH 51/91] tweaks before PR

---
 backend/pashub_fetcher/pashub_client.py       |  7 +++-
 .../pashub_to_ara_trigger_request.py          |  4 +--
 .../trigger_pashub_sqs_from_file.py           | 36 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 7896664d..27342c25 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -74,6 +74,10 @@ class PashubClient:
         logger.info(f"Getting UPRN for job ID {job_id}")
         url = f"{self.base}/jobs/{job_id}"
 
+        logger.debug(
+            f"About to make API request with session headers: { self.session.headers}"
+        )
+
         r = self.session.get(url)
         if r.status_code == 401:
             raise UnauthorizedError("Token expired or invalid")
@@ -82,7 +86,8 @@ class PashubClient:
 
         try:
             return r.json()["uprn"]
-        except Exception:
+        except Exception as e:
+            logger.warning(f"Failed to get UPRN for Job ID {job_id}", e)
             return None
 
     def _select_latest_core_files(
diff --git a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
index 2e077c2e..715a09f8 100644
--- a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
+++ b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
@@ -4,9 +4,7 @@ from pydantic import BaseModel
 
 
 class PashubToAraTriggerRequest(BaseModel):
-    pashub_link: (
-        str  # e.g. https://pashub.net/jobs/{id}/details, /jobs/{id}/evidence/view, /jobs/{id}
-    )
+    pashub_link: str  # e.g. https://pashub.net/jobs/{id}/details, /jobs/{id}/evidence/view, /jobs/{id}
 
     address: Optional[str] = None
     sharepoint_link: Optional[str] = None
diff --git a/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py
index 24a29781..f4c03afc 100644
--- a/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py
+++ b/backend/pashub_fetcher/trigger_pashub_sqs_from_file.py
@@ -14,7 +14,36 @@ from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger: logging.Logger = logging.getLogger(__name__)
 
-DRY_RUN: bool = True
+DRY_RUN: bool = False
+
+DEAL_ID_FILTER: frozenset[str] = frozenset(
+    {
+        "379452094688",
+        "379466504437",
+        "379660170452",
+        "380016925932",
+        "379848065216",
+        "379466504434",
+        "379452094690",
+        "379965924567",
+        "380016925923",
+        "379792072898",
+        "379654754502",
+        "379560262861",
+        "379969670369",
+        "379248717001",
+        "379971468493",
+        "379999888607",
+        "379606372580",
+        "379969603797",
+        "379967743213",
+        "379263155434",
+        "379855267025",
+        "379889899719",
+        "379071064307",
+        "379867925741",
+    }
+)
 
 EXCEL_PATH: str = os.path.join(
     os.path.dirname(__file__),
@@ -75,6 +104,11 @@ def _build_requests(excel_path: str) -> list[PashubToAraTriggerRequest]:
 def main() -> None:
     trigger_requests: list[PashubToAraTriggerRequest] = _build_requests(EXCEL_PATH)
 
+    if DEAL_ID_FILTER:
+        trigger_requests = [
+            r for r in trigger_requests if r.hubspot_deal_id in DEAL_ID_FILTER
+        ]
+
     sqs: Any = cast(Any, boto3.client("sqs"))  # type: ignore[reportUnknownMemberType]
     queue_url: str = get_settings().PASHUB_TO_ARA_SQS_URL
 

From ad49bf9d85c95e480f9949f33eff3693d601e668 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Fri, 15 May 2026 11:00:58 +0000
Subject: [PATCH 52/91] tweak logs

---
 backend/pashub_fetcher/pashub_client.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py
index 27342c25..79d81838 100644
--- a/backend/pashub_fetcher/pashub_client.py
+++ b/backend/pashub_fetcher/pashub_client.py
@@ -75,7 +75,7 @@ class PashubClient:
         url = f"{self.base}/jobs/{job_id}"
 
         logger.debug(
-            f"About to make API request with session headers: { self.session.headers}"
+            f"About to make API request with session headers: {self.session.headers}"
         )
 
         r = self.session.get(url)
@@ -87,7 +87,9 @@ class PashubClient:
         try:
             return r.json()["uprn"]
         except Exception as e:
-            logger.warning(f"Failed to get UPRN for Job ID {job_id}", e)
+            logger.warning(
+                f"Failed to get UPRN for Job ID {job_id} with exception: {e}"
+            )
             return None
 
     def _select_latest_core_files(

From 6afd07600598a0a92883319345e4918aecd46cc1 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 15 May 2026 11:28:04 +0000
Subject: [PATCH 53/91] added 5 second rest every 100 tests

---
 backend/address2UPRN/tests/test_csv.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py
index 73d94388..5c97e691 100644
--- a/backend/address2UPRN/tests/test_csv.py
+++ b/backend/address2UPRN/tests/test_csv.py
@@ -12,12 +12,21 @@ FIXTURE_PATH = Path(__file__).parent / "test_data.csv"
 # Each parametrized case fires at least one EPC request; without throttling,
 # GitHub-hosted runners burst fast enough to hit 429s.
 EPC_THROTTLE_SECONDS = 1.0
+EPC_LONG_PAUSE_EVERY = 100
+EPC_LONG_PAUSE_SECONDS = 5.0
+
+_epc_request_count = 0
 
 
 @pytest.fixture(autouse=True)
 def _throttle_epc_requests():
+    global _epc_request_count
     yield
-    time.sleep(EPC_THROTTLE_SECONDS)
+    _epc_request_count += 1
+    if _epc_request_count % EPC_LONG_PAUSE_EVERY == 0:
+        time.sleep(EPC_LONG_PAUSE_SECONDS)
+    else:
+        time.sleep(EPC_THROTTLE_SECONDS)
 
 
 def load_test_cases():

From fce1e1008ab166f0637f926ffef0bbbf1d8a18f8 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 15 May 2026 16:00:02 +0000
Subject: [PATCH 54/91] added more test cases

---
 backend/address2UPRN/tests/test_data.csv | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv
index 408edc29..1c1ce58a 100644
--- a/backend/address2UPRN/tests/test_data.csv
+++ b/backend/address2UPRN/tests/test_data.csv
@@ -364,4 +364,7 @@ FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974
 164a Victoria Square,M4 5FA,77211315
 165a Victoria Square,M4 5FA,77211316
 166a Victoria Square,M4 5FA,None
-"FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY",CR2 7DL,None
\ No newline at end of file
+"FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY",CR2 7DL,None
+71A  Stoneleigh Avenue,NE12 8NP,None
+71B  Stoneleigh Avenue,NE12 8NP,None
+71  Stoneleigh Avenue,NE12 8NP,47086009
\ No newline at end of file

From a99972457864962e0a4be8b0f35ab5fb33eebeaa Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:05:54 +0000
Subject: [PATCH 55/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?=
 =?UTF-8?q?client=20when=20UPRN=20lookup=20returns=20401=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_service.py      |  7 ++--
 .../tests/test_pashub_service.py              | 36 +++++++++++++++++--
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index b3302fd9..2b8f0926 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -1,6 +1,6 @@
 import os
 from datetime import datetime, timezone
-from typing import List, NamedTuple, Optional, cast
+from typing import Callable, List, NamedTuple, Optional, cast
 
 from backend.app.db.connection import db_session
 from backend.app.db.models.uploaded_file import (
@@ -11,7 +11,7 @@ from backend.app.db.models.uploaded_file import (
 from backend.documents_parser.db_writer import save_epc_property_data
 from backend.documents_parser.parser import parse_site_notes_pdf
 from backend.pashub_fetcher.core_files import get_file_type_string
-from backend.pashub_fetcher.pashub_client import PashubClient
+from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
 from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
     PashubToAraTriggerRequest,
 )
@@ -36,10 +36,13 @@ class PashubService:
         pashub_client: PashubClient,
         sharepoint_client: DomnaSharepointClient,
         s3_bucket: str,
+        coordination_client_factory: Optional[Callable[[], PashubClient]] = None,
     ) -> None:
         self._pashub_client = pashub_client
         self._sharepoint_client = sharepoint_client
         self._s3_bucket = s3_bucket
+        self._coordination_client_factory = coordination_client_factory
+        self._coordination_client: Optional[PashubClient] = None
 
     def run(self, request: PashubToAraTriggerRequest) -> List[str]:
         job_id = request.pashub_job_id
diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index 2aff416b..44c6af1a 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -1,8 +1,8 @@
-from typing import Optional
+from typing import Callable, Optional
 from unittest.mock import MagicMock, call, patch
 
 
-from backend.pashub_fetcher.pashub_client import PashubClient
+from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
 from backend.pashub_fetcher.pashub_service import PashubService
 from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
     PashubToAraTriggerRequest,
@@ -31,11 +31,13 @@ def make_service(
     pashub_client: Optional[PashubClient] = None,
     sharepoint_client: Optional[DomnaSharepointClient] = None,
     s3_bucket: str = "test-bucket",
+    coordination_client_factory: Optional[Callable[[], PashubClient]] = None,
 ) -> PashubService:
     return PashubService(
         pashub_client=pashub_client or MagicMock(spec=PashubClient),
         sharepoint_client=sharepoint_client or MagicMock(spec=DomnaSharepointClient),
         s3_bucket=s3_bucket,
+        coordination_client_factory=coordination_client_factory,
     )
 
 
@@ -225,6 +227,36 @@ def test_run_parses_and_saves_site_notes_for_rd_sap_site_note_file() -> None:
 # ---------------------------------------------------------------------------
 
 
+# ---------------------------------------------------------------------------
+# run(): coordination fallback
+# ---------------------------------------------------------------------------
+
+
+def test_run_uses_coordination_client_when_pas_401_on_uprn_lookup() -> None:
+    pas_client = MagicMock(spec=PashubClient)
+    pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError()
+
+    coord_client = MagicMock(spec=PashubClient)
+    coord_client.get_uprn_by_job_id.return_value = "99999"
+    coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"]
+
+    factory = MagicMock(return_value=coord_client)
+
+    service = make_service(pashub_client=pas_client, coordination_client_factory=factory)
+
+    with (
+        patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"),
+        patch("backend.pashub_fetcher.pashub_service.db_session"),
+        patch("backend.pashub_fetcher.pashub_service.os.remove"),
+    ):
+        result = service.run(make_request())
+
+    assert result == ["/tmp/a.pdf"]
+    coord_client.get_uprn_by_job_id.assert_called_once()
+    coord_client.get_core_evidence_files_by_job_id.assert_called_once()
+    assert factory.call_count == 1
+
+
 def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None:
     mock_client = MagicMock(spec=PashubClient)
     mock_client.get_uprn_by_job_id.return_value = None

From e0446381925964872e91607bbc5135c60177d969 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:06:46 +0000
Subject: [PATCH 56/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?=
 =?UTF-8?q?client=20when=20UPRN=20lookup=20returns=20401=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_service.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index 2b8f0926..0a5fb535 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -44,12 +44,26 @@ class PashubService:
         self._coordination_client_factory = coordination_client_factory
         self._coordination_client: Optional[PashubClient] = None
 
+    def _get_coordination_client(self) -> PashubClient:
+        if self._coordination_client_factory is None:
+            raise UnauthorizedError("No coordination client factory configured")
+        if self._coordination_client is None:
+            self._coordination_client = self._coordination_client_factory()
+        return self._coordination_client
+
     def run(self, request: PashubToAraTriggerRequest) -> List[str]:
         job_id = request.pashub_job_id
+        active_client = self._pashub_client
+
+        if request.uprn:
+            uprn: Optional[str] = request.uprn
+        else:
+            try:
+                uprn = active_client.get_uprn_by_job_id(job_id)
+            except UnauthorizedError:
+                active_client = self._get_coordination_client()
+                uprn = active_client.get_uprn_by_job_id(job_id)
 
-        uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id(
-            job_id
-        )
         hubspot_deal_id: Optional[str] = request.hubspot_deal_id
 
         if uprn:
@@ -57,7 +71,7 @@ class PashubService:
         else:
             logger.info(f"No UPRN found for job {job_id}")
 
-        job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id(
+        job_files: List[str] = active_client.get_core_evidence_files_by_job_id(
             job_id
         )
 

From d49bd3620e2040af82ab737bf7bac3f58daf134c Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:08:47 +0000
Subject: [PATCH 57/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?=
 =?UTF-8?q?client=20when=20file=20listing=20returns=20401=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_service.py              | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index 44c6af1a..dd8ad0a8 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -257,6 +257,29 @@ def test_run_uses_coordination_client_when_pas_401_on_uprn_lookup() -> None:
     assert factory.call_count == 1
 
 
+def test_run_uses_coordination_client_when_pas_401_on_file_listing() -> None:
+    pas_client = MagicMock(spec=PashubClient)
+    pas_client.get_core_evidence_files_by_job_id.side_effect = UnauthorizedError()
+
+    coord_client = MagicMock(spec=PashubClient)
+    coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"]
+
+    factory = MagicMock(return_value=coord_client)
+
+    service = make_service(pashub_client=pas_client, coordination_client_factory=factory)
+
+    with (
+        patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"),
+        patch("backend.pashub_fetcher.pashub_service.db_session"),
+        patch("backend.pashub_fetcher.pashub_service.os.remove"),
+    ):
+        result = service.run(make_request(uprn="12345"))
+
+    assert result == ["/tmp/a.pdf"]
+    coord_client.get_core_evidence_files_by_job_id.assert_called_once()
+    pas_client.get_uprn_by_job_id.assert_not_called()
+
+
 def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None:
     mock_client = MagicMock(spec=PashubClient)
     mock_client.get_uprn_by_job_id.return_value = None

From 0c1ecabf2f88ed0d2a519fc1e3b474ceb0b5a6f7 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:09:18 +0000
Subject: [PATCH 58/91] =?UTF-8?q?PAS=20falls=20back=20to=20coordination=20?=
 =?UTF-8?q?client=20when=20file=20listing=20returns=20401=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_service.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index 0a5fb535..b33b9dcf 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -71,9 +71,15 @@ class PashubService:
         else:
             logger.info(f"No UPRN found for job {job_id}")
 
-        job_files: List[str] = active_client.get_core_evidence_files_by_job_id(
-            job_id
-        )
+        try:
+            job_files: List[str] = active_client.get_core_evidence_files_by_job_id(
+                job_id
+            )
+        except UnauthorizedError:
+            if active_client is not self._pashub_client:
+                raise
+            active_client = self._get_coordination_client()
+            job_files = active_client.get_core_evidence_files_by_job_id(job_id)
 
         if uprn or hubspot_deal_id:
             logger.info("Uploading files to s3")

From 5a29866245fefae3ac5b4aee6ddba1d09ce7eb1d Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:12:19 +0000
Subject: [PATCH 59/91] =?UTF-8?q?PAS=20raises=20UnauthorizedError=20when?=
 =?UTF-8?q?=20401=20received=20with=20no=20coordination=20factory=20config?=
 =?UTF-8?q?ured=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_pashub_service.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index dd8ad0a8..ff4a8977 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -1,3 +1,4 @@
+import pytest
 from typing import Callable, Optional
 from unittest.mock import MagicMock, call, patch
 
@@ -280,6 +281,16 @@ def test_run_uses_coordination_client_when_pas_401_on_file_listing() -> None:
     pas_client.get_uprn_by_job_id.assert_not_called()
 
 
+def test_run_raises_unauthorized_when_pas_401_and_no_factory() -> None:
+    pas_client = MagicMock(spec=PashubClient)
+    pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError()
+
+    service = make_service(pashub_client=pas_client)
+
+    with pytest.raises(UnauthorizedError):
+        service.run(make_request())
+
+
 def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None:
     mock_client = MagicMock(spec=PashubClient)
     mock_client.get_uprn_by_job_id.return_value = None

From dcff529219103ed2bfb0faf5a58e0be814683d8d Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:13:51 +0000
Subject: [PATCH 60/91] =?UTF-8?q?UnauthorizedError=20propagates=20when=20b?=
 =?UTF-8?q?oth=20PAS=20and=20coordination=20clients=20return=20401=20?=
 =?UTF-8?q?=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pashub_fetcher/tests/test_pashub_service.py   | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index ff4a8977..991d2a46 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -291,6 +291,21 @@ def test_run_raises_unauthorized_when_pas_401_and_no_factory() -> None:
         service.run(make_request())
 
 
+def test_run_raises_unauthorized_when_both_clients_401() -> None:
+    pas_client = MagicMock(spec=PashubClient)
+    pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError()
+
+    coord_client = MagicMock(spec=PashubClient)
+    coord_client.get_uprn_by_job_id.side_effect = UnauthorizedError()
+
+    factory = MagicMock(return_value=coord_client)
+
+    service = make_service(pashub_client=pas_client, coordination_client_factory=factory)
+
+    with pytest.raises(UnauthorizedError):
+        service.run(make_request())
+
+
 def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None:
     mock_client = MagicMock(spec=PashubClient)
     mock_client.get_uprn_by_job_id.return_value = None

From 4cd59768c38e2f2a5ae90cb6bde000c40b6646d3 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:22:32 +0000
Subject: [PATCH 61/91] =?UTF-8?q?Wire=20coordination=20account=20fallback?=
 =?UTF-8?q?=20into=20config=20and=20handler,=20remove=20token-refresh=20re?=
 =?UTF-8?q?try=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/app/config.py                     |  2 ++
 backend/pashub_fetcher/handler/handler.py | 43 +++++++++++++----------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/backend/app/config.py b/backend/app/config.py
index bdfc9ace..fcfb6d5b 100644
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -86,6 +86,8 @@ class Settings(BaseSettings):
     # Pas Hub
     PASHUB_EMAIL: Optional[str] = None
     PASHUB_PASSWORD: Optional[str] = None
+    PASHUB_COORDINATION_EMAIL: Optional[str] = None
+    PASHUB_COORDINATION_PASSWORD: Optional[str] = None
 
     # Optional AWS creds (only required in local)
     AWS_ACCESS_KEY_ID: Optional[str] = None
diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py
index cd0c8113..626ce59d 100644
--- a/backend/pashub_fetcher/handler/handler.py
+++ b/backend/pashub_fetcher/handler/handler.py
@@ -1,9 +1,11 @@
-from typing import Any, Dict, List
+from typing import Any, Callable, Dict, List, Optional
 
 from backend.app.config import get_settings
-from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
+from backend.pashub_fetcher.pashub_client import PashubClient
 from backend.pashub_fetcher.pashub_service import PashubService
-from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest
+from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
+    PashubToAraTriggerRequest,
+)
 from backend.pashub_fetcher.token_getter import get_token_from_local_storage
 from backend.app.db.models.tasks import SourceEnum
 from backend.utils.subtasks import task_handler
@@ -28,38 +30,41 @@ def handler(body: Dict[str, Any], context: Any) -> List[str]:
 
     settings = get_settings()
 
-    pas_hub_email = settings.PASHUB_EMAIL
-    pas_hub_password = settings.PASHUB_PASSWORD
+    pashub_email = settings.PASHUB_EMAIL
+    pashub_password = settings.PASHUB_PASSWORD
 
-    if (not pas_hub_email) or (not pas_hub_password):
+    coordination_hub_email = settings.PASHUB_COORDINATION_EMAIL
+    coordination_hub_password = settings.PASHUB_COORDINATION_PASSWORD
+    coordination_client_factory: Optional[Callable[[], PashubClient]] = None
+
+    if (not pashub_email) or (not pashub_password):
         raise ValueError("Pas Hub credentials not provided")
 
     sharepoint_client = DomnaSharepointClient(
         sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3
     )
 
+    if coordination_hub_email and coordination_hub_password:
+        _coord_email, _coord_password = (
+            coordination_hub_email,
+            coordination_hub_password,
+        )
+        coordination_client_factory = lambda: get_pashub_client(
+            _coord_email, _coord_password
+        )
+
     logger.debug("Validating request body")
     payload = PashubToAraTriggerRequest.model_validate(body)
     logger.debug("Successfully validated request body")
 
     service = PashubService(
-        pashub_client=get_pashub_client(pas_hub_email, pas_hub_password),
+        pashub_client=get_pashub_client(pashub_email, pashub_password),
         sharepoint_client=sharepoint_client,
         s3_bucket=S3_BUCKET,
+        coordination_client_factory=coordination_client_factory,
     )
 
-    try:
-        files: List[str] = service.run(payload)
-    except UnauthorizedError:
-        logger.warning("Token expired - refreshing")
-
-        service = PashubService(
-            pashub_client=get_pashub_client(pas_hub_email, pas_hub_password),
-            sharepoint_client=sharepoint_client,
-            s3_bucket=S3_BUCKET,
-        )
-
-        files = service.run(payload)
+    files: List[str] = service.run(payload)
 
     logger.info(f"Saved {len(files)} files")
 

From 3a7a00051d159d7672c29357e664ebc9a2f165a2 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 09:34:34 +0000
Subject: [PATCH 62/91] add new variables to deployment pipeline

---
 .github/workflows/_deploy_lambda.yml                 |  8 ++++++++
 .github/workflows/deploy_terraform.yml               |  2 ++
 .../terraform/lambda/pashub_to_ara/main.tf           |  2 ++
 .../terraform/lambda/pashub_to_ara/variables.tf      | 12 ++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml
index 1cc7d462..0d702155 100644
--- a/.github/workflows/_deploy_lambda.yml
+++ b/.github/workflows/_deploy_lambda.yml
@@ -80,6 +80,10 @@ on:
         required: false
       TF_VAR_pashub_password:
         required: false
+      TF_VAR_pashub_coordination_email:
+        required: false
+      TF_VAR_pashub_coordination_password:
+        required: false
       TF_VAR_hubspot_api_key:
         required: false
 
@@ -154,6 +158,8 @@ jobs:
           TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.TF_VAR_social_housing_wave_3_sharepoint_id }}
           TF_VAR_pashub_email: ${{ secrets.TF_VAR_pashub_email }}
           TF_VAR_pashub_password: ${{ secrets.TF_VAR_pashub_password }}
+          TF_VAR_pashub_coordination_email: ${{ secrets.TF_VAR_pashub_coordination_email }}
+          TF_VAR_pashub_coordination_password: ${{ secrets.TF_VAR_pashub_coordination_password }}
           TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }}
           TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }}
           TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }}
@@ -202,6 +208,8 @@ jobs:
           TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.TF_VAR_social_housing_wave_3_sharepoint_id }}
           TF_VAR_pashub_email: ${{ secrets.TF_VAR_pashub_email }}
           TF_VAR_pashub_password: ${{ secrets.TF_VAR_pashub_password }}
+          TF_VAR_pashub_coordination_email: ${{ secrets.TF_VAR_pashub_coordination_email }}
+          TF_VAR_pashub_coordination_password: ${{ secrets.TF_VAR_pashub_coordination_password }}
           TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }}
           TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }}
           TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }}
diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index e0343974..bd014e3d 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -407,6 +407,8 @@ jobs:
       TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID }}
       TF_VAR_pashub_email: ${{ secrets.PASHUB_EMAIL }}
       TF_VAR_pashub_password: ${{ secrets.PASHUB_PASSWORD }}
+      TF_VAR_pashub_coordination_email: ${{ secrets.PASHUB_COORDINATION_EMAIL }}
+      TF_VAR_pashub_coordination_password: ${{ secrets.PASHUB_COORDINATION_PASSWORD }}
 
 
   # ============================================================
diff --git a/infrastructure/terraform/lambda/pashub_to_ara/main.tf b/infrastructure/terraform/lambda/pashub_to_ara/main.tf
index 902d7845..eba9c874 100644
--- a/infrastructure/terraform/lambda/pashub_to_ara/main.tf
+++ b/infrastructure/terraform/lambda/pashub_to_ara/main.tf
@@ -49,6 +49,8 @@ module "lambda" {
     SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID = var.social_housing_wave_3_sharepoint_id
     PASHUB_EMAIL                        = var.pashub_email
     PASHUB_PASSWORD                     = var.pashub_password
+    PASHUB_COORDINATION_EMAIL           = var.pashub_coordination_email
+    PASHUB_COORDINATION_PASSWORD        = var.pashub_coordination_password
   }
 }
 
diff --git a/infrastructure/terraform/lambda/pashub_to_ara/variables.tf b/infrastructure/terraform/lambda/pashub_to_ara/variables.tf
index 0e99d378..cdeff256 100644
--- a/infrastructure/terraform/lambda/pashub_to_ara/variables.tf
+++ b/infrastructure/terraform/lambda/pashub_to_ara/variables.tf
@@ -100,4 +100,16 @@ variable "pashub_email" {
 variable "pashub_password" {
   type      = string
   sensitive = true
+}
+
+variable "pashub_coordination_email" {
+  type      = string
+  sensitive = true
+  default   = null
+}
+
+variable "pashub_coordination_password" {
+  type      = string
+  sensitive = true
+  default   = null
 }
\ No newline at end of file

From 770493ff9ec751073a3d3b798e51021252e2f10f Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Mon, 18 May 2026 11:51:48 +0000
Subject: [PATCH 63/91] add logging

---
 backend/pashub_fetcher/pashub_service.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index b33b9dcf..13498a32 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -60,7 +60,9 @@ class PashubService:
         else:
             try:
                 uprn = active_client.get_uprn_by_job_id(job_id)
+                logger.info(f"Failed to access job {job_id} with PasHub credentials")
             except UnauthorizedError:
+                logger.info(f"Trying CoordinationHub credentials for job {job_id}")
                 active_client = self._get_coordination_client()
                 uprn = active_client.get_uprn_by_job_id(job_id)
 

From dc3543ac5f655c7f8ec9a76dad12cf014bf94621 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Tue, 19 May 2026 11:07:41 +0000
Subject: [PATCH 64/91] =?UTF-8?q?Coordination=20Hub=20fallback=20stores=20?=
 =?UTF-8?q?correct=20file=5Fsource=20in=20DB=20=F0=9F=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/app/db/models/uploaded_file.py        |  1 +
 .../tests/test_pashub_service.py              | 29 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py
index f3cfee79..b6a73d5d 100644
--- a/backend/app/db/models/uploaded_file.py
+++ b/backend/app/db/models/uploaded_file.py
@@ -25,6 +25,7 @@ class FileTypeEnum(enum.Enum):
 
 class FileSourceEnum(enum.Enum):
     PAS_HUB = "pas hub"
+    COORDINATION_HUB = "coordination_hub"
     SHAREPOINT = "sharepoint"
     HUBSPOT = "hubspot"
     ECMK = "ecmk"
diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index 991d2a46..1d6d167f 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -1,8 +1,9 @@
 import pytest
-from typing import Callable, Optional
+from typing import Any, Callable, Optional
 from unittest.mock import MagicMock, call, patch
 
 
+from backend.app.db.models.uploaded_file import FileSourceEnum
 from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
 from backend.pashub_fetcher.pashub_service import PashubService
 from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
@@ -306,6 +307,32 @@ def test_run_raises_unauthorized_when_both_clients_401() -> None:
         service.run(make_request())
 
 
+def test_run_persists_coordination_hub_file_source_when_pas_401_on_uprn_lookup() -> None:
+    pas_client = MagicMock(spec=PashubClient)
+    pas_client.get_uprn_by_job_id.side_effect = UnauthorizedError()
+
+    coord_client = MagicMock(spec=PashubClient)
+    coord_client.get_uprn_by_job_id.return_value = "99999"
+    coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"]
+
+    factory = MagicMock(return_value=coord_client)
+    fake_session = MagicMock()
+
+    service = make_service(pashub_client=pas_client, coordination_client_factory=factory)
+
+    with (
+        patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"),
+        patch("backend.pashub_fetcher.pashub_service.db_session") as mock_db,
+        patch("backend.pashub_fetcher.pashub_service.os.remove"),
+    ):
+        mock_db.return_value.__enter__.return_value = fake_session
+        service.run(make_request())
+
+    fake_session.add_all.assert_called_once()
+    added: list[Any] = fake_session.add_all.call_args[0][0]
+    assert added[0].file_source == FileSourceEnum.COORDINATION_HUB.value
+
+
 def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None:
     mock_client = MagicMock(spec=PashubClient)
     mock_client.get_uprn_by_job_id.return_value = None

From 1e115ba3dee7a63f9b2ebe4e56fb2f4a22da03f7 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Tue, 19 May 2026 11:09:01 +0000
Subject: [PATCH 65/91] =?UTF-8?q?Coordination=20Hub=20fallback=20stores=20?=
 =?UTF-8?q?correct=20file=5Fsource=20in=20DB=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/pashub_service.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py
index 13498a32..f7f6ccd9 100644
--- a/backend/pashub_fetcher/pashub_service.py
+++ b/backend/pashub_fetcher/pashub_service.py
@@ -60,9 +60,10 @@ class PashubService:
         else:
             try:
                 uprn = active_client.get_uprn_by_job_id(job_id)
-                logger.info(f"Failed to access job {job_id} with PasHub credentials")
             except UnauthorizedError:
-                logger.info(f"Trying CoordinationHub credentials for job {job_id}")
+                logger.info(
+                    f"PasHub credentials unauthorized for job {job_id}; retrying with CoordinationHub credentials"
+                )
                 active_client = self._get_coordination_client()
                 uprn = active_client.get_uprn_by_job_id(job_id)
 
@@ -85,8 +86,13 @@ class PashubService:
 
         if uprn or hubspot_deal_id:
             logger.info("Uploading files to s3")
+            file_source = (
+                FileSourceEnum.PAS_HUB
+                if active_client is self._pashub_client
+                else FileSourceEnum.COORDINATION_HUB
+            )
             upload_records = self._upload_to_s3_and_update_db(
-                job_files, uprn, hubspot_deal_id
+                job_files, uprn, hubspot_deal_id, file_source
             )
             self._save_site_notes(upload_records)
 
@@ -108,6 +114,7 @@ class PashubService:
         job_files: List[str],
         uprn: Optional[str],
         hubspot_deal_id: Optional[str],
+        file_source: FileSourceEnum,
     ) -> List[_FileUploadRecord]:
         if not uprn and not hubspot_deal_id:
             return []
@@ -133,7 +140,7 @@ class PashubService:
                 s3_upload_timestamp=datetime.now(timezone.utc),
                 uprn=int(uprn) if uprn else None,
                 hubspot_deal_id=hubspot_deal_id,
-                file_source=FileSourceEnum.PAS_HUB.value,
+                file_source=file_source.value,
                 file_type=get_file_type_string(filename),
             )
             file_paths.append(file_path)

From a4ad1ca11c90f8ff5e2080977b0567ab2ff8e269 Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Tue, 19 May 2026 11:10:18 +0000
Subject: [PATCH 66/91] =?UTF-8?q?Coordination=20Hub=20file=20listing=20fal?=
 =?UTF-8?q?lback=20stores=20correct=20file=5Fsource=20in=20DB=20?=
 =?UTF-8?q?=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_pashub_service.py              | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index 1d6d167f..cf1c489a 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -333,6 +333,31 @@ def test_run_persists_coordination_hub_file_source_when_pas_401_on_uprn_lookup()
     assert added[0].file_source == FileSourceEnum.COORDINATION_HUB.value
 
 
+def test_run_persists_coordination_hub_file_source_when_pas_401_on_file_listing() -> None:
+    pas_client = MagicMock(spec=PashubClient)
+    pas_client.get_core_evidence_files_by_job_id.side_effect = UnauthorizedError()
+
+    coord_client = MagicMock(spec=PashubClient)
+    coord_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"]
+
+    factory = MagicMock(return_value=coord_client)
+    fake_session = MagicMock()
+
+    service = make_service(pashub_client=pas_client, coordination_client_factory=factory)
+
+    with (
+        patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"),
+        patch("backend.pashub_fetcher.pashub_service.db_session") as mock_db,
+        patch("backend.pashub_fetcher.pashub_service.os.remove"),
+    ):
+        mock_db.return_value.__enter__.return_value = fake_session
+        service.run(make_request(uprn="12345"))
+
+    fake_session.add_all.assert_called_once()
+    added: list[Any] = fake_session.add_all.call_args[0][0]
+    assert added[0].file_source == FileSourceEnum.COORDINATION_HUB.value
+
+
 def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None:
     mock_client = MagicMock(spec=PashubClient)
     mock_client.get_uprn_by_job_id.return_value = None

From 20ad0616bcdc32eb24abee7bb05f4f707475e00b Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Tue, 19 May 2026 11:10:45 +0000
Subject: [PATCH 67/91] =?UTF-8?q?PAS=20Hub=20happy=20path=20asserts=20file?=
 =?UTF-8?q?=5Fsource=20"pas=20hub"=20=F0=9F=9F=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/pashub_fetcher/tests/test_pashub_service.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py
index cf1c489a..1f750117 100644
--- a/backend/pashub_fetcher/tests/test_pashub_service.py
+++ b/backend/pashub_fetcher/tests/test_pashub_service.py
@@ -148,10 +148,11 @@ def test_run_persists_uploaded_file_records_to_db() -> None:
         service.run(make_request(uprn="12345"))
 
     fake_session.add_all.assert_called_once()
-    added: list = fake_session.add_all.call_args[0][0]
+    added: list[Any] = fake_session.add_all.call_args[0][0]
     assert len(added) == 1
     assert added[0].s3_file_bucket == "test-bucket"
     assert added[0].uprn == 12345
+    assert added[0].file_source == FileSourceEnum.PAS_HUB.value
 
 
 # ---------------------------------------------------------------------------

From bc8ca3ead36e13b71272845baf80d01adee63e41 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 12:55:30 +0000
Subject: [PATCH 68/91] deployment from infrastructure

---
 .dockerignore                                 |  2 +-
 .github/workflows/deploy_terraform.yml        | 46 +++++++++----------
 Dockerfile.test.dockerignore                  |  2 +-
 .../terraform/lambda/_template/README.md      |  2 +-
 .../lambda_with_api_gateway/variables.tf      |  2 +-
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 0c7d7749..90436ffc 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,7 +6,7 @@ backend/.idea/*
 backend/.env
 recommendations/tests/*
 model_data/tests/*
-infrastructure/*
+deployment/*
 data_collection/*
 node_modules/*
 conservation_areas/*
diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index bd014e3d..923fc0a9 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -62,20 +62,20 @@ jobs:
       - uses: hashicorp/setup-terraform@v3
 
       - name: Terraform Init
-        working-directory: infrastructure/terraform/shared
+        working-directory: deployment/terraform/shared
         run: terraform init -reconfigure
 
       - name: Terraform Workspace
-        working-directory: infrastructure/terraform/shared
+        working-directory: deployment/terraform/shared
         run: terraform workspace select ${STAGE} || terraform workspace new ${STAGE}
 
       - name: Terraform Plan
-        working-directory: infrastructure/terraform/shared
+        working-directory: deployment/terraform/shared
         run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan
 
       - name: Terraform Apply
         if: env.TERRAFORM_APPLY == 'true'
-        working-directory: infrastructure/terraform/shared
+        working-directory: deployment/terraform/shared
         run: terraform apply -auto-approve tfplan
 
   # ============================================================
@@ -101,7 +101,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: ara_engine
-      lambda_path: infrastructure/terraform/lambda/engine
+      lambda_path: deployment/terraform/lambda/engine
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: engine-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.ara_engine_image.outputs.image_digest }}
@@ -150,7 +150,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: address2uprn
-      lambda_path: infrastructure/terraform/lambda/address2UPRN
+      lambda_path: deployment/terraform/lambda/address2UPRN
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.address2uprn_image.outputs.image_digest }}
@@ -191,7 +191,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: postcodeSplitter
-      lambda_path: infrastructure/terraform/lambda/postcodeSplitter
+      lambda_path: deployment/terraform/lambda/postcodeSplitter
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }}
@@ -231,7 +231,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: bulk_address2uprn_combiner
-      lambda_path: infrastructure/terraform/lambda/bulk_address2uprn_combiner
+      lambda_path: deployment/terraform/lambda/bulk_address2uprn_combiner
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.bulk_address2uprn_combiner_image.outputs.image_digest }}
@@ -271,7 +271,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: condition-etl
-      lambda_path: infrastructure/terraform/lambda/condition-etl
+      lambda_path: deployment/terraform/lambda/condition-etl
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.condition_etl_image.outputs.image_digest }}
@@ -311,7 +311,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: categorisation
-      lambda_path: infrastructure/terraform/lambda/categorisation
+      lambda_path: deployment/terraform/lambda/categorisation
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: categorisation-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.categorisation_image.outputs.image_digest }}
@@ -351,7 +351,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: ordnanceSurvey
-      lambda_path: infrastructure/terraform/lambda/ordnanceSurvey
+      lambda_path: deployment/terraform/lambda/ordnanceSurvey
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }}
@@ -386,7 +386,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: pashub_to_ara
-      lambda_path: infrastructure/terraform/lambda/pashub_to_ara
+      lambda_path: deployment/terraform/lambda/pashub_to_ara
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: pashub_to_ara-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.pashub_to_ara_image.outputs.image_digest }}
@@ -419,7 +419,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: ara_fast_api
-      lambda_path: infrastructure/terraform/lambda/fast-api
+      lambda_path: deployment/terraform/lambda/fast-api
       stage: ${{ needs.determine_stage.outputs.stage }}
       terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
     secrets:
@@ -458,17 +458,17 @@ jobs:
       - uses: hashicorp/setup-terraform@v3
 
       - name: Terraform Init
-        working-directory: infrastructure/terraform/cdn_certificate
+        working-directory: deployment/terraform/cdn_certificate
         run: terraform init -reconfigure
 
       - name: Terraform Workspace
-        working-directory: infrastructure/terraform/cdn_certificate
+        working-directory: deployment/terraform/cdn_certificate
         run: |
           terraform workspace select $STAGE \
             || terraform workspace new $STAGE
 
       - name: Terraform Plan
-        working-directory: infrastructure/terraform/cdn_certificate
+        working-directory: deployment/terraform/cdn_certificate
         run: |
           terraform plan \
             -var="stage=${STAGE}" \
@@ -476,7 +476,7 @@ jobs:
 
       - name: Terraform Apply
         if: env.TERRAFORM_APPLY == 'true'
-        working-directory: infrastructure/terraform/cdn_certificate
+        working-directory: deployment/terraform/cdn_certificate
         run: terraform apply -auto-approve tfplan
 
 
@@ -503,17 +503,17 @@ jobs:
       - uses: hashicorp/setup-terraform@v3
 
       - name: Terraform Init
-        working-directory: infrastructure/terraform/cdn
+        working-directory: deployment/terraform/cdn
         run: terraform init -reconfigure
 
       - name: Terraform Workspace
-        working-directory: infrastructure/terraform/cdn
+        working-directory: deployment/terraform/cdn
         run: |
           terraform workspace select $STAGE \
             || terraform workspace new $STAGE
 
       - name: Terraform Plan
-        working-directory: infrastructure/terraform/cdn
+        working-directory: deployment/terraform/cdn
         run: |
           terraform plan \
             -var="stage=${STAGE}" \
@@ -521,7 +521,7 @@ jobs:
 
       - name: Terraform Apply
         if: env.TERRAFORM_APPLY == 'true'
-        working-directory: infrastructure/terraform/cdn
+        working-directory: deployment/terraform/cdn
         run: terraform apply -auto-approve tfplan
 
   # ============================================================
@@ -562,7 +562,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: magic_plan
-      lambda_path: infrastructure/terraform/lambda/magic_plan
+      lambda_path: deployment/terraform/lambda/magic_plan
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.magic_plan_image.outputs.image_digest }}
@@ -585,7 +585,7 @@ jobs:
     uses: ./.github/workflows/_deploy_lambda.yml
     with:
       lambda_name: hubspot-etl-to-ara
-      lambda_path: infrastructure/terraform/lambda/hubspot_deal_etl
+      lambda_path: deployment/terraform/lambda/hubspot_deal_etl
       stage: ${{ needs.determine_stage.outputs.stage }}
       ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }}
       image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }}
diff --git a/Dockerfile.test.dockerignore b/Dockerfile.test.dockerignore
index 4f79c6ee..ed05c399 100644
--- a/Dockerfile.test.dockerignore
+++ b/Dockerfile.test.dockerignore
@@ -4,7 +4,7 @@ model_data/local_data/
 backend/node_modules/
 backend/.idea/
 backend/.env
-infrastructure/
+deployment/
 data_collection/
 node_modules/
 conservation_areas/
diff --git a/infrastructure/terraform/lambda/_template/README.md b/infrastructure/terraform/lambda/_template/README.md
index 5bb10627..f2a8638a 100644
--- a/infrastructure/terraform/lambda/_template/README.md
+++ b/infrastructure/terraform/lambda/_template/README.md
@@ -10,7 +10,7 @@
 ### 2. Add infrastructure prerequisites (shared stack)
 - Add a new ECR repository in:
 
-  infrastructure/terraform/shared/main.tf
+  deployment/terraform/shared/main.tf
 
 - Create a PR to deploy this to main then dev in order to deploy the shared stack
 
diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf b/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf
index 95e5acd9..b5d0515a 100644
--- a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf
+++ b/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf
@@ -11,7 +11,7 @@ variable "zip_excludes" {
     "**/*.pyc",
     "**/.pytest_cache/**",
     "**/tests/**",
-    "**/infrastructure/**"
+    "**/deployment/**"
   ]
 }
 

From 54a674b5c88bd7907e77ac83db755c47ff4d8028 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 16:35:09 +0000
Subject: [PATCH 69/91] added postcode splitter rewrite to ddd

---
 AGENTS.md                                     |  29 ----
 CLAUDE.md                                     |  29 ----
 asset_list/app.py                             |  24 ++-
 .../terraform/README.md                       |   0
 .../terraform/cdn/main.tf                     |   0
 .../terraform/cdn/provider.tf                 |   0
 .../terraform/cdn/variables.tf                |   0
 .../terraform/cdn_certificate/main.tf         |   0
 .../terraform/cdn_certificate/outputs.tf      |   0
 .../terraform/cdn_certificate/provider.tf     |   0
 .../terraform/cdn_certificate/variables.tf    |   0
 .../terraform/lambda/_template/README.md      |   0
 .../terraform/lambda/_template/main.tf        |   0
 .../terraform/lambda/_template/provider.tf    |   0
 .../terraform/lambda/_template/variables.tf   |   0
 .../terraform/lambda/address2UPRN/main.tf     |   0
 .../terraform/lambda/address2UPRN/outputs.tf  |   0
 .../terraform/lambda/address2UPRN/provider.tf |   0
 .../lambda/address2UPRN/variables.tf          |   0
 .../lambda/bulk_address2uprn_combiner/main.tf |   0
 .../bulk_address2uprn_combiner/outputs.tf     |   0
 .../bulk_address2uprn_combiner/provider.tf    |   0
 .../bulk_address2uprn_combiner/variables.tf   |   0
 .../terraform/lambda/categorisation/main.tf   |   0
 .../lambda/categorisation/outputs.tf          |   0
 .../lambda/categorisation/provider.tf         |   0
 .../lambda/categorisation/variables.tf        |   0
 .../terraform/lambda/condition-etl/main.tf    |   0
 .../lambda/condition-etl/provider.tf          |   0
 .../lambda/condition-etl/variables.tf         |   0
 .../terraform/lambda/ecmk_to_ara/main.tf      |   0
 .../terraform/lambda/ecmk_to_ara/provider.tf  |   0
 .../terraform/lambda/ecmk_to_ara/variables.tf |   0
 .../terraform/lambda/engine/main.tf           |   0
 .../terraform/lambda/engine/outputs.tf        |   0
 .../terraform/lambda/engine/provider.tf       |   0
 .../terraform/lambda/engine/variables.tf      |   0
 .../terraform/lambda/fast-api/main.tf         |   0
 .../terraform/lambda/fast-api/outputs.tf      |   0
 .../terraform/lambda/fast-api/provider.tf     |   0
 .../terraform/lambda/fast-api/variables.tf    |   0
 .../terraform/lambda/hubspot_deal_etl/main.tf |   0
 .../lambda/hubspot_deal_etl/provider.tf       |   0
 .../lambda/hubspot_deal_etl/variables.tf      |   0
 .../terraform/lambda/magic_plan/main.tf       |   0
 .../terraform/lambda/magic_plan/outputs.tf    |   0
 .../terraform/lambda/magic_plan/provider.tf   |   0
 .../terraform/lambda/magic_plan/variables.tf  |   0
 .../terraform/lambda/ordnanceSurvey/main.tf   |   0
 .../lambda/ordnanceSurvey/provider.tf         |   0
 .../lambda/ordnanceSurvey/variables.tf        |   0
 .../terraform/lambda/pashub_to_ara/main.tf    |   0
 .../terraform/lambda/pashub_to_ara/outputs.tf |   0
 .../lambda/pashub_to_ara/provider.tf          |   0
 .../lambda/pashub_to_ara/variables.tf         |   0
 .../terraform/lambda/postcodeSplitter/main.tf |   0
 .../lambda/postcodeSplitter/outputs.tf        |   0
 .../lambda/postcodeSplitter/provider.tf       |   0
 .../lambda/postcodeSplitter/variables.tf      |   0
 .../terraform/modules/acm_certificate/main.tf |   0
 .../modules/acm_certificate/outputs.tf        |   0
 .../modules/acm_certificate/variables.tf      |   0
 .../terraform/modules/cloudfront/main.tf      |   0
 .../terraform/modules/cloudfront/variables.tf |   0
 .../modules/container_registry/main.tf        |   0
 .../modules/container_registry/outputs.tf     |   0
 .../modules/container_registry/variables.tf   |   0
 .../terraform/modules/ecr/main.tf             |   0
 .../terraform/modules/ecr/outputs.tf          |   0
 .../terraform/modules/ecr/variables.tf        |   0
 .../modules/general_iam_policy/main.tf        |   0
 .../modules/general_iam_policy/outputs.tf     |   0
 .../modules/general_iam_policy/variables.tf   |   0
 .../modules/lambda_execution_role/main.tf     |   0
 .../modules/lambda_execution_role/outputs.tf  |   0
 .../lambda_execution_role/variables.tf        |   0
 .../terraform/modules/lambda_service/main.tf  |   0
 .../modules/lambda_service/outputs.tf         |   0
 .../modules/lambda_service/variables.tf       |   0
 .../modules/lambda_service_zip/main.tf        |   0
 .../modules/lambda_service_zip/variables.tf   |   0
 .../modules/lambda_sqs_trigger/main.tf        |   0
 .../modules/lambda_sqs_trigger/variables.tf   |   0
 .../modules/lambda_with_api_gateway/main.tf   |   0
 .../lambda_with_api_gateway/outputs.tf        |   0
 .../lambda_with_api_gateway/variables.tf      |   0
 .../terraform/modules/lambda_with_sqs/main.tf |   0
 .../modules/lambda_with_sqs/outputs.tf        |   0
 .../modules/lambda_with_sqs/variables.tf      |   0
 .../terraform/modules/route53/main.tf         |   0
 .../terraform/modules/route53/variables.tf    |   0
 .../terraform/modules/s3/main.tf              |   0
 .../terraform/modules/s3/outputs.tf           |   0
 .../terraform/modules/s3/variables.tf         |   0
 .../terraform/modules/s3_iam_policy/main.tf   |   0
 .../modules/s3_iam_policy/outputs.tf          |   0
 .../modules/s3_iam_policy/variables.tf        |   0
 .../modules/s3_presignable_bucket/main.tf     |   0
 .../modules/s3_presignable_bucket/outputs.tf  |   0
 .../s3_presignable_bucket/variables.tf        |   0
 .../terraform/modules/ses/main.tf             |   0
 .../terraform/modules/ses/outputs.tf          |   0
 .../terraform/modules/ses/variables.tf        |   0
 .../terraform/modules/sqs_queue/main.tf       |   0
 .../terraform/modules/sqs_queue/outputs.tf    |   0
 .../terraform/modules/sqs_queue/variables.tf  |   0
 .../terraform/modules/tf_state_bucket/main.tf |   0
 .../modules/tf_state_bucket/outputs.tf        |   0
 .../modules/tf_state_bucket/variables.tf      |   0
 .../terraform/shared/dev.tfvars               |   0
 .../terraform/shared/main.tf                  |   0
 .../terraform/shared/secrets.tf               |   0
 .../terraform/shared/variables.tf             |   0
 domain/__init__.py                            |   0
 domain/tasks/__init__.py                      |   0
 domain/tasks/subtasks.py                      |  55 +++++++
 domain/tasks/tasks.py                         |  94 +++++++++++
 infrastructure/__init__.py                    |   0
 infrastructure/postgres/__init__.py           |   0
 infrastructure/postgres/config.py             |  33 ++++
 infrastructure/postgres/engine.py             |  18 +++
 infrastructure/postgres/subtask_table.py      |  21 +++
 infrastructure/postgres/task_table.py         |  36 +++++
 orchestration/__init__.py                     |   0
 orchestration/task_orchestrator.py            |  96 +++++++++++
 repositories/__init__.py                      |   0
 repositories/tasks/__init__.py                |   0
 .../tasks/subtask_postgres_repository.py      |  89 +++++++++++
 repositories/tasks/subtask_repository.py      |  18 +++
 .../tasks/task_postgres_repository.py         |  77 +++++++++
 repositories/tasks/task_repository.py         |  15 ++
 run_backlog.sh                                |   2 -
 tests/__init__.py                             |   0
 tests/domain/__init__.py                      |   0
 tests/domain/tasks/__init__.py                |   0
 tests/domain/tasks/test_subtasks.py           |  75 +++++++++
 tests/domain/tasks/test_tasks.py              | 104 ++++++++++++
 tests/orchestration/__init__.py               |   0
 tests/orchestration/test_task_orchestrator.py | 151 ++++++++++++++++++
 tests/repositories/__init__.py                |   0
 tests/repositories/tasks/__init__.py          |   0
 tests/repositories/tasks/postgres/__init__.py |   0
 .../test_subtask_postgres_repository.py       |  81 ++++++++++
 .../postgres/test_task_postgres_repository.py |  68 ++++++++
 utilities/__init__.py                         |   0
 utilities/aws_lambda/__init__.py              |   0
 utilities/aws_lambda/default_orchestrator.py  |  26 +++
 utilities/aws_lambda/subtask_handler.py       |  67 ++++++++
 utilities/aws_lambda/subtask_trigger_body.py  |  17 ++
 utilities/aws_lambda/task_handler.py          |  98 ++++++++++++
 utilities/private.py                          |  33 ++++
 151 files changed, 1281 insertions(+), 75 deletions(-)
 delete mode 100644 AGENTS.md
 rename {infrastructure => deployment}/terraform/README.md (100%)
 rename {infrastructure => deployment}/terraform/cdn/main.tf (100%)
 rename {infrastructure => deployment}/terraform/cdn/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/cdn/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/cdn_certificate/main.tf (100%)
 rename {infrastructure => deployment}/terraform/cdn_certificate/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/cdn_certificate/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/cdn_certificate/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/_template/README.md (100%)
 rename {infrastructure => deployment}/terraform/lambda/_template/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/_template/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/_template/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/address2UPRN/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/address2UPRN/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/address2UPRN/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/address2UPRN/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/bulk_address2uprn_combiner/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/categorisation/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/categorisation/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/categorisation/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/categorisation/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/condition-etl/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/condition-etl/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/condition-etl/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/ecmk_to_ara/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/ecmk_to_ara/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/ecmk_to_ara/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/engine/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/engine/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/engine/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/engine/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/fast-api/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/fast-api/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/fast-api/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/fast-api/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/hubspot_deal_etl/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/hubspot_deal_etl/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/hubspot_deal_etl/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/magic_plan/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/magic_plan/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/magic_plan/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/magic_plan/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/ordnanceSurvey/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/ordnanceSurvey/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/ordnanceSurvey/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/pashub_to_ara/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/main.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/provider.tf (100%)
 rename {infrastructure => deployment}/terraform/lambda/postcodeSplitter/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/acm_certificate/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/acm_certificate/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/acm_certificate/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/cloudfront/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/cloudfront/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/container_registry/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/container_registry/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/container_registry/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/ecr/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/ecr/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/ecr/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/general_iam_policy/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/general_iam_policy/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/general_iam_policy/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_execution_role/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_execution_role/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_execution_role/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_service/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_service/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_service/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_service_zip/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_service_zip/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_sqs_trigger/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_sqs_trigger/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_with_api_gateway/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_with_api_gateway/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_with_api_gateway/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_with_sqs/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_with_sqs/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/lambda_with_sqs/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/route53/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/route53/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3_iam_policy/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3_iam_policy/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3_iam_policy/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3_presignable_bucket/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3_presignable_bucket/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/s3_presignable_bucket/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/ses/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/ses/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/ses/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/sqs_queue/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/sqs_queue/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/sqs_queue/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/tf_state_bucket/main.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/tf_state_bucket/outputs.tf (100%)
 rename {infrastructure => deployment}/terraform/modules/tf_state_bucket/variables.tf (100%)
 rename {infrastructure => deployment}/terraform/shared/dev.tfvars (100%)
 rename {infrastructure => deployment}/terraform/shared/main.tf (100%)
 rename {infrastructure => deployment}/terraform/shared/secrets.tf (100%)
 rename {infrastructure => deployment}/terraform/shared/variables.tf (100%)
 create mode 100644 domain/__init__.py
 create mode 100644 domain/tasks/__init__.py
 create mode 100644 domain/tasks/subtasks.py
 create mode 100644 domain/tasks/tasks.py
 create mode 100644 infrastructure/__init__.py
 create mode 100644 infrastructure/postgres/__init__.py
 create mode 100644 infrastructure/postgres/config.py
 create mode 100644 infrastructure/postgres/engine.py
 create mode 100644 infrastructure/postgres/subtask_table.py
 create mode 100644 infrastructure/postgres/task_table.py
 create mode 100644 orchestration/__init__.py
 create mode 100644 orchestration/task_orchestrator.py
 create mode 100644 repositories/__init__.py
 create mode 100644 repositories/tasks/__init__.py
 create mode 100644 repositories/tasks/subtask_postgres_repository.py
 create mode 100644 repositories/tasks/subtask_repository.py
 create mode 100644 repositories/tasks/task_postgres_repository.py
 create mode 100644 repositories/tasks/task_repository.py
 delete mode 100644 run_backlog.sh
 create mode 100644 tests/__init__.py
 create mode 100644 tests/domain/__init__.py
 create mode 100644 tests/domain/tasks/__init__.py
 create mode 100644 tests/domain/tasks/test_subtasks.py
 create mode 100644 tests/domain/tasks/test_tasks.py
 create mode 100644 tests/orchestration/__init__.py
 create mode 100644 tests/orchestration/test_task_orchestrator.py
 create mode 100644 tests/repositories/__init__.py
 create mode 100644 tests/repositories/tasks/__init__.py
 create mode 100644 tests/repositories/tasks/postgres/__init__.py
 create mode 100644 tests/repositories/tasks/postgres/test_subtask_postgres_repository.py
 create mode 100644 tests/repositories/tasks/postgres/test_task_postgres_repository.py
 create mode 100644 utilities/__init__.py
 create mode 100644 utilities/aws_lambda/__init__.py
 create mode 100644 utilities/aws_lambda/default_orchestrator.py
 create mode 100644 utilities/aws_lambda/subtask_handler.py
 create mode 100644 utilities/aws_lambda/subtask_trigger_body.py
 create mode 100644 utilities/aws_lambda/task_handler.py
 create mode 100644 utilities/private.py

diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index aa0426a0..00000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,29 +0,0 @@
-
-<!-- BACKLOG.MD MCP GUIDELINES START -->
-
-<CRITICAL_INSTRUCTION>
-
-## BACKLOG WORKFLOW INSTRUCTIONS
-
-This project uses Backlog.md MCP for all task and project management activities.
-
-**CRITICAL GUIDANCE**
-
-- If your client supports MCP resources, read `backlog://workflow/overview` to understand when and how to use Backlog for this project.
-- If your client only supports tools or the above request fails, call `backlog.get_backlog_instructions()` to load the tool-oriented overview. Use the `instruction` selector when you need `task-creation`, `task-execution`, or `task-finalization`.
-
-- **First time working here?** Read the overview resource IMMEDIATELY to learn the workflow
-- **Already familiar?** You should have the overview cached ("## Backlog.md Overview (MCP)")
-- **When to read it**: BEFORE creating tasks, or when you're unsure whether to track work
-
-These guides cover:
-- Decision framework for when to create tasks
-- Search-first workflow to avoid duplicates
-- Links to detailed guides for task creation, execution, and finalization
-- MCP tools reference
-
-You MUST read the overview resource to understand the complete workflow. The information is NOT summarized here.
-
-</CRITICAL_INSTRUCTION>
-
-<!-- BACKLOG.MD MCP GUIDELINES END -->
diff --git a/CLAUDE.md b/CLAUDE.md
index f88a59d5..2dabf532 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,33 +1,4 @@
 
-<!-- BACKLOG.MD MCP GUIDELINES START -->
-
-<CRITICAL_INSTRUCTION>
-
-## BACKLOG WORKFLOW INSTRUCTIONS
-
-This project uses Backlog.md MCP for all task and project management activities.
-
-**CRITICAL GUIDANCE**
-
-- If your client supports MCP resources, read `backlog://workflow/overview` to understand when and how to use Backlog for this project.
-- If your client only supports tools or the above request fails, call `backlog.get_backlog_instructions()` to load the tool-oriented overview. Use the `instruction` selector when you need `task-creation`, `task-execution`, or `task-finalization`.
-
-- **First time working here?** Read the overview resource IMMEDIATELY to learn the workflow
-- **Already familiar?** You should have the overview cached ("## Backlog.md Overview (MCP)")
-- **When to read it**: BEFORE creating tasks, or when you're unsure whether to track work
-
-These guides cover:
-- Decision framework for when to create tasks
-- Search-first workflow to avoid duplicates
-- Links to detailed guides for task creation, execution, and finalization
-- MCP tools reference
-
-You MUST read the overview resource to understand the complete workflow. The information is NOT summarized here.
-
-</CRITICAL_INSTRUCTION>
-
-<!-- BACKLOG.MD MCP GUIDELINES END -->
-
 ## Available Skills
 
 Five Claude Code skills are installed in this repo's dev container. Each maps to a phase of the feature lifecycle.
diff --git a/asset_list/app.py b/asset_list/app.py
index 7413c7cb..9b10d7f3 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -79,23 +79,23 @@ def app():
     """
 
     data_folder = "/workspaces/model/asset_list"
-    data_filename = "input.xlsx"
-    sheet_name = "Handovers"
-    postcode_column = "POSTCODE"
-    address1_column = "Full Addres"
+    data_filename = "lincs_address_list.xlsx"
+    sheet_name = "Sheet1"
+    postcode_column = "Postcode"
+    address1_column = "Deal Name"
     address1_method = None
-    fulladdress_column = "Full Addres"
+    fulladdress_column = "Deal Name"
     address_cols_to_concat = []
     missing_postcodes_method = None
     landlord_year_built = None
-    landlord_os_uprn = "domna_found_uprn"
-    landlord_property_type = "PROPERTY TYPE"  # Good to include if landlord gave
-    landlord_built_form = "Type Description"  # Good to include if landlord gave
+    landlord_os_uprn = None
+    landlord_property_type = None  # Good to include if landlord gave
+    landlord_built_form = None  # Good to include if landlord gave
     landlord_wall_construction = None
     landlord_roof_construction = None
     landlord_heating_system = None
     landlord_existing_pv = None
-    landlord_property_id = "PROP REF"
+    landlord_property_id = "landlord_id"
     landlord_sap = None
     outcomes_filename = None
     outcomes_sheetname = None
@@ -468,9 +468,3 @@ def app():
                 asset_list.duplicated_addresses.to_excel(
                     writer, sheet_name="Duplicate Properties", index=False
                 )
-
-
-
-
-for key,value in dict.items():
-    lsakjfldsa
\ No newline at end of file
diff --git a/infrastructure/terraform/README.md b/deployment/terraform/README.md
similarity index 100%
rename from infrastructure/terraform/README.md
rename to deployment/terraform/README.md
diff --git a/infrastructure/terraform/cdn/main.tf b/deployment/terraform/cdn/main.tf
similarity index 100%
rename from infrastructure/terraform/cdn/main.tf
rename to deployment/terraform/cdn/main.tf
diff --git a/infrastructure/terraform/cdn/provider.tf b/deployment/terraform/cdn/provider.tf
similarity index 100%
rename from infrastructure/terraform/cdn/provider.tf
rename to deployment/terraform/cdn/provider.tf
diff --git a/infrastructure/terraform/cdn/variables.tf b/deployment/terraform/cdn/variables.tf
similarity index 100%
rename from infrastructure/terraform/cdn/variables.tf
rename to deployment/terraform/cdn/variables.tf
diff --git a/infrastructure/terraform/cdn_certificate/main.tf b/deployment/terraform/cdn_certificate/main.tf
similarity index 100%
rename from infrastructure/terraform/cdn_certificate/main.tf
rename to deployment/terraform/cdn_certificate/main.tf
diff --git a/infrastructure/terraform/cdn_certificate/outputs.tf b/deployment/terraform/cdn_certificate/outputs.tf
similarity index 100%
rename from infrastructure/terraform/cdn_certificate/outputs.tf
rename to deployment/terraform/cdn_certificate/outputs.tf
diff --git a/infrastructure/terraform/cdn_certificate/provider.tf b/deployment/terraform/cdn_certificate/provider.tf
similarity index 100%
rename from infrastructure/terraform/cdn_certificate/provider.tf
rename to deployment/terraform/cdn_certificate/provider.tf
diff --git a/infrastructure/terraform/cdn_certificate/variables.tf b/deployment/terraform/cdn_certificate/variables.tf
similarity index 100%
rename from infrastructure/terraform/cdn_certificate/variables.tf
rename to deployment/terraform/cdn_certificate/variables.tf
diff --git a/infrastructure/terraform/lambda/_template/README.md b/deployment/terraform/lambda/_template/README.md
similarity index 100%
rename from infrastructure/terraform/lambda/_template/README.md
rename to deployment/terraform/lambda/_template/README.md
diff --git a/infrastructure/terraform/lambda/_template/main.tf b/deployment/terraform/lambda/_template/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/_template/main.tf
rename to deployment/terraform/lambda/_template/main.tf
diff --git a/infrastructure/terraform/lambda/_template/provider.tf b/deployment/terraform/lambda/_template/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/_template/provider.tf
rename to deployment/terraform/lambda/_template/provider.tf
diff --git a/infrastructure/terraform/lambda/_template/variables.tf b/deployment/terraform/lambda/_template/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/_template/variables.tf
rename to deployment/terraform/lambda/_template/variables.tf
diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/deployment/terraform/lambda/address2UPRN/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/address2UPRN/main.tf
rename to deployment/terraform/lambda/address2UPRN/main.tf
diff --git a/infrastructure/terraform/lambda/address2UPRN/outputs.tf b/deployment/terraform/lambda/address2UPRN/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/address2UPRN/outputs.tf
rename to deployment/terraform/lambda/address2UPRN/outputs.tf
diff --git a/infrastructure/terraform/lambda/address2UPRN/provider.tf b/deployment/terraform/lambda/address2UPRN/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/address2UPRN/provider.tf
rename to deployment/terraform/lambda/address2UPRN/provider.tf
diff --git a/infrastructure/terraform/lambda/address2UPRN/variables.tf b/deployment/terraform/lambda/address2UPRN/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/address2UPRN/variables.tf
rename to deployment/terraform/lambda/address2UPRN/variables.tf
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf
rename to deployment/terraform/lambda/bulk_address2uprn_combiner/main.tf
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf
rename to deployment/terraform/lambda/bulk_address2uprn_combiner/outputs.tf
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf
rename to deployment/terraform/lambda/bulk_address2uprn_combiner/provider.tf
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf b/deployment/terraform/lambda/bulk_address2uprn_combiner/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf
rename to deployment/terraform/lambda/bulk_address2uprn_combiner/variables.tf
diff --git a/infrastructure/terraform/lambda/categorisation/main.tf b/deployment/terraform/lambda/categorisation/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/categorisation/main.tf
rename to deployment/terraform/lambda/categorisation/main.tf
diff --git a/infrastructure/terraform/lambda/categorisation/outputs.tf b/deployment/terraform/lambda/categorisation/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/categorisation/outputs.tf
rename to deployment/terraform/lambda/categorisation/outputs.tf
diff --git a/infrastructure/terraform/lambda/categorisation/provider.tf b/deployment/terraform/lambda/categorisation/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/categorisation/provider.tf
rename to deployment/terraform/lambda/categorisation/provider.tf
diff --git a/infrastructure/terraform/lambda/categorisation/variables.tf b/deployment/terraform/lambda/categorisation/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/categorisation/variables.tf
rename to deployment/terraform/lambda/categorisation/variables.tf
diff --git a/infrastructure/terraform/lambda/condition-etl/main.tf b/deployment/terraform/lambda/condition-etl/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/condition-etl/main.tf
rename to deployment/terraform/lambda/condition-etl/main.tf
diff --git a/infrastructure/terraform/lambda/condition-etl/provider.tf b/deployment/terraform/lambda/condition-etl/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/condition-etl/provider.tf
rename to deployment/terraform/lambda/condition-etl/provider.tf
diff --git a/infrastructure/terraform/lambda/condition-etl/variables.tf b/deployment/terraform/lambda/condition-etl/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/condition-etl/variables.tf
rename to deployment/terraform/lambda/condition-etl/variables.tf
diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/main.tf b/deployment/terraform/lambda/ecmk_to_ara/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/ecmk_to_ara/main.tf
rename to deployment/terraform/lambda/ecmk_to_ara/main.tf
diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/provider.tf b/deployment/terraform/lambda/ecmk_to_ara/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/ecmk_to_ara/provider.tf
rename to deployment/terraform/lambda/ecmk_to_ara/provider.tf
diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/variables.tf b/deployment/terraform/lambda/ecmk_to_ara/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/ecmk_to_ara/variables.tf
rename to deployment/terraform/lambda/ecmk_to_ara/variables.tf
diff --git a/infrastructure/terraform/lambda/engine/main.tf b/deployment/terraform/lambda/engine/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/engine/main.tf
rename to deployment/terraform/lambda/engine/main.tf
diff --git a/infrastructure/terraform/lambda/engine/outputs.tf b/deployment/terraform/lambda/engine/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/engine/outputs.tf
rename to deployment/terraform/lambda/engine/outputs.tf
diff --git a/infrastructure/terraform/lambda/engine/provider.tf b/deployment/terraform/lambda/engine/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/engine/provider.tf
rename to deployment/terraform/lambda/engine/provider.tf
diff --git a/infrastructure/terraform/lambda/engine/variables.tf b/deployment/terraform/lambda/engine/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/engine/variables.tf
rename to deployment/terraform/lambda/engine/variables.tf
diff --git a/infrastructure/terraform/lambda/fast-api/main.tf b/deployment/terraform/lambda/fast-api/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/fast-api/main.tf
rename to deployment/terraform/lambda/fast-api/main.tf
diff --git a/infrastructure/terraform/lambda/fast-api/outputs.tf b/deployment/terraform/lambda/fast-api/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/fast-api/outputs.tf
rename to deployment/terraform/lambda/fast-api/outputs.tf
diff --git a/infrastructure/terraform/lambda/fast-api/provider.tf b/deployment/terraform/lambda/fast-api/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/fast-api/provider.tf
rename to deployment/terraform/lambda/fast-api/provider.tf
diff --git a/infrastructure/terraform/lambda/fast-api/variables.tf b/deployment/terraform/lambda/fast-api/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/fast-api/variables.tf
rename to deployment/terraform/lambda/fast-api/variables.tf
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/deployment/terraform/lambda/hubspot_deal_etl/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
rename to deployment/terraform/lambda/hubspot_deal_etl/main.tf
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf b/deployment/terraform/lambda/hubspot_deal_etl/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
rename to deployment/terraform/lambda/hubspot_deal_etl/provider.tf
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf b/deployment/terraform/lambda/hubspot_deal_etl/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf
rename to deployment/terraform/lambda/hubspot_deal_etl/variables.tf
diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/deployment/terraform/lambda/magic_plan/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/magic_plan/main.tf
rename to deployment/terraform/lambda/magic_plan/main.tf
diff --git a/infrastructure/terraform/lambda/magic_plan/outputs.tf b/deployment/terraform/lambda/magic_plan/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/magic_plan/outputs.tf
rename to deployment/terraform/lambda/magic_plan/outputs.tf
diff --git a/infrastructure/terraform/lambda/magic_plan/provider.tf b/deployment/terraform/lambda/magic_plan/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/magic_plan/provider.tf
rename to deployment/terraform/lambda/magic_plan/provider.tf
diff --git a/infrastructure/terraform/lambda/magic_plan/variables.tf b/deployment/terraform/lambda/magic_plan/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/magic_plan/variables.tf
rename to deployment/terraform/lambda/magic_plan/variables.tf
diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/main.tf b/deployment/terraform/lambda/ordnanceSurvey/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/ordnanceSurvey/main.tf
rename to deployment/terraform/lambda/ordnanceSurvey/main.tf
diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf b/deployment/terraform/lambda/ordnanceSurvey/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/ordnanceSurvey/provider.tf
rename to deployment/terraform/lambda/ordnanceSurvey/provider.tf
diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf b/deployment/terraform/lambda/ordnanceSurvey/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/ordnanceSurvey/variables.tf
rename to deployment/terraform/lambda/ordnanceSurvey/variables.tf
diff --git a/infrastructure/terraform/lambda/pashub_to_ara/main.tf b/deployment/terraform/lambda/pashub_to_ara/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/pashub_to_ara/main.tf
rename to deployment/terraform/lambda/pashub_to_ara/main.tf
diff --git a/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf b/deployment/terraform/lambda/pashub_to_ara/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/pashub_to_ara/outputs.tf
rename to deployment/terraform/lambda/pashub_to_ara/outputs.tf
diff --git a/infrastructure/terraform/lambda/pashub_to_ara/provider.tf b/deployment/terraform/lambda/pashub_to_ara/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/pashub_to_ara/provider.tf
rename to deployment/terraform/lambda/pashub_to_ara/provider.tf
diff --git a/infrastructure/terraform/lambda/pashub_to_ara/variables.tf b/deployment/terraform/lambda/pashub_to_ara/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/pashub_to_ara/variables.tf
rename to deployment/terraform/lambda/pashub_to_ara/variables.tf
diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf
similarity index 100%
rename from infrastructure/terraform/lambda/postcodeSplitter/main.tf
rename to deployment/terraform/lambda/postcodeSplitter/main.tf
diff --git a/infrastructure/terraform/lambda/postcodeSplitter/outputs.tf b/deployment/terraform/lambda/postcodeSplitter/outputs.tf
similarity index 100%
rename from infrastructure/terraform/lambda/postcodeSplitter/outputs.tf
rename to deployment/terraform/lambda/postcodeSplitter/outputs.tf
diff --git a/infrastructure/terraform/lambda/postcodeSplitter/provider.tf b/deployment/terraform/lambda/postcodeSplitter/provider.tf
similarity index 100%
rename from infrastructure/terraform/lambda/postcodeSplitter/provider.tf
rename to deployment/terraform/lambda/postcodeSplitter/provider.tf
diff --git a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf b/deployment/terraform/lambda/postcodeSplitter/variables.tf
similarity index 100%
rename from infrastructure/terraform/lambda/postcodeSplitter/variables.tf
rename to deployment/terraform/lambda/postcodeSplitter/variables.tf
diff --git a/infrastructure/terraform/modules/acm_certificate/main.tf b/deployment/terraform/modules/acm_certificate/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/acm_certificate/main.tf
rename to deployment/terraform/modules/acm_certificate/main.tf
diff --git a/infrastructure/terraform/modules/acm_certificate/outputs.tf b/deployment/terraform/modules/acm_certificate/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/acm_certificate/outputs.tf
rename to deployment/terraform/modules/acm_certificate/outputs.tf
diff --git a/infrastructure/terraform/modules/acm_certificate/variables.tf b/deployment/terraform/modules/acm_certificate/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/acm_certificate/variables.tf
rename to deployment/terraform/modules/acm_certificate/variables.tf
diff --git a/infrastructure/terraform/modules/cloudfront/main.tf b/deployment/terraform/modules/cloudfront/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/cloudfront/main.tf
rename to deployment/terraform/modules/cloudfront/main.tf
diff --git a/infrastructure/terraform/modules/cloudfront/variables.tf b/deployment/terraform/modules/cloudfront/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/cloudfront/variables.tf
rename to deployment/terraform/modules/cloudfront/variables.tf
diff --git a/infrastructure/terraform/modules/container_registry/main.tf b/deployment/terraform/modules/container_registry/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/container_registry/main.tf
rename to deployment/terraform/modules/container_registry/main.tf
diff --git a/infrastructure/terraform/modules/container_registry/outputs.tf b/deployment/terraform/modules/container_registry/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/container_registry/outputs.tf
rename to deployment/terraform/modules/container_registry/outputs.tf
diff --git a/infrastructure/terraform/modules/container_registry/variables.tf b/deployment/terraform/modules/container_registry/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/container_registry/variables.tf
rename to deployment/terraform/modules/container_registry/variables.tf
diff --git a/infrastructure/terraform/modules/ecr/main.tf b/deployment/terraform/modules/ecr/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/ecr/main.tf
rename to deployment/terraform/modules/ecr/main.tf
diff --git a/infrastructure/terraform/modules/ecr/outputs.tf b/deployment/terraform/modules/ecr/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/ecr/outputs.tf
rename to deployment/terraform/modules/ecr/outputs.tf
diff --git a/infrastructure/terraform/modules/ecr/variables.tf b/deployment/terraform/modules/ecr/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/ecr/variables.tf
rename to deployment/terraform/modules/ecr/variables.tf
diff --git a/infrastructure/terraform/modules/general_iam_policy/main.tf b/deployment/terraform/modules/general_iam_policy/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/general_iam_policy/main.tf
rename to deployment/terraform/modules/general_iam_policy/main.tf
diff --git a/infrastructure/terraform/modules/general_iam_policy/outputs.tf b/deployment/terraform/modules/general_iam_policy/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/general_iam_policy/outputs.tf
rename to deployment/terraform/modules/general_iam_policy/outputs.tf
diff --git a/infrastructure/terraform/modules/general_iam_policy/variables.tf b/deployment/terraform/modules/general_iam_policy/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/general_iam_policy/variables.tf
rename to deployment/terraform/modules/general_iam_policy/variables.tf
diff --git a/infrastructure/terraform/modules/lambda_execution_role/main.tf b/deployment/terraform/modules/lambda_execution_role/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_execution_role/main.tf
rename to deployment/terraform/modules/lambda_execution_role/main.tf
diff --git a/infrastructure/terraform/modules/lambda_execution_role/outputs.tf b/deployment/terraform/modules/lambda_execution_role/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_execution_role/outputs.tf
rename to deployment/terraform/modules/lambda_execution_role/outputs.tf
diff --git a/infrastructure/terraform/modules/lambda_execution_role/variables.tf b/deployment/terraform/modules/lambda_execution_role/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_execution_role/variables.tf
rename to deployment/terraform/modules/lambda_execution_role/variables.tf
diff --git a/infrastructure/terraform/modules/lambda_service/main.tf b/deployment/terraform/modules/lambda_service/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_service/main.tf
rename to deployment/terraform/modules/lambda_service/main.tf
diff --git a/infrastructure/terraform/modules/lambda_service/outputs.tf b/deployment/terraform/modules/lambda_service/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_service/outputs.tf
rename to deployment/terraform/modules/lambda_service/outputs.tf
diff --git a/infrastructure/terraform/modules/lambda_service/variables.tf b/deployment/terraform/modules/lambda_service/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_service/variables.tf
rename to deployment/terraform/modules/lambda_service/variables.tf
diff --git a/infrastructure/terraform/modules/lambda_service_zip/main.tf b/deployment/terraform/modules/lambda_service_zip/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_service_zip/main.tf
rename to deployment/terraform/modules/lambda_service_zip/main.tf
diff --git a/infrastructure/terraform/modules/lambda_service_zip/variables.tf b/deployment/terraform/modules/lambda_service_zip/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_service_zip/variables.tf
rename to deployment/terraform/modules/lambda_service_zip/variables.tf
diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf b/deployment/terraform/modules/lambda_sqs_trigger/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_sqs_trigger/main.tf
rename to deployment/terraform/modules/lambda_sqs_trigger/main.tf
diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf b/deployment/terraform/modules/lambda_sqs_trigger/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_sqs_trigger/variables.tf
rename to deployment/terraform/modules/lambda_sqs_trigger/variables.tf
diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/main.tf b/deployment/terraform/modules/lambda_with_api_gateway/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_with_api_gateway/main.tf
rename to deployment/terraform/modules/lambda_with_api_gateway/main.tf
diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf b/deployment/terraform/modules/lambda_with_api_gateway/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf
rename to deployment/terraform/modules/lambda_with_api_gateway/outputs.tf
diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf b/deployment/terraform/modules/lambda_with_api_gateway/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf
rename to deployment/terraform/modules/lambda_with_api_gateway/variables.tf
diff --git a/infrastructure/terraform/modules/lambda_with_sqs/main.tf b/deployment/terraform/modules/lambda_with_sqs/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_with_sqs/main.tf
rename to deployment/terraform/modules/lambda_with_sqs/main.tf
diff --git a/infrastructure/terraform/modules/lambda_with_sqs/outputs.tf b/deployment/terraform/modules/lambda_with_sqs/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_with_sqs/outputs.tf
rename to deployment/terraform/modules/lambda_with_sqs/outputs.tf
diff --git a/infrastructure/terraform/modules/lambda_with_sqs/variables.tf b/deployment/terraform/modules/lambda_with_sqs/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/lambda_with_sqs/variables.tf
rename to deployment/terraform/modules/lambda_with_sqs/variables.tf
diff --git a/infrastructure/terraform/modules/route53/main.tf b/deployment/terraform/modules/route53/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/route53/main.tf
rename to deployment/terraform/modules/route53/main.tf
diff --git a/infrastructure/terraform/modules/route53/variables.tf b/deployment/terraform/modules/route53/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/route53/variables.tf
rename to deployment/terraform/modules/route53/variables.tf
diff --git a/infrastructure/terraform/modules/s3/main.tf b/deployment/terraform/modules/s3/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3/main.tf
rename to deployment/terraform/modules/s3/main.tf
diff --git a/infrastructure/terraform/modules/s3/outputs.tf b/deployment/terraform/modules/s3/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3/outputs.tf
rename to deployment/terraform/modules/s3/outputs.tf
diff --git a/infrastructure/terraform/modules/s3/variables.tf b/deployment/terraform/modules/s3/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3/variables.tf
rename to deployment/terraform/modules/s3/variables.tf
diff --git a/infrastructure/terraform/modules/s3_iam_policy/main.tf b/deployment/terraform/modules/s3_iam_policy/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3_iam_policy/main.tf
rename to deployment/terraform/modules/s3_iam_policy/main.tf
diff --git a/infrastructure/terraform/modules/s3_iam_policy/outputs.tf b/deployment/terraform/modules/s3_iam_policy/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3_iam_policy/outputs.tf
rename to deployment/terraform/modules/s3_iam_policy/outputs.tf
diff --git a/infrastructure/terraform/modules/s3_iam_policy/variables.tf b/deployment/terraform/modules/s3_iam_policy/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3_iam_policy/variables.tf
rename to deployment/terraform/modules/s3_iam_policy/variables.tf
diff --git a/infrastructure/terraform/modules/s3_presignable_bucket/main.tf b/deployment/terraform/modules/s3_presignable_bucket/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3_presignable_bucket/main.tf
rename to deployment/terraform/modules/s3_presignable_bucket/main.tf
diff --git a/infrastructure/terraform/modules/s3_presignable_bucket/outputs.tf b/deployment/terraform/modules/s3_presignable_bucket/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3_presignable_bucket/outputs.tf
rename to deployment/terraform/modules/s3_presignable_bucket/outputs.tf
diff --git a/infrastructure/terraform/modules/s3_presignable_bucket/variables.tf b/deployment/terraform/modules/s3_presignable_bucket/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/s3_presignable_bucket/variables.tf
rename to deployment/terraform/modules/s3_presignable_bucket/variables.tf
diff --git a/infrastructure/terraform/modules/ses/main.tf b/deployment/terraform/modules/ses/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/ses/main.tf
rename to deployment/terraform/modules/ses/main.tf
diff --git a/infrastructure/terraform/modules/ses/outputs.tf b/deployment/terraform/modules/ses/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/ses/outputs.tf
rename to deployment/terraform/modules/ses/outputs.tf
diff --git a/infrastructure/terraform/modules/ses/variables.tf b/deployment/terraform/modules/ses/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/ses/variables.tf
rename to deployment/terraform/modules/ses/variables.tf
diff --git a/infrastructure/terraform/modules/sqs_queue/main.tf b/deployment/terraform/modules/sqs_queue/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/sqs_queue/main.tf
rename to deployment/terraform/modules/sqs_queue/main.tf
diff --git a/infrastructure/terraform/modules/sqs_queue/outputs.tf b/deployment/terraform/modules/sqs_queue/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/sqs_queue/outputs.tf
rename to deployment/terraform/modules/sqs_queue/outputs.tf
diff --git a/infrastructure/terraform/modules/sqs_queue/variables.tf b/deployment/terraform/modules/sqs_queue/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/sqs_queue/variables.tf
rename to deployment/terraform/modules/sqs_queue/variables.tf
diff --git a/infrastructure/terraform/modules/tf_state_bucket/main.tf b/deployment/terraform/modules/tf_state_bucket/main.tf
similarity index 100%
rename from infrastructure/terraform/modules/tf_state_bucket/main.tf
rename to deployment/terraform/modules/tf_state_bucket/main.tf
diff --git a/infrastructure/terraform/modules/tf_state_bucket/outputs.tf b/deployment/terraform/modules/tf_state_bucket/outputs.tf
similarity index 100%
rename from infrastructure/terraform/modules/tf_state_bucket/outputs.tf
rename to deployment/terraform/modules/tf_state_bucket/outputs.tf
diff --git a/infrastructure/terraform/modules/tf_state_bucket/variables.tf b/deployment/terraform/modules/tf_state_bucket/variables.tf
similarity index 100%
rename from infrastructure/terraform/modules/tf_state_bucket/variables.tf
rename to deployment/terraform/modules/tf_state_bucket/variables.tf
diff --git a/infrastructure/terraform/shared/dev.tfvars b/deployment/terraform/shared/dev.tfvars
similarity index 100%
rename from infrastructure/terraform/shared/dev.tfvars
rename to deployment/terraform/shared/dev.tfvars
diff --git a/infrastructure/terraform/shared/main.tf b/deployment/terraform/shared/main.tf
similarity index 100%
rename from infrastructure/terraform/shared/main.tf
rename to deployment/terraform/shared/main.tf
diff --git a/infrastructure/terraform/shared/secrets.tf b/deployment/terraform/shared/secrets.tf
similarity index 100%
rename from infrastructure/terraform/shared/secrets.tf
rename to deployment/terraform/shared/secrets.tf
diff --git a/infrastructure/terraform/shared/variables.tf b/deployment/terraform/shared/variables.tf
similarity index 100%
rename from infrastructure/terraform/shared/variables.tf
rename to deployment/terraform/shared/variables.tf
diff --git a/domain/__init__.py b/domain/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/domain/tasks/__init__.py b/domain/tasks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/domain/tasks/subtasks.py b/domain/tasks/subtasks.py
new file mode 100644
index 00000000..bd49a6ec
--- /dev/null
+++ b/domain/tasks/subtasks.py
@@ -0,0 +1,55 @@
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Optional
+from uuid import UUID, uuid4
+
+
+class SubTaskStatus(str, Enum):
+    WAITING = "waiting"
+    IN_PROGRESS = "in progress"
+    COMPLETE = "complete"
+    FAILED = "failed"
+
+
+@dataclass
+class SubTask:
+    id: UUID
+    task_id: UUID
+    status: SubTaskStatus = SubTaskStatus.WAITING
+    inputs: Optional[dict[str, Any]] = None
+    outputs: Optional[dict[str, Any]] = None
+    cloud_logs_url: Optional[str] = None
+    job_started: Optional[datetime] = None
+    job_completed: Optional[datetime] = None
+
+    @classmethod
+    def create(
+        cls, *, task_id: UUID, inputs: Optional[dict[str, Any]] = None
+    ) -> "SubTask":
+        return cls(
+            id=uuid4(),
+            task_id=task_id,
+            status=SubTaskStatus.WAITING,
+            inputs=inputs,
+        )
+
+    def start(self, cloud_logs_url: Optional[str] = None) -> None:
+        if self.status not in (SubTaskStatus.WAITING, SubTaskStatus.IN_PROGRESS):
+            raise ValueError(f"cannot start subtask in status {self.status}")
+        if self.job_started is None:
+            self.job_started = datetime.now(timezone.utc)
+        self.status = SubTaskStatus.IN_PROGRESS
+        if cloud_logs_url is not None:
+            self.cloud_logs_url = cloud_logs_url
+
+    def complete(self, result: Any = None) -> None:
+        self.status = SubTaskStatus.COMPLETE
+        self.job_completed = datetime.now(timezone.utc)
+        if result is not None:
+            self.outputs = {"result": result}
+
+    def fail(self, error: BaseException) -> None:
+        self.status = SubTaskStatus.FAILED
+        self.job_completed = datetime.now(timezone.utc)
+        self.outputs = {"error": str(error)}
diff --git a/domain/tasks/tasks.py b/domain/tasks/tasks.py
new file mode 100644
index 00000000..177258d6
--- /dev/null
+++ b/domain/tasks/tasks.py
@@ -0,0 +1,94 @@
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Optional
+from uuid import UUID, uuid4
+
+from domain.tasks.subtasks import SubTaskStatus
+
+
+class TaskStatus(str, Enum):
+    WAITING = "waiting"
+    IN_PROGRESS = "in progress"
+    COMPLETE = "complete"
+    FAILED = "failed"
+
+
+class Source(str, Enum):
+    PORTFOLIO = "portfolio_id"
+    HUBSPOT_DEAL = "hubspot_deal_id"
+
+
+@dataclass
+class Task:
+    id: UUID
+    task_source: str
+    status: TaskStatus = TaskStatus.WAITING
+    service: Optional[str] = None
+    source: Optional[Source] = None
+    source_id: Optional[str] = None
+    job_started: Optional[datetime] = None
+    job_completed: Optional[datetime] = None
+
+    @classmethod
+    def create(
+        cls,
+        *,
+        task_source: str,
+        service: Optional[str] = None,
+        source: Optional[Source] = None,
+        source_id: Optional[str] = None,
+    ) -> "Task":
+        if not task_source.strip():
+            raise ValueError("task_source must be non-empty")
+        return cls(
+            id=uuid4(),
+            task_source=task_source,
+            service=service,
+            source=source,
+            source_id=source_id,
+            status=TaskStatus.WAITING,
+            job_started=datetime.now(timezone.utc),
+        )
+
+    def start(self) -> None:
+        if self.status not in (TaskStatus.WAITING, TaskStatus.IN_PROGRESS):
+            raise ValueError(f"cannot start task in status {self.status}")
+        if self.job_started is None:
+            self.job_started = datetime.now(timezone.utc)
+        self.status = TaskStatus.IN_PROGRESS
+
+    def complete(self) -> None:
+        self.status = TaskStatus.COMPLETE
+        self.job_completed = datetime.now(timezone.utc)
+
+    def fail(self) -> None:
+        self.status = TaskStatus.FAILED
+        self.job_completed = datetime.now(timezone.utc)
+
+    def recalculate_from_subtasks(self, statuses: list[SubTaskStatus]) -> None:
+        """Recompute Task.status from its SubTasks' statuses.
+
+        Rule (preserved from legacy _update_task_progress):
+          - any FAILED       → FAILED
+          - all COMPLETE     → COMPLETE
+          - any IN_PROGRESS  → IN_PROGRESS
+          - otherwise        → WAITING
+
+        Empty list is a no-op (newly-created task with no subtasks).
+        """
+        if not statuses:
+            return
+        now = datetime.now(timezone.utc)
+        if SubTaskStatus.FAILED in statuses:
+            self.status = TaskStatus.FAILED
+            self.job_completed = now
+        elif all(s is SubTaskStatus.COMPLETE for s in statuses):
+            self.status = TaskStatus.COMPLETE
+            self.job_completed = now
+        elif SubTaskStatus.IN_PROGRESS in statuses:
+            self.status = TaskStatus.IN_PROGRESS
+            self.job_completed = None
+        else:
+            self.status = TaskStatus.WAITING
+            self.job_completed = None
diff --git a/infrastructure/__init__.py b/infrastructure/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/infrastructure/postgres/__init__.py b/infrastructure/postgres/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/infrastructure/postgres/config.py b/infrastructure/postgres/config.py
new file mode 100644
index 00000000..c39c6f30
--- /dev/null
+++ b/infrastructure/postgres/config.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass
+from typing import Mapping
+
+
+@dataclass(frozen=True)
+class PostgresConfig:
+    host: str
+    port: int
+    username: str
+    password: str
+    database: str
+    driver: str = "psycopg2"
+    pool_size: int = 3
+    max_overflow: int = 5
+    pool_pre_ping: bool = True
+    pool_recycle: int = 300
+
+    def url(self) -> str:
+        return (
+            f"postgresql+{self.driver}://"
+            f"{self.username}:{self.password}@{self.host}:{self.port}/{self.database}"
+        )
+
+    @classmethod
+    def from_env(cls, env: Mapping[str, str]) -> "PostgresConfig":
+        return cls(
+            host=env["POSTGRES_HOST"],
+            port=int(env["POSTGRES_PORT"]),
+            username=env["POSTGRES_USERNAME"],
+            password=env["POSTGRES_PASSWORD"],
+            database=env["POSTGRES_DATABASE"],
+            driver=env.get("POSTGRES_DRIVER", "psycopg2"),
+        )
diff --git a/infrastructure/postgres/engine.py b/infrastructure/postgres/engine.py
new file mode 100644
index 00000000..0de9efcb
--- /dev/null
+++ b/infrastructure/postgres/engine.py
@@ -0,0 +1,18 @@
+from sqlalchemy.engine import Engine
+from sqlmodel import Session, create_engine
+
+from infrastructure.postgres.config import PostgresConfig
+
+
+def make_engine(config: PostgresConfig) -> Engine:
+    return create_engine(
+        config.url(),
+        pool_size=config.pool_size,
+        max_overflow=config.max_overflow,
+        pool_pre_ping=config.pool_pre_ping,
+        pool_recycle=config.pool_recycle,
+    )
+
+
+def make_session(engine: Engine) -> Session:
+    return Session(engine)
diff --git a/infrastructure/postgres/subtask_table.py b/infrastructure/postgres/subtask_table.py
new file mode 100644
index 00000000..dec34fbf
--- /dev/null
+++ b/infrastructure/postgres/subtask_table.py
@@ -0,0 +1,21 @@
+from datetime import datetime, timezone
+from typing import ClassVar, Optional
+from uuid import UUID, uuid4
+
+from sqlmodel import Field, SQLModel
+
+
+class SubTaskRow(SQLModel, table=True):
+    __tablename__: ClassVar[str] = "sub_task"  # pyright: ignore[reportIncompatibleVariableOverride]
+
+    id: UUID = Field(default_factory=uuid4, primary_key=True, index=True)
+    task_id: UUID = Field(foreign_key="tasks.id")
+    job_started: Optional[datetime] = None
+    job_completed: Optional[datetime] = None
+    status: str = Field(default="waiting")
+    inputs: Optional[str] = None
+    outputs: Optional[str] = None
+    cloud_logs_url: Optional[str] = None
+    updated_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc)
+    )
diff --git a/infrastructure/postgres/task_table.py b/infrastructure/postgres/task_table.py
new file mode 100644
index 00000000..32e5450b
--- /dev/null
+++ b/infrastructure/postgres/task_table.py
@@ -0,0 +1,36 @@
+from datetime import datetime, timezone
+from typing import ClassVar, Optional
+from uuid import UUID, uuid4
+
+from sqlalchemy import Column
+from sqlalchemy import Enum as SAEnum
+from sqlmodel import Field, SQLModel
+
+from domain.tasks.tasks import Source
+
+
+class TaskRow(SQLModel, table=True):
+    __tablename__: ClassVar[str] = "tasks"  # pyright: ignore[reportIncompatibleVariableOverride]
+
+    id: UUID = Field(default_factory=uuid4, primary_key=True, index=True)
+    task_source: str
+    job_started: Optional[datetime] = None
+    job_completed: Optional[datetime] = None
+    status: str = Field(default="waiting")
+    service: Optional[str] = None
+    updated_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc)
+    )
+
+    source: Optional[Source] = Field(
+        default=None,
+        sa_column=Column(
+            SAEnum(
+                Source,
+                name="source",
+                values_callable=lambda cls: [m.value for m in cls],  # pyright: ignore[reportUnknownLambdaType, reportUnknownMemberType, reportUnknownVariableType]
+            ),
+            nullable=True,
+        ),
+    )
+    source_id: Optional[str] = None
diff --git a/orchestration/__init__.py b/orchestration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/orchestration/task_orchestrator.py b/orchestration/task_orchestrator.py
new file mode 100644
index 00000000..6c67d1ce
--- /dev/null
+++ b/orchestration/task_orchestrator.py
@@ -0,0 +1,96 @@
+from typing import Any, Callable, Optional
+from uuid import UUID
+
+from domain.tasks.subtasks import SubTask
+from domain.tasks.tasks import Source, Task
+from repositories.tasks.subtask_repository import SubTaskRepository
+from repositories.tasks.task_repository import TaskRepository
+from utilities.private import private
+
+
+class TaskOrchestrator:
+    """Coordinates Task + SubTask lifecycle.
+
+    Exposes primitives (start/complete/fail_subtask) for handlers that want
+    fine-grained control, and a high-level run_subtask wrapper that owns the
+    try/except so it can replace the body of the legacy subtask_handler
+    decorator in backend/utils/subtasks.py.
+
+    Each primitive saves the SubTask, then recomputes the parent Task's
+    status from all its children.
+    """
+
+    def __init__(
+        self,
+        task_repo: TaskRepository,
+        subtask_repo: SubTaskRepository,
+    ) -> None:
+        self._tasks = task_repo
+        self._subtasks = subtask_repo
+
+    def create_task_with_subtask(
+        self,
+        *,
+        task_source: str,
+        inputs: Optional[dict[str, Any]] = None,
+        service: Optional[str] = None,
+        source: Optional[Source] = None,
+        source_id: Optional[str] = None,
+    ) -> tuple[Task, SubTask]:
+        task = Task.create(
+            task_source=task_source,
+            service=service,
+            source=source,
+            source_id=source_id,
+        )
+        self._tasks.create(task)
+        subtask = SubTask.create(task_id=task.id, inputs=inputs)
+        self._subtasks.create(subtask)
+        return task, subtask
+
+    def start_subtask(
+        self, subtask_id: UUID, cloud_logs_url: Optional[str] = None
+    ) -> SubTask:
+        subtask = self._subtasks.get(subtask_id)
+        subtask.start(cloud_logs_url)
+        self._subtasks.save(subtask)
+        self._cascade(subtask.task_id)
+        return subtask
+
+    def complete_subtask(
+        self, subtask_id: UUID, result: Any = None
+    ) -> SubTask:
+        subtask = self._subtasks.get(subtask_id)
+        subtask.complete(result)
+        self._subtasks.save(subtask)
+        self._cascade(subtask.task_id)
+        return subtask
+
+    def fail_subtask(self, subtask_id: UUID, error: BaseException) -> SubTask:
+        subtask = self._subtasks.get(subtask_id)
+        subtask.fail(error)
+        self._subtasks.save(subtask)
+        self._cascade(subtask.task_id)
+        return subtask
+
+    def run_subtask(
+        self,
+        subtask_id: UUID,
+        work: Callable[[], Any],
+        cloud_logs_url: Optional[str] = None,
+    ) -> Any:
+        self.start_subtask(subtask_id, cloud_logs_url)
+        try:
+            result = work()
+        except Exception as e:
+            self.fail_subtask(subtask_id, e)
+            raise
+        self.complete_subtask(subtask_id, result)
+        return result
+
+    @private
+    def _cascade(self, task_id: UUID) -> None:
+        statuses = [s.status for s in self._subtasks.list_by_task(task_id)]
+        task = self._tasks.get(task_id)
+        task.recalculate_from_subtasks(statuses)
+        self._tasks.save(task)
diff --git a/repositories/__init__.py b/repositories/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/repositories/tasks/__init__.py b/repositories/tasks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/repositories/tasks/subtask_postgres_repository.py b/repositories/tasks/subtask_postgres_repository.py
new file mode 100644
index 00000000..affc280e
--- /dev/null
+++ b/repositories/tasks/subtask_postgres_repository.py
@@ -0,0 +1,89 @@
+import json
+from datetime import datetime, timezone
+from typing import Any, Optional
+from uuid import UUID
+
+from sqlmodel import Session, select
+
+from domain.tasks.subtasks import SubTask, SubTaskStatus
+from infrastructure.postgres.subtask_table import SubTaskRow
+from repositories.tasks.subtask_repository import SubTaskRepository
+from utilities.private import private
+
+
+class SubTaskPostgresRepository(SubTaskRepository):
+    def __init__(self, session: Session) -> None:
+        self._session = session
+
+    def create(self, subtask: SubTask) -> SubTask:
+        row = self._to_row(subtask)
+        self._session.add(row)
+        self._session.commit()
+        self._session.refresh(row)
+        return self._to_domain(row)
+
+    def get(self, subtask_id: UUID) -> SubTask:
+        row = self._session.get(SubTaskRow, subtask_id)
+        if row is None:
+            raise ValueError(f"SubTask {subtask_id} not found")
+        return self._to_domain(row)
+
+    def save(self, subtask: SubTask) -> None:
+        row = self._session.get(SubTaskRow, subtask.id)
+        if row is None:
+            raise ValueError(f"SubTask {subtask.id} not found")
+        row.status = subtask.status.value
+        row.job_started = subtask.job_started
+        row.job_completed = subtask.job_completed
+        row.inputs = (
+            json.dumps(subtask.inputs) if subtask.inputs is not None else None
+        )
+        row.outputs = (
+            json.dumps(subtask.outputs) if subtask.outputs is not None else None
+        )
+        row.cloud_logs_url = subtask.cloud_logs_url
+        row.updated_at = datetime.now(timezone.utc)
+        self._session.add(row)
+        self._session.commit()
+
+    def list_by_task(self, task_id: UUID) -> list[SubTask]:
+        rows = self._session.exec(
+            select(SubTaskRow).where(SubTaskRow.task_id == task_id)
+        ).all()
+        return [self._to_domain(r) for r in rows]
+
+    @private
+    def _to_row(self, subtask: SubTask) -> SubTaskRow:
+        return SubTaskRow(
+            id=subtask.id,
+            task_id=subtask.task_id,
+            status=subtask.status.value,
+            inputs=(
+                json.dumps(subtask.inputs) if subtask.inputs is not None else None
+            ),
+            outputs=(
+                json.dumps(subtask.outputs)
+                if subtask.outputs is not None
+                else None
+            ),
+            cloud_logs_url=subtask.cloud_logs_url,
+            job_started=subtask.job_started,
+            job_completed=subtask.job_completed,
+        )
+
+    @private
+    def _to_domain(self, row: SubTaskRow) -> SubTask:
+        return SubTask(
+            id=row.id,
+            task_id=row.task_id,
+            status=SubTaskStatus(row.status.lower()),
+            inputs=_loads_or_none(row.inputs),
+            outputs=_loads_or_none(row.outputs),
+            cloud_logs_url=row.cloud_logs_url,
+            job_started=row.job_started,
+            job_completed=row.job_completed,
+        )
+
+
+def _loads_or_none(s: Optional[str]) -> Optional[dict[str, Any]]:
+    return json.loads(s) if s else None
diff --git a/repositories/tasks/subtask_repository.py b/repositories/tasks/subtask_repository.py
new file mode 100644
index 00000000..adb36f99
--- /dev/null
+++ b/repositories/tasks/subtask_repository.py
@@ -0,0 +1,18 @@
+from abc import ABC, abstractmethod
+from uuid import UUID
+
+from domain.tasks.subtasks import SubTask
+
+
+class SubTaskRepository(ABC):
+    @abstractmethod
+    def create(self, subtask: SubTask) -> SubTask: ...
+
+    @abstractmethod
+    def get(self, subtask_id: UUID) -> SubTask: ...
+
+    @abstractmethod
+    def save(self, subtask: SubTask) -> None: ...
+
+    @abstractmethod
+    def list_by_task(self, task_id: UUID) -> list[SubTask]: ...
diff --git a/repositories/tasks/task_postgres_repository.py b/repositories/tasks/task_postgres_repository.py
new file mode 100644
index 00000000..d23fe91c
--- /dev/null
+++ b/repositories/tasks/task_postgres_repository.py
@@ -0,0 +1,77 @@
+"""
+Postgres implementation of TaskRepository.
+
+NOTE: this repository owns only the `tasks` table. Unlike the legacy
+backend.app.db.functions.tasks.Tasks.TasksInterface.create_task, it does NOT
+auto-create a child SubTask. Do not rewire existing Lambda callers to this
+repo until the SubTask aggregate + TaskOrchestrator slice lands — they would
+silently lose their initial SubTask row.
+"""
+
+from datetime import datetime, timezone
+from uuid import UUID
+
+from sqlmodel import Session
+
+from domain.tasks.tasks import Task, TaskStatus
+from infrastructure.postgres.task_table import TaskRow
+from repositories.tasks.task_repository import TaskRepository
+from utilities.private import private
+
+
+class TaskPostgresRepository(TaskRepository):
+    def __init__(self, session: Session) -> None:
+        self._session = session
+
+    def create(self, task: Task) -> Task:
+        row = self._to_row(task)
+        self._session.add(row)
+        self._session.commit()
+        self._session.refresh(row)
+        return self._to_domain(row)
+
+    def get(self, task_id: UUID) -> Task:
+        row = self._session.get(TaskRow, task_id)
+        if row is None:
+            raise ValueError(f"Task {task_id} not found")
+        return self._to_domain(row)
+
+    def save(self, task: Task) -> None:
+        row = self._session.get(TaskRow, task.id)
+        if row is None:
+            raise ValueError(f"Task {task.id} not found")
+        row.status = task.status.value
+        row.job_started = task.job_started
+        row.job_completed = task.job_completed
+        row.service = task.service
+        row.source = task.source
+        row.source_id = task.source_id
+        row.updated_at = datetime.now(timezone.utc)
+        self._session.add(row)
+        self._session.commit()
+
+    @private
+    def _to_row(self, task: Task) -> TaskRow:
+        return TaskRow(
+            id=task.id,
+            task_source=task.task_source,
+            status=task.status.value,
+            service=task.service,
+            source=task.source,
+            source_id=task.source_id,
+            job_started=task.job_started,
+            job_completed=task.job_completed,
+        )
+
+    @private
+    def _to_domain(self, row: TaskRow) -> Task:
+        return Task(
+            id=row.id,
+            task_source=row.task_source,
+            status=TaskStatus(row.status.lower()),
+            service=row.service,
+            source=row.source,
+            source_id=row.source_id,
+            job_started=row.job_started,
+            job_completed=row.job_completed,
+        )
diff --git a/repositories/tasks/task_repository.py b/repositories/tasks/task_repository.py
new file mode 100644
index 00000000..8bdce0cc
--- /dev/null
+++ b/repositories/tasks/task_repository.py
@@ -0,0 +1,15 @@
+from abc import ABC, abstractmethod
+from uuid import UUID
+
+from domain.tasks.tasks import Task
+
+
+class TaskRepository(ABC):
+    @abstractmethod
+    def create(self, task: Task) -> Task: ...
+
+    @abstractmethod
+    def get(self, task_id: UUID) -> Task: ...
+
+    @abstractmethod
+    def save(self, task: Task) -> None: ...
diff --git a/run_backlog.sh b/run_backlog.sh
deleted file mode 100644
index 398e921c..00000000
--- a/run_backlog.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-backlog browser --port 6421
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/domain/__init__.py b/tests/domain/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/domain/tasks/__init__.py b/tests/domain/tasks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/domain/tasks/test_subtasks.py b/tests/domain/tasks/test_subtasks.py
new file mode 100644
index 00000000..2721d38f
--- /dev/null
+++ b/tests/domain/tasks/test_subtasks.py
@@ -0,0 +1,75 @@
+from uuid import uuid4
+
+import pytest
+
+from domain.tasks.subtasks import SubTask, SubTaskStatus
+
+
+def test_create_subtask_starts_waiting() -> None:
+    task_id = uuid4()
+
+    st = SubTask.create(task_id=task_id, inputs={"foo": "bar"})
+
+    assert st.task_id == task_id
+    assert st.status is SubTaskStatus.WAITING
+    assert st.inputs == {"foo": "bar"}
+    assert st.outputs is None
+    assert st.job_started is None
+    assert st.job_completed is None
+
+
+def test_start_transitions_to_in_progress_and_sets_cloud_logs_url() -> None:
+    st = SubTask.create(task_id=uuid4())
+
+    st.start(cloud_logs_url="https://example/log")
+
+    assert st.status is SubTaskStatus.IN_PROGRESS
+    assert st.cloud_logs_url == "https://example/log"
+    assert st.job_started is not None
+
+
+def test_start_is_idempotent_from_in_progress() -> None:
+    st = SubTask.create(task_id=uuid4())
+    st.start()
+    first_start = st.job_started
+
+    st.start(cloud_logs_url="https://other")
+
+    assert st.status is SubTaskStatus.IN_PROGRESS
+    assert st.job_started == first_start  # not overwritten
+    assert st.cloud_logs_url == "https://other"
+
+
+def test_start_rejects_from_terminal_status() -> None:
+    st = SubTask.create(task_id=uuid4())
+    st.complete()
+    with pytest.raises(ValueError):
+        st.start()
+
+
+def test_complete_marks_outputs_and_job_completed() -> None:
+    st = SubTask.create(task_id=uuid4())
+    st.start()
+
+    st.complete({"uprn": "123"})
+
+    assert st.status is SubTaskStatus.COMPLETE
+    assert st.outputs == {"result": {"uprn": "123"}}
+    assert st.job_completed is not None
+
+
+def test_complete_without_result_leaves_outputs_unset() -> None:
+    st = SubTask.create(task_id=uuid4())
+    st.complete()
+    assert st.outputs is None
+
+
+def test_fail_records_error_in_outputs() -> None:
+    st = SubTask.create(task_id=uuid4())
+    err = RuntimeError("boom")
+
+    st.fail(err)
+
+    assert st.status is SubTaskStatus.FAILED
+    assert st.outputs == {"error": "boom"}
+    assert st.job_completed is not None
diff --git a/tests/domain/tasks/test_tasks.py b/tests/domain/tasks/test_tasks.py
new file mode 100644
index 00000000..f30c0aa1
--- /dev/null
+++ b/tests/domain/tasks/test_tasks.py
@@ -0,0 +1,104 @@
+import pytest
+
+from domain.tasks.subtasks import SubTaskStatus
+from domain.tasks.tasks import Source, Task, TaskStatus
+
+
+def test_create_task_starts_waiting() -> None:
+    # Arrange / Act
+    t = Task.create(
+        task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123"
+    )
+
+    # Assert
+    assert t.status is TaskStatus.WAITING
+    assert t.source is Source.PORTFOLIO
+    assert t.source_id == "abc-123"
+    assert t.job_started is not None
+    assert t.job_completed is None
+
+
+def test_create_task_rejects_blank_task_source() -> None:
+    with pytest.raises(ValueError, match="task_source"):
+        Task.create(task_source="   ")
+
+
+def test_start_transitions_to_in_progress() -> None:
+    t = Task.create(task_source="manual:test")
+    t.start()
+    assert t.status is TaskStatus.IN_PROGRESS
+
+
+def test_complete_marks_job_completed() -> None:
+    t = Task.create(task_source="manual:test")
+    t.start()
+    t.complete()
+    assert t.status is TaskStatus.COMPLETE
+    assert t.job_completed is not None
+
+
+def test_fail_marks_job_completed() -> None:
+    t = Task.create(task_source="manual:test")
+    t.fail()
+    assert t.status is TaskStatus.FAILED
+    assert t.job_completed is not None
+
+
+def test_start_rejects_from_terminal_status() -> None:
+    t = Task.create(task_source="manual:test")
+    t.complete()
+    with pytest.raises(ValueError):
+        t.start()
+
+
+def test_recalculate_with_empty_statuses_is_noop() -> None:
+    t = Task.create(task_source="manual:test")
+    original_status = t.status
+    original_completed = t.job_completed
+
+    t.recalculate_from_subtasks([])
+
+    assert t.status is original_status
+    assert t.job_completed is original_completed
+
+
+def test_recalculate_all_waiting_keeps_waiting() -> None:
+    t = Task.create(task_source="manual:test")
+    t.start()  # task moved to IN_PROGRESS earlier
+    t.complete()  # then COMPLETE, with job_completed set
+
+    t.recalculate_from_subtasks([SubTaskStatus.WAITING, SubTaskStatus.WAITING])
+
+    assert t.status is TaskStatus.WAITING
+    assert t.job_completed is None
+
+
+def test_recalculate_any_in_progress_marks_in_progress() -> None:
+    t = Task.create(task_source="manual:test")
+
+    t.recalculate_from_subtasks(
+        [SubTaskStatus.WAITING, SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE]
+    )
+
+    assert t.status is TaskStatus.IN_PROGRESS
+    assert t.job_completed is None
+
+
+def test_recalculate_all_complete_marks_complete() -> None:
+    t = Task.create(task_source="manual:test")
+
+    t.recalculate_from_subtasks([SubTaskStatus.COMPLETE, SubTaskStatus.COMPLETE])
+
+    assert t.status is TaskStatus.COMPLETE
+    assert t.job_completed is not None
+
+
+def test_recalculate_any_failed_marks_failed_even_with_others() -> None:
+    t = Task.create(task_source="manual:test")
+
+    t.recalculate_from_subtasks(
+        [SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE, SubTaskStatus.FAILED]
+    )
+
+    assert t.status is TaskStatus.FAILED
+    assert t.job_completed is not None
diff --git a/tests/orchestration/__init__.py b/tests/orchestration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/orchestration/test_task_orchestrator.py b/tests/orchestration/test_task_orchestrator.py
new file mode 100644
index 00000000..1a48127f
--- /dev/null
+++ b/tests/orchestration/test_task_orchestrator.py
@@ -0,0 +1,151 @@
+from collections.abc import Iterator
+from dataclasses import dataclass
+
+import pytest
+from sqlmodel import Session, SQLModel, create_engine
+
+from domain.tasks.subtasks import SubTask, SubTaskStatus
+from domain.tasks.tasks import Source, TaskStatus
+from orchestration.task_orchestrator import TaskOrchestrator
+from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
+from repositories.tasks.task_postgres_repository import TaskPostgresRepository
+
+
+@dataclass
+class Harness:
+    orchestrator: TaskOrchestrator
+    tasks: TaskPostgresRepository
+    subtasks: SubTaskPostgresRepository
+
+
+@pytest.fixture
+def harness() -> Iterator[Harness]:
+    engine = create_engine("sqlite://")
+    SQLModel.metadata.create_all(engine)
+    with Session(engine) as session:
+        tasks = TaskPostgresRepository(session=session)
+        subtasks = SubTaskPostgresRepository(session=session)
+        yield Harness(
+            orchestrator=TaskOrchestrator(task_repo=tasks, subtask_repo=subtasks),
+            tasks=tasks,
+            subtasks=subtasks,
+        )
+
+
+def test_create_task_with_subtask_creates_both_in_waiting(
+    harness: Harness,
+) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test",
+        inputs={"foo": "bar"},
+        source=Source.PORTFOLIO,
+        source_id="abc",
+    )
+
+    assert task.status is TaskStatus.WAITING
+    assert subtask.status is SubTaskStatus.WAITING
+    assert subtask.task_id == task.id
+    assert subtask.inputs == {"foo": "bar"}
+
+
+def test_start_subtask_cascades_to_in_progress(harness: Harness) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    started = harness.orchestrator.start_subtask(
+        subtask.id, cloud_logs_url="https://example/log"
+    )
+
+    assert started.status is SubTaskStatus.IN_PROGRESS
+    assert started.cloud_logs_url == "https://example/log"
+    assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
+
+
+def test_complete_subtask_cascades_to_complete(harness: Harness) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+    harness.orchestrator.start_subtask(subtask.id)
+
+    harness.orchestrator.complete_subtask(subtask.id, {"value": 42})
+
+    done_subtask = harness.subtasks.get(subtask.id)
+    done_task = harness.tasks.get(task.id)
+    assert done_subtask.outputs == {"result": {"value": 42}}
+    assert done_task.status is TaskStatus.COMPLETE
+    assert done_task.job_completed is not None
+
+
+def test_fail_subtask_cascades_to_failed(harness: Harness) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    harness.orchestrator.fail_subtask(subtask.id, RuntimeError("boom"))
+
+    failed_subtask = harness.subtasks.get(subtask.id)
+    failed_task = harness.tasks.get(task.id)
+    assert failed_subtask.outputs == {"error": "boom"}
+    assert failed_task.status is TaskStatus.FAILED
+
+
+def test_failed_subtask_locks_task_failed_even_with_others_complete(
+    harness: Harness,
+) -> None:
+    task, first = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+    second = SubTask.create(task_id=task.id)
+    harness.subtasks.create(second)
+
+    harness.orchestrator.complete_subtask(first.id)
+    harness.orchestrator.fail_subtask(second.id, RuntimeError("nope"))
+
+    assert harness.tasks.get(task.id).status is TaskStatus.FAILED
+
+
+def test_mixed_complete_and_in_progress_keeps_task_in_progress(
+    harness: Harness,
+) -> None:
+    task, first = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+    second = SubTask.create(task_id=task.id)
+    harness.subtasks.create(second)
+
+    harness.orchestrator.complete_subtask(first.id)
+    harness.orchestrator.start_subtask(second.id)
+
+    assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
+
+
+def test_run_subtask_happy_path_returns_result_and_cascades_complete(
+    harness: Harness,
+) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    result = harness.orchestrator.run_subtask(subtask.id, work=lambda: {"answer": 42})
+
+    assert result == {"answer": 42}
+    assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE
+    assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE
+
+
+def test_run_subtask_failing_work_marks_failed_and_reraises(
+    harness: Harness,
+) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    def boom() -> None:
+        raise RuntimeError("boom")
+
+    with pytest.raises(RuntimeError, match="boom"):
+        harness.orchestrator.run_subtask(subtask.id, work=boom)
+
+    assert harness.subtasks.get(subtask.id).status is SubTaskStatus.FAILED
+    assert harness.tasks.get(task.id).status is TaskStatus.FAILED
diff --git a/tests/repositories/__init__.py b/tests/repositories/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/repositories/tasks/__init__.py b/tests/repositories/tasks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/repositories/tasks/postgres/__init__.py b/tests/repositories/tasks/postgres/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py
new file mode 100644
index 00000000..ac39e089
--- /dev/null
+++ b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py
@@ -0,0 +1,81 @@
+from collections.abc import Iterator
+from uuid import uuid4
+
+import pytest
+from sqlmodel import Session, SQLModel, create_engine
+
+# Importing the SQLModel row modules registers their tables in
+# SQLModel.metadata so create_all builds both. Imports look unused; they aren't.
+import infrastructure.postgres.subtask_table  # noqa: F401  # pyright: ignore[reportUnusedImport]
+import infrastructure.postgres.task_table  # noqa: F401  # pyright: ignore[reportUnusedImport]
+from domain.tasks.subtasks import SubTask, SubTaskStatus
+from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
+
+
+@pytest.fixture
+def session() -> Iterator[Session]:
+    engine = create_engine("sqlite://")
+    SQLModel.metadata.create_all(engine)
+    with Session(engine) as s:
+        yield s
+
+
+def test_create_and_get_round_trip_preserves_inputs(session: Session) -> None:
+    repo = SubTaskPostgresRepository(session=session)
+    task_id = uuid4()
+    st = SubTask.create(task_id=task_id, inputs={"address": "68 Glendon Way"})
+
+    repo.create(st)
+    fetched = repo.get(st.id)
+
+    assert fetched.id == st.id
+    assert fetched.task_id == task_id
+    assert fetched.status is SubTaskStatus.WAITING
+    assert fetched.inputs == {"address": "68 Glendon Way"}
+    assert fetched.outputs is None
+
+
+def test_save_persists_status_and_outputs(session: Session) -> None:
+    repo = SubTaskPostgresRepository(session=session)
+    st = SubTask.create(task_id=uuid4())
+    repo.create(st)
+
+    st.start(cloud_logs_url="https://example/log")
+    repo.save(st)
+    assert repo.get(st.id).status is SubTaskStatus.IN_PROGRESS
+
+    st.complete({"uprn": "123"})
+    repo.save(st)
+    done = repo.get(st.id)
+    assert done.status is SubTaskStatus.COMPLETE
+    assert done.outputs == {"result": {"uprn": "123"}}
+    assert done.cloud_logs_url == "https://example/log"
+    assert done.job_completed is not None
+
+
+def test_list_by_task_filters_by_task_id(session: Session) -> None:
+    repo = SubTaskPostgresRepository(session=session)
+    task_a = uuid4()
+    task_b = uuid4()
+    repo.create(SubTask.create(task_id=task_a))
+    repo.create(SubTask.create(task_id=task_a))
+    repo.create(SubTask.create(task_id=task_b))
+
+    a_results = repo.list_by_task(task_a)
+    b_results = repo.list_by_task(task_b)
+
+    assert len(a_results) == 2
+    assert len(b_results) == 1
+    assert all(s.task_id == task_a for s in a_results)
+    assert all(s.task_id == task_b for s in b_results)
+
+
+def test_list_by_task_returns_empty_for_unknown_task(session: Session) -> None:
+    repo = SubTaskPostgresRepository(session=session)
+    assert repo.list_by_task(uuid4()) == []
+
+
+def test_get_missing_raises(session: Session) -> None:
+    repo = SubTaskPostgresRepository(session=session)
+    with pytest.raises(ValueError, match="not found"):
+        repo.get(uuid4())
diff --git a/tests/repositories/tasks/postgres/test_task_postgres_repository.py b/tests/repositories/tasks/postgres/test_task_postgres_repository.py
new file mode 100644
index 00000000..3e1aa226
--- /dev/null
+++ b/tests/repositories/tasks/postgres/test_task_postgres_repository.py
@@ -0,0 +1,68 @@
+from collections.abc import Iterator
+from uuid import uuid4
+
+import pytest
+from sqlmodel import Session, SQLModel, create_engine
+
+from domain.tasks.tasks import Source, Task, TaskStatus
+from infrastructure.postgres.task_table import TaskRow
+from repositories.tasks.task_postgres_repository import TaskPostgresRepository
+
+
+@pytest.fixture
+def session() -> Iterator[Session]:
+    engine = create_engine("sqlite://")
+    SQLModel.metadata.create_all(engine)
+    with Session(engine) as s:
+        yield s
+
+
+def test_create_and_get_round_trip(session: Session) -> None:
+    # Arrange
+    repo = TaskPostgresRepository(session=session)
+    t = Task.create(
+        task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123"
+    )
+
+    # Act
+    repo.create(t)
+    fetched = repo.get(t.id)
+
+    # Assert
+    assert fetched.id == t.id
+    assert fetched.status is TaskStatus.WAITING
+    assert fetched.source is Source.PORTFOLIO
+    assert fetched.source_id == "abc-123"
+
+
+def test_save_persists_status_transition(session: Session) -> None:
+    repo = TaskPostgresRepository(session=session)
+    t = Task.create(task_source="manual:test")
+    repo.create(t)
+
+    t.start()
+    repo.save(t)
+    assert repo.get(t.id).status is TaskStatus.IN_PROGRESS
+
+    t.complete()
+    repo.save(t)
+    done = repo.get(t.id)
+    assert done.status is TaskStatus.COMPLETE
+    assert done.job_completed is not None
+
+
+def test_get_missing_raises(session: Session) -> None:
+    repo = TaskPostgresRepository(session=session)
+    with pytest.raises(ValueError, match="not found"):
+        repo.get(uuid4())
+
+
+def test_get_normalises_legacy_capitalised_status(session: Session) -> None:
+    # Existing rows written by backend code use "In Progress" (capitalised).
+    repo = TaskPostgresRepository(session=session)
+    row = TaskRow(task_source="manual:test", status="In Progress")
+    session.add(row)
+    session.commit()
+
+    fetched = repo.get(row.id)
+    assert fetched.status is TaskStatus.IN_PROGRESS
diff --git a/utilities/__init__.py b/utilities/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/utilities/aws_lambda/__init__.py b/utilities/aws_lambda/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/utilities/aws_lambda/default_orchestrator.py b/utilities/aws_lambda/default_orchestrator.py
new file mode 100644
index 00000000..f78886b9
--- /dev/null
+++ b/utilities/aws_lambda/default_orchestrator.py
@@ -0,0 +1,26 @@
+import os
+from collections.abc import Generator
+from contextlib import contextmanager
+
+from sqlmodel import Session
+
+from infrastructure.postgres.config import PostgresConfig
+from infrastructure.postgres.engine import make_engine
+from orchestration.task_orchestrator import TaskOrchestrator
+from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
+from repositories.tasks.task_postgres_repository import TaskPostgresRepository
+
+
+@contextmanager
+def default_orchestrator() -> Generator[TaskOrchestrator, None, None]:
+    """Yield a TaskOrchestrator wired to a fresh Postgres session.
+
+    Connection params come from os.environ via PostgresConfig.from_env. Each
+    handler invocation gets its own session, cleaned up on context exit.
+    """
+    engine = make_engine(PostgresConfig.from_env(dict(os.environ)))
+    with Session(engine) as session:
+        yield TaskOrchestrator(
+            task_repo=TaskPostgresRepository(session=session),
+            subtask_repo=SubTaskPostgresRepository(session=session),
+        )
diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py
new file mode 100644
index 00000000..64c1daa6
--- /dev/null
+++ b/utilities/aws_lambda/subtask_handler.py
@@ -0,0 +1,67 @@
+"""@subtask_handler decorator for Lambdas that operate on existing SubTasks.
+
+Translates an AWS Lambda invocation (SQS-shaped or direct) into
+TaskOrchestrator.run_subtask(...) calls.
+"""
+
+import json
+from contextlib import AbstractContextManager
+from functools import wraps
+from typing import Any, Callable, Optional, cast
+
+from utilities.aws_lambda.default_orchestrator import default_orchestrator
+from utilities.aws_lambda.subtask_trigger_body import SubtaskTriggerBody
+from orchestration.task_orchestrator import TaskOrchestrator
+
+OrchestratorCM = Callable[[], AbstractContextManager[TaskOrchestrator]]
+
+
+def subtask_handler(
+    *,
+    orchestrator_cm: Optional[OrchestratorCM] = None,
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """Run the wrapped function as the body of an existing SubTask.
+
+    For each record, validates the body via SubtaskTriggerBody (must contain
+    task_id and sub_task_id), then runs the function inside
+    orchestrator.run_subtask(...). The orchestrator owns the start/complete/
+    fail lifecycle and cascades status into the parent Task. On failure the
+    underlying exception propagates after the SubTask is marked FAILED.
+    """
+    factory = orchestrator_cm or default_orchestrator
+
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(func)
+        def wrapper(event: dict[str, Any], context: Any) -> None:
+            with factory() as orchestrator:
+                for record in _records(event):
+                    body = _parse_body(record)
+                    trigger = SubtaskTriggerBody.model_validate(body)
+                    orchestrator.run_subtask(
+                        trigger.sub_task_id,
+                        work=lambda body=body: func(body, context),
+                    )
+
+        return wrapper
+
+    return decorator
+
+
+def _parse_body(record: dict[str, Any]) -> dict[str, Any]:
+    raw = record.get("body", record)
+    if isinstance(raw, str):
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            return {}
+        return cast(dict[str, Any], parsed) if isinstance(parsed, dict) else {}
+    if isinstance(raw, dict):
+        return cast(dict[str, Any], raw)
+    return {}
+
+
+def _records(event: dict[str, Any]) -> list[dict[str, Any]]:
+    raw_records = event.get("Records")
+    if isinstance(raw_records, list):
+        return [r for r in cast(list[Any], raw_records) if isinstance(r, dict)]
+    return [event]
diff --git a/utilities/aws_lambda/subtask_trigger_body.py b/utilities/aws_lambda/subtask_trigger_body.py
new file mode 100644
index 00000000..a6b539e5
--- /dev/null
+++ b/utilities/aws_lambda/subtask_trigger_body.py
@@ -0,0 +1,17 @@
+from uuid import UUID
+
+from pydantic import BaseModel, ConfigDict
+
+
+class SubtaskTriggerBody(BaseModel):
+    """The minimum the subtask_handler needs to dispatch lifecycle calls.
+
+    `extra="allow"` so the rest of the work payload passes through to the
+    decorated function untouched — handlers do their own model_validate on
+    the full body for fields specific to their use case.
+    """
+
+    model_config = ConfigDict(extra="allow")
+
+    task_id: UUID
+    sub_task_id: UUID
diff --git a/utilities/aws_lambda/task_handler.py b/utilities/aws_lambda/task_handler.py
new file mode 100644
index 00000000..82c7198e
--- /dev/null
+++ b/utilities/aws_lambda/task_handler.py
@@ -0,0 +1,98 @@
+"""@task_handler decorator for Lambdas that own the entire pipeline.
+
+Translates an AWS Lambda invocation (SQS-shaped or direct) into
+TaskOrchestrator.create_task_with_subtask(...) + run_subtask(...).
+"""
+
+import json
+from contextlib import AbstractContextManager
+from functools import wraps
+from typing import Any, Callable, Optional, cast
+
+from utilities.aws_lambda.default_orchestrator import default_orchestrator
+from domain.tasks.tasks import Source
+from orchestration.task_orchestrator import TaskOrchestrator
+
+OrchestratorCM = Callable[[], AbstractContextManager[TaskOrchestrator]]
+
+
+def task_handler(
+    *,
+    task_source: str,
+    source: Source,
+    orchestrator_cm: Optional[OrchestratorCM] = None,
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """Run the wrapped function as the body of a freshly-created Task + SubTask.
+
+    For each record, creates a new Task + initial SubTask, then runs the
+    wrapped function inside orchestrator.run_subtask(...). `source_id` is
+    read from body[source.value] (silent None if absent — preserved from
+    legacy ADR-0001).
+
+    Records-style events use SQS partial-batch-failure semantics: individual
+    failures are reported via {"batchItemFailures": [...]} rather than
+    propagating. Direct invocations re-raise.
+    """
+    factory = orchestrator_cm or default_orchestrator
+
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(func)
+        def wrapper(event: dict[str, Any], context: Any) -> Any:
+            with factory() as orchestrator:
+                results: list[Any] = []
+                failures: list[dict[str, Any]] = []
+
+                for record in _records(event):
+                    body = _parse_body(record)
+                    raw_source_id = body.get(source.value)
+                    source_id = (
+                        str(raw_source_id) if raw_source_id is not None else None
+                    )
+
+                    _, subtask = orchestrator.create_task_with_subtask(
+                        task_source=task_source,
+                        inputs=body,
+                        source=source,
+                        source_id=source_id,
+                    )
+
+                    try:
+                        result = orchestrator.run_subtask(
+                            subtask.id,
+                            work=lambda body=body: func(body, context),
+                        )
+                        results.append(result)
+                    except Exception:
+                        if "Records" in event:
+                            message_id = record.get("messageId", "")
+                            failures.append({"itemIdentifier": message_id})
+                        else:
+                            raise
+
+                if "Records" in event:
+                    return {"batchItemFailures": failures}
+                return results
+
+        return wrapper
+
+    return decorator
+
+
+def _parse_body(record: dict[str, Any]) -> dict[str, Any]:
+    raw = record.get("body", record)
+    if isinstance(raw, str):
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            return {}
+        return cast(dict[str, Any], parsed) if isinstance(parsed, dict) else {}
+    if isinstance(raw, dict):
+        return cast(dict[str, Any], raw)
+    return {}
+
+
+def _records(event: dict[str, Any]) -> list[dict[str, Any]]:
+    raw_records = event.get("Records")
+    if isinstance(raw_records, list):
+        return [r for r in cast(list[Any], raw_records) if isinstance(r, dict)]
+    return [event]
diff --git a/utilities/private.py b/utilities/private.py
new file mode 100644
index 00000000..77a70578
--- /dev/null
+++ b/utilities/private.py
@@ -0,0 +1,33 @@
+import inspect
+from typing import Any, Callable
+
+
+class private:
+    """Decorator that raises if a _-prefixed method is called from outside its class."""
+
+    func: Callable[..., Any]
+    name: str
+    owner: type
+
+    def __init__(self, func: Callable[..., Any]) -> None:
+        self.func = func
+        self.name = getattr(func, "__name__", "<anonymous>")
+
+    def __set_name__(self, owner: type, name: str) -> None:
+        self.owner = owner
+
+    def __get__(self, instance: Any, owner: type) -> Callable[..., Any]:
+        # Walk up one frame to see who's calling
+        frame = inspect.currentframe()
+        if frame is None or frame.f_back is None:
+            raise RuntimeError("cannot inspect caller frame")
+        caller_frame = frame.f_back
+        caller_self = caller_frame.f_locals.get("self")
+
+        if not isinstance(caller_self, self.owner):
+            raise RuntimeError(
+                f"{self.owner.__name__}.{self.name} is private; "
+                f"called from {caller_frame.f_code.co_name}"
+            )
+
+        return getattr(self.func, "__get__")(instance, owner)

From 6198d7a46db83ecf2b74e2b260fd0b0923010b39 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 16:45:47 +0000
Subject: [PATCH 70/91] postcode_splitter: pure domain (UserAddress,
 sanitise_postcode, postcode_batching)

Slice 1/6 of the postcode_splitter refactor (Hestia-Homes/Model#1100).
Introduces the pure-domain foundation under domain/, with no AWS, Postgres,
or pandas. UserAddress is a frozen dataclass that sanitises its postcode in
__post_init__ via the canonical sanitise_postcode helper, and
iter_postcode_grouped_batches preserves the legacy splitter's batching
invariants (group-by-postcode in insertion order, never split a group,
oversize single-postcode groups dispatched whole, final flush). Updates
UBIQUITOUS_LANGUAGE.md so the User Address term covers both the dataclass
sense (preferred in domain code) and the raw upstream-string sense.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 UBIQUITOUS_LANGUAGE.md                        |  4 +-
 domain/addresses/__init__.py                  |  0
 domain/addresses/postcode_batching.py         | 87 +++++++++++++++++
 domain/addresses/user_address.py              | 36 +++++++
 domain/postcodes/__init__.py                  |  0
 domain/postcodes/sanitise.py                  | 23 +++++
 tests/domain/addresses/__init__.py            |  0
 .../addresses/test_postcode_batching.py       | 93 +++++++++++++++++++
 tests/domain/addresses/test_user_address.py   | 45 +++++++++
 tests/domain/postcodes/__init__.py            |  0
 tests/domain/postcodes/test_sanitise.py       | 28 ++++++
 11 files changed, 314 insertions(+), 2 deletions(-)
 create mode 100644 domain/addresses/__init__.py
 create mode 100644 domain/addresses/postcode_batching.py
 create mode 100644 domain/addresses/user_address.py
 create mode 100644 domain/postcodes/__init__.py
 create mode 100644 domain/postcodes/sanitise.py
 create mode 100644 tests/domain/addresses/__init__.py
 create mode 100644 tests/domain/addresses/test_postcode_batching.py
 create mode 100644 tests/domain/addresses/test_user_address.py
 create mode 100644 tests/domain/postcodes/__init__.py
 create mode 100644 tests/domain/postcodes/test_sanitise.py

diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md
index 1765cbc8..c3074c02 100644
--- a/UBIQUITOUS_LANGUAGE.md
+++ b/UBIQUITOUS_LANGUAGE.md
@@ -23,7 +23,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve
 |------|------------|------------------|
 | **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" |
 | **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" |
-| **User Address** | A free-text address string provided by a user or imported from a customer dataset, before any normalisation or matching. | "user input", "raw address", "user_inputed_address" |
+| **User Address** | A structured dataclass (`domain.addresses.user_address.UserAddress`) capturing a customer-supplied address: a free-text `user_address` line, a canonical `postcode` (sanitised on construction), and an optional `internal_reference`. The bare string sense -- the raw free-text address line as it arrives from upstream ingestion, before being wrapped -- remains valid when discussing CSV columns, API payloads, or other upstream contexts; in domain code, prefer the dataclass. | "user input", "raw address", "user_inputed_address" |
 | **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" |
 
 ## Address Matching
@@ -72,7 +72,7 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve
 
 ## Flagged ambiguities
 
-- **"address"** appears as both the raw **User Address** (free-text from customer data) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1".
+- **"address"** appears as both the raw **User Address** (free-text from customer data, or the structured `UserAddress` dataclass that wraps it) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". Within `domain/`, **User Address** specifically means the `UserAddress` dataclass; in upstream ingestion contexts (CSV columns, SQS payloads) it can still mean the raw string sense.
 - **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments.
 - **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`.
 - **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter.
diff --git a/domain/addresses/__init__.py b/domain/addresses/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py
new file mode 100644
index 00000000..209e0784
--- /dev/null
+++ b/domain/addresses/postcode_batching.py
@@ -0,0 +1,87 @@
+"""Pure-Python postcode-grouped batching.
+
+This module preserves the batching invariants from the legacy postcode
+splitter (``backend/postcode_splitter/main.py``) without touching pandas,
+S3, or SQS:
+
+  * Addresses are grouped by **Postcode** in *insertion order* -- the first
+    Postcode seen produces the first group.
+  * A Postcode group is never split across two batches.
+  * If a single Postcode group is larger than ``max_batch_size``, it is
+    flushed as its own oversize batch (any buffered groups go out first,
+    untouched).
+  * Adding a group that would push the buffer past ``max_batch_size`` first
+    flushes the existing buffer, then starts a new buffer with the group.
+  * Whatever remains in the buffer after the input is exhausted is flushed
+    as the final batch.
+  * Empty input yields no batches.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Iterator
+
+from domain.addresses.user_address import UserAddress
+
+
+def iter_postcode_grouped_batches(
+    addresses: Iterable[UserAddress],
+    *,
+    max_batch_size: int = 500,
+) -> Iterator[list[UserAddress]]:
+    """Yield batches of ``UserAddress`` grouped by Postcode.
+
+    Args:
+        addresses: An iterable of :class:`UserAddress`. Order is preserved
+            within each Postcode group, and groups are yielded in the order
+            their first member was seen.
+        max_batch_size: The soft upper bound on batch size, in number of
+            addresses. A single Postcode group larger than this cap is
+            dispatched whole (the cap is never used to split a group).
+
+    Yields:
+        Lists of ``UserAddress``. Each list is non-empty.
+    """
+    if max_batch_size < 1:
+        raise ValueError("max_batch_size must be >= 1")
+
+    groups = _group_by_postcode_in_order(addresses)
+
+    buffer: list[UserAddress] = []
+    for group in groups.values():
+        group_len = len(group)
+
+        # Oversize single-Postcode group: flush buffer first, then dispatch
+        # the group as its own batch. Mirrors the legacy
+        # ``if group_len >= batch_size`` branch.
+        if group_len >= max_batch_size:
+            if buffer:
+                yield buffer
+                buffer = []
+            yield group
+            continue
+
+        # Adding this group would overflow: flush buffer before appending.
+        if len(buffer) + group_len > max_batch_size:
+            yield buffer
+            buffer = []
+
+        buffer.extend(group)
+
+    # Final flush.
+    if buffer:
+        yield buffer
+
+
+def _group_by_postcode_in_order(
+    addresses: Iterable[UserAddress],
+) -> dict[str, list[UserAddress]]:
+    """Group addresses by ``postcode`` preserving first-seen order.
+
+    Python dicts retain insertion order since 3.7, so a plain dict suffices
+    for the same effect as pandas ``groupby(..., sort=False)``.
+    """
+    groups: dict[str, list[UserAddress]] = {}
+    for address in addresses:
+        groups.setdefault(address.postcode, []).append(address)
+    return groups
diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py
new file mode 100644
index 00000000..e48dfdec
--- /dev/null
+++ b/domain/addresses/user_address.py
@@ -0,0 +1,36 @@
+"""The :class:`UserAddress` value object.
+
+A frozen dataclass capturing the splitter's domain entity: the raw input
+address line, a sanitised postcode, and an optional internal reference from
+the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no
+caller can construct an instance with an un-normalised postcode.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional
+
+from domain.postcodes.sanitise import sanitise_postcode
+
+
+@dataclass(frozen=True)
+class UserAddress:
+    """A user-supplied address paired with its canonical postcode.
+
+    Attributes:
+        user_address: The free-text address string as supplied upstream.
+        postcode: The postcode; always stored in canonical form
+            (uppercased, whitespace stripped). Sanitisation is enforced by
+            :meth:`__post_init__`.
+        internal_reference: Optional customer-side identifier preserved for
+            traceability through the matching pipeline.
+    """
+
+    user_address: str
+    postcode: str
+    internal_reference: Optional[str] = None
+
+    def __post_init__(self) -> None:
+        # Frozen dataclass: bypass the descriptor with object.__setattr__.
+        object.__setattr__(self, "postcode", sanitise_postcode(self.postcode))
diff --git a/domain/postcodes/__init__.py b/domain/postcodes/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/domain/postcodes/sanitise.py b/domain/postcodes/sanitise.py
new file mode 100644
index 00000000..94b0dcf7
--- /dev/null
+++ b/domain/postcodes/sanitise.py
@@ -0,0 +1,23 @@
+"""Canonical postcode sanitisation for the domain layer.
+
+The legacy postcode_splitter normalises postcodes inline with
+``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes
+that operation to a pure, reusable function so the same canonical form is
+applied wherever a postcode crosses a domain boundary -- including
+:class:`domain.addresses.user_address.UserAddress` construction and future
+migrations.
+"""
+
+from __future__ import annotations
+
+
+def sanitise_postcode(s: str) -> str:
+    """Return the canonical form of a postcode.
+
+    The canonical form is uppercase with all whitespace removed. This matches
+    the legacy splitter's ``str.upper().str.replace(" ", "")`` for the
+    overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"``
+    becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep
+    in from CSV ingestion.
+    """
+    return "".join(s.split()).upper()
diff --git a/tests/domain/addresses/__init__.py b/tests/domain/addresses/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py
new file mode 100644
index 00000000..2dac46cc
--- /dev/null
+++ b/tests/domain/addresses/test_postcode_batching.py
@@ -0,0 +1,93 @@
+import pytest
+
+from domain.addresses.postcode_batching import iter_postcode_grouped_batches
+from domain.addresses.user_address import UserAddress
+
+
+def _addrs(postcode: str, n: int) -> list[UserAddress]:
+    """Build ``n`` addresses sharing a postcode, with distinct address lines."""
+    return [
+        UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode)
+        for i in range(n)
+    ]
+
+
+def test_empty_input_yields_no_batches() -> None:
+    assert list(iter_postcode_grouped_batches([])) == []
+
+
+def test_single_batch_under_cap() -> None:
+    addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2)
+    batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500))
+    assert len(batches) == 1
+    assert batches[0] == addrs
+
+
+def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None:
+    # Two groups whose total exactly equals the cap pack into a single
+    # batch -- no premature flush.
+    addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2)
+    batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
+    assert len(batches) == 1
+    assert len(batches[0]) == 5
+
+
+def test_flush_on_overflow_before_adding_next_postcode() -> None:
+    # Cap is 5. First group fills 3 slots; second group of 3 would overflow,
+    # so the buffer is flushed first and the next group starts a fresh batch.
+    addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3)
+    batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
+    assert len(batches) == 2
+    assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3
+    assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3
+
+
+def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
+    # An oversize single-postcode group goes out as one batch larger than
+    # the cap -- the cap never splits a postcode.
+    addrs = _addrs("AA1 1AA", 7)
+    batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
+    assert len(batches) == 1
+    assert len(batches[0]) == 7
+
+
+def test_oversize_group_flushes_existing_buffer_first() -> None:
+    # Mirrors the legacy ``if buffer: flush`` branch when an oversize group
+    # is encountered: buffered work must not be lost or interleaved.
+    small = _addrs("AA1 1AA", 2)
+    big = _addrs("BB2 2BB", 7)
+    tail = _addrs("CC3 3CC", 1)
+    batches = list(
+        iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
+    )
+    assert len(batches) == 3
+    assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"]
+    assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7
+    assert [a.postcode for a in batches[2]] == ["CC33CC"]
+
+
+def test_final_flush_yields_remaining_buffer() -> None:
+    # No overflow ever happens, but the trailing buffer must still come out.
+    addrs = _addrs("AA1 1AA", 2) + _addrs("BB2 2BB", 2)
+    batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500))
+    assert batches == [addrs]
+
+
+def test_postcode_grouping_preserves_first_seen_order() -> None:
+    # Interleaved input must still group by postcode and emit in first-seen
+    # order -- never alphabetical.
+    a1, a2 = _addrs("ZZ9 9ZZ", 2)
+    b1, b2 = _addrs("AA1 1AA", 2)
+    batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2]))
+    assert len(batches) == 1
+    assert [a.postcode for a in batches[0]] == [
+        "ZZ99ZZ",
+        "ZZ99ZZ",
+        "AA11AA",
+        "AA11AA",
+    ]
+
+
+def test_invalid_max_batch_size_raises() -> None:
+    with pytest.raises(ValueError, match="max_batch_size"):
+        list(iter_postcode_grouped_batches([], max_batch_size=0))
diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py
new file mode 100644
index 00000000..e722077d
--- /dev/null
+++ b/tests/domain/addresses/test_user_address.py
@@ -0,0 +1,45 @@
+import dataclasses
+
+import pytest
+
+from domain.addresses.user_address import UserAddress
+
+
+def test_user_address_sanitises_postcode_on_construction() -> None:
+    addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
+    assert addr.postcode == "SW1A1AA"
+
+
+def test_user_address_preserves_user_address_verbatim() -> None:
+    # The free-text user_address string is intentionally NOT normalised --
+    # only the postcode is canonicalised at the boundary.
+    addr = UserAddress(user_address="  1 The   Street  ", postcode="sw1a 1aa")
+    assert addr.user_address == "  1 The   Street  "
+
+
+def test_user_address_internal_reference_defaults_to_none() -> None:
+    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    assert addr.internal_reference is None
+
+
+def test_user_address_internal_reference_accepted() -> None:
+    addr = UserAddress(
+        user_address="1 The Street",
+        postcode="SW1A1AA",
+        internal_reference="cust-42",
+    )
+    assert addr.internal_reference == "cust-42"
+
+
+def test_user_address_is_frozen() -> None:
+    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        addr.postcode = "OTHER"  # type: ignore[misc]
+
+
+def test_user_address_equality_uses_sanitised_postcode() -> None:
+    # Two instances constructed with different surface forms of the same
+    # postcode must compare equal because sanitisation runs eagerly.
+    a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
+    b = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    assert a == b
diff --git a/tests/domain/postcodes/__init__.py b/tests/domain/postcodes/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/domain/postcodes/test_sanitise.py b/tests/domain/postcodes/test_sanitise.py
new file mode 100644
index 00000000..edd1679c
--- /dev/null
+++ b/tests/domain/postcodes/test_sanitise.py
@@ -0,0 +1,28 @@
+from domain.postcodes.sanitise import sanitise_postcode
+
+
+def test_sanitise_uppercases() -> None:
+    assert sanitise_postcode("sw1a1aa") == "SW1A1AA"
+
+
+def test_sanitise_strips_internal_spaces() -> None:
+    assert sanitise_postcode("sw1a 1aa") == "SW1A1AA"
+
+
+def test_sanitise_strips_leading_and_trailing_whitespace() -> None:
+    assert sanitise_postcode("  sw1a 1aa  ") == "SW1A1AA"
+
+
+def test_sanitise_strips_tabs_and_newlines() -> None:
+    # CSV ingestion occasionally introduces stray whitespace characters; the
+    # canonical form must absorb them just like literal spaces.
+    assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA"
+
+
+def test_sanitise_already_canonical_is_idempotent() -> None:
+    assert sanitise_postcode("SW1A1AA") == "SW1A1AA"
+    assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA"
+
+
+def test_sanitise_empty_string() -> None:
+    assert sanitise_postcode("") == ""

From 7b00a33cd242e9959ac47e4e207d67477d53b8a2 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 17:12:21 +0000
Subject: [PATCH 71/91] infrastructure: typed S3/SQS clients (S3Client,
 CsvS3Client, SqsClient, Address2UprnQueueClient)

Slice 3/6 of the postcode_splitter refactor (Hestia-Homes/Model#1101).
Introduces a thin typed infrastructure layer wrapping boto3 for the AWS
side of the splitter. S3Client/SqsClient are bucket-/queue-bound byte
adapters; CsvS3Client subclasses S3Client to round-trip CSV row dicts
via the existing parse_s3_uri helper in utils/s3.py; Address2UprnQueueClient
subclasses SqsClient to publish the typed {task_id, sub_task_id, s3_uri}
fan-out body the downstream consumer expects. moto[s3,sqs] is pulled into
test.requirements.txt and the new tests/infrastructure/ suite exercises
each client against the moto backend (S3 round-trip, CSV round-trip,
SQS send + body inspection, typed publish + body inspection). pyright
--strict is clean on the new modules.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 infrastructure/address2uprn_queue_client.py   | 27 ++++++++
 infrastructure/csv_s3_client.py               | 46 +++++++++++++
 infrastructure/s3_client.py                   | 31 +++++++++
 infrastructure/sqs_client.py                  | 28 ++++++++
 test.requirements.txt                         |  3 +-
 tests/infrastructure/__init__.py              | 17 +++++
 tests/infrastructure/conftest.py              | 32 +++++++++
 .../test_address2uprn_queue_client.py         | 65 +++++++++++++++++++
 tests/infrastructure/test_csv_s3_client.py    | 43 ++++++++++++
 tests/infrastructure/test_s3_client.py        | 31 +++++++++
 tests/infrastructure/test_sqs_client.py       | 38 +++++++++++
 11 files changed, 360 insertions(+), 1 deletion(-)
 create mode 100644 infrastructure/address2uprn_queue_client.py
 create mode 100644 infrastructure/csv_s3_client.py
 create mode 100644 infrastructure/s3_client.py
 create mode 100644 infrastructure/sqs_client.py
 create mode 100644 tests/infrastructure/__init__.py
 create mode 100644 tests/infrastructure/conftest.py
 create mode 100644 tests/infrastructure/test_address2uprn_queue_client.py
 create mode 100644 tests/infrastructure/test_csv_s3_client.py
 create mode 100644 tests/infrastructure/test_s3_client.py
 create mode 100644 tests/infrastructure/test_sqs_client.py

diff --git a/infrastructure/address2uprn_queue_client.py b/infrastructure/address2uprn_queue_client.py
new file mode 100644
index 00000000..d81e2dd1
--- /dev/null
+++ b/infrastructure/address2uprn_queue_client.py
@@ -0,0 +1,27 @@
+from uuid import UUID
+
+from infrastructure.sqs_client import SqsClient
+
+
+class Address2UprnQueueClient(SqsClient):
+    """SQS client that publishes Address-to-UPRN fan-out messages.
+
+    The body shape is fixed by the downstream consumer:
+        ``{"task_id": str, "sub_task_id": str, "s3_uri": str}``
+    """
+
+    def publish(
+        self,
+        *,
+        parent_task_id: UUID,
+        child_subtask_id: UUID,
+        s3_uri: str,
+    ) -> str:
+        """Send a typed Address-to-UPRN message. Returns the SQS ``MessageId``."""
+        return self.send(
+            {
+                "task_id": str(parent_task_id),
+                "sub_task_id": str(child_subtask_id),
+                "s3_uri": s3_uri,
+            }
+        )
diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py
new file mode 100644
index 00000000..5163705b
--- /dev/null
+++ b/infrastructure/csv_s3_client.py
@@ -0,0 +1,46 @@
+import csv
+from io import StringIO
+
+from infrastructure.s3_client import S3Client
+from utils.s3 import parse_s3_uri
+
+
+class CsvS3Client(S3Client):
+    """:class:`S3Client` subclass that round-trips CSV row dictionaries.
+
+    Rows are represented as ``list[dict[str, str]]`` — the same shape used by
+    :func:`csv.DictReader`/``DictWriter`` — which keeps the API trivially
+    compatible with existing CSV helpers in ``utils/s3.py``.
+    """
+
+    def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
+        """Fetch the object at ``s3_uri`` and decode it as a CSV.
+
+        The bucket portion of the URI is validated against this client's
+        configured bucket so cross-bucket reads fail loudly rather than
+        silently fetching from the wrong place.
+        """
+        bucket, key = parse_s3_uri(s3_uri)
+        if bucket != self.bucket:
+            raise ValueError(
+                f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}"
+            )
+        raw = self.get_object(key)
+        text = raw.decode("utf-8-sig")
+        reader = csv.DictReader(StringIO(text))
+        return [dict(row) for row in reader]
+
+    def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
+        """Serialise ``rows`` to CSV under ``key`` and return the ``s3://`` URI.
+
+        An empty ``rows`` list is rejected because we cannot otherwise infer
+        a header row.
+        """
+        if not rows:
+            raise ValueError("Cannot save an empty rows list: header is unknown")
+        buffer = StringIO()
+        fieldnames = list(rows[0].keys())
+        writer = csv.DictWriter(buffer, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+        return self.put_object(key, buffer.getvalue().encode("utf-8"))
diff --git a/infrastructure/s3_client.py b/infrastructure/s3_client.py
new file mode 100644
index 00000000..9e772881
--- /dev/null
+++ b/infrastructure/s3_client.py
@@ -0,0 +1,31 @@
+from typing import Any
+
+
+class S3Client:
+    """Thin typed wrapper around a boto3 S3 client bound to a single bucket.
+
+    The class is deliberately small: it exposes only the byte-level
+    operations needed by the wider infrastructure layer. Serialisation
+    (CSV, JSON, etc.) lives in subclasses such as :class:`CsvS3Client`.
+    """
+
+    def __init__(self, boto_s3_client: Any, bucket: str) -> None:
+        self._client = boto_s3_client
+        self._bucket = bucket
+
+    @property
+    def bucket(self) -> str:
+        return self._bucket
+
+    def get_object(self, key: str) -> bytes:
+        """Return the raw bytes stored at ``key`` in this client's bucket."""
+        response: dict[str, Any] = self._client.get_object(
+            Bucket=self._bucket, Key=key
+        )
+        body: bytes = response["Body"].read()
+        return body
+
+    def put_object(self, key: str, body: bytes) -> str:
+        """Write ``body`` to ``key`` and return the canonical ``s3://`` URI."""
+        self._client.put_object(Bucket=self._bucket, Key=key, Body=body)
+        return f"s3://{self._bucket}/{key}"
diff --git a/infrastructure/sqs_client.py b/infrastructure/sqs_client.py
new file mode 100644
index 00000000..fb053680
--- /dev/null
+++ b/infrastructure/sqs_client.py
@@ -0,0 +1,28 @@
+import json
+from typing import Any
+
+
+class SqsClient:
+    """Thin typed wrapper around a boto3 SQS client bound to one queue URL.
+
+    The body is JSON-serialised here so callers can pass plain dictionaries
+    instead of constructing message strings themselves. Typed publish
+    helpers (e.g. :class:`Address2UprnQueueClient`) build on this contract.
+    """
+
+    def __init__(self, boto_sqs_client: Any, queue_url: str) -> None:
+        self._client = boto_sqs_client
+        self._queue_url = queue_url
+
+    @property
+    def queue_url(self) -> str:
+        return self._queue_url
+
+    def send(self, body: dict[str, Any]) -> str:
+        """JSON-serialise ``body`` and send it. Returns the SQS ``MessageId``."""
+        response: dict[str, Any] = self._client.send_message(
+            QueueUrl=self._queue_url,
+            MessageBody=json.dumps(body),
+        )
+        message_id: str = response["MessageId"]
+        return message_id
diff --git a/test.requirements.txt b/test.requirements.txt
index 7fdd7dc4..26125034 100644
--- a/test.requirements.txt
+++ b/test.requirements.txt
@@ -9,4 +9,5 @@ hubspot-api-client
 fuzzywuzzy
 pymupdf
 playwright==1.58.0
-msal
\ No newline at end of file
+msal
+moto[s3,sqs]
\ No newline at end of file
diff --git a/tests/infrastructure/__init__.py b/tests/infrastructure/__init__.py
new file mode 100644
index 00000000..3478bda9
--- /dev/null
+++ b/tests/infrastructure/__init__.py
@@ -0,0 +1,17 @@
+from typing import Any
+
+import boto3
+
+REGION = "us-east-1"
+
+
+def make_boto_client(service_name: str) -> Any:
+    """Construct a boto3 client typed as ``Any``.
+
+    boto3's overloaded ``client`` signature uses ``Literal[...]`` per service
+    in the installed stubs, which forces every call site to satisfy
+    ``reportArgumentType`` and ``reportUnknownMemberType`` under strict
+    pyright. Centralising the cast keeps each test file clean.
+    """
+    factory: Any = boto3.client  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
+    return factory(service_name, region_name=REGION)
diff --git a/tests/infrastructure/conftest.py b/tests/infrastructure/conftest.py
new file mode 100644
index 00000000..7ed2fdd6
--- /dev/null
+++ b/tests/infrastructure/conftest.py
@@ -0,0 +1,32 @@
+import os
+from collections.abc import Iterator
+from typing import Optional
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _aws_creds() -> Iterator[None]:  # pyright: ignore[reportUnusedFunction]
+    """Stub AWS creds so botocore doesn't probe the host environment.
+
+    Applied automatically to every test in ``tests/infrastructure/``.
+    """
+    keys = (
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+        "AWS_DEFAULT_REGION",
+    )
+    prev: dict[str, Optional[str]] = {k: os.environ.get(k) for k in keys}
+    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+    os.environ["AWS_SESSION_TOKEN"] = "testing"
+    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
+    try:
+        yield
+    finally:
+        for k, v in prev.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
diff --git a/tests/infrastructure/test_address2uprn_queue_client.py b/tests/infrastructure/test_address2uprn_queue_client.py
new file mode 100644
index 00000000..b4114742
--- /dev/null
+++ b/tests/infrastructure/test_address2uprn_queue_client.py
@@ -0,0 +1,65 @@
+import json
+from collections.abc import Iterator
+from typing import Any, cast
+from uuid import uuid4
+
+import pytest
+from moto import mock_aws
+
+from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
+from tests.infrastructure import make_boto_client
+
+
+@pytest.fixture
+def queue_setup() -> Iterator[tuple[Address2UprnQueueClient, Any, str]]:
+    with mock_aws():
+        boto_client = make_boto_client("sqs")
+        queue: dict[str, Any] = boto_client.create_queue(
+            QueueName="address2uprn-queue"
+        )
+        queue_url = cast(str, queue["QueueUrl"])
+        yield (
+            Address2UprnQueueClient(boto_client, queue_url),
+            boto_client,
+            queue_url,
+        )
+
+
+def test_publish_returns_message_id(
+    queue_setup: tuple[Address2UprnQueueClient, Any, str],
+) -> None:
+    client, _boto, _url = queue_setup
+    message_id = client.publish(
+        parent_task_id=uuid4(),
+        child_subtask_id=uuid4(),
+        s3_uri="s3://my-bucket/path/to/chunk.csv",
+    )
+    assert isinstance(message_id, str)
+    assert message_id
+
+
+def test_publish_body_uses_typed_shape(
+    queue_setup: tuple[Address2UprnQueueClient, Any, str],
+) -> None:
+    client, boto_client, queue_url = queue_setup
+    parent_id = uuid4()
+    child_id = uuid4()
+    s3_uri = "s3://my-bucket/path/to/chunk.csv"
+
+    client.publish(
+        parent_task_id=parent_id,
+        child_subtask_id=child_id,
+        s3_uri=s3_uri,
+    )
+
+    received: dict[str, Any] = boto_client.receive_message(
+        QueueUrl=queue_url, MaxNumberOfMessages=1
+    )
+    messages: list[dict[str, Any]] = received["Messages"]
+    assert len(messages) == 1
+    body = json.loads(messages[0]["Body"])
+    assert body == {
+        "task_id": str(parent_id),
+        "sub_task_id": str(child_id),
+        "s3_uri": s3_uri,
+    }
diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py
new file mode 100644
index 00000000..4b9fc199
--- /dev/null
+++ b/tests/infrastructure/test_csv_s3_client.py
@@ -0,0 +1,43 @@
+from collections.abc import Iterator
+
+import pytest
+from moto import mock_aws
+
+from infrastructure.csv_s3_client import CsvS3Client
+from tests.infrastructure import make_boto_client
+
+BUCKET = "csv-bucket"
+
+
+@pytest.fixture
+def csv_client() -> Iterator[CsvS3Client]:
+    with mock_aws():
+        boto_client = make_boto_client("s3")
+        boto_client.create_bucket(Bucket=BUCKET)
+        yield CsvS3Client(boto_client, BUCKET)
+
+
+def test_save_rows_returns_s3_uri(csv_client: CsvS3Client) -> None:
+    rows = [{"address": "1 High St", "postcode": "AB1 2CD"}]
+    uri = csv_client.save_rows(rows, "uploads/addresses.csv")
+    assert uri == f"s3://{BUCKET}/uploads/addresses.csv"
+
+
+def test_round_trip_preserves_rows(csv_client: CsvS3Client) -> None:
+    rows = [
+        {"address": "1 High St", "postcode": "AB1 2CD"},
+        {"address": "2 Low St", "postcode": "XY9 8ZW"},
+    ]
+    uri = csv_client.save_rows(rows, "uploads/addresses.csv")
+    fetched = csv_client.read_rows(uri)
+    assert fetched == rows
+
+
+def test_save_rows_rejects_empty_list(csv_client: CsvS3Client) -> None:
+    with pytest.raises(ValueError, match="empty"):
+        csv_client.save_rows([], "uploads/empty.csv")
+
+
+def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
+    with pytest.raises(ValueError, match="does not match client bucket"):
+        csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
diff --git a/tests/infrastructure/test_s3_client.py b/tests/infrastructure/test_s3_client.py
new file mode 100644
index 00000000..7ed4c30b
--- /dev/null
+++ b/tests/infrastructure/test_s3_client.py
@@ -0,0 +1,31 @@
+from collections.abc import Iterator
+
+import pytest
+from moto import mock_aws
+
+from infrastructure.s3_client import S3Client
+from tests.infrastructure import make_boto_client
+
+BUCKET = "test-bucket"
+
+
+@pytest.fixture
+def s3_client() -> Iterator[S3Client]:
+    with mock_aws():
+        boto_client = make_boto_client("s3")
+        boto_client.create_bucket(Bucket=BUCKET)
+        yield S3Client(boto_client, BUCKET)
+
+
+def test_put_object_returns_s3_uri(s3_client: S3Client) -> None:
+    uri = s3_client.put_object("folder/data.bin", b"payload")
+    assert uri == f"s3://{BUCKET}/folder/data.bin"
+
+
+def test_get_object_returns_bytes_written_by_put_object(s3_client: S3Client) -> None:
+    s3_client.put_object("round/trip.bin", b"hello world")
+    assert s3_client.get_object("round/trip.bin") == b"hello world"
+
+
+def test_bucket_property_exposes_configured_bucket(s3_client: S3Client) -> None:
+    assert s3_client.bucket == BUCKET
diff --git a/tests/infrastructure/test_sqs_client.py b/tests/infrastructure/test_sqs_client.py
new file mode 100644
index 00000000..7f1e8f78
--- /dev/null
+++ b/tests/infrastructure/test_sqs_client.py
@@ -0,0 +1,38 @@
+import json
+from collections.abc import Iterator
+from typing import Any, cast
+
+import pytest
+from moto import mock_aws
+
+from infrastructure.sqs_client import SqsClient
+from tests.infrastructure import make_boto_client
+
+
+@pytest.fixture
+def sqs_setup() -> Iterator[tuple[SqsClient, Any, str]]:
+    with mock_aws():
+        boto_client = make_boto_client("sqs")
+        queue: dict[str, Any] = boto_client.create_queue(QueueName="test-queue")
+        queue_url = cast(str, queue["QueueUrl"])
+        yield SqsClient(boto_client, queue_url), boto_client, queue_url
+
+
+def test_send_returns_message_id(sqs_setup: tuple[SqsClient, Any, str]) -> None:
+    client, _boto, _url = sqs_setup
+    message_id = client.send({"hello": "world"})
+    assert isinstance(message_id, str)
+    assert message_id
+
+
+def test_send_json_serialises_body(sqs_setup: tuple[SqsClient, Any, str]) -> None:
+    client, boto_client, queue_url = sqs_setup
+    body = {"hello": "world", "count": 3}
+    client.send(body)
+
+    received: dict[str, Any] = boto_client.receive_message(
+        QueueUrl=queue_url, MaxNumberOfMessages=1
+    )
+    messages: list[dict[str, Any]] = received["Messages"]
+    assert len(messages) == 1
+    assert json.loads(messages[0]["Body"]) == body

From d7f14033ba76b355543ded5fb3ced93e0411b2ae Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 17:19:41 +0000
Subject: [PATCH 72/91] orchestration: add
 TaskOrchestrator.create_child_subtask primitive

Adds a primitive for creating a new WAITING SubTask under an existing
parent Task, routing all SubTask creation through the orchestrator
(replacing the legacy SubTaskInterface path used by the splitter).
Skips _cascade because a new WAITING child against an IN_PROGRESS
parent is a no-op under Task.recalculate_from_subtasks.
---
 orchestration/task_orchestrator.py            | 16 ++++++++++++++
 tests/orchestration/test_task_orchestrator.py | 22 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/orchestration/task_orchestrator.py b/orchestration/task_orchestrator.py
index 6c67d1ce..82d95db1 100644
--- a/orchestration/task_orchestrator.py
+++ b/orchestration/task_orchestrator.py
@@ -48,6 +48,22 @@ class TaskOrchestrator:
         self._subtasks.create(subtask)
         return task, subtask
 
+    def create_child_subtask(
+        self,
+        parent_task_id: UUID,
+        *,
+        inputs: Optional[dict[str, Any]] = None,
+    ) -> SubTask:
+        """Add a new WAITING SubTask under an existing parent Task.
+
+        Skips `_cascade`: a new WAITING child against an IN_PROGRESS parent
+        leaves the parent's status unchanged per `Task.recalculate_from_subtasks`,
+        so calling it here would be a no-op.
+        """
+        subtask = SubTask.create(task_id=parent_task_id, inputs=inputs)
+        self._subtasks.create(subtask)
+        return subtask
+
     def start_subtask(
         self, subtask_id: UUID, cloud_logs_url: Optional[str] = None
     ) -> SubTask:
diff --git a/tests/orchestration/test_task_orchestrator.py b/tests/orchestration/test_task_orchestrator.py
index 1a48127f..c0816d2d 100644
--- a/tests/orchestration/test_task_orchestrator.py
+++ b/tests/orchestration/test_task_orchestrator.py
@@ -134,6 +134,28 @@ def test_run_subtask_happy_path_returns_result_and_cascades_complete(
     assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE
 
 
+def test_create_child_subtask_adds_waiting_child_without_changing_parent_status(
+    harness: Harness,
+) -> None:
+    task, first = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+    harness.orchestrator.start_subtask(first.id)
+    assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
+
+    child = harness.orchestrator.create_child_subtask(
+        task.id, inputs={"split": "a"}
+    )
+
+    persisted_child = harness.subtasks.get(child.id)
+    assert persisted_child.task_id == task.id
+    assert persisted_child.status is SubTaskStatus.WAITING
+    assert persisted_child.inputs == {"split": "a"}
+    assert persisted_child.id != first.id
+    # Cascade is a no-op: parent stays IN_PROGRESS.
+    assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
+
+
 def test_run_subtask_failing_work_marks_failed_and_reraises(
     harness: Harness,
 ) -> None:

From d70e8a9e53706cd09f2cc82b85ce499fed97bda2 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 17:31:27 +0000
Subject: [PATCH 73/91] utilities/aws_lambda: @subtask_handler injects
 TaskOrchestrator as third positional arg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wrapped function now receives the decorator-owned TaskOrchestrator as
a third positional argument so handlers can compose their own use-case
orchestrator that shares the session, instead of opening a second Postgres
connection per invocation.

Both existing callers (backend/ordnanceSurvey/main.py and
backend/bulk_address2uprn_combiner/main.py) have their signatures extended
to accept the new positional argument (typed Optional[TaskOrchestrator] so
the legacy backend.utils.subtasks.subtask_handler — which only passes two
args — keeps working until the migration to the new decorator lands).

@task_handler is intentionally unchanged in this slice; symmetry is
deferred per issue #1103.
---
 backend/bulk_address2uprn_combiner/main.py    |  14 +-
 backend/ordnanceSurvey/main.py                |  12 +-
 tests/utilities/__init__.py                   |   0
 tests/utilities/aws_lambda/__init__.py        |   0
 .../aws_lambda/test_subtask_handler.py        | 144 ++++++++++++++++++
 utilities/aws_lambda/subtask_handler.py       |   2 +-
 6 files changed, 168 insertions(+), 4 deletions(-)
 create mode 100644 tests/utilities/__init__.py
 create mode 100644 tests/utilities/aws_lambda/__init__.py
 create mode 100644 tests/utilities/aws_lambda/test_subtask_handler.py

diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py
index 44f0b3f9..37136e52 100644
--- a/backend/bulk_address2uprn_combiner/main.py
+++ b/backend/bulk_address2uprn_combiner/main.py
@@ -2,7 +2,7 @@ import os
 import boto3
 import pandas as pd
 from io import BytesIO
-from typing import Any
+from typing import Any, Optional
 from uuid import UUID
 from datetime import datetime, timezone
 
@@ -12,6 +12,7 @@ from backend.app.db.functions.bulk_address_uploads_functions import (
     set_combined_output_s3_uri,
     set_combining_status,
 )
+from orchestration.task_orchestrator import TaskOrchestrator
 
 logger = setup_logger()
 
@@ -35,7 +36,16 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame:
 
 
 @subtask_handler()
-def handler(body: dict[str, Any], context: Any) -> str:
+def handler(
+    body: dict[str, Any],
+    context: Any,
+    orchestrator: Optional[TaskOrchestrator] = None,
+) -> str:
+    # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
+    # decorator; unused here but accepted so the contract is uniform across
+    # callers (see issue #1103).
+    del orchestrator
+
     task_id_str: str = body.get("task_id", "")
 
     if not task_id_str:
diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py
index 6e82b468..18c4e2f2 100644
--- a/backend/ordnanceSurvey/main.py
+++ b/backend/ordnanceSurvey/main.py
@@ -16,6 +16,7 @@ from backend.ordnanceSurvey.helpers import (
     os_places_results_to_dataframe,
 )
 from backend.app.config import get_settings
+from orchestration.task_orchestrator import TaskOrchestrator
 from sqlalchemy import select
 from datetime import datetime
 import uuid
@@ -105,7 +106,16 @@ def save_results_to_s3(
 
 
 @subtask_handler()  # This assumes task_id and subtask_id is defined in event.Records.body
-def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
+def handler(
+    body: dict[str, Any],
+    context: Any,
+    orchestrator: Optional[TaskOrchestrator] = None,
+    local: bool = False,
+) -> None:
+    # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
+    # decorator; unused here but accepted so the contract is uniform across
+    # callers (see issue #1103).
+    del orchestrator
 
     # delete this line after test
     # local = True
diff --git a/tests/utilities/__init__.py b/tests/utilities/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/utilities/aws_lambda/__init__.py b/tests/utilities/aws_lambda/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py
new file mode 100644
index 00000000..426b250f
--- /dev/null
+++ b/tests/utilities/aws_lambda/test_subtask_handler.py
@@ -0,0 +1,144 @@
+"""Tests for the @subtask_handler decorator.
+
+Covers the contract that the decorator owns the parent SubTask lifecycle and
+injects the decorator-owned TaskOrchestrator as a third positional argument
+to the wrapped function — so the handler can compose its own use-case
+orchestrator that shares the session.
+"""
+
+from collections.abc import Generator, Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Any
+from uuid import UUID
+
+import pytest
+from sqlmodel import Session, SQLModel, create_engine
+
+from domain.tasks.subtasks import SubTaskStatus
+from domain.tasks.tasks import TaskStatus
+from orchestration.task_orchestrator import TaskOrchestrator
+from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
+from repositories.tasks.task_postgres_repository import TaskPostgresRepository
+from utilities.aws_lambda.subtask_handler import subtask_handler
+
+
+@dataclass
+class Harness:
+    orchestrator: TaskOrchestrator
+    tasks: TaskPostgresRepository
+    subtasks: SubTaskPostgresRepository
+
+    @contextmanager
+    def factory(self) -> Generator[TaskOrchestrator, None, None]:
+        yield self.orchestrator
+
+
+@pytest.fixture
+def harness() -> Iterator[Harness]:
+    engine = create_engine("sqlite://")
+    SQLModel.metadata.create_all(engine)
+    with Session(engine) as session:
+        tasks = TaskPostgresRepository(session=session)
+        subtasks = SubTaskPostgresRepository(session=session)
+        yield Harness(
+            orchestrator=TaskOrchestrator(task_repo=tasks, subtask_repo=subtasks),
+            tasks=tasks,
+            subtasks=subtasks,
+        )
+
+
+def _direct_event(task_id: UUID, subtask_id: UUID) -> dict[str, Any]:
+    return {"task_id": str(task_id), "sub_task_id": str(subtask_id)}
+
+
+def test_subtask_handler_injects_orchestrator_as_third_positional_argument(
+    harness: Harness,
+) -> None:
+    """The wrapped function receives the decorator-owned TaskOrchestrator
+    so it can share the session with its own use-case orchestrator."""
+    _, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    received: dict[str, Any] = {}
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        received["body"] = body
+        received["context"] = context
+        received["orchestrator"] = orchestrator
+
+    handler(_direct_event(subtask.task_id, subtask.id), context="ctx-sentinel")
+
+    assert received["orchestrator"] is harness.orchestrator
+    assert received["context"] == "ctx-sentinel"
+    assert received["body"]["sub_task_id"] == str(subtask.id)
+
+
+def test_subtask_handler_completes_parent_subtask_on_success(
+    harness: Harness,
+) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        return None
+
+    handler(_direct_event(task.id, subtask.id), context=None)
+
+    assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE
+    assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE
+
+
+def test_subtask_handler_marks_parent_failed_and_reraises_on_error(
+    harness: Harness,
+) -> None:
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        raise RuntimeError("boom")
+
+    with pytest.raises(RuntimeError, match="boom"):
+        handler(_direct_event(task.id, subtask.id), context=None)
+
+    assert harness.subtasks.get(subtask.id).status is SubTaskStatus.FAILED
+    assert harness.tasks.get(task.id).status is TaskStatus.FAILED
+
+
+def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
+    harness: Harness,
+) -> None:
+    """Smoke check the share-the-session promise: the injected orchestrator
+    is the same one the decorator owns, so a handler can use it to create
+    child SubTasks under the same session."""
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    child_ids: list[UUID] = []
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        child = orchestrator.create_child_subtask(task.id, inputs={"split": 1})
+        child_ids.append(child.id)
+
+    handler(_direct_event(task.id, subtask.id), context=None)
+
+    assert len(child_ids) == 1
+    persisted_child = harness.subtasks.get(child_ids[0])
+    assert persisted_child.task_id == task.id
+    assert persisted_child.status is SubTaskStatus.WAITING
diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py
index 64c1daa6..5ad5f6e1 100644
--- a/utilities/aws_lambda/subtask_handler.py
+++ b/utilities/aws_lambda/subtask_handler.py
@@ -39,7 +39,7 @@ def subtask_handler(
                     trigger = SubtaskTriggerBody.model_validate(body)
                     orchestrator.run_subtask(
                         trigger.sub_task_id,
-                        work=lambda body=body: func(body, context),
+                        work=lambda body=body, o=orchestrator: func(body, context, o),
                     )
 
         return wrapper

From 708f1b5d189222793cd206cf883c3c427ca63917 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 17:37:02 +0000
Subject: [PATCH 74/91] repositories: UserAddressRepository +
 UserAddressCsvS3Repository (CSV-on-S3 adapter)

Adds the persistence layer for UserAddress batches:

- Abstract UserAddressRepository with load_batch / save_batch.
- Concrete UserAddressCsvS3Repository over CsvS3Client:
  - load_batch reads canonical upload columns (Address 1/2/3, Postcode,
    Internal Reference), comma-joins non-empty address parts, and
    passes Internal Reference through (None when missing/empty).
  - save_batch writes a 3-column CSV (user_address,postcode,
    internal_reference) to {path_prefix}/{ISO datetime}_{uuid8}.csv
    and returns the s3://bucket/key URI.
- Postcode sanitisation flows through UserAddress.__post_init__; the
  repo never calls sanitise_postcode directly.

Tests (moto-backed) cover: three-line address load, Address-1-only
load, missing Internal Reference, save->reload round trip, and
unique-filename-per-save. pyright --strict clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 repositories/user_address/__init__.py         |   0
 .../user_address_csv_s3_repository.py         |  87 +++++++++
 .../user_address/user_address_repository.py   |  30 +++
 tests/repositories/user_address/__init__.py   |   0
 tests/repositories/user_address/conftest.py   |  32 ++++
 .../test_user_address_csv_s3_repository.py    | 175 ++++++++++++++++++
 6 files changed, 324 insertions(+)
 create mode 100644 repositories/user_address/__init__.py
 create mode 100644 repositories/user_address/user_address_csv_s3_repository.py
 create mode 100644 repositories/user_address/user_address_repository.py
 create mode 100644 tests/repositories/user_address/__init__.py
 create mode 100644 tests/repositories/user_address/conftest.py
 create mode 100644 tests/repositories/user_address/test_user_address_csv_s3_repository.py

diff --git a/repositories/user_address/__init__.py b/repositories/user_address/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py
new file mode 100644
index 00000000..be2baa13
--- /dev/null
+++ b/repositories/user_address/user_address_csv_s3_repository.py
@@ -0,0 +1,87 @@
+"""CSV-on-S3 adapter for :class:`UserAddressRepository`.
+
+Reads canonical upload CSVs (``Address 1``, ``Address 2``, ``Address 3``,
+``Postcode``, ``Internal Reference``) and writes the splitter's compact
+3-column form (``user_address``, ``postcode``, ``internal_reference``).
+
+The frontend pre-applies the user's column mapping at upload time, so this
+adapter does NOT consult any ``BulkAddressUpload.column_mapping``: it always
+expects the canonical column names listed above.
+"""
+
+from __future__ import annotations
+
+import uuid
+from datetime import datetime, timezone
+from typing import Optional
+
+from domain.addresses.user_address import UserAddress
+from infrastructure.csv_s3_client import CsvS3Client
+from repositories.user_address.user_address_repository import UserAddressRepository
+
+_ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3")
+_POSTCODE_COLUMN: str = "Postcode"
+_INTERNAL_REFERENCE_COLUMN: str = "Internal Reference"
+
+
+class UserAddressCsvS3Repository(UserAddressRepository):
+    """Persist :class:`UserAddress` batches as CSV objects in S3.
+
+    The repo owns the unique-filename-within-prefix convention
+    (``{ISO datetime}_{8-char uuid}.csv``); callers own the directory
+    hierarchy supplied as ``path_prefix``.
+    """
+
+    def __init__(self, csv_client: CsvS3Client, bucket: str) -> None:
+        self._csv_client = csv_client
+        self._bucket = bucket
+
+    def load_batch(self, s3_uri: str) -> list[UserAddress]:
+        """Load canonical upload CSV rows into :class:`UserAddress` objects.
+
+        Concatenates ``Address 1``/``Address 2``/``Address 3`` with ``", "``,
+        skipping missing or empty parts, into ``user_address``. Falls back to
+        just ``Address 1`` when 2 and 3 are absent. Passes ``Internal Reference``
+        through to :attr:`UserAddress.internal_reference` (``None`` when the
+        column is missing or empty).
+        """
+        rows = self._csv_client.read_rows(s3_uri)
+        addresses: list[UserAddress] = []
+        for row in rows:
+            parts = [
+                row[col].strip()
+                for col in _ADDRESS_COLUMNS
+                if col in row and row[col].strip()
+            ]
+            user_address = ", ".join(parts)
+            postcode = row.get(_POSTCODE_COLUMN, "")
+            raw_ref = row.get(_INTERNAL_REFERENCE_COLUMN, "").strip()
+            internal_reference: Optional[str] = raw_ref or None
+            addresses.append(
+                UserAddress(
+                    user_address=user_address,
+                    postcode=postcode,
+                    internal_reference=internal_reference,
+                )
+            )
+        return addresses
+
+    def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
+        """Write a 3-column CSV under a unique key beneath ``path_prefix``.
+
+        The key is ``{path_prefix}/{ISO-8601 datetime}_{8-char uuid}.csv``.
+        Returns the full ``s3://bucket/key`` URI.
+        """
+        rows: list[dict[str, str]] = [
+            {
+                "user_address": addr.user_address,
+                "postcode": addr.postcode,
+                "internal_reference": addr.internal_reference or "",
+            }
+            for addr in addresses
+        ]
+        filename = (
+            f"{datetime.now(timezone.utc).isoformat()}_{uuid.uuid4().hex[:8]}.csv"
+        )
+        key = f"{path_prefix.rstrip('/')}/{filename}"
+        return self._csv_client.save_rows(rows, key)
diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py
new file mode 100644
index 00000000..d8c12855
--- /dev/null
+++ b/repositories/user_address/user_address_repository.py
@@ -0,0 +1,30 @@
+"""Abstract repository for :class:`UserAddress` batches.
+
+Persistence-agnostic interface for loading and saving batches of
+:class:`domain.addresses.user_address.UserAddress`. Concrete adapters --
+e.g. :class:`UserAddressCsvS3Repository` -- live alongside this module.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from domain.addresses.user_address import UserAddress
+
+
+class UserAddressRepository(ABC):
+    """Load and persist batches of :class:`UserAddress`.
+
+    Implementations choose the underlying storage (S3 CSV, Postgres,
+    in-memory, ...) but must preserve the canonical column semantics:
+    the address text, postcode (sanitised by ``UserAddress.__post_init__``),
+    and an optional internal reference.
+    """
+
+    @abstractmethod
+    def load_batch(self, s3_uri: str) -> list[UserAddress]:
+        """Read a batch of addresses from ``s3_uri`` and return domain objects."""
+
+    @abstractmethod
+    def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
+        """Persist ``addresses`` under ``path_prefix`` and return the URI written."""
diff --git a/tests/repositories/user_address/__init__.py b/tests/repositories/user_address/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/repositories/user_address/conftest.py b/tests/repositories/user_address/conftest.py
new file mode 100644
index 00000000..1859ff0a
--- /dev/null
+++ b/tests/repositories/user_address/conftest.py
@@ -0,0 +1,32 @@
+import os
+from collections.abc import Iterator
+from typing import Optional
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _aws_creds() -> Iterator[None]:  # pyright: ignore[reportUnusedFunction]
+    """Stub AWS creds so botocore doesn't probe the host environment.
+
+    Applied automatically to every test in ``tests/repositories/user_address/``.
+    """
+    keys = (
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+        "AWS_DEFAULT_REGION",
+    )
+    prev: dict[str, Optional[str]] = {k: os.environ.get(k) for k in keys}
+    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+    os.environ["AWS_SESSION_TOKEN"] = "testing"
+    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
+    try:
+        yield
+    finally:
+        for k, v in prev.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
new file mode 100644
index 00000000..ca9e8a57
--- /dev/null
+++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
@@ -0,0 +1,175 @@
+from collections.abc import Iterator
+
+import pytest
+from moto import mock_aws
+
+from infrastructure.csv_s3_client import CsvS3Client
+from repositories.user_address.user_address_csv_s3_repository import (
+    UserAddressCsvS3Repository,
+)
+from tests.infrastructure import make_boto_client
+
+BUCKET = "user-address-bucket"
+
+
+@pytest.fixture
+def repo() -> Iterator[UserAddressCsvS3Repository]:
+    with mock_aws():
+        boto_client = make_boto_client("s3")
+        boto_client.create_bucket(Bucket=BUCKET)
+        csv_client = CsvS3Client(boto_client, BUCKET)
+        yield UserAddressCsvS3Repository(csv_client, BUCKET)
+
+
+def _upload_csv(
+    repo: UserAddressCsvS3Repository, rows: list[dict[str, str]], key: str
+) -> str:
+    return repo._csv_client.save_rows(rows, key)  # pyright: ignore[reportPrivateUsage]
+
+
+def test_load_batch_concatenates_three_address_lines(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    rows = [
+        {
+            "Address 1": "1 High Street",
+            "Address 2": "Flat 2",
+            "Address 3": "Townville",
+            "Postcode": "sw1a 1aa",
+            "Internal Reference": "REF-001",
+        }
+    ]
+    uri = _upload_csv(repo, rows, "uploads/full.csv")
+
+    addresses = repo.load_batch(uri)
+
+    assert len(addresses) == 1
+    address = addresses[0]
+    assert address.user_address == "1 High Street, Flat 2, Townville"
+    assert address.postcode == "SW1A1AA"
+    assert address.internal_reference == "REF-001"
+
+
+def test_load_batch_uses_only_address_1_when_others_missing(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    rows = [
+        {
+            "Address 1": "10 Cardiff Road",
+            "Address 2": "",
+            "Address 3": "",
+            "Postcode": "CF10 1AA",
+            "Internal Reference": "REF-002",
+        }
+    ]
+    uri = _upload_csv(repo, rows, "uploads/address1-only.csv")
+
+    addresses = repo.load_batch(uri)
+
+    assert len(addresses) == 1
+    assert addresses[0].user_address == "10 Cardiff Road"
+    assert addresses[0].postcode == "CF101AA"
+    assert addresses[0].internal_reference == "REF-002"
+
+
+def test_load_batch_handles_missing_internal_reference(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    rows = [
+        {
+            "Address 1": "5 Park Lane",
+            "Address 2": "",
+            "Address 3": "",
+            "Postcode": "M1 1AA",
+            "Internal Reference": "",
+        }
+    ]
+    uri = _upload_csv(repo, rows, "uploads/no-ref.csv")
+
+    addresses = repo.load_batch(uri)
+
+    assert len(addresses) == 1
+    assert addresses[0].user_address == "5 Park Lane"
+    assert addresses[0].postcode == "M11AA"
+    assert addresses[0].internal_reference is None
+
+
+def test_save_batch_returns_uri_under_path_prefix(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    from domain.addresses.user_address import UserAddress
+
+    addresses = [
+        UserAddress(
+            user_address="1 High Street, Flat 2, Townville",
+            postcode="SW1A 1AA",
+            internal_reference="REF-001",
+        ),
+    ]
+
+    uri = repo.save_batch(addresses, "tasks/abc/batches")
+
+    assert uri.startswith(f"s3://{BUCKET}/tasks/abc/batches/")
+    assert uri.endswith(".csv")
+
+
+def test_save_then_reload_round_trip_preserves_values(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    from domain.addresses.user_address import UserAddress
+
+    # save_batch writes the splitter's compact schema
+    # (user_address/postcode/internal_reference); load_batch reads the
+    # canonical upload schema. To round-trip through the repo we re-upload
+    # the saved CSV under the upload schema's column names.
+    original = [
+        UserAddress(
+            user_address="1 High Street",
+            postcode="SW1A 1AA",
+            internal_reference="REF-001",
+        ),
+        UserAddress(
+            user_address="2 Low Street",
+            postcode="XY9 8ZW",
+            internal_reference=None,
+        ),
+    ]
+
+    saved_uri = repo.save_batch(original, "tasks/round-trip")
+
+    # Re-shape the saved CSV into the canonical upload schema for reload.
+    saved_rows = repo._csv_client.read_rows(saved_uri)  # pyright: ignore[reportPrivateUsage]
+    upload_rows: list[dict[str, str]] = [
+        {
+            "Address 1": row["user_address"],
+            "Address 2": "",
+            "Address 3": "",
+            "Postcode": row["postcode"],
+            "Internal Reference": row["internal_reference"],
+        }
+        for row in saved_rows
+    ]
+    upload_uri = _upload_csv(repo, upload_rows, "uploads/round-trip.csv")
+
+    reloaded = repo.load_batch(upload_uri)
+
+    assert reloaded == original
+
+
+def test_save_batch_uses_unique_filename_per_call(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    from domain.addresses.user_address import UserAddress
+
+    addresses = [
+        UserAddress(
+            user_address="1 High Street",
+            postcode="SW1A 1AA",
+            internal_reference="REF-001",
+        ),
+    ]
+
+    uri_1 = repo.save_batch(addresses, "tasks/uniqueness")
+    uri_2 = repo.save_batch(addresses, "tasks/uniqueness")
+
+    assert uri_1 != uri_2

From 0a0444821756543f57625832681e28a09d264056 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 19 May 2026 17:46:12 +0000
Subject: [PATCH 75/91] applications/postcode_splitter:
 PostcodeSplitterOrchestrator + Lambda entrypoint slice

Wires slice 1-5 primitives into a deployable splitter:

- orchestration/postcode_splitter_orchestrator.py: PostcodeSplitterOrchestrator
  loads addresses via UserAddressRepository, groups by postcode via
  iter_postcode_grouped_batches, persists each batch under
  ara_postcode_splitter_batches/{task_id}/{subtask_id}/, creates a WAITING
  child SubTask, and publishes an address2UPRN SQS message per batch.

- applications/postcode_splitter/: Lambda entrypoint. handler.py is decorated
  with @subtask_handler() so the parent SubTask lifecycle is decorator-owned;
  PostcodeSplitterTriggerBody validates the body. Dockerfile is the
  python:3.11 Lambda base with the DDD-shaped source layers and no pandas.

- tests/orchestration/test_postcode_splitter_orchestrator.py: integration
  test using moto S3 + moto SQS + in-memory SQLite that exercises the full
  wiring against a fixture CSV spanning three postcode groups (one
  oversize) and asserts child count, persisted inputs, queue bodies, and
  dispatch order.

backend/postcode_splitter/ and .github/workflows/deploy_terraform.yml are
intentionally unchanged: the dockerfile_path flip is deferred until the
companion backend/address2UPRN/ migration is also ready.
---
 applications/__init__.py                      |   0
 applications/postcode_splitter/Dockerfile     |  21 ++
 applications/postcode_splitter/__init__.py    |   0
 applications/postcode_splitter/handler.py     |  70 ++++
 .../postcode_splitter_trigger_body.py         |  32 ++
 .../postcode_splitter/requirements.txt        |   4 +
 .../postcode_splitter_orchestrator.py         |  89 ++++++
 .../test_postcode_splitter_orchestrator.py    | 298 ++++++++++++++++++
 8 files changed, 514 insertions(+)
 create mode 100644 applications/__init__.py
 create mode 100644 applications/postcode_splitter/Dockerfile
 create mode 100644 applications/postcode_splitter/__init__.py
 create mode 100644 applications/postcode_splitter/handler.py
 create mode 100644 applications/postcode_splitter/postcode_splitter_trigger_body.py
 create mode 100644 applications/postcode_splitter/requirements.txt
 create mode 100644 orchestration/postcode_splitter_orchestrator.py
 create mode 100644 tests/orchestration/test_postcode_splitter_orchestrator.py

diff --git a/applications/__init__.py b/applications/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/applications/postcode_splitter/Dockerfile b/applications/postcode_splitter/Dockerfile
new file mode 100644
index 00000000..578ee7a7
--- /dev/null
+++ b/applications/postcode_splitter/Dockerfile
@@ -0,0 +1,21 @@
+FROM public.ecr.aws/lambda/python:3.11
+
+WORKDIR /var/task
+
+COPY applications/postcode_splitter/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the layered source the handler imports from. The new splitter pulls
+# only DDD-shaped packages — no pandas, no legacy backend/.
+COPY domain/ domain/
+COPY infrastructure/ infrastructure/
+COPY orchestration/ orchestration/
+COPY repositories/ repositories/
+COPY utilities/ utilities/
+COPY applications/ applications/
+
+# Place the handler at the Lambda task root so the runtime can resolve
+# ``main.handler`` without an extra package prefix.
+COPY applications/postcode_splitter/handler.py /var/task/main.py
+
+CMD ["main.handler"]
diff --git a/applications/postcode_splitter/__init__.py b/applications/postcode_splitter/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py
new file mode 100644
index 00000000..005227a9
--- /dev/null
+++ b/applications/postcode_splitter/handler.py
@@ -0,0 +1,70 @@
+"""Lambda entrypoint for the postcode splitter slice.
+
+The :func:`handler` function is decorated with ``@subtask_handler()`` so the
+decorator owns the parent ``SubTask`` lifecycle (start/complete/fail) and
+injects the decorator-owned :class:`TaskOrchestrator` as the third positional
+argument. The handler itself does only two things:
+
+1. Build a :class:`PostcodeSplitterOrchestrator` from env-driven config.
+2. Delegate to ``split_and_dispatch`` and return its result so it lands in
+   ``SubTask.outputs["result"]``.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+import boto3
+
+from applications.postcode_splitter.postcode_splitter_trigger_body import (
+    PostcodeSplitterTriggerBody,
+)
+from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
+from infrastructure.csv_s3_client import CsvS3Client
+from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
+from orchestration.task_orchestrator import TaskOrchestrator
+from repositories.user_address.user_address_csv_s3_repository import (
+    UserAddressCsvS3Repository,
+)
+from utilities.aws_lambda.subtask_handler import subtask_handler
+
+
+@subtask_handler()
+def handler(
+    body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator
+) -> dict[str, list[str]]:
+    """Validate the trigger body, build the splitter, dispatch children.
+
+    Reads ``S3_BUCKET_NAME`` and ``ADDRESS2UPRN_QUEUE_URL`` from the
+    environment to construct the typed S3/SQS clients. The return value
+    lands in ``SubTask.outputs["result"]`` via the decorator.
+    """
+    trigger = PostcodeSplitterTriggerBody.model_validate(body)
+
+    bucket = os.environ["S3_BUCKET_NAME"]
+    queue_url = os.environ["ADDRESS2UPRN_QUEUE_URL"]
+
+    # boto3.client is overloaded per-service in the installed stubs; cast
+    # to Any so the strict-mode checker treats it as opaque.
+    boto3_client: Any = boto3.client  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
+    boto_s3: Any = boto3_client("s3")
+    boto_sqs: Any = boto3_client("sqs")
+
+    csv_client = CsvS3Client(boto_s3, bucket)
+    user_address_repo = UserAddressCsvS3Repository(csv_client, bucket)
+    queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
+
+    splitter = PostcodeSplitterOrchestrator(
+        task_orchestrator=task_orchestrator,
+        user_address_repo=user_address_repo,
+        queue_client=queue_client,
+    )
+
+    child_ids = splitter.split_and_dispatch(
+        parent_task_id=trigger.task_id,
+        parent_subtask_id=trigger.sub_task_id,
+        input_s3_uri=trigger.s3_uri,
+    )
+
+    return {"child_subtask_ids": [str(cid) for cid in child_ids]}
diff --git a/applications/postcode_splitter/postcode_splitter_trigger_body.py b/applications/postcode_splitter/postcode_splitter_trigger_body.py
new file mode 100644
index 00000000..bc983abc
--- /dev/null
+++ b/applications/postcode_splitter/postcode_splitter_trigger_body.py
@@ -0,0 +1,32 @@
+"""Trigger payload model for the postcode splitter Lambda.
+
+The decorator (``@subtask_handler``) already validates ``task_id`` and
+``sub_task_id`` via :class:`SubtaskTriggerBody`; this model layers on the
+splitter-specific ``s3_uri`` field while keeping ``extra="allow"`` so any
+upstream-passthrough keys (e.g. ``portfolio_id``) survive untouched.
+"""
+
+from uuid import UUID
+
+from pydantic import BaseModel, ConfigDict
+
+
+class PostcodeSplitterTriggerBody(BaseModel):
+    """Validated body for the postcode splitter Lambda.
+
+    Attributes:
+        task_id: Parent ``Task`` id; used as the ``task_id`` input on each
+            child ``SubTask`` and as the ``parent_task_id`` on the fan-out
+            SQS messages.
+        sub_task_id: The splitter's own ``SubTask`` id; used as the path
+            segment under ``ara_postcode_splitter_batches/{task_id}/{...}``
+            so per-invocation outputs cannot collide.
+        s3_uri: ``s3://bucket/key`` URI of the uploaded address CSV the
+            splitter must read.
+    """
+
+    model_config = ConfigDict(extra="allow")
+
+    task_id: UUID
+    sub_task_id: UUID
+    s3_uri: str
diff --git a/applications/postcode_splitter/requirements.txt b/applications/postcode_splitter/requirements.txt
new file mode 100644
index 00000000..6a85a255
--- /dev/null
+++ b/applications/postcode_splitter/requirements.txt
@@ -0,0 +1,4 @@
+boto3
+pydantic
+sqlmodel
+psycopg2-binary
diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py
new file mode 100644
index 00000000..6afa2538
--- /dev/null
+++ b/orchestration/postcode_splitter_orchestrator.py
@@ -0,0 +1,89 @@
+"""Use-case orchestrator for the postcode splitter Lambda.
+
+Wires the slice-1 domain (``iter_postcode_grouped_batches``), the slice-3
+``UserAddressRepository``, the slice-2 ``Address2UprnQueueClient``, and the
+slice-4 ``TaskOrchestrator.create_child_subtask`` primitive together.
+
+``split_and_dispatch`` loads the input batch, groups it into per-postcode
+chunks, writes each chunk back to S3 under a deterministic prefix, creates a
+WAITING child ``SubTask`` for it, and publishes the address-to-UPRN fan-out
+message that downstream consumers pick up.
+"""
+
+from __future__ import annotations
+
+from uuid import UUID
+
+from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
+from orchestration.task_orchestrator import TaskOrchestrator
+from domain.addresses.postcode_batching import iter_postcode_grouped_batches
+from repositories.user_address.user_address_repository import UserAddressRepository
+
+
+class PostcodeSplitterOrchestrator:
+    """Split an uploaded address batch into postcode-grouped child SubTasks.
+
+    The orchestrator owns the algorithm; the IO collaborators
+    (:class:`UserAddressRepository`, :class:`Address2UprnQueueClient`) and
+    the :class:`TaskOrchestrator` lifecycle primitive are injected so the
+    same wiring can be exercised against moto/SQLite in tests and against
+    real AWS in the Lambda entrypoint.
+    """
+
+    def __init__(
+        self,
+        task_orchestrator: TaskOrchestrator,
+        user_address_repo: UserAddressRepository,
+        queue_client: Address2UprnQueueClient,
+        max_batch_size: int = 500,
+    ) -> None:
+        self._task_orchestrator = task_orchestrator
+        self._user_address_repo = user_address_repo
+        self._queue_client = queue_client
+        self._max_batch_size = max_batch_size
+
+    def split_and_dispatch(
+        self,
+        *,
+        parent_task_id: UUID,
+        parent_subtask_id: UUID,
+        input_s3_uri: str,
+    ) -> list[UUID]:
+        """Split ``input_s3_uri`` into postcode batches and dispatch each.
+
+        For each yielded batch:
+
+        1. Persist it under
+           ``ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}``.
+        2. Create a WAITING child ``SubTask`` with
+           ``inputs={"task_id": str(parent_task_id), "s3_uri": batch_uri}``.
+        3. Publish an ``address2UPRN`` SQS message referencing the new child.
+
+        Returns:
+            The list of child ``SubTask`` ids, in dispatch order.
+        """
+        addresses = self._user_address_repo.load_batch(input_s3_uri)
+        path_prefix = (
+            f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}"
+        )
+
+        child_ids: list[UUID] = []
+        for batch in iter_postcode_grouped_batches(
+            addresses, max_batch_size=self._max_batch_size
+        ):
+            batch_uri = self._user_address_repo.save_batch(batch, path_prefix)
+            child = self._task_orchestrator.create_child_subtask(
+                parent_task_id,
+                inputs={
+                    "task_id": str(parent_task_id),
+                    "s3_uri": batch_uri,
+                },
+            )
+            self._queue_client.publish(
+                parent_task_id=parent_task_id,
+                child_subtask_id=child.id,
+                s3_uri=batch_uri,
+            )
+            child_ids.append(child.id)
+
+        return child_ids
diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py
new file mode 100644
index 00000000..57bd2133
--- /dev/null
+++ b/tests/orchestration/test_postcode_splitter_orchestrator.py
@@ -0,0 +1,298 @@
+"""Integration test: PostcodeSplitterOrchestrator wired end-to-end.
+
+Combines moto S3 + moto SQS + an in-memory SQLite session for the
+``TaskOrchestrator`` so the full slice-6 wiring is exercised through real
+infrastructure adapters (not mocks). The fixture CSV spans three postcodes
+with one oversize group, which forces both the buffer-flush-then-oversize
+branch and the final-flush branch of
+``iter_postcode_grouped_batches`` — three batches in total.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from collections.abc import Iterator
+from dataclasses import dataclass
+from typing import Any, cast
+
+import boto3
+import pytest
+from moto import mock_aws
+from sqlmodel import Session, SQLModel, create_engine
+
+from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
+from infrastructure.csv_s3_client import CsvS3Client
+from orchestration.postcode_splitter_orchestrator import PostcodeSplitterOrchestrator
+from orchestration.task_orchestrator import TaskOrchestrator
+from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
+from repositories.tasks.task_postgres_repository import TaskPostgresRepository
+from repositories.user_address.user_address_csv_s3_repository import (
+    UserAddressCsvS3Repository,
+)
+
+BUCKET = "splitter-bucket"
+REGION = "us-east-1"
+
+
+def _make_boto_client(service_name: str) -> Any:
+    factory: Any = boto3.client  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
+    return factory(service_name, region_name=REGION)
+
+
+@pytest.fixture(autouse=True)
+def _aws_creds() -> Iterator[None]:  # pyright: ignore[reportUnusedFunction]
+    keys = (
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+        "AWS_DEFAULT_REGION",
+    )
+    prev: dict[str, Any] = {k: os.environ.get(k) for k in keys}
+    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+    os.environ["AWS_SESSION_TOKEN"] = "testing"
+    os.environ["AWS_DEFAULT_REGION"] = REGION
+    try:
+        yield
+    finally:
+        for k, v in prev.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+
+
+@dataclass
+class Harness:
+    splitter: PostcodeSplitterOrchestrator
+    task_orchestrator: TaskOrchestrator
+    subtasks: SubTaskPostgresRepository
+    csv_client: CsvS3Client
+    boto_sqs: Any
+    queue_url: str
+    repo: UserAddressCsvS3Repository
+
+
+@pytest.fixture
+def harness() -> Iterator[Harness]:
+    with mock_aws():
+        # Infra: S3 + SQS
+        boto_s3 = _make_boto_client("s3")
+        boto_s3.create_bucket(Bucket=BUCKET)
+        boto_sqs = _make_boto_client("sqs")
+        queue: dict[str, Any] = boto_sqs.create_queue(QueueName="address2uprn-queue")
+        queue_url = cast(str, queue["QueueUrl"])
+
+        csv_client = CsvS3Client(boto_s3, BUCKET)
+        repo = UserAddressCsvS3Repository(csv_client, BUCKET)
+        queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
+
+        # DB: in-memory SQLite TaskOrchestrator
+        engine = create_engine("sqlite://")
+        SQLModel.metadata.create_all(engine)
+        with Session(engine) as session:
+            task_repo = TaskPostgresRepository(session=session)
+            subtask_repo = SubTaskPostgresRepository(session=session)
+            task_orchestrator = TaskOrchestrator(
+                task_repo=task_repo, subtask_repo=subtask_repo
+            )
+
+            splitter = PostcodeSplitterOrchestrator(
+                task_orchestrator=task_orchestrator,
+                user_address_repo=repo,
+                queue_client=queue_client,
+                max_batch_size=3,
+            )
+
+            yield Harness(
+                splitter=splitter,
+                task_orchestrator=task_orchestrator,
+                subtasks=subtask_repo,
+                csv_client=csv_client,
+                boto_sqs=boto_sqs,
+                queue_url=queue_url,
+                repo=repo,
+            )
+
+
+def _upload_fixture_csv(csv_client: CsvS3Client) -> str:
+    # Three postcode groups:
+    #   AA1 1AA × 2 (within cap)
+    #   BB2 2BB × 4 (oversize: > max_batch_size=3)
+    #   CC3 3CC × 1 (final flush)
+    # Expected batching with cap=3 and the algorithm in
+    # ``iter_postcode_grouped_batches``:
+    #   batch 1: [AA1 1AA × 2]           (flushed because oversize follows)
+    #   batch 2: [BB2 2BB × 4]           (oversize own batch)
+    #   batch 3: [CC3 3CC × 1]           (final flush)
+    rows: list[dict[str, str]] = []
+    rows.extend(
+        {
+            "Address 1": f"{i} High St",
+            "Address 2": "",
+            "Address 3": "",
+            "Postcode": "AA1 1AA",
+            "Internal Reference": f"AA-{i}",
+        }
+        for i in range(1, 3)
+    )
+    rows.extend(
+        {
+            "Address 1": f"{i} Long Road",
+            "Address 2": "",
+            "Address 3": "",
+            "Postcode": "BB2 2BB",
+            "Internal Reference": f"BB-{i}",
+        }
+        for i in range(1, 5)
+    )
+    rows.append(
+        {
+            "Address 1": "1 Final Way",
+            "Address 2": "",
+            "Address 3": "",
+            "Postcode": "CC3 3CC",
+            "Internal Reference": "CC-1",
+        }
+    )
+    return csv_client.save_rows(rows, "uploads/input.csv")
+
+
+def _drain_queue(boto_sqs: Any, queue_url: str) -> list[dict[str, Any]]:
+    bodies: list[dict[str, Any]] = []
+    while True:
+        received: dict[str, Any] = boto_sqs.receive_message(
+            QueueUrl=queue_url, MaxNumberOfMessages=10, WaitTimeSeconds=0
+        )
+        messages = cast(list[dict[str, Any]], received.get("Messages", []))
+        if not messages:
+            break
+        for message in messages:
+            bodies.append(cast(dict[str, Any], json.loads(message["Body"])))
+            boto_sqs.delete_message(
+                QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"]
+            )
+    return bodies
+
+
+def test_split_and_dispatch_creates_three_children_for_fixture(
+    harness: Harness,
+) -> None:
+    parent_task, parent_subtask = (
+        harness.task_orchestrator.create_task_with_subtask(
+            task_source="manual:postcode-splitter-int"
+        )
+    )
+    input_uri = _upload_fixture_csv(harness.csv_client)
+
+    child_ids = harness.splitter.split_and_dispatch(
+        parent_task_id=parent_task.id,
+        parent_subtask_id=parent_subtask.id,
+        input_s3_uri=input_uri,
+    )
+
+    assert len(child_ids) == 3
+    # All child ids are unique and persisted as WAITING children of the
+    # parent task.
+    assert len(set(child_ids)) == 3
+    for cid in child_ids:
+        child = harness.subtasks.get(cid)
+        assert child.task_id == parent_task.id
+
+
+def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri(
+    harness: Harness,
+) -> None:
+    parent_task, parent_subtask = (
+        harness.task_orchestrator.create_task_with_subtask(
+            task_source="manual:postcode-splitter-int"
+        )
+    )
+    input_uri = _upload_fixture_csv(harness.csv_client)
+
+    child_ids = harness.splitter.split_and_dispatch(
+        parent_task_id=parent_task.id,
+        parent_subtask_id=parent_subtask.id,
+        input_s3_uri=input_uri,
+    )
+
+    for cid in child_ids:
+        child = harness.subtasks.get(cid)
+        assert child.inputs is not None
+        assert child.inputs["task_id"] == str(parent_task.id)
+        batch_uri = child.inputs["s3_uri"]
+        assert isinstance(batch_uri, str)
+        prefix = (
+            f"s3://{BUCKET}/ara_postcode_splitter_batches/"
+            f"{parent_task.id}/{parent_subtask.id}/"
+        )
+        assert batch_uri.startswith(prefix)
+        assert batch_uri.endswith(".csv")
+
+
+def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids(
+    harness: Harness,
+) -> None:
+    parent_task, parent_subtask = (
+        harness.task_orchestrator.create_task_with_subtask(
+            task_source="manual:postcode-splitter-int"
+        )
+    )
+    input_uri = _upload_fixture_csv(harness.csv_client)
+
+    child_ids = harness.splitter.split_and_dispatch(
+        parent_task_id=parent_task.id,
+        parent_subtask_id=parent_subtask.id,
+        input_s3_uri=input_uri,
+    )
+
+    bodies = _drain_queue(harness.boto_sqs, harness.queue_url)
+    assert len(bodies) == len(child_ids)
+
+    # Match queue messages against persisted child inputs by child_subtask_id;
+    # the message body's task_id/s3_uri must agree with the SubTask inputs.
+    bodies_by_child = {body["sub_task_id"]: body for body in bodies}
+    assert set(bodies_by_child.keys()) == {str(cid) for cid in child_ids}
+    for cid in child_ids:
+        child = harness.subtasks.get(cid)
+        body = bodies_by_child[str(cid)]
+        assert child.inputs is not None
+        assert body == {
+            "task_id": str(parent_task.id),
+            "sub_task_id": str(cid),
+            "s3_uri": child.inputs["s3_uri"],
+        }
+
+
+def test_split_and_dispatch_returns_child_ids_in_dispatch_order(
+    harness: Harness,
+) -> None:
+    parent_task, parent_subtask = (
+        harness.task_orchestrator.create_task_with_subtask(
+            task_source="manual:postcode-splitter-int"
+        )
+    )
+    input_uri = _upload_fixture_csv(harness.csv_client)
+
+    child_ids = harness.splitter.split_and_dispatch(
+        parent_task_id=parent_task.id,
+        parent_subtask_id=parent_subtask.id,
+        input_s3_uri=input_uri,
+    )
+
+    # Re-load each child's saved batch and inspect the postcode column to
+    # confirm the dispatch order matches the postcode-batching algorithm:
+    # AA-batch first, BB oversize batch second, CC final-flush third.
+    postcodes_per_batch: list[set[str]] = []
+    for cid in child_ids:
+        child = harness.subtasks.get(cid)
+        assert child.inputs is not None
+        rows = harness.csv_client.read_rows(child.inputs["s3_uri"])
+        postcodes_per_batch.append({row["postcode"] for row in rows})
+
+    assert postcodes_per_batch == [
+        {"AA11AA"},
+        {"BB22BB"},
+        {"CC33CC"},
+    ]

From 914a8ed51e13bdd1cb644de4972d0d4951269adc Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 11:07:40 +0000
Subject: [PATCH 76/91] postcode splliter working e2e

---
 .gitignore                                    |   1 +
 applications/postcode_splitter/Dockerfile     |  13 ++
 .../local_handler/.env.local.example          |  34 +++++
 .../local_handler/docker-compose.yml          |   9 ++
 .../local_handler/invoke_local_lambda.py      |  37 +++++
 .../local_handler/run_local.sh                |  12 ++
 backend/address2UPRN/handler/requirements.txt |   3 +-
 .../terraform/lambda/postcodeSplitter/main.tf |  14 --
 domain/addresses/user_address.py              |  16 ++-
 infrastructure/csv_s3_client.py               |   2 +-
 infrastructure/s3_uri.py                      |  43 ++++++
 .../user_address_csv_s3_repository.py         |  57 +++++---
 tests/domain/addresses/test_user_address.py   |  26 ++++
 tests/infrastructure/test_s3_uri.py           |  32 +++++
 .../test_postcode_splitter_orchestrator.py    |  12 +-
 .../test_user_address_csv_s3_repository.py    | 127 +++++++++++-------
 .../aws_lambda/test_subtask_handler.py        | 111 +++++++++++++++
 utilities/aws_lambda/subtask_handler.py       |  67 ++++++++-
 18 files changed, 523 insertions(+), 93 deletions(-)
 create mode 100644 applications/postcode_splitter/local_handler/.env.local.example
 create mode 100644 applications/postcode_splitter/local_handler/docker-compose.yml
 create mode 100755 applications/postcode_splitter/local_handler/invoke_local_lambda.py
 create mode 100755 applications/postcode_splitter/local_handler/run_local.sh
 create mode 100644 infrastructure/s3_uri.py
 create mode 100644 tests/infrastructure/test_s3_uri.py

diff --git a/.gitignore b/.gitignore
index 888d527a..9e5df0c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,7 @@ celerybeat.pid
 
 # Environments
 .env
+.env.local
 .venv
 env/
 venv/
diff --git a/applications/postcode_splitter/Dockerfile b/applications/postcode_splitter/Dockerfile
index 578ee7a7..aea1f914 100644
--- a/applications/postcode_splitter/Dockerfile
+++ b/applications/postcode_splitter/Dockerfile
@@ -1,5 +1,18 @@
 FROM public.ecr.aws/lambda/python:3.11
 
+# Postgres host/port/database are baked into the image at build time from
+# the deploy workflow's --build-arg values (GitHub Actions DEV_DB_* secrets),
+# mirroring backend/postcode_splitter/handler/Dockerfile. They map onto the
+# POSTGRES_* names PostgresConfig.from_env reads. Username/password are NOT
+# baked in -- Terraform injects those as Lambda env vars from Secrets Manager.
+ARG DEV_DB_HOST
+ARG DEV_DB_PORT
+ARG DEV_DB_NAME
+
+ENV POSTGRES_HOST=${DEV_DB_HOST}
+ENV POSTGRES_PORT=${DEV_DB_PORT}
+ENV POSTGRES_DATABASE=${DEV_DB_NAME}
+
 WORKDIR /var/task
 
 COPY applications/postcode_splitter/requirements.txt .
diff --git a/applications/postcode_splitter/local_handler/.env.local.example b/applications/postcode_splitter/local_handler/.env.local.example
new file mode 100644
index 00000000..28fa8390
--- /dev/null
+++ b/applications/postcode_splitter/local_handler/.env.local.example
@@ -0,0 +1,34 @@
+# Local-test environment for the postcode_splitter Lambda.
+#
+#   cp .env.local.example .env.local   then fill in the values below.
+#
+# .env.local is gitignored. The container hits REAL AWS and a REAL Postgres,
+# so every value here points at infrastructure that actually exists.
+#
+# NOTE: the new DDD code uses different env var names than the repo root
+# .env. The mapping (root .env name -> var here) is given per section.
+# Keep comments on their own lines — docker-compose's env_file parser folds a
+# trailing "# ..." into the value.
+
+# --- Postgres (orchestration/default_orchestrator -> PostgresConfig.from_env) ---
+# POSTGRES_HOST <- DB_HOST, PORT <- DB_PORT, USERNAME <- DB_USERNAME,
+# PASSWORD <- DB_PASSWORD, DATABASE <- DB_NAME.
+POSTGRES_HOST=
+POSTGRES_PORT=5432
+POSTGRES_USERNAME=
+POSTGRES_PASSWORD=
+POSTGRES_DATABASE=
+# POSTGRES_DRIVER=psycopg2   (optional; defaults to psycopg2)
+
+# --- Handler config (applications/postcode_splitter/handler.py) ---
+# S3_BUCKET_NAME: bucket holding the input address CSV (root .env: DATA_BUCKET).
+# ADDRESS2UPRN_QUEUE_URL: SQS queue the splitter fans batches out to; not in
+# the root .env (Terraform sets it in prod).
+S3_BUCKET_NAME=
+ADDRESS2UPRN_QUEUE_URL=
+
+# --- AWS credentials for boto3 (S3 + SQS clients) ---
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_DEFAULT_REGION=eu-west-2
+# AWS_SESSION_TOKEN=   (only if using temporary/SSO credentials)
diff --git a/applications/postcode_splitter/local_handler/docker-compose.yml b/applications/postcode_splitter/local_handler/docker-compose.yml
new file mode 100644
index 00000000..68af1c40
--- /dev/null
+++ b/applications/postcode_splitter/local_handler/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  postcode-splitter:
+    build:
+      context: ../../../
+      dockerfile: applications/postcode_splitter/Dockerfile
+    ports:
+      - "9001:8080"
+    env_file:
+      - .env.local
diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
new file mode 100755
index 00000000..c0ca89ec
--- /dev/null
+++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""POST a single SQS-shaped event at the locally-running splitter Lambda.
+
+The container built by docker-compose runs the AWS Lambda Runtime Interface
+Emulator, which accepts invocations on the URL below. Replace the three
+placeholder values with a real parent Task id, the splitter's own SubTask id
+(both must already exist in the Postgres pointed at by .env.local), and the
+s3://... URI of an uploaded address CSV.
+"""
+
+import json
+import requests
+
+HOST = "localhost"
+PORT = "9001"
+
+LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations"
+
+payload = {
+    "Records": [
+        {
+            "body": json.dumps(
+                {
+                    "task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf",
+                    "sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d",
+                    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv",
+                }
+            )
+        }
+    ]
+}
+
+response = requests.post(LAMBDA_URL, json=payload)
+
+print("Status code:", response.status_code)
+print("Response:")
+print(response.text)
diff --git a/applications/postcode_splitter/local_handler/run_local.sh b/applications/postcode_splitter/local_handler/run_local.sh
new file mode 100755
index 00000000..345b60ee
--- /dev/null
+++ b/applications/postcode_splitter/local_handler/run_local.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cd "$(dirname "$0")"
+
+if [ ! -f .env.local ]; then
+  cp .env.local.example .env.local
+  echo "Created .env.local from the template — fill it in, then re-run." >&2
+  exit 1
+fi
+
+docker compose build --no-cache
+docker compose up --force-recreate
diff --git a/backend/address2UPRN/handler/requirements.txt b/backend/address2UPRN/handler/requirements.txt
index 6ef41b2d..02aaefba 100644
--- a/backend/address2UPRN/handler/requirements.txt
+++ b/backend/address2UPRN/handler/requirements.txt
@@ -8,4 +8,5 @@ boto3==1.35.44
 sqlmodel
 sqlalchemy==2.0.36
 psycopg2-binary==2.9.10
-pydantic-settings==2.6.0
\ No newline at end of file
+pydantic-settings==2.6.0
+httpx
\ No newline at end of file
diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf
index 94c5cd4e..325f7dc7 100644
--- a/deployment/terraform/lambda/postcodeSplitter/main.tf
+++ b/deployment/terraform/lambda/postcodeSplitter/main.tf
@@ -40,20 +40,6 @@ module "lambda" {
       LOG_LEVEL = "info"
       DB_USERNAME = local.db_credentials.db_assessment_model_username
       DB_PASSWORD = local.db_credentials.db_assessment_model_password
-      GOOGLE_SOLAR_API_KEY = "test"
-      SAP_PREDICTIONS_BUCKET = "test"
-      CARBON_PREDICTIONS_BUCKET = "test"
-      HEAT_PREDICTIONS_BUCKET = "test"
-      HEATING_KWH_PREDICTIONS_BUCKET = "test"
-      HOTWATER_KWH_PREDICTIONS_BUCKET = "test"
-      API_KEY = "test"
-      ENVIRONMENT = "test"
-      SECRET_KEY = "test"
-      PLAN_TRIGGER_BUCKET = "test"
-      DATA_BUCKET = "test"
-      EPC_AUTH_TOKEN = "test"
-      ENGINE_SQS_URL = "test"
-      ENERGY_ASSESSMENTS_BUCKET = "test"
       ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url
       S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
     },
diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py
index e48dfdec..120a3659 100644
--- a/domain/addresses/user_address.py
+++ b/domain/addresses/user_address.py
@@ -8,12 +8,17 @@ caller can construct an instance with an un-normalised postcode.
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from domain.postcodes.sanitise import sanitise_postcode
 
 
+def _empty_source_row() -> dict[str, str]:
+    """Typed default factory for :attr:`UserAddress.source_row`."""
+    return {}
+
+
 @dataclass(frozen=True)
 class UserAddress:
     """A user-supplied address paired with its canonical postcode.
@@ -25,11 +30,20 @@ class UserAddress:
             :meth:`__post_init__`.
         internal_reference: Optional customer-side identifier preserved for
             traceability through the matching pipeline.
+        source_row: The complete original CSV row this address was parsed
+            from, column name -> cell value. The splitter is a pass-through
+            router: it groups rows by postcode but must not drop the other
+            columns the downstream address2uprn stage relies on, so the raw
+            row travels alongside the parsed fields. Excluded from equality
+            and hashing -- identity stays defined by the parsed fields above.
     """
 
     user_address: str
     postcode: str
     internal_reference: Optional[str] = None
+    source_row: dict[str, str] = field(
+        default_factory=_empty_source_row, compare=False
+    )
 
     def __post_init__(self) -> None:
         # Frozen dataclass: bypass the descriptor with object.__setattr__.
diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py
index 5163705b..0a576b81 100644
--- a/infrastructure/csv_s3_client.py
+++ b/infrastructure/csv_s3_client.py
@@ -2,7 +2,7 @@ import csv
 from io import StringIO
 
 from infrastructure.s3_client import S3Client
-from utils.s3 import parse_s3_uri
+from infrastructure.s3_uri import parse_s3_uri
 
 
 class CsvS3Client(S3Client):
diff --git a/infrastructure/s3_uri.py b/infrastructure/s3_uri.py
new file mode 100644
index 00000000..bf97100e
--- /dev/null
+++ b/infrastructure/s3_uri.py
@@ -0,0 +1,43 @@
+"""Parse S3 URIs into ``(bucket, key)`` pairs.
+
+A pure-stdlib helper for the infrastructure layer. It deliberately pulls in
+neither pandas, boto3, nor the legacy ``utils`` package, so slim Lambda images
+that only need URI parsing do not drag the wider data stack along.
+
+Two input shapes are supported:
+
+* canonical S3 URIs --- ``s3://bucket/key``
+* AWS S3 console URLs --- ``https://.../s3/object/bucket?prefix=key``
+"""
+
+from urllib.parse import unquote
+
+
+def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
+    """Return the ``(bucket, key)`` pair addressed by ``s3_uri``.
+
+    Raises:
+        ValueError: if ``s3_uri`` is neither a well-formed ``s3://`` URI nor
+            an AWS console URL carrying a ``prefix`` query parameter.
+    """
+    if s3_uri.startswith("s3://"):
+        parts = s3_uri[len("s3://") :].split("/", 1)
+        if len(parts) < 2 or not parts[0] or not parts[1]:
+            raise ValueError("S3 URI must include both a bucket and a key")
+        return parts[0], parts[1]
+
+    if "?" not in s3_uri:
+        raise ValueError(f"Not an s3:// URI and has no query string: {s3_uri!r}")
+    base, query = s3_uri.split("?", 1)
+
+    if "/s3/object/" not in base:
+        raise ValueError(f"Console URL has no '/s3/object/' segment: {s3_uri!r}")
+    bucket = base.split("/s3/object/", 1)[1]
+
+    params: dict[str, str] = {}
+    for item in query.split("&"):
+        if "=" in item:
+            name, value = item.split("=", 1)
+            params[name] = value
+    key = unquote(params.get("prefix", ""))
+    return bucket, key
diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py
index be2baa13..7cd10bac 100644
--- a/repositories/user_address/user_address_csv_s3_repository.py
+++ b/repositories/user_address/user_address_csv_s3_repository.py
@@ -1,12 +1,16 @@
 """CSV-on-S3 adapter for :class:`UserAddressRepository`.
 
-Reads canonical upload CSVs (``Address 1``, ``Address 2``, ``Address 3``,
-``Postcode``, ``Internal Reference``) and writes the splitter's compact
-3-column form (``user_address``, ``postcode``, ``internal_reference``).
+Reads upload CSVs that carry a ``postcode`` column (plus optional
+``Address 1``/``Address 2``/``Address 3`` and ``Internal Reference``), and
+writes batch CSVs that pass *every* original column through unchanged with
+one column appended -- ``postcode_clean`` (uppercase, whitespace-stripped) --
+which the downstream address2uprn stage groups on.
 
-The frontend pre-applies the user's column mapping at upload time, so this
-adapter does NOT consult any ``BulkAddressUpload.column_mapping``: it always
-expects the canonical column names listed above.
+The splitter is a pass-through router: it must not reshape or drop columns,
+because address2uprn has not been migrated and still consumes the legacy
+splitter's full-row output. The frontend pre-applies the user's column
+mapping at upload time, so this adapter does NOT consult any
+``BulkAddressUpload.column_mapping``.
 """
 
 from __future__ import annotations
@@ -20,8 +24,9 @@ from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_repository import UserAddressRepository
 
 _ADDRESS_COLUMNS: tuple[str, str, str] = ("Address 1", "Address 2", "Address 3")
-_POSTCODE_COLUMN: str = "Postcode"
+_POSTCODE_COLUMN: str = "postcode"
 _INTERNAL_REFERENCE_COLUMN: str = "Internal Reference"
+_POSTCODE_CLEAN_COLUMN: str = "postcode_clean"
 
 
 class UserAddressCsvS3Repository(UserAddressRepository):
@@ -37,15 +42,27 @@ class UserAddressCsvS3Repository(UserAddressRepository):
         self._bucket = bucket
 
     def load_batch(self, s3_uri: str) -> list[UserAddress]:
-        """Load canonical upload CSV rows into :class:`UserAddress` objects.
+        """Load upload CSV rows into :class:`UserAddress` objects.
 
-        Concatenates ``Address 1``/``Address 2``/``Address 3`` with ``", "``,
-        skipping missing or empty parts, into ``user_address``. Falls back to
-        just ``Address 1`` when 2 and 3 are absent. Passes ``Internal Reference``
-        through to :attr:`UserAddress.internal_reference` (``None`` when the
-        column is missing or empty).
+        Each row's complete column set is preserved on
+        :attr:`UserAddress.source_row` so :meth:`save_batch` can pass it
+        through untouched. The parsed convenience fields are also populated:
+        ``Address 1``/``Address 2``/``Address 3`` are concatenated with
+        ``", "`` (skipping missing/empty parts) into ``user_address``, and
+        ``Internal Reference`` is threaded to
+        :attr:`UserAddress.internal_reference` (``None`` when missing/empty).
+
+        Raises:
+            ValueError: if the CSV has rows but no ``postcode`` column --
+                without it the splitter cannot group, and silently emitting
+                empty postcodes would corrupt every downstream batch.
         """
         rows = self._csv_client.read_rows(s3_uri)
+        if rows and _POSTCODE_COLUMN not in rows[0]:
+            raise ValueError(
+                f"Input CSV {s3_uri} has no {_POSTCODE_COLUMN!r} column; "
+                f"columns present: {sorted(rows[0])}"
+            )
         addresses: list[UserAddress] = []
         for row in rows:
             parts = [
@@ -62,22 +79,24 @@ class UserAddressCsvS3Repository(UserAddressRepository):
                     user_address=user_address,
                     postcode=postcode,
                     internal_reference=internal_reference,
+                    source_row=row,
                 )
             )
         return addresses
 
     def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
-        """Write a 3-column CSV under a unique key beneath ``path_prefix``.
+        """Write a pass-through batch CSV under a unique key.
+
+        Each output row is the address's original ``source_row`` with a
+        ``postcode_clean`` column appended (the canonical postcode the
+        downstream address2uprn stage groups on). No original column is
+        dropped or reshaped.
 
         The key is ``{path_prefix}/{ISO-8601 datetime}_{8-char uuid}.csv``.
         Returns the full ``s3://bucket/key`` URI.
         """
         rows: list[dict[str, str]] = [
-            {
-                "user_address": addr.user_address,
-                "postcode": addr.postcode,
-                "internal_reference": addr.internal_reference or "",
-            }
+            {**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode}
             for addr in addresses
         ]
         filename = (
diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py
index e722077d..4d8322da 100644
--- a/tests/domain/addresses/test_user_address.py
+++ b/tests/domain/addresses/test_user_address.py
@@ -43,3 +43,29 @@ def test_user_address_equality_uses_sanitised_postcode() -> None:
     a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
     b = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
     assert a == b
+
+
+def test_user_address_source_row_defaults_to_empty_dict() -> None:
+    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    assert addr.source_row == {}
+
+
+def test_user_address_carries_source_row() -> None:
+    row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
+    addr = UserAddress(
+        user_address="1 The Street", postcode="SW1A 1AA", source_row=row
+    )
+    assert addr.source_row == row
+
+
+def test_user_address_equality_ignores_source_row() -> None:
+    # source_row is excluded from equality (and hashing): identity stays
+    # defined by the parsed fields, so two addresses parsed from rows with
+    # different incidental columns still compare equal.
+    a = UserAddress(
+        user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"}
+    )
+    b = UserAddress(
+        user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"}
+    )
+    assert a == b
diff --git a/tests/infrastructure/test_s3_uri.py b/tests/infrastructure/test_s3_uri.py
new file mode 100644
index 00000000..896c5959
--- /dev/null
+++ b/tests/infrastructure/test_s3_uri.py
@@ -0,0 +1,32 @@
+import pytest
+
+from infrastructure.s3_uri import parse_s3_uri
+
+
+def test_parses_simple_s3_uri() -> None:
+    assert parse_s3_uri("s3://my-bucket/file.csv") == ("my-bucket", "file.csv")
+
+
+def test_parses_s3_uri_with_nested_key() -> None:
+    bucket, key = parse_s3_uri("s3://my-bucket/nested/path/to/file.csv")
+    assert (bucket, key) == ("my-bucket", "nested/path/to/file.csv")
+
+
+def test_rejects_s3_uri_without_key() -> None:
+    with pytest.raises(ValueError, match="bucket and a key"):
+        parse_s3_uri("s3://my-bucket")
+
+
+def test_rejects_s3_uri_with_empty_key() -> None:
+    with pytest.raises(ValueError, match="bucket and a key"):
+        parse_s3_uri("s3://my-bucket/")
+
+
+def test_parses_console_url_prefix() -> None:
+    url = "https://eu-west-2.console.aws.amazon.com/s3/object/my-bucket?prefix=nested%2Ffile.csv"
+    assert parse_s3_uri(url) == ("my-bucket", "nested/file.csv")
+
+
+def test_rejects_unparseable_string() -> None:
+    with pytest.raises(ValueError):
+        parse_s3_uri("not-a-uri-at-all")
diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py
index 57bd2133..79c60974 100644
--- a/tests/orchestration/test_postcode_splitter_orchestrator.py
+++ b/tests/orchestration/test_postcode_splitter_orchestrator.py
@@ -132,7 +132,7 @@ def _upload_fixture_csv(csv_client: CsvS3Client) -> str:
             "Address 1": f"{i} High St",
             "Address 2": "",
             "Address 3": "",
-            "Postcode": "AA1 1AA",
+            "postcode": "AA1 1AA",
             "Internal Reference": f"AA-{i}",
         }
         for i in range(1, 3)
@@ -142,7 +142,7 @@ def _upload_fixture_csv(csv_client: CsvS3Client) -> str:
             "Address 1": f"{i} Long Road",
             "Address 2": "",
             "Address 3": "",
-            "Postcode": "BB2 2BB",
+            "postcode": "BB2 2BB",
             "Internal Reference": f"BB-{i}",
         }
         for i in range(1, 5)
@@ -152,7 +152,7 @@ def _upload_fixture_csv(csv_client: CsvS3Client) -> str:
             "Address 1": "1 Final Way",
             "Address 2": "",
             "Address 3": "",
-            "Postcode": "CC3 3CC",
+            "postcode": "CC3 3CC",
             "Internal Reference": "CC-1",
         }
     )
@@ -281,15 +281,15 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order(
         input_s3_uri=input_uri,
     )
 
-    # Re-load each child's saved batch and inspect the postcode column to
-    # confirm the dispatch order matches the postcode-batching algorithm:
+    # Re-load each child's saved batch and inspect the postcode_clean column
+    # to confirm the dispatch order matches the postcode-batching algorithm:
     # AA-batch first, BB oversize batch second, CC final-flush third.
     postcodes_per_batch: list[set[str]] = []
     for cid in child_ids:
         child = harness.subtasks.get(cid)
         assert child.inputs is not None
         rows = harness.csv_client.read_rows(child.inputs["s3_uri"])
-        postcodes_per_batch.append({row["postcode"] for row in rows})
+        postcodes_per_batch.append({row["postcode_clean"] for row in rows})
 
     assert postcodes_per_batch == [
         {"AA11AA"},
diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
index ca9e8a57..48733b55 100644
--- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py
+++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
@@ -3,6 +3,7 @@ from collections.abc import Iterator
 import pytest
 from moto import mock_aws
 
+from domain.addresses.user_address import UserAddress
 from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_csv_s3_repository import (
     UserAddressCsvS3Repository,
@@ -27,7 +28,7 @@ def _upload_csv(
     return repo._csv_client.save_rows(rows, key)  # pyright: ignore[reportPrivateUsage]
 
 
-def test_load_batch_concatenates_three_address_lines(
+def test_load_batch_parses_address_postcode_and_reference(
     repo: UserAddressCsvS3Repository,
 ) -> None:
     rows = [
@@ -35,7 +36,7 @@ def test_load_batch_concatenates_three_address_lines(
             "Address 1": "1 High Street",
             "Address 2": "Flat 2",
             "Address 3": "Townville",
-            "Postcode": "sw1a 1aa",
+            "postcode": "sw1a 1aa",
             "Internal Reference": "REF-001",
         }
     ]
@@ -58,7 +59,7 @@ def test_load_batch_uses_only_address_1_when_others_missing(
             "Address 1": "10 Cardiff Road",
             "Address 2": "",
             "Address 3": "",
-            "Postcode": "CF10 1AA",
+            "postcode": "CF10 1AA",
             "Internal Reference": "REF-002",
         }
     ]
@@ -80,7 +81,7 @@ def test_load_batch_handles_missing_internal_reference(
             "Address 1": "5 Park Lane",
             "Address 2": "",
             "Address 3": "",
-            "Postcode": "M1 1AA",
+            "postcode": "M1 1AA",
             "Internal Reference": "",
         }
     ]
@@ -94,16 +95,67 @@ def test_load_batch_handles_missing_internal_reference(
     assert addresses[0].internal_reference is None
 
 
+def test_load_batch_captures_full_source_row(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    # A raw EPC-export-shaped row: the splitter must preserve every column,
+    # not just the ones it parses into UserAddress fields.
+    row = {
+        "Asset Reference": "511",
+        "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX",
+        "postcode": "BB12 7BX",
+        "Property Type": "House: End Terrace",
+        "SAP Score": "69",
+    }
+    uri = _upload_csv(repo, [row], "uploads/epc.csv")
+
+    addresses = repo.load_batch(uri)
+
+    assert addresses[0].source_row == row
+
+
+def test_load_batch_raises_when_postcode_column_absent(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}]
+    uri = _upload_csv(repo, rows, "uploads/no-postcode.csv")
+
+    with pytest.raises(ValueError, match="no 'postcode' column"):
+        repo.load_batch(uri)
+
+
+def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
+    repo: UserAddressCsvS3Repository,
+) -> None:
+    row = {
+        "Asset Reference": "511",
+        "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX",
+        "postcode": " BB12 7BX",
+        "Property Type": "House: End Terrace",
+    }
+    uri = _upload_csv(repo, [row], "uploads/epc.csv")
+    addresses = repo.load_batch(uri)
+
+    saved_uri = repo.save_batch(addresses, "tasks/passthrough")
+    saved_rows = repo._csv_client.read_rows(saved_uri)  # pyright: ignore[reportPrivateUsage]
+
+    assert len(saved_rows) == 1
+    saved = saved_rows[0]
+    # Every original column survives, byte-for-byte.
+    for column, value in row.items():
+        assert saved[column] == value
+    # Plus the one appended column the downstream address2uprn stage groups on.
+    assert saved["postcode_clean"] == "BB127BX"
+
+
 def test_save_batch_returns_uri_under_path_prefix(
     repo: UserAddressCsvS3Repository,
 ) -> None:
-    from domain.addresses.user_address import UserAddress
-
     addresses = [
         UserAddress(
-            user_address="1 High Street, Flat 2, Townville",
+            user_address="1 High Street",
             postcode="SW1A 1AA",
-            internal_reference="REF-001",
+            source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
         ),
     ]
 
@@ -113,59 +165,42 @@ def test_save_batch_returns_uri_under_path_prefix(
     assert uri.endswith(".csv")
 
 
-def test_save_then_reload_round_trip_preserves_values(
+def test_save_then_reload_round_trip_preserves_columns(
     repo: UserAddressCsvS3Repository,
 ) -> None:
-    from domain.addresses.user_address import UserAddress
-
-    # save_batch writes the splitter's compact schema
-    # (user_address/postcode/internal_reference); load_batch reads the
-    # canonical upload schema. To round-trip through the repo we re-upload
-    # the saved CSV under the upload schema's column names.
-    original = [
-        UserAddress(
-            user_address="1 High Street",
-            postcode="SW1A 1AA",
-            internal_reference="REF-001",
-        ),
-        UserAddress(
-            user_address="2 Low Street",
-            postcode="XY9 8ZW",
-            internal_reference=None,
-        ),
-    ]
-
-    saved_uri = repo.save_batch(original, "tasks/round-trip")
-
-    # Re-shape the saved CSV into the canonical upload schema for reload.
-    saved_rows = repo._csv_client.read_rows(saved_uri)  # pyright: ignore[reportPrivateUsage]
-    upload_rows: list[dict[str, str]] = [
+    rows = [
         {
-            "Address 1": row["user_address"],
-            "Address 2": "",
-            "Address 3": "",
-            "Postcode": row["postcode"],
-            "Internal Reference": row["internal_reference"],
-        }
-        for row in saved_rows
+            "Address 1": "1 High Street",
+            "postcode": "SW1A 1AA",
+            "Internal Reference": "REF-001",
+        },
+        {
+            "Address 1": "2 Low Street",
+            "postcode": "XY9 8ZW",
+            "Internal Reference": "",
+        },
     ]
-    upload_uri = _upload_csv(repo, upload_rows, "uploads/round-trip.csv")
+    uri = _upload_csv(repo, rows, "uploads/round-trip.csv")
+    addresses = repo.load_batch(uri)
 
-    reloaded = repo.load_batch(upload_uri)
+    saved_uri = repo.save_batch(addresses, "tasks/round-trip")
+    saved_rows = repo._csv_client.read_rows(saved_uri)  # pyright: ignore[reportPrivateUsage]
 
-    assert reloaded == original
+    # Original columns come back verbatim; postcode_clean is the only addition.
+    assert [
+        {k: v for k, v in r.items() if k != "postcode_clean"} for r in saved_rows
+    ] == rows
+    assert [r["postcode_clean"] for r in saved_rows] == ["SW1A1AA", "XY98ZW"]
 
 
 def test_save_batch_uses_unique_filename_per_call(
     repo: UserAddressCsvS3Repository,
 ) -> None:
-    from domain.addresses.user_address import UserAddress
-
     addresses = [
         UserAddress(
             user_address="1 High Street",
             postcode="SW1A 1AA",
-            internal_reference="REF-001",
+            source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
         ),
     ]
 
diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py
index 426b250f..771a49f8 100644
--- a/tests/utilities/aws_lambda/test_subtask_handler.py
+++ b/tests/utilities/aws_lambda/test_subtask_handler.py
@@ -6,6 +6,7 @@ to the wrapped function — so the handler can compose its own use-case
 orchestrator that shares the session.
 """
 
+import logging
 from collections.abc import Generator, Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -13,6 +14,8 @@ from typing import Any
 from uuid import UUID
 
 import pytest
+
+_LOGGER_NAME = "utilities.aws_lambda.subtask_handler"
 from sqlmodel import Session, SQLModel, create_engine
 
 from domain.tasks.subtasks import SubTaskStatus
@@ -142,3 +145,111 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
     persisted_child = harness.subtasks.get(child_ids[0])
     assert persisted_child.task_id == task.id
     assert persisted_child.status is SubTaskStatus.WAITING
+
+
+def test_subtask_handler_logs_subtask_lifecycle_on_success(
+    harness: Harness, caplog: pytest.LogCaptureFixture
+) -> None:
+    """Start and completion are logged at INFO so a successful invocation
+    leaves a CloudWatch breadcrumb (not just the Lambda runtime lines)."""
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        return None
+
+    with caplog.at_level(logging.INFO, logger=_LOGGER_NAME):
+        handler(_direct_event(task.id, subtask.id), context=None)
+
+    assert f"Running subtask {subtask.id}" in caplog.text
+    assert f"Subtask {subtask.id} completed" in caplog.text
+
+
+def test_subtask_handler_logs_exception_on_failure(
+    harness: Harness, caplog: pytest.LogCaptureFixture
+) -> None:
+    """A failing subtask is logged at ERROR with the traceback attached,
+    before the exception propagates for the Lambda runtime to surface."""
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        raise RuntimeError("boom")
+
+    with caplog.at_level(logging.INFO, logger=_LOGGER_NAME):
+        with pytest.raises(RuntimeError, match="boom"):
+            handler(_direct_event(task.id, subtask.id), context=None)
+
+    failures = [r for r in caplog.records if r.levelno == logging.ERROR]
+    assert any(
+        f"Subtask {subtask.id} failed" in r.getMessage() for r in failures
+    )
+    assert any(r.exc_info is not None for r in failures)
+
+
+def test_subtask_handler_records_cloudwatch_url_on_subtask(
+    harness: Harness, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """With the AWS Lambda runtime's log env vars present, a CloudWatch deep
+    link is built and persisted on the SubTask."""
+    monkeypatch.setenv("AWS_REGION", "eu-west-2")
+    monkeypatch.setenv(
+        "AWS_LAMBDA_LOG_GROUP_NAME", "/aws/lambda/postcode-splitter"
+    )
+    monkeypatch.setenv(
+        "AWS_LAMBDA_LOG_STREAM_NAME", "2026/05/20/[$LATEST]abc123"
+    )
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        return None
+
+    handler(_direct_event(task.id, subtask.id), context=None)
+
+    saved_url = harness.subtasks.get(subtask.id).cloud_logs_url
+    assert saved_url is not None
+    assert saved_url.startswith(
+        "https://eu-west-2.console.aws.amazon.com/cloudwatch/home"
+    )
+    # Log group / stream are console-encoded ("/" -> "$252F").
+    assert "$252Faws$252Flambda$252Fpostcode-splitter" in saved_url
+    assert "$255B$2524LATEST$255D" in saved_url
+
+
+def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda(
+    harness: Harness, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Outside a real Lambda (e.g. the local RIE) the runtime log env vars
+    are absent, so cloud_logs_url is left unset rather than storing junk."""
+    for var in (
+        "AWS_REGION",
+        "AWS_LAMBDA_LOG_GROUP_NAME",
+        "AWS_LAMBDA_LOG_STREAM_NAME",
+    ):
+        monkeypatch.delenv(var, raising=False)
+    task, subtask = harness.orchestrator.create_task_with_subtask(
+        task_source="manual:test"
+    )
+
+    @subtask_handler(orchestrator_cm=harness.factory)
+    def handler(
+        body: dict[str, Any], context: Any, orchestrator: TaskOrchestrator
+    ) -> None:
+        return None
+
+    handler(_direct_event(task.id, subtask.id), context=None)
+
+    assert harness.subtasks.get(subtask.id).cloud_logs_url is None
diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py
index 5ad5f6e1..40f116ad 100644
--- a/utilities/aws_lambda/subtask_handler.py
+++ b/utilities/aws_lambda/subtask_handler.py
@@ -1,18 +1,32 @@
 """@subtask_handler decorator for Lambdas that operate on existing SubTasks.
 
 Translates an AWS Lambda invocation (SQS-shaped or direct) into
-TaskOrchestrator.run_subtask(...) calls.
+TaskOrchestrator.run_subtask(...) calls, emitting an INFO log line for each
+subtask's start and completion and a logged exception on failure. Those lines
+land in CloudWatch via the Lambda runtime's stdout/stderr capture.
+
+Each subtask also records ``cloud_logs_url`` -- a deep link to this
+invocation's CloudWatch log stream -- so an operator can jump from a SubTask
+row straight to its logs. It is built from the environment variables the AWS
+Lambda runtime sets, so it is populated only on real Lambda invocations and
+left unset under the local RIE (which does not export them).
 """
 
 import json
+import logging
+import os
 from contextlib import AbstractContextManager
 from functools import wraps
 from typing import Any, Callable, Optional, cast
+from urllib.parse import quote
 
 from utilities.aws_lambda.default_orchestrator import default_orchestrator
 from utilities.aws_lambda.subtask_trigger_body import SubtaskTriggerBody
 from orchestration.task_orchestrator import TaskOrchestrator
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 OrchestratorCM = Callable[[], AbstractContextManager[TaskOrchestrator]]
 
 
@@ -33,14 +47,26 @@ def subtask_handler(
     def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
         @wraps(func)
         def wrapper(event: dict[str, Any], context: Any) -> None:
+            cloud_logs_url = _cloudwatch_url()
             with factory() as orchestrator:
                 for record in _records(event):
                     body = _parse_body(record)
                     trigger = SubtaskTriggerBody.model_validate(body)
-                    orchestrator.run_subtask(
-                        trigger.sub_task_id,
-                        work=lambda body=body, o=orchestrator: func(body, context, o),
-                    )
+                    logger.info("Running subtask %s", trigger.sub_task_id)
+                    try:
+                        orchestrator.run_subtask(
+                            trigger.sub_task_id,
+                            work=lambda body=body, o=orchestrator: func(
+                                body, context, o
+                            ),
+                            cloud_logs_url=cloud_logs_url,
+                        )
+                    except Exception:
+                        logger.exception(
+                            "Subtask %s failed", trigger.sub_task_id
+                        )
+                        raise
+                    logger.info("Subtask %s completed", trigger.sub_task_id)
 
         return wrapper
 
@@ -65,3 +91,34 @@ def _records(event: dict[str, Any]) -> list[dict[str, Any]]:
     if isinstance(raw_records, list):
         return [r for r in cast(list[Any], raw_records) if isinstance(r, dict)]
     return [event]
+
+
+def _console_encode(value: str) -> str:
+    """Encode a value for a CloudWatch console deep link.
+
+    The console expects URL-encoding with the percent signs themselves
+    re-encoded as ``$25`` -- e.g. ``/`` becomes ``%2F`` becomes ``$252F``.
+    """
+    return quote(value, safe="").replace("%", "$25")
+
+
+def _cloudwatch_url() -> Optional[str]:
+    """Build a CloudWatch console URL for this invocation's log stream.
+
+    Sourced entirely from the environment variables the AWS Lambda runtime
+    sets -- ``AWS_REGION``, ``AWS_LAMBDA_LOG_GROUP_NAME`` and
+    ``AWS_LAMBDA_LOG_STREAM_NAME``. Returns None when any is absent, which is
+    the case outside a real Lambda (the local RIE does not export them) -- so
+    ``SubTask.cloud_logs_url`` is left unset rather than storing a link that
+    points nowhere.
+    """
+    region = os.environ.get("AWS_REGION")
+    log_group = os.environ.get("AWS_LAMBDA_LOG_GROUP_NAME")
+    log_stream = os.environ.get("AWS_LAMBDA_LOG_STREAM_NAME")
+    if not (region and log_group and log_stream):
+        return None
+    return (
+        f"https://{region}.console.aws.amazon.com/cloudwatch/home"
+        f"?region={region}#logsV2:log-groups/log-group/"
+        f"{_console_encode(log_group)}/log-events/{_console_encode(log_stream)}"
+    )

From 8bb90a5aa5beb495de799c481c2faa7899e6c5de Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 12:57:03 +0000
Subject: [PATCH 77/91] sanitisation of postcode

---
 backend/bulk_address2uprn_combiner/main.py    | 14 +-----
 backend/ordnanceSurvey/main.py                | 12 +----
 domain/addresses/postcode_batching.py         |  8 ++--
 domain/addresses/user_address.py              | 20 ++++----
 domain/postcode.py                            | 40 ++++++++++++++++
 domain/postcodes/__init__.py                  |  0
 domain/postcodes/sanitise.py                  | 23 ---------
 .../user_address_csv_s3_repository.py         |  5 +-
 .../user_address/user_address_repository.py   |  4 +-
 .../addresses/test_postcode_batching.py       | 17 ++++---
 tests/domain/addresses/test_user_address.py   | 48 +++++++++++--------
 tests/domain/postcodes/__init__.py            |  0
 tests/domain/postcodes/test_sanitise.py       | 28 -----------
 tests/domain/test_postcode.py                 | 48 +++++++++++++++++++
 .../test_user_address_csv_s3_repository.py    | 11 +++--
 15 files changed, 153 insertions(+), 125 deletions(-)
 create mode 100644 domain/postcode.py
 delete mode 100644 domain/postcodes/__init__.py
 delete mode 100644 domain/postcodes/sanitise.py
 delete mode 100644 tests/domain/postcodes/__init__.py
 delete mode 100644 tests/domain/postcodes/test_sanitise.py
 create mode 100644 tests/domain/test_postcode.py

diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py
index 37136e52..44f0b3f9 100644
--- a/backend/bulk_address2uprn_combiner/main.py
+++ b/backend/bulk_address2uprn_combiner/main.py
@@ -2,7 +2,7 @@ import os
 import boto3
 import pandas as pd
 from io import BytesIO
-from typing import Any, Optional
+from typing import Any
 from uuid import UUID
 from datetime import datetime, timezone
 
@@ -12,7 +12,6 @@ from backend.app.db.functions.bulk_address_uploads_functions import (
     set_combined_output_s3_uri,
     set_combining_status,
 )
-from orchestration.task_orchestrator import TaskOrchestrator
 
 logger = setup_logger()
 
@@ -36,16 +35,7 @@ def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame:
 
 
 @subtask_handler()
-def handler(
-    body: dict[str, Any],
-    context: Any,
-    orchestrator: Optional[TaskOrchestrator] = None,
-) -> str:
-    # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
-    # decorator; unused here but accepted so the contract is uniform across
-    # callers (see issue #1103).
-    del orchestrator
-
+def handler(body: dict[str, Any], context: Any) -> str:
     task_id_str: str = body.get("task_id", "")
 
     if not task_id_str:
diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py
index 18c4e2f2..6e82b468 100644
--- a/backend/ordnanceSurvey/main.py
+++ b/backend/ordnanceSurvey/main.py
@@ -16,7 +16,6 @@ from backend.ordnanceSurvey.helpers import (
     os_places_results_to_dataframe,
 )
 from backend.app.config import get_settings
-from orchestration.task_orchestrator import TaskOrchestrator
 from sqlalchemy import select
 from datetime import datetime
 import uuid
@@ -106,16 +105,7 @@ def save_results_to_s3(
 
 
 @subtask_handler()  # This assumes task_id and subtask_id is defined in event.Records.body
-def handler(
-    body: dict[str, Any],
-    context: Any,
-    orchestrator: Optional[TaskOrchestrator] = None,
-    local: bool = False,
-) -> None:
-    # `orchestrator` is injected by the new utilities.aws_lambda.subtask_handler
-    # decorator; unused here but accepted so the contract is uniform across
-    # callers (see issue #1103).
-    del orchestrator
+def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
 
     # delete this line after test
     # local = True
diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py
index 209e0784..b73dc1bb 100644
--- a/domain/addresses/postcode_batching.py
+++ b/domain/addresses/postcode_batching.py
@@ -22,6 +22,7 @@ from __future__ import annotations
 from collections.abc import Iterable, Iterator
 
 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 
 
 def iter_postcode_grouped_batches(
@@ -75,13 +76,14 @@ def iter_postcode_grouped_batches(
 
 def _group_by_postcode_in_order(
     addresses: Iterable[UserAddress],
-) -> dict[str, list[UserAddress]]:
+) -> dict[Postcode, list[UserAddress]]:
     """Group addresses by ``postcode`` preserving first-seen order.
 
     Python dicts retain insertion order since 3.7, so a plain dict suffices
-    for the same effect as pandas ``groupby(..., sort=False)``.
+    for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode``
+    is a frozen value object, hence hashable and usable as the dict key.
     """
-    groups: dict[str, list[UserAddress]] = {}
+    groups: dict[Postcode, list[UserAddress]] = {}
     for address in addresses:
         groups.setdefault(address.postcode, []).append(address)
     return groups
diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py
index 120a3659..672b2c54 100644
--- a/domain/addresses/user_address.py
+++ b/domain/addresses/user_address.py
@@ -1,9 +1,9 @@
 """The :class:`UserAddress` value object.
 
 A frozen dataclass capturing the splitter's domain entity: the raw input
-address line, a sanitised postcode, and an optional internal reference from
-the customer dataset. Postcode sanitisation runs in ``__post_init__`` so no
-caller can construct an instance with an un-normalised postcode.
+address line, a :class:`~domain.postcode.Postcode`, and an optional internal
+reference from the customer dataset. The postcode is a value object that is
+canonical by construction, so no caller can hold an un-normalised postcode.
 """
 
 from __future__ import annotations
@@ -11,7 +11,7 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Optional
 
-from domain.postcodes.sanitise import sanitise_postcode
+from domain.postcode import Postcode
 
 
 def _empty_source_row() -> dict[str, str]:
@@ -25,9 +25,9 @@ class UserAddress:
 
     Attributes:
         user_address: The free-text address string as supplied upstream.
-        postcode: The postcode; always stored in canonical form
-            (uppercased, whitespace stripped). Sanitisation is enforced by
-            :meth:`__post_init__`.
+        postcode: The postcode as a :class:`~domain.postcode.Postcode` value
+            object -- canonical (uppercased, whitespace stripped) by
+            construction.
         internal_reference: Optional customer-side identifier preserved for
             traceability through the matching pipeline.
         source_row: The complete original CSV row this address was parsed
@@ -39,12 +39,8 @@ class UserAddress:
     """
 
     user_address: str
-    postcode: str
+    postcode: Postcode
     internal_reference: Optional[str] = None
     source_row: dict[str, str] = field(
         default_factory=_empty_source_row, compare=False
     )
-
-    def __post_init__(self) -> None:
-        # Frozen dataclass: bypass the descriptor with object.__setattr__.
-        object.__setattr__(self, "postcode", sanitise_postcode(self.postcode))
diff --git a/domain/postcode.py b/domain/postcode.py
new file mode 100644
index 00000000..514e1a39
--- /dev/null
+++ b/domain/postcode.py
@@ -0,0 +1,40 @@
+"""The :class:`Postcode` value object.
+
+A frozen value object that owns postcode sanitisation. Constructing a
+``Postcode`` always yields the canonical form -- uppercase with all
+whitespace removed -- so no part of the domain can hold an un-normalised
+postcode. This matches the legacy splitter's
+``df["postcode"].str.upper().str.replace(" ", "")``.
+
+``Postcode`` is the single sanitisation point: anywhere a postcode crosses a
+domain boundary it should be wrapped in one, and ``str(postcode)`` gives the
+canonical string back for serialisation.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Postcode:
+    """A postcode held in canonical form.
+
+    The ``value`` passed to the constructor is sanitised eagerly in
+    :meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs,
+    newlines) removed -- so every ``Postcode`` instance is canonical by
+    construction. Two postcodes that differ only in surface whitespace or
+    case therefore compare equal.
+
+    Attributes:
+        value: The canonical postcode string (e.g. ``"SW1A1AA"``).
+    """
+
+    value: str
+
+    def __post_init__(self) -> None:
+        # Frozen dataclass: bypass the descriptor with object.__setattr__.
+        object.__setattr__(self, "value", "".join(self.value.split()).upper())
+
+    def __str__(self) -> str:
+        return self.value
diff --git a/domain/postcodes/__init__.py b/domain/postcodes/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/domain/postcodes/sanitise.py b/domain/postcodes/sanitise.py
deleted file mode 100644
index 94b0dcf7..00000000
--- a/domain/postcodes/sanitise.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Canonical postcode sanitisation for the domain layer.
-
-The legacy postcode_splitter normalises postcodes inline with
-``df["postcode"].str.upper().str.replace(" ", "")``. This module promotes
-that operation to a pure, reusable function so the same canonical form is
-applied wherever a postcode crosses a domain boundary -- including
-:class:`domain.addresses.user_address.UserAddress` construction and future
-migrations.
-"""
-
-from __future__ import annotations
-
-
-def sanitise_postcode(s: str) -> str:
-    """Return the canonical form of a postcode.
-
-    The canonical form is uppercase with all whitespace removed. This matches
-    the legacy splitter's ``str.upper().str.replace(" ", "")`` for the
-    overwhelmingly common case of space-separated postcodes (e.g. ``"sw1a 1aa"``
-    becomes ``"SW1A1AA"``) while also tolerating tabs/newlines that can creep
-    in from CSV ingestion.
-    """
-    return "".join(s.split()).upper()
diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py
index 7cd10bac..2432d8e9 100644
--- a/repositories/user_address/user_address_csv_s3_repository.py
+++ b/repositories/user_address/user_address_csv_s3_repository.py
@@ -20,6 +20,7 @@ from datetime import datetime, timezone
 from typing import Optional
 
 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_repository import UserAddressRepository
 
@@ -77,7 +78,7 @@ class UserAddressCsvS3Repository(UserAddressRepository):
             addresses.append(
                 UserAddress(
                     user_address=user_address,
-                    postcode=postcode,
+                    postcode=Postcode(postcode),
                     internal_reference=internal_reference,
                     source_row=row,
                 )
@@ -96,7 +97,7 @@ class UserAddressCsvS3Repository(UserAddressRepository):
         Returns the full ``s3://bucket/key`` URI.
         """
         rows: list[dict[str, str]] = [
-            {**addr.source_row, _POSTCODE_CLEAN_COLUMN: addr.postcode}
+            {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
             for addr in addresses
         ]
         filename = (
diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py
index d8c12855..ab9b6671 100644
--- a/repositories/user_address/user_address_repository.py
+++ b/repositories/user_address/user_address_repository.py
@@ -17,8 +17,8 @@ class UserAddressRepository(ABC):
 
     Implementations choose the underlying storage (S3 CSV, Postgres,
     in-memory, ...) but must preserve the canonical column semantics:
-    the address text, postcode (sanitised by ``UserAddress.__post_init__``),
-    and an optional internal reference.
+    the address text, postcode (a :class:`~domain.postcode.Postcode` value
+    object), and an optional internal reference.
     """
 
     @abstractmethod
diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py
index 2dac46cc..6e52b581 100644
--- a/tests/domain/addresses/test_postcode_batching.py
+++ b/tests/domain/addresses/test_postcode_batching.py
@@ -2,12 +2,15 @@ import pytest
 
 from domain.addresses.postcode_batching import iter_postcode_grouped_batches
 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 
 
 def _addrs(postcode: str, n: int) -> list[UserAddress]:
     """Build ``n`` addresses sharing a postcode, with distinct address lines."""
     return [
-        UserAddress(user_address=f"{i} {postcode} Street", postcode=postcode)
+        UserAddress(
+            user_address=f"{i} {postcode} Street", postcode=Postcode(postcode)
+        )
         for i in range(n)
     ]
 
@@ -38,8 +41,8 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None:
     addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3)
     batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
     assert len(batches) == 2
-    assert [a.postcode for a in batches[0]] == ["AA11AA"] * 3
-    assert [a.postcode for a in batches[1]] == ["BB22BB"] * 3
+    assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3
+    assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3
 
 
 def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
@@ -61,9 +64,9 @@ def test_oversize_group_flushes_existing_buffer_first() -> None:
         iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
     )
     assert len(batches) == 3
-    assert [a.postcode for a in batches[0]] == ["AA11AA", "AA11AA"]
-    assert [a.postcode for a in batches[1]] == ["BB22BB"] * 7
-    assert [a.postcode for a in batches[2]] == ["CC33CC"]
+    assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"]
+    assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7
+    assert [str(a.postcode) for a in batches[2]] == ["CC33CC"]
 
 
 def test_final_flush_yields_remaining_buffer() -> None:
@@ -80,7 +83,7 @@ def test_postcode_grouping_preserves_first_seen_order() -> None:
     b1, b2 = _addrs("AA1 1AA", 2)
     batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2]))
     assert len(batches) == 1
-    assert [a.postcode for a in batches[0]] == [
+    assert [str(a.postcode) for a in batches[0]] == [
         "ZZ99ZZ",
         "ZZ99ZZ",
         "AA11AA",
diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py
index 4d8322da..fa44ad61 100644
--- a/tests/domain/addresses/test_user_address.py
+++ b/tests/domain/addresses/test_user_address.py
@@ -3,69 +3,77 @@ import dataclasses
 import pytest
 
 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 
 
-def test_user_address_sanitises_postcode_on_construction() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
-    assert addr.postcode == "SW1A1AA"
+def test_user_address_holds_postcode_value_object() -> None:
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
+    assert addr.postcode == Postcode("SW1A1AA")
 
 
 def test_user_address_preserves_user_address_verbatim() -> None:
     # The free-text user_address string is intentionally NOT normalised --
-    # only the postcode is canonicalised at the boundary.
-    addr = UserAddress(user_address="  1 The   Street  ", postcode="sw1a 1aa")
+    # only the postcode is canonicalised, and that happens inside Postcode.
+    addr = UserAddress(
+        user_address="  1 The   Street  ", postcode=Postcode("SW1A1AA")
+    )
     assert addr.user_address == "  1 The   Street  "
 
 
 def test_user_address_internal_reference_defaults_to_none() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
     assert addr.internal_reference is None
 
 
 def test_user_address_internal_reference_accepted() -> None:
     addr = UserAddress(
         user_address="1 The Street",
-        postcode="SW1A1AA",
+        postcode=Postcode("SW1A1AA"),
         internal_reference="cust-42",
     )
     assert addr.internal_reference == "cust-42"
 
 
 def test_user_address_is_frozen() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
     with pytest.raises(dataclasses.FrozenInstanceError):
-        addr.postcode = "OTHER"  # type: ignore[misc]
+        addr.postcode = Postcode("OTHER")  # type: ignore[misc]
 
 
-def test_user_address_equality_uses_sanitised_postcode() -> None:
-    # Two instances constructed with different surface forms of the same
-    # postcode must compare equal because sanitisation runs eagerly.
-    a = UserAddress(user_address="1 The Street", postcode="sw1a 1aa")
-    b = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+def test_user_address_equality_uses_canonical_postcode() -> None:
+    # Postcode sanitises eagerly, so addresses built from different surface
+    # forms of the same postcode compare equal.
+    a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
+    b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
     assert a == b
 
 
 def test_user_address_source_row_defaults_to_empty_dict() -> None:
-    addr = UserAddress(user_address="1 The Street", postcode="SW1A1AA")
+    addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
     assert addr.source_row == {}
 
 
 def test_user_address_carries_source_row() -> None:
     row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
     addr = UserAddress(
-        user_address="1 The Street", postcode="SW1A 1AA", source_row=row
+        user_address="1 The Street",
+        postcode=Postcode("SW1A 1AA"),
+        source_row=row,
     )
     assert addr.source_row == row
 
 
 def test_user_address_equality_ignores_source_row() -> None:
     # source_row is excluded from equality (and hashing): identity stays
-    # defined by the parsed fields, so two addresses parsed from rows with
-    # different incidental columns still compare equal.
+    # defined by the parsed fields.
     a = UserAddress(
-        user_address="1 The Street", postcode="SW1A1AA", source_row={"x": "1"}
+        user_address="1 The Street",
+        postcode=Postcode("SW1A1AA"),
+        source_row={"x": "1"},
     )
     b = UserAddress(
-        user_address="1 The Street", postcode="SW1A1AA", source_row={"y": "2"}
+        user_address="1 The Street",
+        postcode=Postcode("SW1A1AA"),
+        source_row={"y": "2"},
     )
     assert a == b
diff --git a/tests/domain/postcodes/__init__.py b/tests/domain/postcodes/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/domain/postcodes/test_sanitise.py b/tests/domain/postcodes/test_sanitise.py
deleted file mode 100644
index edd1679c..00000000
--- a/tests/domain/postcodes/test_sanitise.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from domain.postcodes.sanitise import sanitise_postcode
-
-
-def test_sanitise_uppercases() -> None:
-    assert sanitise_postcode("sw1a1aa") == "SW1A1AA"
-
-
-def test_sanitise_strips_internal_spaces() -> None:
-    assert sanitise_postcode("sw1a 1aa") == "SW1A1AA"
-
-
-def test_sanitise_strips_leading_and_trailing_whitespace() -> None:
-    assert sanitise_postcode("  sw1a 1aa  ") == "SW1A1AA"
-
-
-def test_sanitise_strips_tabs_and_newlines() -> None:
-    # CSV ingestion occasionally introduces stray whitespace characters; the
-    # canonical form must absorb them just like literal spaces.
-    assert sanitise_postcode("sw1a\t1aa\n") == "SW1A1AA"
-
-
-def test_sanitise_already_canonical_is_idempotent() -> None:
-    assert sanitise_postcode("SW1A1AA") == "SW1A1AA"
-    assert sanitise_postcode(sanitise_postcode("sw1a 1aa")) == "SW1A1AA"
-
-
-def test_sanitise_empty_string() -> None:
-    assert sanitise_postcode("") == ""
diff --git a/tests/domain/test_postcode.py b/tests/domain/test_postcode.py
new file mode 100644
index 00000000..89d5cdc8
--- /dev/null
+++ b/tests/domain/test_postcode.py
@@ -0,0 +1,48 @@
+import dataclasses
+
+import pytest
+
+from domain.postcode import Postcode
+
+
+def test_postcode_uppercases() -> None:
+    assert Postcode("sw1a1aa").value == "SW1A1AA"
+
+
+def test_postcode_strips_internal_spaces() -> None:
+    assert Postcode("sw1a 1aa").value == "SW1A1AA"
+
+
+def test_postcode_strips_leading_and_trailing_whitespace() -> None:
+    assert Postcode("  sw1a 1aa  ").value == "SW1A1AA"
+
+
+def test_postcode_strips_tabs_and_newlines() -> None:
+    # CSV ingestion occasionally introduces stray whitespace characters; the
+    # canonical form must absorb them just like literal spaces.
+    assert Postcode("sw1a\t1aa\n").value == "SW1A1AA"
+
+
+def test_postcode_construction_is_idempotent() -> None:
+    once = Postcode("sw1a 1aa")
+    assert Postcode(once.value).value == "SW1A1AA"
+
+
+def test_postcode_empty_string() -> None:
+    assert Postcode("").value == ""
+
+
+def test_postcode_str_returns_canonical_value() -> None:
+    assert str(Postcode("sw1a 1aa")) == "SW1A1AA"
+
+
+def test_postcode_equality_ignores_surface_form() -> None:
+    # Differing case / whitespace sanitise to the same canonical value, so
+    # the value objects compare equal.
+    assert Postcode("sw1a 1aa") == Postcode("SW1A1AA")
+
+
+def test_postcode_is_frozen() -> None:
+    postcode = Postcode("SW1A1AA")
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        postcode.value = "OTHER"  # type: ignore[misc]
diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
index 48733b55..c1acee32 100644
--- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py
+++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
@@ -4,6 +4,7 @@ import pytest
 from moto import mock_aws
 
 from domain.addresses.user_address import UserAddress
+from domain.postcode import Postcode
 from infrastructure.csv_s3_client import CsvS3Client
 from repositories.user_address.user_address_csv_s3_repository import (
     UserAddressCsvS3Repository,
@@ -47,7 +48,7 @@ def test_load_batch_parses_address_postcode_and_reference(
     assert len(addresses) == 1
     address = addresses[0]
     assert address.user_address == "1 High Street, Flat 2, Townville"
-    assert address.postcode == "SW1A1AA"
+    assert address.postcode == Postcode("SW1A1AA")
     assert address.internal_reference == "REF-001"
 
 
@@ -69,7 +70,7 @@ def test_load_batch_uses_only_address_1_when_others_missing(
 
     assert len(addresses) == 1
     assert addresses[0].user_address == "10 Cardiff Road"
-    assert addresses[0].postcode == "CF101AA"
+    assert addresses[0].postcode == Postcode("CF101AA")
     assert addresses[0].internal_reference == "REF-002"
 
 
@@ -91,7 +92,7 @@ def test_load_batch_handles_missing_internal_reference(
 
     assert len(addresses) == 1
     assert addresses[0].user_address == "5 Park Lane"
-    assert addresses[0].postcode == "M11AA"
+    assert addresses[0].postcode == Postcode("M11AA")
     assert addresses[0].internal_reference is None
 
 
@@ -154,7 +155,7 @@ def test_save_batch_returns_uri_under_path_prefix(
     addresses = [
         UserAddress(
             user_address="1 High Street",
-            postcode="SW1A 1AA",
+            postcode=Postcode("SW1A 1AA"),
             source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
         ),
     ]
@@ -199,7 +200,7 @@ def test_save_batch_uses_unique_filename_per_call(
     addresses = [
         UserAddress(
             user_address="1 High Street",
-            postcode="SW1A 1AA",
+            postcode=Postcode("SW1A 1AA"),
             source_row={"Address 1": "1 High Street", "postcode": "SW1A 1AA"},
         ),
     ]

From d0cf3d14ad5116d0b2926aceb23d642408ca71bc Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 13:21:11 +0000
Subject: [PATCH 78/91] get rid of comments

---
 applications/postcode_splitter/handler.py     | 18 -------
 .../local_handler/invoke_local_lambda.py      |  9 ----
 .../postcode_splitter_trigger_body.py         | 21 ---------
 domain/addresses/postcode_batching.py         | 38 ---------------
 domain/addresses/user_address.py              | 30 +-----------
 domain/postcode.py                            | 25 ----------
 infrastructure/address2uprn_queue_client.py   |  7 ---
 infrastructure/csv_s3_client.py               | 18 -------
 infrastructure/s3_client.py                   |  9 ----
 infrastructure/s3_uri.py                      | 18 -------
 infrastructure/sqs_client.py                  |  8 ----
 .../postcode_splitter_orchestrator.py         | 34 --------------
 orchestration/task_orchestrator.py            |  6 ---
 .../user_address_csv_s3_repository.py         | 47 -------------------
 .../user_address/user_address_repository.py   | 19 +-------
 .../addresses/test_postcode_batching.py       |  1 -
 tests/infrastructure/__init__.py              |  7 ---
 tests/infrastructure/conftest.py              |  4 --
 .../test_postcode_splitter_orchestrator.py    | 10 ----
 tests/repositories/user_address/conftest.py   |  4 --
 .../aws_lambda/test_subtask_handler.py        | 25 +---------
 utilities/aws_lambda/subtask_handler.py       | 24 +---------
 22 files changed, 6 insertions(+), 376 deletions(-)

diff --git a/applications/postcode_splitter/handler.py b/applications/postcode_splitter/handler.py
index 005227a9..9fb3ca6a 100644
--- a/applications/postcode_splitter/handler.py
+++ b/applications/postcode_splitter/handler.py
@@ -1,15 +1,3 @@
-"""Lambda entrypoint for the postcode splitter slice.
-
-The :func:`handler` function is decorated with ``@subtask_handler()`` so the
-decorator owns the parent ``SubTask`` lifecycle (start/complete/fail) and
-injects the decorator-owned :class:`TaskOrchestrator` as the third positional
-argument. The handler itself does only two things:
-
-1. Build a :class:`PostcodeSplitterOrchestrator` from env-driven config.
-2. Delegate to ``split_and_dispatch`` and return its result so it lands in
-   ``SubTask.outputs["result"]``.
-"""
-
 from __future__ import annotations
 
 import os
@@ -34,12 +22,6 @@ from utilities.aws_lambda.subtask_handler import subtask_handler
 def handler(
     body: dict[str, Any], context: Any, task_orchestrator: TaskOrchestrator
 ) -> dict[str, list[str]]:
-    """Validate the trigger body, build the splitter, dispatch children.
-
-    Reads ``S3_BUCKET_NAME`` and ``ADDRESS2UPRN_QUEUE_URL`` from the
-    environment to construct the typed S3/SQS clients. The return value
-    lands in ``SubTask.outputs["result"]`` via the decorator.
-    """
     trigger = PostcodeSplitterTriggerBody.model_validate(body)
 
     bucket = os.environ["S3_BUCKET_NAME"]
diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
index c0ca89ec..21fa9b9e 100755
--- a/applications/postcode_splitter/local_handler/invoke_local_lambda.py
+++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
@@ -1,13 +1,4 @@
 #!/usr/bin/env python3
-"""POST a single SQS-shaped event at the locally-running splitter Lambda.
-
-The container built by docker-compose runs the AWS Lambda Runtime Interface
-Emulator, which accepts invocations on the URL below. Replace the three
-placeholder values with a real parent Task id, the splitter's own SubTask id
-(both must already exist in the Postgres pointed at by .env.local), and the
-s3://... URI of an uploaded address CSV.
-"""
-
 import json
 import requests
 
diff --git a/applications/postcode_splitter/postcode_splitter_trigger_body.py b/applications/postcode_splitter/postcode_splitter_trigger_body.py
index bc983abc..4c33f4a4 100644
--- a/applications/postcode_splitter/postcode_splitter_trigger_body.py
+++ b/applications/postcode_splitter/postcode_splitter_trigger_body.py
@@ -1,30 +1,9 @@
-"""Trigger payload model for the postcode splitter Lambda.
-
-The decorator (``@subtask_handler``) already validates ``task_id`` and
-``sub_task_id`` via :class:`SubtaskTriggerBody`; this model layers on the
-splitter-specific ``s3_uri`` field while keeping ``extra="allow"`` so any
-upstream-passthrough keys (e.g. ``portfolio_id``) survive untouched.
-"""
-
 from uuid import UUID
 
 from pydantic import BaseModel, ConfigDict
 
 
 class PostcodeSplitterTriggerBody(BaseModel):
-    """Validated body for the postcode splitter Lambda.
-
-    Attributes:
-        task_id: Parent ``Task`` id; used as the ``task_id`` input on each
-            child ``SubTask`` and as the ``parent_task_id`` on the fan-out
-            SQS messages.
-        sub_task_id: The splitter's own ``SubTask`` id; used as the path
-            segment under ``ara_postcode_splitter_batches/{task_id}/{...}``
-            so per-invocation outputs cannot collide.
-        s3_uri: ``s3://bucket/key`` URI of the uploaded address CSV the
-            splitter must read.
-    """
-
     model_config = ConfigDict(extra="allow")
 
     task_id: UUID
diff --git a/domain/addresses/postcode_batching.py b/domain/addresses/postcode_batching.py
index b73dc1bb..44e4d967 100644
--- a/domain/addresses/postcode_batching.py
+++ b/domain/addresses/postcode_batching.py
@@ -1,22 +1,3 @@
-"""Pure-Python postcode-grouped batching.
-
-This module preserves the batching invariants from the legacy postcode
-splitter (``backend/postcode_splitter/main.py``) without touching pandas,
-S3, or SQS:
-
-  * Addresses are grouped by **Postcode** in *insertion order* -- the first
-    Postcode seen produces the first group.
-  * A Postcode group is never split across two batches.
-  * If a single Postcode group is larger than ``max_batch_size``, it is
-    flushed as its own oversize batch (any buffered groups go out first,
-    untouched).
-  * Adding a group that would push the buffer past ``max_batch_size`` first
-    flushes the existing buffer, then starts a new buffer with the group.
-  * Whatever remains in the buffer after the input is exhausted is flushed
-    as the final batch.
-  * Empty input yields no batches.
-"""
-
 from __future__ import annotations
 
 from collections.abc import Iterable, Iterator
@@ -30,19 +11,6 @@ def iter_postcode_grouped_batches(
     *,
     max_batch_size: int = 500,
 ) -> Iterator[list[UserAddress]]:
-    """Yield batches of ``UserAddress`` grouped by Postcode.
-
-    Args:
-        addresses: An iterable of :class:`UserAddress`. Order is preserved
-            within each Postcode group, and groups are yielded in the order
-            their first member was seen.
-        max_batch_size: The soft upper bound on batch size, in number of
-            addresses. A single Postcode group larger than this cap is
-            dispatched whole (the cap is never used to split a group).
-
-    Yields:
-        Lists of ``UserAddress``. Each list is non-empty.
-    """
     if max_batch_size < 1:
         raise ValueError("max_batch_size must be >= 1")
 
@@ -77,12 +45,6 @@ def iter_postcode_grouped_batches(
 def _group_by_postcode_in_order(
     addresses: Iterable[UserAddress],
 ) -> dict[Postcode, list[UserAddress]]:
-    """Group addresses by ``postcode`` preserving first-seen order.
-
-    Python dicts retain insertion order since 3.7, so a plain dict suffices
-    for the same effect as pandas ``groupby(..., sort=False)``. ``Postcode``
-    is a frozen value object, hence hashable and usable as the dict key.
-    """
     groups: dict[Postcode, list[UserAddress]] = {}
     for address in addresses:
         groups.setdefault(address.postcode, []).append(address)
diff --git a/domain/addresses/user_address.py b/domain/addresses/user_address.py
index 672b2c54..9a28751b 100644
--- a/domain/addresses/user_address.py
+++ b/domain/addresses/user_address.py
@@ -1,11 +1,3 @@
-"""The :class:`UserAddress` value object.
-
-A frozen dataclass capturing the splitter's domain entity: the raw input
-address line, a :class:`~domain.postcode.Postcode`, and an optional internal
-reference from the customer dataset. The postcode is a value object that is
-canonical by construction, so no caller can hold an un-normalised postcode.
-"""
-
 from __future__ import annotations
 
 from dataclasses import dataclass, field
@@ -15,32 +7,12 @@ from domain.postcode import Postcode
 
 
 def _empty_source_row() -> dict[str, str]:
-    """Typed default factory for :attr:`UserAddress.source_row`."""
     return {}
 
 
 @dataclass(frozen=True)
 class UserAddress:
-    """A user-supplied address paired with its canonical postcode.
-
-    Attributes:
-        user_address: The free-text address string as supplied upstream.
-        postcode: The postcode as a :class:`~domain.postcode.Postcode` value
-            object -- canonical (uppercased, whitespace stripped) by
-            construction.
-        internal_reference: Optional customer-side identifier preserved for
-            traceability through the matching pipeline.
-        source_row: The complete original CSV row this address was parsed
-            from, column name -> cell value. The splitter is a pass-through
-            router: it groups rows by postcode but must not drop the other
-            columns the downstream address2uprn stage relies on, so the raw
-            row travels alongside the parsed fields. Excluded from equality
-            and hashing -- identity stays defined by the parsed fields above.
-    """
-
     user_address: str
     postcode: Postcode
     internal_reference: Optional[str] = None
-    source_row: dict[str, str] = field(
-        default_factory=_empty_source_row, compare=False
-    )
+    source_row: dict[str, str] = field(default_factory=_empty_source_row, compare=False)
diff --git a/domain/postcode.py b/domain/postcode.py
index 514e1a39..8e4e7c79 100644
--- a/domain/postcode.py
+++ b/domain/postcode.py
@@ -1,16 +1,3 @@
-"""The :class:`Postcode` value object.
-
-A frozen value object that owns postcode sanitisation. Constructing a
-``Postcode`` always yields the canonical form -- uppercase with all
-whitespace removed -- so no part of the domain can hold an un-normalised
-postcode. This matches the legacy splitter's
-``df["postcode"].str.upper().str.replace(" ", "")``.
-
-``Postcode`` is the single sanitisation point: anywhere a postcode crosses a
-domain boundary it should be wrapped in one, and ``str(postcode)`` gives the
-canonical string back for serialisation.
-"""
-
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -18,18 +5,6 @@ from dataclasses import dataclass
 
 @dataclass(frozen=True)
 class Postcode:
-    """A postcode held in canonical form.
-
-    The ``value`` passed to the constructor is sanitised eagerly in
-    :meth:`__post_init__` -- uppercased, with all whitespace (spaces, tabs,
-    newlines) removed -- so every ``Postcode`` instance is canonical by
-    construction. Two postcodes that differ only in surface whitespace or
-    case therefore compare equal.
-
-    Attributes:
-        value: The canonical postcode string (e.g. ``"SW1A1AA"``).
-    """
-
     value: str
 
     def __post_init__(self) -> None:
diff --git a/infrastructure/address2uprn_queue_client.py b/infrastructure/address2uprn_queue_client.py
index d81e2dd1..314e981f 100644
--- a/infrastructure/address2uprn_queue_client.py
+++ b/infrastructure/address2uprn_queue_client.py
@@ -4,12 +4,6 @@ from infrastructure.sqs_client import SqsClient
 
 
 class Address2UprnQueueClient(SqsClient):
-    """SQS client that publishes Address-to-UPRN fan-out messages.
-
-    The body shape is fixed by the downstream consumer:
-        ``{"task_id": str, "sub_task_id": str, "s3_uri": str}``
-    """
-
     def publish(
         self,
         *,
@@ -17,7 +11,6 @@ class Address2UprnQueueClient(SqsClient):
         child_subtask_id: UUID,
         s3_uri: str,
     ) -> str:
-        """Send a typed Address-to-UPRN message. Returns the SQS ``MessageId``."""
         return self.send(
             {
                 "task_id": str(parent_task_id),
diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py
index 0a576b81..055d1ce3 100644
--- a/infrastructure/csv_s3_client.py
+++ b/infrastructure/csv_s3_client.py
@@ -6,20 +6,7 @@ from infrastructure.s3_uri import parse_s3_uri
 
 
 class CsvS3Client(S3Client):
-    """:class:`S3Client` subclass that round-trips CSV row dictionaries.
-
-    Rows are represented as ``list[dict[str, str]]`` — the same shape used by
-    :func:`csv.DictReader`/``DictWriter`` — which keeps the API trivially
-    compatible with existing CSV helpers in ``utils/s3.py``.
-    """
-
     def read_rows(self, s3_uri: str) -> list[dict[str, str]]:
-        """Fetch the object at ``s3_uri`` and decode it as a CSV.
-
-        The bucket portion of the URI is validated against this client's
-        configured bucket so cross-bucket reads fail loudly rather than
-        silently fetching from the wrong place.
-        """
         bucket, key = parse_s3_uri(s3_uri)
         if bucket != self.bucket:
             raise ValueError(
@@ -31,11 +18,6 @@ class CsvS3Client(S3Client):
         return [dict(row) for row in reader]
 
     def save_rows(self, rows: list[dict[str, str]], key: str) -> str:
-        """Serialise ``rows`` to CSV under ``key`` and return the ``s3://`` URI.
-
-        An empty ``rows`` list is rejected because we cannot otherwise infer
-        a header row.
-        """
         if not rows:
             raise ValueError("Cannot save an empty rows list: header is unknown")
         buffer = StringIO()
diff --git a/infrastructure/s3_client.py b/infrastructure/s3_client.py
index 9e772881..a789fcc2 100644
--- a/infrastructure/s3_client.py
+++ b/infrastructure/s3_client.py
@@ -2,13 +2,6 @@ from typing import Any
 
 
 class S3Client:
-    """Thin typed wrapper around a boto3 S3 client bound to a single bucket.
-
-    The class is deliberately small: it exposes only the byte-level
-    operations needed by the wider infrastructure layer. Serialisation
-    (CSV, JSON, etc.) lives in subclasses such as :class:`CsvS3Client`.
-    """
-
     def __init__(self, boto_s3_client: Any, bucket: str) -> None:
         self._client = boto_s3_client
         self._bucket = bucket
@@ -18,7 +11,6 @@ class S3Client:
         return self._bucket
 
     def get_object(self, key: str) -> bytes:
-        """Return the raw bytes stored at ``key`` in this client's bucket."""
         response: dict[str, Any] = self._client.get_object(
             Bucket=self._bucket, Key=key
         )
@@ -26,6 +18,5 @@ class S3Client:
         return body
 
     def put_object(self, key: str, body: bytes) -> str:
-        """Write ``body`` to ``key`` and return the canonical ``s3://`` URI."""
         self._client.put_object(Bucket=self._bucket, Key=key, Body=body)
         return f"s3://{self._bucket}/{key}"
diff --git a/infrastructure/s3_uri.py b/infrastructure/s3_uri.py
index bf97100e..1dd5d967 100644
--- a/infrastructure/s3_uri.py
+++ b/infrastructure/s3_uri.py
@@ -1,25 +1,7 @@
-"""Parse S3 URIs into ``(bucket, key)`` pairs.
-
-A pure-stdlib helper for the infrastructure layer. It deliberately pulls in
-neither pandas, boto3, nor the legacy ``utils`` package, so slim Lambda images
-that only need URI parsing do not drag the wider data stack along.
-
-Two input shapes are supported:
-
-* canonical S3 URIs --- ``s3://bucket/key``
-* AWS S3 console URLs --- ``https://.../s3/object/bucket?prefix=key``
-"""
-
 from urllib.parse import unquote
 
 
 def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
-    """Return the ``(bucket, key)`` pair addressed by ``s3_uri``.
-
-    Raises:
-        ValueError: if ``s3_uri`` is neither a well-formed ``s3://`` URI nor
-            an AWS console URL carrying a ``prefix`` query parameter.
-    """
     if s3_uri.startswith("s3://"):
         parts = s3_uri[len("s3://") :].split("/", 1)
         if len(parts) < 2 or not parts[0] or not parts[1]:
diff --git a/infrastructure/sqs_client.py b/infrastructure/sqs_client.py
index fb053680..6fe8dd2e 100644
--- a/infrastructure/sqs_client.py
+++ b/infrastructure/sqs_client.py
@@ -3,13 +3,6 @@ from typing import Any
 
 
 class SqsClient:
-    """Thin typed wrapper around a boto3 SQS client bound to one queue URL.
-
-    The body is JSON-serialised here so callers can pass plain dictionaries
-    instead of constructing message strings themselves. Typed publish
-    helpers (e.g. :class:`Address2UprnQueueClient`) build on this contract.
-    """
-
     def __init__(self, boto_sqs_client: Any, queue_url: str) -> None:
         self._client = boto_sqs_client
         self._queue_url = queue_url
@@ -19,7 +12,6 @@ class SqsClient:
         return self._queue_url
 
     def send(self, body: dict[str, Any]) -> str:
-        """JSON-serialise ``body`` and send it. Returns the SQS ``MessageId``."""
         response: dict[str, Any] = self._client.send_message(
             QueueUrl=self._queue_url,
             MessageBody=json.dumps(body),
diff --git a/orchestration/postcode_splitter_orchestrator.py b/orchestration/postcode_splitter_orchestrator.py
index 6afa2538..36f4b515 100644
--- a/orchestration/postcode_splitter_orchestrator.py
+++ b/orchestration/postcode_splitter_orchestrator.py
@@ -1,15 +1,3 @@
-"""Use-case orchestrator for the postcode splitter Lambda.
-
-Wires the slice-1 domain (``iter_postcode_grouped_batches``), the slice-3
-``UserAddressRepository``, the slice-2 ``Address2UprnQueueClient``, and the
-slice-4 ``TaskOrchestrator.create_child_subtask`` primitive together.
-
-``split_and_dispatch`` loads the input batch, groups it into per-postcode
-chunks, writes each chunk back to S3 under a deterministic prefix, creates a
-WAITING child ``SubTask`` for it, and publishes the address-to-UPRN fan-out
-message that downstream consumers pick up.
-"""
-
 from __future__ import annotations
 
 from uuid import UUID
@@ -21,15 +9,6 @@ from repositories.user_address.user_address_repository import UserAddressReposit
 
 
 class PostcodeSplitterOrchestrator:
-    """Split an uploaded address batch into postcode-grouped child SubTasks.
-
-    The orchestrator owns the algorithm; the IO collaborators
-    (:class:`UserAddressRepository`, :class:`Address2UprnQueueClient`) and
-    the :class:`TaskOrchestrator` lifecycle primitive are injected so the
-    same wiring can be exercised against moto/SQLite in tests and against
-    real AWS in the Lambda entrypoint.
-    """
-
     def __init__(
         self,
         task_orchestrator: TaskOrchestrator,
@@ -49,19 +28,6 @@ class PostcodeSplitterOrchestrator:
         parent_subtask_id: UUID,
         input_s3_uri: str,
     ) -> list[UUID]:
-        """Split ``input_s3_uri`` into postcode batches and dispatch each.
-
-        For each yielded batch:
-
-        1. Persist it under
-           ``ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}``.
-        2. Create a WAITING child ``SubTask`` with
-           ``inputs={"task_id": str(parent_task_id), "s3_uri": batch_uri}``.
-        3. Publish an ``address2UPRN`` SQS message referencing the new child.
-
-        Returns:
-            The list of child ``SubTask`` ids, in dispatch order.
-        """
         addresses = self._user_address_repo.load_batch(input_s3_uri)
         path_prefix = (
             f"ara_postcode_splitter_batches/{parent_task_id}/{parent_subtask_id}"
diff --git a/orchestration/task_orchestrator.py b/orchestration/task_orchestrator.py
index 82d95db1..ebb71a32 100644
--- a/orchestration/task_orchestrator.py
+++ b/orchestration/task_orchestrator.py
@@ -54,12 +54,6 @@ class TaskOrchestrator:
         *,
         inputs: Optional[dict[str, Any]] = None,
     ) -> SubTask:
-        """Add a new WAITING SubTask under an existing parent Task.
-
-        Skips `_cascade`: a new WAITING child against an IN_PROGRESS parent
-        leaves the parent's status unchanged per `Task.recalculate_from_subtasks`,
-        so calling it here would be a no-op.
-        """
         subtask = SubTask.create(task_id=parent_task_id, inputs=inputs)
         self._subtasks.create(subtask)
         return subtask
diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py
index 2432d8e9..9b93b638 100644
--- a/repositories/user_address/user_address_csv_s3_repository.py
+++ b/repositories/user_address/user_address_csv_s3_repository.py
@@ -1,18 +1,3 @@
-"""CSV-on-S3 adapter for :class:`UserAddressRepository`.
-
-Reads upload CSVs that carry a ``postcode`` column (plus optional
-``Address 1``/``Address 2``/``Address 3`` and ``Internal Reference``), and
-writes batch CSVs that pass *every* original column through unchanged with
-one column appended -- ``postcode_clean`` (uppercase, whitespace-stripped) --
-which the downstream address2uprn stage groups on.
-
-The splitter is a pass-through router: it must not reshape or drop columns,
-because address2uprn has not been migrated and still consumes the legacy
-splitter's full-row output. The frontend pre-applies the user's column
-mapping at upload time, so this adapter does NOT consult any
-``BulkAddressUpload.column_mapping``.
-"""
-
 from __future__ import annotations
 
 import uuid
@@ -31,33 +16,11 @@ _POSTCODE_CLEAN_COLUMN: str = "postcode_clean"
 
 
 class UserAddressCsvS3Repository(UserAddressRepository):
-    """Persist :class:`UserAddress` batches as CSV objects in S3.
-
-    The repo owns the unique-filename-within-prefix convention
-    (``{ISO datetime}_{8-char uuid}.csv``); callers own the directory
-    hierarchy supplied as ``path_prefix``.
-    """
-
     def __init__(self, csv_client: CsvS3Client, bucket: str) -> None:
         self._csv_client = csv_client
         self._bucket = bucket
 
     def load_batch(self, s3_uri: str) -> list[UserAddress]:
-        """Load upload CSV rows into :class:`UserAddress` objects.
-
-        Each row's complete column set is preserved on
-        :attr:`UserAddress.source_row` so :meth:`save_batch` can pass it
-        through untouched. The parsed convenience fields are also populated:
-        ``Address 1``/``Address 2``/``Address 3`` are concatenated with
-        ``", "`` (skipping missing/empty parts) into ``user_address``, and
-        ``Internal Reference`` is threaded to
-        :attr:`UserAddress.internal_reference` (``None`` when missing/empty).
-
-        Raises:
-            ValueError: if the CSV has rows but no ``postcode`` column --
-                without it the splitter cannot group, and silently emitting
-                empty postcodes would corrupt every downstream batch.
-        """
         rows = self._csv_client.read_rows(s3_uri)
         if rows and _POSTCODE_COLUMN not in rows[0]:
             raise ValueError(
@@ -86,16 +49,6 @@ class UserAddressCsvS3Repository(UserAddressRepository):
         return addresses
 
     def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
-        """Write a pass-through batch CSV under a unique key.
-
-        Each output row is the address's original ``source_row`` with a
-        ``postcode_clean`` column appended (the canonical postcode the
-        downstream address2uprn stage groups on). No original column is
-        dropped or reshaped.
-
-        The key is ``{path_prefix}/{ISO-8601 datetime}_{8-char uuid}.csv``.
-        Returns the full ``s3://bucket/key`` URI.
-        """
         rows: list[dict[str, str]] = [
             {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
             for addr in addresses
diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py
index ab9b6671..170f34dd 100644
--- a/repositories/user_address/user_address_repository.py
+++ b/repositories/user_address/user_address_repository.py
@@ -1,10 +1,3 @@
-"""Abstract repository for :class:`UserAddress` batches.
-
-Persistence-agnostic interface for loading and saving batches of
-:class:`domain.addresses.user_address.UserAddress`. Concrete adapters --
-e.g. :class:`UserAddressCsvS3Repository` -- live alongside this module.
-"""
-
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
@@ -13,18 +6,10 @@ from domain.addresses.user_address import UserAddress
 
 
 class UserAddressRepository(ABC):
-    """Load and persist batches of :class:`UserAddress`.
-
-    Implementations choose the underlying storage (S3 CSV, Postgres,
-    in-memory, ...) but must preserve the canonical column semantics:
-    the address text, postcode (a :class:`~domain.postcode.Postcode` value
-    object), and an optional internal reference.
-    """
-
     @abstractmethod
     def load_batch(self, s3_uri: str) -> list[UserAddress]:
-        """Read a batch of addresses from ``s3_uri`` and return domain objects."""
+        ...
 
     @abstractmethod
     def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
-        """Persist ``addresses`` under ``path_prefix`` and return the URI written."""
+        ...
diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py
index 6e52b581..c69722ba 100644
--- a/tests/domain/addresses/test_postcode_batching.py
+++ b/tests/domain/addresses/test_postcode_batching.py
@@ -6,7 +6,6 @@ from domain.postcode import Postcode
 
 
 def _addrs(postcode: str, n: int) -> list[UserAddress]:
-    """Build ``n`` addresses sharing a postcode, with distinct address lines."""
     return [
         UserAddress(
             user_address=f"{i} {postcode} Street", postcode=Postcode(postcode)
diff --git a/tests/infrastructure/__init__.py b/tests/infrastructure/__init__.py
index 3478bda9..f5ad62d0 100644
--- a/tests/infrastructure/__init__.py
+++ b/tests/infrastructure/__init__.py
@@ -6,12 +6,5 @@ REGION = "us-east-1"
 
 
 def make_boto_client(service_name: str) -> Any:
-    """Construct a boto3 client typed as ``Any``.
-
-    boto3's overloaded ``client`` signature uses ``Literal[...]`` per service
-    in the installed stubs, which forces every call site to satisfy
-    ``reportArgumentType`` and ``reportUnknownMemberType`` under strict
-    pyright. Centralising the cast keeps each test file clean.
-    """
     factory: Any = boto3.client  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
     return factory(service_name, region_name=REGION)
diff --git a/tests/infrastructure/conftest.py b/tests/infrastructure/conftest.py
index 7ed2fdd6..25c1ac3b 100644
--- a/tests/infrastructure/conftest.py
+++ b/tests/infrastructure/conftest.py
@@ -7,10 +7,6 @@ import pytest
 
 @pytest.fixture(autouse=True)
 def _aws_creds() -> Iterator[None]:  # pyright: ignore[reportUnusedFunction]
-    """Stub AWS creds so botocore doesn't probe the host environment.
-
-    Applied automatically to every test in ``tests/infrastructure/``.
-    """
     keys = (
         "AWS_ACCESS_KEY_ID",
         "AWS_SECRET_ACCESS_KEY",
diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py
index 79c60974..4ee2315e 100644
--- a/tests/orchestration/test_postcode_splitter_orchestrator.py
+++ b/tests/orchestration/test_postcode_splitter_orchestrator.py
@@ -1,13 +1,3 @@
-"""Integration test: PostcodeSplitterOrchestrator wired end-to-end.
-
-Combines moto S3 + moto SQS + an in-memory SQLite session for the
-``TaskOrchestrator`` so the full slice-6 wiring is exercised through real
-infrastructure adapters (not mocks). The fixture CSV spans three postcodes
-with one oversize group, which forces both the buffer-flush-then-oversize
-branch and the final-flush branch of
-``iter_postcode_grouped_batches`` — three batches in total.
-"""
-
 from __future__ import annotations
 
 import json
diff --git a/tests/repositories/user_address/conftest.py b/tests/repositories/user_address/conftest.py
index 1859ff0a..25c1ac3b 100644
--- a/tests/repositories/user_address/conftest.py
+++ b/tests/repositories/user_address/conftest.py
@@ -7,10 +7,6 @@ import pytest
 
 @pytest.fixture(autouse=True)
 def _aws_creds() -> Iterator[None]:  # pyright: ignore[reportUnusedFunction]
-    """Stub AWS creds so botocore doesn't probe the host environment.
-
-    Applied automatically to every test in ``tests/repositories/user_address/``.
-    """
     keys = (
         "AWS_ACCESS_KEY_ID",
         "AWS_SECRET_ACCESS_KEY",
diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py
index 771a49f8..9cf68f28 100644
--- a/tests/utilities/aws_lambda/test_subtask_handler.py
+++ b/tests/utilities/aws_lambda/test_subtask_handler.py
@@ -1,11 +1,3 @@
-"""Tests for the @subtask_handler decorator.
-
-Covers the contract that the decorator owns the parent SubTask lifecycle and
-injects the decorator-owned TaskOrchestrator as a third positional argument
-to the wrapped function — so the handler can compose its own use-case
-orchestrator that shares the session.
-"""
-
 import logging
 from collections.abc import Generator, Iterator
 from contextlib import contextmanager
@@ -14,8 +6,6 @@ from typing import Any
 from uuid import UUID
 
 import pytest
-
-_LOGGER_NAME = "utilities.aws_lambda.subtask_handler"
 from sqlmodel import Session, SQLModel, create_engine
 
 from domain.tasks.subtasks import SubTaskStatus
@@ -25,6 +15,8 @@ from repositories.tasks.subtask_postgres_repository import SubTaskPostgresReposi
 from repositories.tasks.task_postgres_repository import TaskPostgresRepository
 from utilities.aws_lambda.subtask_handler import subtask_handler
 
+_LOGGER_NAME = "utilities.aws_lambda.subtask_handler"
+
 
 @dataclass
 class Harness:
@@ -58,8 +50,6 @@ def _direct_event(task_id: UUID, subtask_id: UUID) -> dict[str, Any]:
 def test_subtask_handler_injects_orchestrator_as_third_positional_argument(
     harness: Harness,
 ) -> None:
-    """The wrapped function receives the decorator-owned TaskOrchestrator
-    so it can share the session with its own use-case orchestrator."""
     _, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -123,9 +113,6 @@ def test_subtask_handler_marks_parent_failed_and_reraises_on_error(
 def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
     harness: Harness,
 ) -> None:
-    """Smoke check the share-the-session promise: the injected orchestrator
-    is the same one the decorator owns, so a handler can use it to create
-    child SubTasks under the same session."""
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -150,8 +137,6 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
 def test_subtask_handler_logs_subtask_lifecycle_on_success(
     harness: Harness, caplog: pytest.LogCaptureFixture
 ) -> None:
-    """Start and completion are logged at INFO so a successful invocation
-    leaves a CloudWatch breadcrumb (not just the Lambda runtime lines)."""
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -172,8 +157,6 @@ def test_subtask_handler_logs_subtask_lifecycle_on_success(
 def test_subtask_handler_logs_exception_on_failure(
     harness: Harness, caplog: pytest.LogCaptureFixture
 ) -> None:
-    """A failing subtask is logged at ERROR with the traceback attached,
-    before the exception propagates for the Lambda runtime to surface."""
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -198,8 +181,6 @@ def test_subtask_handler_logs_exception_on_failure(
 def test_subtask_handler_records_cloudwatch_url_on_subtask(
     harness: Harness, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    """With the AWS Lambda runtime's log env vars present, a CloudWatch deep
-    link is built and persisted on the SubTask."""
     monkeypatch.setenv("AWS_REGION", "eu-west-2")
     monkeypatch.setenv(
         "AWS_LAMBDA_LOG_GROUP_NAME", "/aws/lambda/postcode-splitter"
@@ -232,8 +213,6 @@ def test_subtask_handler_records_cloudwatch_url_on_subtask(
 def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda(
     harness: Harness, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    """Outside a real Lambda (e.g. the local RIE) the runtime log env vars
-    are absent, so cloud_logs_url is left unset rather than storing junk."""
     for var in (
         "AWS_REGION",
         "AWS_LAMBDA_LOG_GROUP_NAME",
diff --git a/utilities/aws_lambda/subtask_handler.py b/utilities/aws_lambda/subtask_handler.py
index 40f116ad..592ffebf 100644
--- a/utilities/aws_lambda/subtask_handler.py
+++ b/utilities/aws_lambda/subtask_handler.py
@@ -1,15 +1,7 @@
 """@subtask_handler decorator for Lambdas that operate on existing SubTasks.
 
 Translates an AWS Lambda invocation (SQS-shaped or direct) into
-TaskOrchestrator.run_subtask(...) calls, emitting an INFO log line for each
-subtask's start and completion and a logged exception on failure. Those lines
-land in CloudWatch via the Lambda runtime's stdout/stderr capture.
-
-Each subtask also records ``cloud_logs_url`` -- a deep link to this
-invocation's CloudWatch log stream -- so an operator can jump from a SubTask
-row straight to its logs. It is built from the environment variables the AWS
-Lambda runtime sets, so it is populated only on real Lambda invocations and
-left unset under the local RIE (which does not export them).
+TaskOrchestrator.run_subtask(...) calls.
 """
 
 import json
@@ -94,24 +86,10 @@ def _records(event: dict[str, Any]) -> list[dict[str, Any]]:
 
 
 def _console_encode(value: str) -> str:
-    """Encode a value for a CloudWatch console deep link.
-
-    The console expects URL-encoding with the percent signs themselves
-    re-encoded as ``$25`` -- e.g. ``/`` becomes ``%2F`` becomes ``$252F``.
-    """
     return quote(value, safe="").replace("%", "$25")
 
 
 def _cloudwatch_url() -> Optional[str]:
-    """Build a CloudWatch console URL for this invocation's log stream.
-
-    Sourced entirely from the environment variables the AWS Lambda runtime
-    sets -- ``AWS_REGION``, ``AWS_LAMBDA_LOG_GROUP_NAME`` and
-    ``AWS_LAMBDA_LOG_STREAM_NAME``. Returns None when any is absent, which is
-    the case outside a real Lambda (the local RIE does not export them) -- so
-    ``SubTask.cloud_logs_url`` is left unset rather than storing a link that
-    points nowhere.
-    """
     region = os.environ.get("AWS_REGION")
     log_group = os.environ.get("AWS_LAMBDA_LOG_GROUP_NAME")
     log_stream = os.environ.get("AWS_LAMBDA_LOG_STREAM_NAME")

From dc159e0b457d8e72e0e64dc931d21a9ae9dfed39 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 14:00:19 +0000
Subject: [PATCH 79/91] tests framework completed

---
 .../user_address_csv_s3_repository.py         |  3 ++
 .../user_address/user_address_repository.py   |  6 +--
 tests/conftest.py                             | 48 +++++++++++++++++++
 .../addresses/test_postcode_batching.py       | 23 +++++++++
 tests/domain/addresses/test_user_address.py   | 19 ++++++++
 tests/domain/tasks/test_subtasks.py           | 20 ++++++++
 tests/domain/tasks/test_tasks.py              | 31 +++++++++++-
 tests/domain/test_postcode.py                 | 11 +++++
 .../test_address2uprn_queue_client.py         |  6 +++
 tests/infrastructure/test_csv_s3_client.py    |  8 ++++
 tests/infrastructure/test_s3_client.py        |  5 ++
 tests/infrastructure/test_s3_uri.py           |  8 ++++
 tests/infrastructure/test_sqs_client.py       |  6 +++
 .../test_postcode_splitter_orchestrator.py    | 23 ++++++---
 tests/orchestration/test_task_orchestrator.py | 34 +++++++++++--
 .../test_subtask_postgres_repository.py       | 47 ++++++++++++------
 .../postgres/test_task_postgres_repository.py | 25 ++++++----
 .../test_user_address_csv_s3_repository.py    | 26 ++++++++++
 .../aws_lambda/test_subtask_handler.py        | 31 ++++++++++--
 19 files changed, 336 insertions(+), 44 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/repositories/user_address/user_address_csv_s3_repository.py b/repositories/user_address/user_address_csv_s3_repository.py
index 9b93b638..058fd5a5 100644
--- a/repositories/user_address/user_address_csv_s3_repository.py
+++ b/repositories/user_address/user_address_csv_s3_repository.py
@@ -53,6 +53,9 @@ class UserAddressCsvS3Repository(UserAddressRepository):
             {**addr.source_row, _POSTCODE_CLEAN_COLUMN: str(addr.postcode)}
             for addr in addresses
         ]
+
+        # TODO: [New Starter Task] file_name generation can be standardised
+        # and also easier to read, test for future implementation. Buiild that!
         filename = (
             f"{datetime.now(timezone.utc).isoformat()}_{uuid.uuid4().hex[:8]}.csv"
         )
diff --git a/repositories/user_address/user_address_repository.py b/repositories/user_address/user_address_repository.py
index 170f34dd..b2c0f866 100644
--- a/repositories/user_address/user_address_repository.py
+++ b/repositories/user_address/user_address_repository.py
@@ -7,9 +7,7 @@ from domain.addresses.user_address import UserAddress
 
 class UserAddressRepository(ABC):
     @abstractmethod
-    def load_batch(self, s3_uri: str) -> list[UserAddress]:
-        ...
+    def load_batch(self, s3_uri: str) -> list[UserAddress]: ...
 
     @abstractmethod
-    def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str:
-        ...
+    def save_batch(self, addresses: list[UserAddress], path_prefix: str) -> str: ...
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..0a246372
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,48 @@
+"""Shared pytest fixtures for the ``tests/`` tree.
+
+Provides an ephemeral PostgreSQL engine for tests that exercise SQLModel
+repositories. PostgreSQL has no true in-memory mode; ``pytest-postgresql``
+starts a real, throwaway server in a temp directory (the process is started
+once per session and a fresh database is created/dropped per test). That is
+the closest equivalent to "in-memory" and matches production behaviour far
+better than SQLite (enums, JSONB, constraint semantics, etc.).
+"""
+
+from __future__ import annotations
+
+import glob
+from collections.abc import Iterator
+from typing import Any
+
+import pytest
+from psycopg import Connection
+from pytest_postgresql import factories
+from sqlalchemy import Engine
+from sqlmodel import SQLModel, create_engine
+
+# Importing the SQLModel row modules registers their tables on
+# SQLModel.metadata so ``create_all`` builds the full schema. Imports look
+# unused; they aren't.
+
+
+# pg_ctl ships under a versioned path and is not on PATH in the dev container.
+_PG_CTL = next(iter(sorted(glob.glob("/usr/lib/postgresql/*/bin/pg_ctl"))), "pg_ctl")
+
+postgresql_proc = factories.postgresql_proc(
+    executable=_PG_CTL
+)  # pyright: ignore[reportUnknownMemberType]
+postgresql = factories.postgresql("postgresql_proc")
+
+
+@pytest.fixture
+def db_engine(postgresql: Connection[Any]) -> Iterator[Engine]:
+    """A SQLModel engine bound to a fresh, ephemeral PostgreSQL database."""
+    info = postgresql.info
+    url = f"postgresql+psycopg://{info.user}:@{info.host}:{info.port}/{info.dbname}"
+    engine = create_engine(url)
+    SQLModel.metadata.create_all(engine)
+    try:
+        yield engine
+    finally:
+        SQLModel.metadata.drop_all(engine)
+        engine.dispose()
diff --git a/tests/domain/addresses/test_postcode_batching.py b/tests/domain/addresses/test_postcode_batching.py
index c69722ba..8ffcf1b5 100644
--- a/tests/domain/addresses/test_postcode_batching.py
+++ b/tests/domain/addresses/test_postcode_batching.py
@@ -15,12 +15,16 @@ def _addrs(postcode: str, n: int) -> list[UserAddress]:
 
 
 def test_empty_input_yields_no_batches() -> None:
+    # act / assert
     assert list(iter_postcode_grouped_batches([])) == []
 
 
 def test_single_batch_under_cap() -> None:
+    # arrange
     addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2)
+    # act
     batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500))
+    # assert
     assert len(batches) == 1
     assert batches[0] == addrs
 
@@ -28,8 +32,11 @@ def test_single_batch_under_cap() -> None:
 def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None:
     # Two groups whose total exactly equals the cap pack into a single
     # batch -- no premature flush.
+    # arrange
     addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 2)
+    # act
     batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
+    # assert
     assert len(batches) == 1
     assert len(batches[0]) == 5
 
@@ -37,8 +44,11 @@ def test_multiple_postcodes_packed_into_one_batch_up_to_cap() -> None:
 def test_flush_on_overflow_before_adding_next_postcode() -> None:
     # Cap is 5. First group fills 3 slots; second group of 3 would overflow,
     # so the buffer is flushed first and the next group starts a fresh batch.
+    # arrange
     addrs = _addrs("AA1 1AA", 3) + _addrs("BB2 2BB", 3)
+    # act
     batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
+    # assert
     assert len(batches) == 2
     assert [str(a.postcode) for a in batches[0]] == ["AA11AA"] * 3
     assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 3
@@ -47,8 +57,11 @@ def test_flush_on_overflow_before_adding_next_postcode() -> None:
 def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
     # An oversize single-postcode group goes out as one batch larger than
     # the cap -- the cap never splits a postcode.
+    # arrange
     addrs = _addrs("AA1 1AA", 7)
+    # act
     batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=5))
+    # assert
     assert len(batches) == 1
     assert len(batches[0]) == 7
 
@@ -56,12 +69,15 @@ def test_single_postcode_group_exceeding_cap_is_dispatched_whole() -> None:
 def test_oversize_group_flushes_existing_buffer_first() -> None:
     # Mirrors the legacy ``if buffer: flush`` branch when an oversize group
     # is encountered: buffered work must not be lost or interleaved.
+    # arrange
     small = _addrs("AA1 1AA", 2)
     big = _addrs("BB2 2BB", 7)
     tail = _addrs("CC3 3CC", 1)
+    # act
     batches = list(
         iter_postcode_grouped_batches(small + big + tail, max_batch_size=5)
     )
+    # assert
     assert len(batches) == 3
     assert [str(a.postcode) for a in batches[0]] == ["AA11AA", "AA11AA"]
     assert [str(a.postcode) for a in batches[1]] == ["BB22BB"] * 7
@@ -70,17 +86,23 @@ def test_oversize_group_flushes_existing_buffer_first() -> None:
 
 def test_final_flush_yields_remaining_buffer() -> None:
     # No overflow ever happens, but the trailing buffer must still come out.
+    # arrange
     addrs = _addrs("AA1 1AA", 2) + _addrs("BB2 2BB", 2)
+    # act
     batches = list(iter_postcode_grouped_batches(addrs, max_batch_size=500))
+    # assert
     assert batches == [addrs]
 
 
 def test_postcode_grouping_preserves_first_seen_order() -> None:
     # Interleaved input must still group by postcode and emit in first-seen
     # order -- never alphabetical.
+    # arrange
     a1, a2 = _addrs("ZZ9 9ZZ", 2)
     b1, b2 = _addrs("AA1 1AA", 2)
+    # act
     batches = list(iter_postcode_grouped_batches([a1, b1, a2, b2]))
+    # assert
     assert len(batches) == 1
     assert [str(a.postcode) for a in batches[0]] == [
         "ZZ99ZZ",
@@ -91,5 +113,6 @@ def test_postcode_grouping_preserves_first_seen_order() -> None:
 
 
 def test_invalid_max_batch_size_raises() -> None:
+    # act / assert
     with pytest.raises(ValueError, match="max_batch_size"):
         list(iter_postcode_grouped_batches([], max_batch_size=0))
diff --git a/tests/domain/addresses/test_user_address.py b/tests/domain/addresses/test_user_address.py
index fa44ad61..8d092df3 100644
--- a/tests/domain/addresses/test_user_address.py
+++ b/tests/domain/addresses/test_user_address.py
@@ -7,35 +7,45 @@ from domain.postcode import Postcode
 
 
 def test_user_address_holds_postcode_value_object() -> None:
+    # act
     addr = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
+    # assert
     assert addr.postcode == Postcode("SW1A1AA")
 
 
 def test_user_address_preserves_user_address_verbatim() -> None:
     # The free-text user_address string is intentionally NOT normalised --
     # only the postcode is canonicalised, and that happens inside Postcode.
+    # act
     addr = UserAddress(
         user_address="  1 The   Street  ", postcode=Postcode("SW1A1AA")
     )
+    # assert
     assert addr.user_address == "  1 The   Street  "
 
 
 def test_user_address_internal_reference_defaults_to_none() -> None:
+    # act
     addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
+    # assert
     assert addr.internal_reference is None
 
 
 def test_user_address_internal_reference_accepted() -> None:
+    # act
     addr = UserAddress(
         user_address="1 The Street",
         postcode=Postcode("SW1A1AA"),
         internal_reference="cust-42",
     )
+    # assert
     assert addr.internal_reference == "cust-42"
 
 
 def test_user_address_is_frozen() -> None:
+    # arrange
     addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
+    # act / assert
     with pytest.raises(dataclasses.FrozenInstanceError):
         addr.postcode = Postcode("OTHER")  # type: ignore[misc]
 
@@ -43,29 +53,37 @@ def test_user_address_is_frozen() -> None:
 def test_user_address_equality_uses_canonical_postcode() -> None:
     # Postcode sanitises eagerly, so addresses built from different surface
     # forms of the same postcode compare equal.
+    # arrange
     a = UserAddress(user_address="1 The Street", postcode=Postcode("sw1a 1aa"))
     b = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
+    # act / assert
     assert a == b
 
 
 def test_user_address_source_row_defaults_to_empty_dict() -> None:
+    # act
     addr = UserAddress(user_address="1 The Street", postcode=Postcode("SW1A1AA"))
+    # assert
     assert addr.source_row == {}
 
 
 def test_user_address_carries_source_row() -> None:
+    # arrange
     row = {"Address 1": "1 The Street", "postcode": "SW1A 1AA", "SAP Score": "72"}
+    # act
     addr = UserAddress(
         user_address="1 The Street",
         postcode=Postcode("SW1A 1AA"),
         source_row=row,
     )
+    # assert
     assert addr.source_row == row
 
 
 def test_user_address_equality_ignores_source_row() -> None:
     # source_row is excluded from equality (and hashing): identity stays
     # defined by the parsed fields.
+    # arrange
     a = UserAddress(
         user_address="1 The Street",
         postcode=Postcode("SW1A1AA"),
@@ -76,4 +94,5 @@ def test_user_address_equality_ignores_source_row() -> None:
         postcode=Postcode("SW1A1AA"),
         source_row={"y": "2"},
     )
+    # act / assert
     assert a == b
diff --git a/tests/domain/tasks/test_subtasks.py b/tests/domain/tasks/test_subtasks.py
index 2721d38f..8cee4496 100644
--- a/tests/domain/tasks/test_subtasks.py
+++ b/tests/domain/tasks/test_subtasks.py
@@ -6,10 +6,13 @@ from domain.tasks.subtasks import SubTask, SubTaskStatus
 
 
 def test_create_subtask_starts_waiting() -> None:
+    # arrange
     task_id = uuid4()
 
+    # act
     st = SubTask.create(task_id=task_id, inputs={"foo": "bar"})
 
+    # assert
     assert st.task_id == task_id
     assert st.status is SubTaskStatus.WAITING
     assert st.inputs == {"foo": "bar"}
@@ -19,57 +22,74 @@ def test_create_subtask_starts_waiting() -> None:
 
 
 def test_start_transitions_to_in_progress_and_sets_cloud_logs_url() -> None:
+    # arrange
     st = SubTask.create(task_id=uuid4())
 
+    # act
     st.start(cloud_logs_url="https://example/log")
 
+    # assert
     assert st.status is SubTaskStatus.IN_PROGRESS
     assert st.cloud_logs_url == "https://example/log"
     assert st.job_started is not None
 
 
 def test_start_is_idempotent_from_in_progress() -> None:
+    # arrange
     st = SubTask.create(task_id=uuid4())
     st.start()
     first_start = st.job_started
 
+    # act
     st.start(cloud_logs_url="https://other")
 
+    # assert
     assert st.status is SubTaskStatus.IN_PROGRESS
     assert st.job_started == first_start  # not overwritten
     assert st.cloud_logs_url == "https://other"
 
 
 def test_start_rejects_from_terminal_status() -> None:
+    # arrange
     st = SubTask.create(task_id=uuid4())
     st.complete()
+    # act / assert
     with pytest.raises(ValueError):
         st.start()
 
 
 def test_complete_marks_outputs_and_job_completed() -> None:
+    # arrange
     st = SubTask.create(task_id=uuid4())
     st.start()
 
+    # act
     st.complete({"uprn": "123"})
 
+    # assert
     assert st.status is SubTaskStatus.COMPLETE
     assert st.outputs == {"result": {"uprn": "123"}}
     assert st.job_completed is not None
 
 
 def test_complete_without_result_leaves_outputs_unset() -> None:
+    # arrange
     st = SubTask.create(task_id=uuid4())
+    # act
     st.complete()
+    # assert
     assert st.outputs is None
 
 
 def test_fail_records_error_in_outputs() -> None:
+    # arrange
     st = SubTask.create(task_id=uuid4())
     err = RuntimeError("boom")
 
+    # act
     st.fail(err)
 
+    # assert
     assert st.status is SubTaskStatus.FAILED
     assert st.outputs == {"error": "boom"}
     assert st.job_completed is not None
diff --git a/tests/domain/tasks/test_tasks.py b/tests/domain/tasks/test_tasks.py
index f30c0aa1..ba82412b 100644
--- a/tests/domain/tasks/test_tasks.py
+++ b/tests/domain/tasks/test_tasks.py
@@ -5,12 +5,12 @@ from domain.tasks.tasks import Source, Task, TaskStatus
 
 
 def test_create_task_starts_waiting() -> None:
-    # Arrange / Act
+    # arrange / act
     t = Task.create(
         task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123"
     )
 
-    # Assert
+    # assert
     assert t.status is TaskStatus.WAITING
     assert t.source is Source.PORTFOLIO
     assert t.source_id == "abc-123"
@@ -19,86 +19,113 @@ def test_create_task_starts_waiting() -> None:
 
 
 def test_create_task_rejects_blank_task_source() -> None:
+    # act / assert
     with pytest.raises(ValueError, match="task_source"):
         Task.create(task_source="   ")
 
 
 def test_start_transitions_to_in_progress() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
+    # act
     t.start()
+    # assert
     assert t.status is TaskStatus.IN_PROGRESS
 
 
 def test_complete_marks_job_completed() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
     t.start()
+    # act
     t.complete()
+    # assert
     assert t.status is TaskStatus.COMPLETE
     assert t.job_completed is not None
 
 
 def test_fail_marks_job_completed() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
+    # act
     t.fail()
+    # assert
     assert t.status is TaskStatus.FAILED
     assert t.job_completed is not None
 
 
 def test_start_rejects_from_terminal_status() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
     t.complete()
+    # act / assert
     with pytest.raises(ValueError):
         t.start()
 
 
 def test_recalculate_with_empty_statuses_is_noop() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
     original_status = t.status
     original_completed = t.job_completed
 
+    # act
     t.recalculate_from_subtasks([])
 
+    # assert
     assert t.status is original_status
     assert t.job_completed is original_completed
 
 
 def test_recalculate_all_waiting_keeps_waiting() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
     t.start()  # task moved to IN_PROGRESS earlier
     t.complete()  # then COMPLETE, with job_completed set
 
+    # act
     t.recalculate_from_subtasks([SubTaskStatus.WAITING, SubTaskStatus.WAITING])
 
+    # assert
     assert t.status is TaskStatus.WAITING
     assert t.job_completed is None
 
 
 def test_recalculate_any_in_progress_marks_in_progress() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
 
+    # act
     t.recalculate_from_subtasks(
         [SubTaskStatus.WAITING, SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE]
     )
 
+    # assert
     assert t.status is TaskStatus.IN_PROGRESS
     assert t.job_completed is None
 
 
 def test_recalculate_all_complete_marks_complete() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
 
+    # act
     t.recalculate_from_subtasks([SubTaskStatus.COMPLETE, SubTaskStatus.COMPLETE])
 
+    # assert
     assert t.status is TaskStatus.COMPLETE
     assert t.job_completed is not None
 
 
 def test_recalculate_any_failed_marks_failed_even_with_others() -> None:
+    # arrange
     t = Task.create(task_source="manual:test")
 
+    # act
     t.recalculate_from_subtasks(
         [SubTaskStatus.IN_PROGRESS, SubTaskStatus.COMPLETE, SubTaskStatus.FAILED]
     )
 
+    # assert
     assert t.status is TaskStatus.FAILED
     assert t.job_completed is not None
diff --git a/tests/domain/test_postcode.py b/tests/domain/test_postcode.py
index 89d5cdc8..f7ce9015 100644
--- a/tests/domain/test_postcode.py
+++ b/tests/domain/test_postcode.py
@@ -6,43 +6,54 @@ from domain.postcode import Postcode
 
 
 def test_postcode_uppercases() -> None:
+    # act / assert
     assert Postcode("sw1a1aa").value == "SW1A1AA"
 
 
 def test_postcode_strips_internal_spaces() -> None:
+    # act / assert
     assert Postcode("sw1a 1aa").value == "SW1A1AA"
 
 
 def test_postcode_strips_leading_and_trailing_whitespace() -> None:
+    # act / assert
     assert Postcode("  sw1a 1aa  ").value == "SW1A1AA"
 
 
 def test_postcode_strips_tabs_and_newlines() -> None:
     # CSV ingestion occasionally introduces stray whitespace characters; the
     # canonical form must absorb them just like literal spaces.
+    # act / assert
     assert Postcode("sw1a\t1aa\n").value == "SW1A1AA"
 
 
 def test_postcode_construction_is_idempotent() -> None:
+    # arrange
     once = Postcode("sw1a 1aa")
+    # act / assert
     assert Postcode(once.value).value == "SW1A1AA"
 
 
 def test_postcode_empty_string() -> None:
+    # act / assert
     assert Postcode("").value == ""
 
 
 def test_postcode_str_returns_canonical_value() -> None:
+    # act / assert
     assert str(Postcode("sw1a 1aa")) == "SW1A1AA"
 
 
 def test_postcode_equality_ignores_surface_form() -> None:
     # Differing case / whitespace sanitise to the same canonical value, so
     # the value objects compare equal.
+    # act / assert
     assert Postcode("sw1a 1aa") == Postcode("SW1A1AA")
 
 
 def test_postcode_is_frozen() -> None:
+    # arrange
     postcode = Postcode("SW1A1AA")
+    # act / assert
     with pytest.raises(dataclasses.FrozenInstanceError):
         postcode.value = "OTHER"  # type: ignore[misc]
diff --git a/tests/infrastructure/test_address2uprn_queue_client.py b/tests/infrastructure/test_address2uprn_queue_client.py
index b4114742..c8e89ece 100644
--- a/tests/infrastructure/test_address2uprn_queue_client.py
+++ b/tests/infrastructure/test_address2uprn_queue_client.py
@@ -28,12 +28,15 @@ def queue_setup() -> Iterator[tuple[Address2UprnQueueClient, Any, str]]:
 def test_publish_returns_message_id(
     queue_setup: tuple[Address2UprnQueueClient, Any, str],
 ) -> None:
+    # arrange
     client, _boto, _url = queue_setup
+    # act
     message_id = client.publish(
         parent_task_id=uuid4(),
         child_subtask_id=uuid4(),
         s3_uri="s3://my-bucket/path/to/chunk.csv",
     )
+    # assert
     assert isinstance(message_id, str)
     assert message_id
 
@@ -41,17 +44,20 @@ def test_publish_returns_message_id(
 def test_publish_body_uses_typed_shape(
     queue_setup: tuple[Address2UprnQueueClient, Any, str],
 ) -> None:
+    # arrange
     client, boto_client, queue_url = queue_setup
     parent_id = uuid4()
     child_id = uuid4()
     s3_uri = "s3://my-bucket/path/to/chunk.csv"
 
+    # act
     client.publish(
         parent_task_id=parent_id,
         child_subtask_id=child_id,
         s3_uri=s3_uri,
     )
 
+    # assert
     received: dict[str, Any] = boto_client.receive_message(
         QueueUrl=queue_url, MaxNumberOfMessages=1
     )
diff --git a/tests/infrastructure/test_csv_s3_client.py b/tests/infrastructure/test_csv_s3_client.py
index 4b9fc199..30e27164 100644
--- a/tests/infrastructure/test_csv_s3_client.py
+++ b/tests/infrastructure/test_csv_s3_client.py
@@ -18,26 +18,34 @@ def csv_client() -> Iterator[CsvS3Client]:
 
 
 def test_save_rows_returns_s3_uri(csv_client: CsvS3Client) -> None:
+    # arrange
     rows = [{"address": "1 High St", "postcode": "AB1 2CD"}]
+    # act
     uri = csv_client.save_rows(rows, "uploads/addresses.csv")
+    # assert
     assert uri == f"s3://{BUCKET}/uploads/addresses.csv"
 
 
 def test_round_trip_preserves_rows(csv_client: CsvS3Client) -> None:
+    # arrange
     rows = [
         {"address": "1 High St", "postcode": "AB1 2CD"},
         {"address": "2 Low St", "postcode": "XY9 8ZW"},
     ]
+    # act
     uri = csv_client.save_rows(rows, "uploads/addresses.csv")
     fetched = csv_client.read_rows(uri)
+    # assert
     assert fetched == rows
 
 
 def test_save_rows_rejects_empty_list(csv_client: CsvS3Client) -> None:
+    # act / assert
     with pytest.raises(ValueError, match="empty"):
         csv_client.save_rows([], "uploads/empty.csv")
 
 
 def test_read_rows_rejects_wrong_bucket(csv_client: CsvS3Client) -> None:
+    # act / assert
     with pytest.raises(ValueError, match="does not match client bucket"):
         csv_client.read_rows("s3://other-bucket/uploads/addresses.csv")
diff --git a/tests/infrastructure/test_s3_client.py b/tests/infrastructure/test_s3_client.py
index 7ed4c30b..67db4f58 100644
--- a/tests/infrastructure/test_s3_client.py
+++ b/tests/infrastructure/test_s3_client.py
@@ -18,14 +18,19 @@ def s3_client() -> Iterator[S3Client]:
 
 
 def test_put_object_returns_s3_uri(s3_client: S3Client) -> None:
+    # act
     uri = s3_client.put_object("folder/data.bin", b"payload")
+    # assert
     assert uri == f"s3://{BUCKET}/folder/data.bin"
 
 
 def test_get_object_returns_bytes_written_by_put_object(s3_client: S3Client) -> None:
+    # arrange
     s3_client.put_object("round/trip.bin", b"hello world")
+    # act / assert
     assert s3_client.get_object("round/trip.bin") == b"hello world"
 
 
 def test_bucket_property_exposes_configured_bucket(s3_client: S3Client) -> None:
+    # act / assert
     assert s3_client.bucket == BUCKET
diff --git a/tests/infrastructure/test_s3_uri.py b/tests/infrastructure/test_s3_uri.py
index 896c5959..32fd710f 100644
--- a/tests/infrastructure/test_s3_uri.py
+++ b/tests/infrastructure/test_s3_uri.py
@@ -4,29 +4,37 @@ from infrastructure.s3_uri import parse_s3_uri
 
 
 def test_parses_simple_s3_uri() -> None:
+    # act / assert
     assert parse_s3_uri("s3://my-bucket/file.csv") == ("my-bucket", "file.csv")
 
 
 def test_parses_s3_uri_with_nested_key() -> None:
+    # act
     bucket, key = parse_s3_uri("s3://my-bucket/nested/path/to/file.csv")
+    # assert
     assert (bucket, key) == ("my-bucket", "nested/path/to/file.csv")
 
 
 def test_rejects_s3_uri_without_key() -> None:
+    # act / assert
     with pytest.raises(ValueError, match="bucket and a key"):
         parse_s3_uri("s3://my-bucket")
 
 
 def test_rejects_s3_uri_with_empty_key() -> None:
+    # act / assert
     with pytest.raises(ValueError, match="bucket and a key"):
         parse_s3_uri("s3://my-bucket/")
 
 
 def test_parses_console_url_prefix() -> None:
+    # arrange
     url = "https://eu-west-2.console.aws.amazon.com/s3/object/my-bucket?prefix=nested%2Ffile.csv"
+    # act / assert
     assert parse_s3_uri(url) == ("my-bucket", "nested/file.csv")
 
 
 def test_rejects_unparseable_string() -> None:
+    # act / assert
     with pytest.raises(ValueError):
         parse_s3_uri("not-a-uri-at-all")
diff --git a/tests/infrastructure/test_sqs_client.py b/tests/infrastructure/test_sqs_client.py
index 7f1e8f78..44186bbb 100644
--- a/tests/infrastructure/test_sqs_client.py
+++ b/tests/infrastructure/test_sqs_client.py
@@ -19,17 +19,23 @@ def sqs_setup() -> Iterator[tuple[SqsClient, Any, str]]:
 
 
 def test_send_returns_message_id(sqs_setup: tuple[SqsClient, Any, str]) -> None:
+    # arrange
     client, _boto, _url = sqs_setup
+    # act
     message_id = client.send({"hello": "world"})
+    # assert
     assert isinstance(message_id, str)
     assert message_id
 
 
 def test_send_json_serialises_body(sqs_setup: tuple[SqsClient, Any, str]) -> None:
+    # arrange
     client, boto_client, queue_url = sqs_setup
     body = {"hello": "world", "count": 3}
+    # act
     client.send(body)
 
+    # assert
     received: dict[str, Any] = boto_client.receive_message(
         QueueUrl=queue_url, MaxNumberOfMessages=1
     )
diff --git a/tests/orchestration/test_postcode_splitter_orchestrator.py b/tests/orchestration/test_postcode_splitter_orchestrator.py
index 4ee2315e..a718ffbc 100644
--- a/tests/orchestration/test_postcode_splitter_orchestrator.py
+++ b/tests/orchestration/test_postcode_splitter_orchestrator.py
@@ -9,7 +9,8 @@ from typing import Any, cast
 import boto3
 import pytest
 from moto import mock_aws
-from sqlmodel import Session, SQLModel, create_engine
+from sqlalchemy import Engine
+from sqlmodel import Session
 
 from infrastructure.address2uprn_queue_client import Address2UprnQueueClient
 from infrastructure.csv_s3_client import CsvS3Client
@@ -65,7 +66,7 @@ class Harness:
 
 
 @pytest.fixture
-def harness() -> Iterator[Harness]:
+def harness(db_engine: Engine) -> Iterator[Harness]:
     with mock_aws():
         # Infra: S3 + SQS
         boto_s3 = _make_boto_client("s3")
@@ -78,10 +79,8 @@ def harness() -> Iterator[Harness]:
         repo = UserAddressCsvS3Repository(csv_client, BUCKET)
         queue_client = Address2UprnQueueClient(boto_sqs, queue_url)
 
-        # DB: in-memory SQLite TaskOrchestrator
-        engine = create_engine("sqlite://")
-        SQLModel.metadata.create_all(engine)
-        with Session(engine) as session:
+        # DB: ephemeral PostgreSQL TaskOrchestrator
+        with Session(db_engine) as session:
             task_repo = TaskPostgresRepository(session=session)
             subtask_repo = SubTaskPostgresRepository(session=session)
             task_orchestrator = TaskOrchestrator(
@@ -169,6 +168,7 @@ def _drain_queue(boto_sqs: Any, queue_url: str) -> list[dict[str, Any]]:
 def test_split_and_dispatch_creates_three_children_for_fixture(
     harness: Harness,
 ) -> None:
+    # arrange
     parent_task, parent_subtask = (
         harness.task_orchestrator.create_task_with_subtask(
             task_source="manual:postcode-splitter-int"
@@ -176,12 +176,14 @@ def test_split_and_dispatch_creates_three_children_for_fixture(
     )
     input_uri = _upload_fixture_csv(harness.csv_client)
 
+    # act
     child_ids = harness.splitter.split_and_dispatch(
         parent_task_id=parent_task.id,
         parent_subtask_id=parent_subtask.id,
         input_s3_uri=input_uri,
     )
 
+    # assert
     assert len(child_ids) == 3
     # All child ids are unique and persisted as WAITING children of the
     # parent task.
@@ -194,6 +196,7 @@ def test_split_and_dispatch_creates_three_children_for_fixture(
 def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri(
     harness: Harness,
 ) -> None:
+    # arrange
     parent_task, parent_subtask = (
         harness.task_orchestrator.create_task_with_subtask(
             task_source="manual:postcode-splitter-int"
@@ -201,12 +204,14 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri(
     )
     input_uri = _upload_fixture_csv(harness.csv_client)
 
+    # act
     child_ids = harness.splitter.split_and_dispatch(
         parent_task_id=parent_task.id,
         parent_subtask_id=parent_subtask.id,
         input_s3_uri=input_uri,
     )
 
+    # assert
     for cid in child_ids:
         child = harness.subtasks.get(cid)
         assert child.inputs is not None
@@ -224,6 +229,7 @@ def test_split_and_dispatch_persists_child_inputs_with_task_id_and_s3_uri(
 def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids(
     harness: Harness,
 ) -> None:
+    # arrange
     parent_task, parent_subtask = (
         harness.task_orchestrator.create_task_with_subtask(
             task_source="manual:postcode-splitter-int"
@@ -231,12 +237,14 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids(
     )
     input_uri = _upload_fixture_csv(harness.csv_client)
 
+    # act
     child_ids = harness.splitter.split_and_dispatch(
         parent_task_id=parent_task.id,
         parent_subtask_id=parent_subtask.id,
         input_s3_uri=input_uri,
     )
 
+    # assert
     bodies = _drain_queue(harness.boto_sqs, harness.queue_url)
     assert len(bodies) == len(child_ids)
 
@@ -258,6 +266,7 @@ def test_split_and_dispatch_publishes_one_message_per_child_with_matching_ids(
 def test_split_and_dispatch_returns_child_ids_in_dispatch_order(
     harness: Harness,
 ) -> None:
+    # arrange
     parent_task, parent_subtask = (
         harness.task_orchestrator.create_task_with_subtask(
             task_source="manual:postcode-splitter-int"
@@ -265,12 +274,14 @@ def test_split_and_dispatch_returns_child_ids_in_dispatch_order(
     )
     input_uri = _upload_fixture_csv(harness.csv_client)
 
+    # act
     child_ids = harness.splitter.split_and_dispatch(
         parent_task_id=parent_task.id,
         parent_subtask_id=parent_subtask.id,
         input_s3_uri=input_uri,
     )
 
+    # assert
     # Re-load each child's saved batch and inspect the postcode_clean column
     # to confirm the dispatch order matches the postcode-batching algorithm:
     # AA-batch first, BB oversize batch second, CC final-flush third.
diff --git a/tests/orchestration/test_task_orchestrator.py b/tests/orchestration/test_task_orchestrator.py
index c0816d2d..ae89991d 100644
--- a/tests/orchestration/test_task_orchestrator.py
+++ b/tests/orchestration/test_task_orchestrator.py
@@ -2,7 +2,8 @@ from collections.abc import Iterator
 from dataclasses import dataclass
 
 import pytest
-from sqlmodel import Session, SQLModel, create_engine
+from sqlalchemy import Engine
+from sqlmodel import Session
 
 from domain.tasks.subtasks import SubTask, SubTaskStatus
 from domain.tasks.tasks import Source, TaskStatus
@@ -19,10 +20,8 @@ class Harness:
 
 
 @pytest.fixture
-def harness() -> Iterator[Harness]:
-    engine = create_engine("sqlite://")
-    SQLModel.metadata.create_all(engine)
-    with Session(engine) as session:
+def harness(db_engine: Engine) -> Iterator[Harness]:
+    with Session(db_engine) as session:
         tasks = TaskPostgresRepository(session=session)
         subtasks = SubTaskPostgresRepository(session=session)
         yield Harness(
@@ -35,6 +34,7 @@ def harness() -> Iterator[Harness]:
 def test_create_task_with_subtask_creates_both_in_waiting(
     harness: Harness,
 ) -> None:
+    # act
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test",
         inputs={"foo": "bar"},
@@ -42,6 +42,7 @@ def test_create_task_with_subtask_creates_both_in_waiting(
         source_id="abc",
     )
 
+    # assert
     assert task.status is TaskStatus.WAITING
     assert subtask.status is SubTaskStatus.WAITING
     assert subtask.task_id == task.id
@@ -49,27 +50,33 @@ def test_create_task_with_subtask_creates_both_in_waiting(
 
 
 def test_start_subtask_cascades_to_in_progress(harness: Harness) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
 
+    # act
     started = harness.orchestrator.start_subtask(
         subtask.id, cloud_logs_url="https://example/log"
     )
 
+    # assert
     assert started.status is SubTaskStatus.IN_PROGRESS
     assert started.cloud_logs_url == "https://example/log"
     assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
 
 
 def test_complete_subtask_cascades_to_complete(harness: Harness) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
     harness.orchestrator.start_subtask(subtask.id)
 
+    # act
     harness.orchestrator.complete_subtask(subtask.id, {"value": 42})
 
+    # assert
     done_subtask = harness.subtasks.get(subtask.id)
     done_task = harness.tasks.get(task.id)
     assert done_subtask.outputs == {"result": {"value": 42}}
@@ -78,12 +85,15 @@ def test_complete_subtask_cascades_to_complete(harness: Harness) -> None:
 
 
 def test_fail_subtask_cascades_to_failed(harness: Harness) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
 
+    # act
     harness.orchestrator.fail_subtask(subtask.id, RuntimeError("boom"))
 
+    # assert
     failed_subtask = harness.subtasks.get(subtask.id)
     failed_task = harness.tasks.get(task.id)
     assert failed_subtask.outputs == {"error": "boom"}
@@ -93,42 +103,51 @@ def test_fail_subtask_cascades_to_failed(harness: Harness) -> None:
 def test_failed_subtask_locks_task_failed_even_with_others_complete(
     harness: Harness,
 ) -> None:
+    # arrange
     task, first = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
     second = SubTask.create(task_id=task.id)
     harness.subtasks.create(second)
 
+    # act
     harness.orchestrator.complete_subtask(first.id)
     harness.orchestrator.fail_subtask(second.id, RuntimeError("nope"))
 
+    # assert
     assert harness.tasks.get(task.id).status is TaskStatus.FAILED
 
 
 def test_mixed_complete_and_in_progress_keeps_task_in_progress(
     harness: Harness,
 ) -> None:
+    # arrange
     task, first = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
     second = SubTask.create(task_id=task.id)
     harness.subtasks.create(second)
 
+    # act
     harness.orchestrator.complete_subtask(first.id)
     harness.orchestrator.start_subtask(second.id)
 
+    # assert
     assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
 
 
 def test_run_subtask_happy_path_returns_result_and_cascades_complete(
     harness: Harness,
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
 
+    # act
     result = harness.orchestrator.run_subtask(subtask.id, work=lambda: {"answer": 42})
 
+    # assert
     assert result == {"answer": 42}
     assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE
     assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE
@@ -137,16 +156,19 @@ def test_run_subtask_happy_path_returns_result_and_cascades_complete(
 def test_create_child_subtask_adds_waiting_child_without_changing_parent_status(
     harness: Harness,
 ) -> None:
+    # arrange
     task, first = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
     harness.orchestrator.start_subtask(first.id)
     assert harness.tasks.get(task.id).status is TaskStatus.IN_PROGRESS
 
+    # act
     child = harness.orchestrator.create_child_subtask(
         task.id, inputs={"split": "a"}
     )
 
+    # assert
     persisted_child = harness.subtasks.get(child.id)
     assert persisted_child.task_id == task.id
     assert persisted_child.status is SubTaskStatus.WAITING
@@ -159,6 +181,7 @@ def test_create_child_subtask_adds_waiting_child_without_changing_parent_status(
 def test_run_subtask_failing_work_marks_failed_and_reraises(
     harness: Harness,
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -166,6 +189,7 @@ def test_run_subtask_failing_work_marks_failed_and_reraises(
     def boom() -> None:
         raise RuntimeError("boom")
 
+    # act / assert
     with pytest.raises(RuntimeError, match="boom"):
         harness.orchestrator.run_subtask(subtask.id, work=boom)
 
diff --git a/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py
index ac39e089..9cec52ea 100644
--- a/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py
+++ b/tests/repositories/tasks/postgres/test_subtask_postgres_repository.py
@@ -1,33 +1,40 @@
 from collections.abc import Iterator
-from uuid import uuid4
+from uuid import UUID, uuid4
 
 import pytest
-from sqlmodel import Session, SQLModel, create_engine
+from sqlalchemy import Engine
+from sqlmodel import Session
 
-# Importing the SQLModel row modules registers their tables in
-# SQLModel.metadata so create_all builds both. Imports look unused; they aren't.
-import infrastructure.postgres.subtask_table  # noqa: F401  # pyright: ignore[reportUnusedImport]
-import infrastructure.postgres.task_table  # noqa: F401  # pyright: ignore[reportUnusedImport]
 from domain.tasks.subtasks import SubTask, SubTaskStatus
+from domain.tasks.tasks import Task
 from repositories.tasks.subtask_postgres_repository import SubTaskPostgresRepository
+from repositories.tasks.task_postgres_repository import TaskPostgresRepository
 
 
 @pytest.fixture
-def session() -> Iterator[Session]:
-    engine = create_engine("sqlite://")
-    SQLModel.metadata.create_all(engine)
-    with Session(engine) as s:
+def session(db_engine: Engine) -> Iterator[Session]:
+    with Session(db_engine) as s:
         yield s
 
 
+def _persisted_task_id(session: Session) -> UUID:
+    """Create a parent Task row so SubTask FK constraints are satisfied."""
+    task = Task.create(task_source="manual:test")
+    TaskPostgresRepository(session=session).create(task)
+    return task.id
+
+
 def test_create_and_get_round_trip_preserves_inputs(session: Session) -> None:
+    # arrange
     repo = SubTaskPostgresRepository(session=session)
-    task_id = uuid4()
+    task_id = _persisted_task_id(session)
     st = SubTask.create(task_id=task_id, inputs={"address": "68 Glendon Way"})
 
+    # act
     repo.create(st)
     fetched = repo.get(st.id)
 
+    # assert
     assert fetched.id == st.id
     assert fetched.task_id == task_id
     assert fetched.status is SubTaskStatus.WAITING
@@ -36,16 +43,21 @@ def test_create_and_get_round_trip_preserves_inputs(session: Session) -> None:
 
 
 def test_save_persists_status_and_outputs(session: Session) -> None:
+    # arrange
     repo = SubTaskPostgresRepository(session=session)
-    st = SubTask.create(task_id=uuid4())
+    st = SubTask.create(task_id=_persisted_task_id(session))
     repo.create(st)
 
+    # act
     st.start(cloud_logs_url="https://example/log")
     repo.save(st)
+    # assert
     assert repo.get(st.id).status is SubTaskStatus.IN_PROGRESS
 
+    # act
     st.complete({"uprn": "123"})
     repo.save(st)
+    # assert
     done = repo.get(st.id)
     assert done.status is SubTaskStatus.COMPLETE
     assert done.outputs == {"result": {"uprn": "123"}}
@@ -54,16 +66,19 @@ def test_save_persists_status_and_outputs(session: Session) -> None:
 
 
 def test_list_by_task_filters_by_task_id(session: Session) -> None:
+    # arrange
     repo = SubTaskPostgresRepository(session=session)
-    task_a = uuid4()
-    task_b = uuid4()
+    task_a = _persisted_task_id(session)
+    task_b = _persisted_task_id(session)
     repo.create(SubTask.create(task_id=task_a))
     repo.create(SubTask.create(task_id=task_a))
     repo.create(SubTask.create(task_id=task_b))
 
+    # act
     a_results = repo.list_by_task(task_a)
     b_results = repo.list_by_task(task_b)
 
+    # assert
     assert len(a_results) == 2
     assert len(b_results) == 1
     assert all(s.task_id == task_a for s in a_results)
@@ -71,11 +86,15 @@ def test_list_by_task_filters_by_task_id(session: Session) -> None:
 
 
 def test_list_by_task_returns_empty_for_unknown_task(session: Session) -> None:
+    # arrange
     repo = SubTaskPostgresRepository(session=session)
+    # act / assert
     assert repo.list_by_task(uuid4()) == []
 
 
 def test_get_missing_raises(session: Session) -> None:
+    # arrange
     repo = SubTaskPostgresRepository(session=session)
+    # act / assert
     with pytest.raises(ValueError, match="not found"):
         repo.get(uuid4())
diff --git a/tests/repositories/tasks/postgres/test_task_postgres_repository.py b/tests/repositories/tasks/postgres/test_task_postgres_repository.py
index 3e1aa226..8a49a861 100644
--- a/tests/repositories/tasks/postgres/test_task_postgres_repository.py
+++ b/tests/repositories/tasks/postgres/test_task_postgres_repository.py
@@ -2,7 +2,8 @@ from collections.abc import Iterator
 from uuid import uuid4
 
 import pytest
-from sqlmodel import Session, SQLModel, create_engine
+from sqlalchemy import Engine
+from sqlmodel import Session
 
 from domain.tasks.tasks import Source, Task, TaskStatus
 from infrastructure.postgres.task_table import TaskRow
@@ -10,25 +11,23 @@ from repositories.tasks.task_postgres_repository import TaskPostgresRepository
 
 
 @pytest.fixture
-def session() -> Iterator[Session]:
-    engine = create_engine("sqlite://")
-    SQLModel.metadata.create_all(engine)
-    with Session(engine) as s:
+def session(db_engine: Engine) -> Iterator[Session]:
+    with Session(db_engine) as s:
         yield s
 
 
 def test_create_and_get_round_trip(session: Session) -> None:
-    # Arrange
+    # arrange
     repo = TaskPostgresRepository(session=session)
     t = Task.create(
         task_source="manual:test", source=Source.PORTFOLIO, source_id="abc-123"
     )
 
-    # Act
+    # act
     repo.create(t)
     fetched = repo.get(t.id)
 
-    # Assert
+    # assert
     assert fetched.id == t.id
     assert fetched.status is TaskStatus.WAITING
     assert fetched.source is Source.PORTFOLIO
@@ -36,33 +35,43 @@ def test_create_and_get_round_trip(session: Session) -> None:
 
 
 def test_save_persists_status_transition(session: Session) -> None:
+    # arrange
     repo = TaskPostgresRepository(session=session)
     t = Task.create(task_source="manual:test")
     repo.create(t)
 
+    # act
     t.start()
     repo.save(t)
+    # assert
     assert repo.get(t.id).status is TaskStatus.IN_PROGRESS
 
+    # act
     t.complete()
     repo.save(t)
+    # assert
     done = repo.get(t.id)
     assert done.status is TaskStatus.COMPLETE
     assert done.job_completed is not None
 
 
 def test_get_missing_raises(session: Session) -> None:
+    # arrange
     repo = TaskPostgresRepository(session=session)
+    # act / assert
     with pytest.raises(ValueError, match="not found"):
         repo.get(uuid4())
 
 
 def test_get_normalises_legacy_capitalised_status(session: Session) -> None:
     # Existing rows written by backend code use "In Progress" (capitalised).
+    # arrange
     repo = TaskPostgresRepository(session=session)
     row = TaskRow(task_source="manual:test", status="In Progress")
     session.add(row)
     session.commit()
 
+    # act
     fetched = repo.get(row.id)
+    # assert
     assert fetched.status is TaskStatus.IN_PROGRESS
diff --git a/tests/repositories/user_address/test_user_address_csv_s3_repository.py b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
index c1acee32..9ffb250a 100644
--- a/tests/repositories/user_address/test_user_address_csv_s3_repository.py
+++ b/tests/repositories/user_address/test_user_address_csv_s3_repository.py
@@ -32,6 +32,7 @@ def _upload_csv(
 def test_load_batch_parses_address_postcode_and_reference(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     rows = [
         {
             "Address 1": "1 High Street",
@@ -43,8 +44,10 @@ def test_load_batch_parses_address_postcode_and_reference(
     ]
     uri = _upload_csv(repo, rows, "uploads/full.csv")
 
+    # act
     addresses = repo.load_batch(uri)
 
+    # assert
     assert len(addresses) == 1
     address = addresses[0]
     assert address.user_address == "1 High Street, Flat 2, Townville"
@@ -55,6 +58,7 @@ def test_load_batch_parses_address_postcode_and_reference(
 def test_load_batch_uses_only_address_1_when_others_missing(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     rows = [
         {
             "Address 1": "10 Cardiff Road",
@@ -66,8 +70,10 @@ def test_load_batch_uses_only_address_1_when_others_missing(
     ]
     uri = _upload_csv(repo, rows, "uploads/address1-only.csv")
 
+    # act
     addresses = repo.load_batch(uri)
 
+    # assert
     assert len(addresses) == 1
     assert addresses[0].user_address == "10 Cardiff Road"
     assert addresses[0].postcode == Postcode("CF101AA")
@@ -77,6 +83,7 @@ def test_load_batch_uses_only_address_1_when_others_missing(
 def test_load_batch_handles_missing_internal_reference(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     rows = [
         {
             "Address 1": "5 Park Lane",
@@ -88,8 +95,10 @@ def test_load_batch_handles_missing_internal_reference(
     ]
     uri = _upload_csv(repo, rows, "uploads/no-ref.csv")
 
+    # act
     addresses = repo.load_batch(uri)
 
+    # assert
     assert len(addresses) == 1
     assert addresses[0].user_address == "5 Park Lane"
     assert addresses[0].postcode == Postcode("M11AA")
@@ -101,6 +110,7 @@ def test_load_batch_captures_full_source_row(
 ) -> None:
     # A raw EPC-export-shaped row: the splitter must preserve every column,
     # not just the ones it parses into UserAddress fields.
+    # arrange
     row = {
         "Asset Reference": "511",
         "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX",
@@ -110,17 +120,21 @@ def test_load_batch_captures_full_source_row(
     }
     uri = _upload_csv(repo, [row], "uploads/epc.csv")
 
+    # act
     addresses = repo.load_batch(uri)
 
+    # assert
     assert addresses[0].source_row == row
 
 
 def test_load_batch_raises_when_postcode_column_absent(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     rows = [{"Address 1": "1 High Street", "Property Type": "Flat"}]
     uri = _upload_csv(repo, rows, "uploads/no-postcode.csv")
 
+    # act / assert
     with pytest.raises(ValueError, match="no 'postcode' column"):
         repo.load_batch(uri)
 
@@ -128,6 +142,7 @@ def test_load_batch_raises_when_postcode_column_absent(
 def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     row = {
         "Asset Reference": "511",
         "Address 1": "9 Abingdon Road Padiham Lancashire BB12 7BX",
@@ -137,9 +152,11 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
     uri = _upload_csv(repo, [row], "uploads/epc.csv")
     addresses = repo.load_batch(uri)
 
+    # act
     saved_uri = repo.save_batch(addresses, "tasks/passthrough")
     saved_rows = repo._csv_client.read_rows(saved_uri)  # pyright: ignore[reportPrivateUsage]
 
+    # assert
     assert len(saved_rows) == 1
     saved = saved_rows[0]
     # Every original column survives, byte-for-byte.
@@ -152,6 +169,7 @@ def test_save_batch_passes_through_all_columns_and_appends_postcode_clean(
 def test_save_batch_returns_uri_under_path_prefix(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     addresses = [
         UserAddress(
             user_address="1 High Street",
@@ -160,8 +178,10 @@ def test_save_batch_returns_uri_under_path_prefix(
         ),
     ]
 
+    # act
     uri = repo.save_batch(addresses, "tasks/abc/batches")
 
+    # assert
     assert uri.startswith(f"s3://{BUCKET}/tasks/abc/batches/")
     assert uri.endswith(".csv")
 
@@ -169,6 +189,7 @@ def test_save_batch_returns_uri_under_path_prefix(
 def test_save_then_reload_round_trip_preserves_columns(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     rows = [
         {
             "Address 1": "1 High Street",
@@ -184,9 +205,11 @@ def test_save_then_reload_round_trip_preserves_columns(
     uri = _upload_csv(repo, rows, "uploads/round-trip.csv")
     addresses = repo.load_batch(uri)
 
+    # act
     saved_uri = repo.save_batch(addresses, "tasks/round-trip")
     saved_rows = repo._csv_client.read_rows(saved_uri)  # pyright: ignore[reportPrivateUsage]
 
+    # assert
     # Original columns come back verbatim; postcode_clean is the only addition.
     assert [
         {k: v for k, v in r.items() if k != "postcode_clean"} for r in saved_rows
@@ -197,6 +220,7 @@ def test_save_then_reload_round_trip_preserves_columns(
 def test_save_batch_uses_unique_filename_per_call(
     repo: UserAddressCsvS3Repository,
 ) -> None:
+    # arrange
     addresses = [
         UserAddress(
             user_address="1 High Street",
@@ -205,7 +229,9 @@ def test_save_batch_uses_unique_filename_per_call(
         ),
     ]
 
+    # act
     uri_1 = repo.save_batch(addresses, "tasks/uniqueness")
     uri_2 = repo.save_batch(addresses, "tasks/uniqueness")
 
+    # assert
     assert uri_1 != uri_2
diff --git a/tests/utilities/aws_lambda/test_subtask_handler.py b/tests/utilities/aws_lambda/test_subtask_handler.py
index 9cf68f28..d671adc4 100644
--- a/tests/utilities/aws_lambda/test_subtask_handler.py
+++ b/tests/utilities/aws_lambda/test_subtask_handler.py
@@ -6,7 +6,8 @@ from typing import Any
 from uuid import UUID
 
 import pytest
-from sqlmodel import Session, SQLModel, create_engine
+from sqlalchemy import Engine
+from sqlmodel import Session
 
 from domain.tasks.subtasks import SubTaskStatus
 from domain.tasks.tasks import TaskStatus
@@ -30,10 +31,8 @@ class Harness:
 
 
 @pytest.fixture
-def harness() -> Iterator[Harness]:
-    engine = create_engine("sqlite://")
-    SQLModel.metadata.create_all(engine)
-    with Session(engine) as session:
+def harness(db_engine: Engine) -> Iterator[Harness]:
+    with Session(db_engine) as session:
         tasks = TaskPostgresRepository(session=session)
         subtasks = SubTaskPostgresRepository(session=session)
         yield Harness(
@@ -50,6 +49,7 @@ def _direct_event(task_id: UUID, subtask_id: UUID) -> dict[str, Any]:
 def test_subtask_handler_injects_orchestrator_as_third_positional_argument(
     harness: Harness,
 ) -> None:
+    # arrange
     _, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -64,8 +64,10 @@ def test_subtask_handler_injects_orchestrator_as_third_positional_argument(
         received["context"] = context
         received["orchestrator"] = orchestrator
 
+    # act
     handler(_direct_event(subtask.task_id, subtask.id), context="ctx-sentinel")
 
+    # assert
     assert received["orchestrator"] is harness.orchestrator
     assert received["context"] == "ctx-sentinel"
     assert received["body"]["sub_task_id"] == str(subtask.id)
@@ -74,6 +76,7 @@ def test_subtask_handler_injects_orchestrator_as_third_positional_argument(
 def test_subtask_handler_completes_parent_subtask_on_success(
     harness: Harness,
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -84,8 +87,10 @@ def test_subtask_handler_completes_parent_subtask_on_success(
     ) -> None:
         return None
 
+    # act
     handler(_direct_event(task.id, subtask.id), context=None)
 
+    # assert
     assert harness.subtasks.get(subtask.id).status is SubTaskStatus.COMPLETE
     assert harness.tasks.get(task.id).status is TaskStatus.COMPLETE
 
@@ -93,6 +98,7 @@ def test_subtask_handler_completes_parent_subtask_on_success(
 def test_subtask_handler_marks_parent_failed_and_reraises_on_error(
     harness: Harness,
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -103,6 +109,7 @@ def test_subtask_handler_marks_parent_failed_and_reraises_on_error(
     ) -> None:
         raise RuntimeError("boom")
 
+    # act / assert
     with pytest.raises(RuntimeError, match="boom"):
         handler(_direct_event(task.id, subtask.id), context=None)
 
@@ -113,6 +120,7 @@ def test_subtask_handler_marks_parent_failed_and_reraises_on_error(
 def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
     harness: Harness,
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -126,8 +134,10 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
         child = orchestrator.create_child_subtask(task.id, inputs={"split": 1})
         child_ids.append(child.id)
 
+    # act
     handler(_direct_event(task.id, subtask.id), context=None)
 
+    # assert
     assert len(child_ids) == 1
     persisted_child = harness.subtasks.get(child_ids[0])
     assert persisted_child.task_id == task.id
@@ -137,6 +147,7 @@ def test_subtask_handler_injected_orchestrator_can_create_child_subtask(
 def test_subtask_handler_logs_subtask_lifecycle_on_success(
     harness: Harness, caplog: pytest.LogCaptureFixture
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -147,9 +158,11 @@ def test_subtask_handler_logs_subtask_lifecycle_on_success(
     ) -> None:
         return None
 
+    # act
     with caplog.at_level(logging.INFO, logger=_LOGGER_NAME):
         handler(_direct_event(task.id, subtask.id), context=None)
 
+    # assert
     assert f"Running subtask {subtask.id}" in caplog.text
     assert f"Subtask {subtask.id} completed" in caplog.text
 
@@ -157,6 +170,7 @@ def test_subtask_handler_logs_subtask_lifecycle_on_success(
 def test_subtask_handler_logs_exception_on_failure(
     harness: Harness, caplog: pytest.LogCaptureFixture
 ) -> None:
+    # arrange
     task, subtask = harness.orchestrator.create_task_with_subtask(
         task_source="manual:test"
     )
@@ -167,6 +181,7 @@ def test_subtask_handler_logs_exception_on_failure(
     ) -> None:
         raise RuntimeError("boom")
 
+    # act / assert
     with caplog.at_level(logging.INFO, logger=_LOGGER_NAME):
         with pytest.raises(RuntimeError, match="boom"):
             handler(_direct_event(task.id, subtask.id), context=None)
@@ -181,6 +196,7 @@ def test_subtask_handler_logs_exception_on_failure(
 def test_subtask_handler_records_cloudwatch_url_on_subtask(
     harness: Harness, monkeypatch: pytest.MonkeyPatch
 ) -> None:
+    # arrange
     monkeypatch.setenv("AWS_REGION", "eu-west-2")
     monkeypatch.setenv(
         "AWS_LAMBDA_LOG_GROUP_NAME", "/aws/lambda/postcode-splitter"
@@ -198,8 +214,10 @@ def test_subtask_handler_records_cloudwatch_url_on_subtask(
     ) -> None:
         return None
 
+    # act
     handler(_direct_event(task.id, subtask.id), context=None)
 
+    # assert
     saved_url = harness.subtasks.get(subtask.id).cloud_logs_url
     assert saved_url is not None
     assert saved_url.startswith(
@@ -213,6 +231,7 @@ def test_subtask_handler_records_cloudwatch_url_on_subtask(
 def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda(
     harness: Harness, monkeypatch: pytest.MonkeyPatch
 ) -> None:
+    # arrange
     for var in (
         "AWS_REGION",
         "AWS_LAMBDA_LOG_GROUP_NAME",
@@ -229,6 +248,8 @@ def test_subtask_handler_leaves_cloudwatch_url_unset_outside_lambda(
     ) -> None:
         return None
 
+    # act
     handler(_direct_event(task.id, subtask.id), context=None)
 
+    # assert
     assert harness.subtasks.get(subtask.id).cloud_logs_url is None

From f10947699eca992b3cbc5ef9b69b744acaf73226 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 14:13:04 +0000
Subject: [PATCH 80/91] pytest.ini

---
 pytest.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytest.ini b/pytest.ini
index 99cc8e1b..5044465b 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -4,6 +4,7 @@ log_cli = true
 log_cli_level = INFO
 addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
 testpaths =
+    tests
     recommendations/tests
     backend/tests
     backend/address2UPRN/tests

From 154b820b29f7b6ba2c24c34a3a60a98435a79df8 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 14:26:46 +0000
Subject: [PATCH 81/91] pytest.ini

---
 .github/workflows/unit_tests.yml | 12 ++++++++++++
 pytest.ini                       |  1 -
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index fa4fdf2a..15d4cfe9 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -60,3 +60,15 @@ jobs:
             -e DB_PASSWORD=test \
             -e DB_PORT=5432 \
             model-test pytest -vv -m 'not integration'
+
+      # The DDD rewrite (tests/) defines SQLModel table classes that map to the
+      # same physical tables as the legacy backend models. Both sets share the
+      # one global SQLModel.metadata, so they cannot be imported into the same
+      # pytest process. It runs as a separate invocation until the legacy
+      # models are retired. Its DB is spawned in-process by pytest-postgresql,
+      # so no DB service or env is required.
+      - name: Run DDD tests
+        run: |
+          docker run --rm \
+            --network host \
+            model-test pytest -vv tests/
diff --git a/pytest.ini b/pytest.ini
index 5044465b..99cc8e1b 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -4,7 +4,6 @@ log_cli = true
 log_cli_level = INFO
 addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
 testpaths =
-    tests
     recommendations/tests
     backend/tests
     backend/address2UPRN/tests

From 8610a0c87518c3dd7c2625b839218aa4593b9e4c Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 15:17:55 +0000
Subject: [PATCH 82/91] actually deploy postcode splitter

---
 .github/workflows/deploy_terraform.yml               | 2 +-
 deployment/terraform/lambda/postcodeSplitter/main.tf | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index 923fc0a9..8ba473ca 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -169,7 +169,7 @@ jobs:
     uses: ./.github/workflows/_build_image.yml
     with:
       ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
-      dockerfile_path: backend/postcode_splitter/handler/Dockerfile
+      dockerfile_path: applications/postcode_splitter/Dockerfile
       build_context: .
       build_args: |
         DEV_DB_HOST=$DEV_DB_HOST
diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf
index 325f7dc7..721cb2ea 100644
--- a/deployment/terraform/lambda/postcodeSplitter/main.tf
+++ b/deployment/terraform/lambda/postcodeSplitter/main.tf
@@ -38,8 +38,8 @@ module "lambda" {
     {
       STAGE     = var.stage
       LOG_LEVEL = "info"
-      DB_USERNAME = local.db_credentials.db_assessment_model_username
-      DB_PASSWORD = local.db_credentials.db_assessment_model_password
+      POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
+      POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
       ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url
       S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
     },

From 78c1d150fa2552ad4386cf113a0ee61523d8aa9a Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 15:25:42 +0000
Subject: [PATCH 83/91] added smoke test

---
 .github/workflows/lambda_smoke_tests.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/lambda_smoke_tests.yml b/.github/workflows/lambda_smoke_tests.yml
index 5ff5420a..b562f91e 100644
--- a/.github/workflows/lambda_smoke_tests.yml
+++ b/.github/workflows/lambda_smoke_tests.yml
@@ -36,6 +36,13 @@ jobs:
       build_context: .
       service_name: postcode-splitter
 
+  postcode_splitter_ddd_smoke_test:
+    uses: ./.github/workflows/_smoke_test_lambda.yml
+    with:
+      dockerfile_path: applications/postcode_splitter/Dockerfile
+      build_context: .
+      service_name: postcode-splitter-ddd
+
   # ============================================================
   # Bulk Address2UPRN Combiner
   # ============================================================

From 53b211e951c1b2eb71ac0fce20aefeab6cd9ddc5 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 15:43:41 +0000
Subject: [PATCH 84/91] epc token added

---
 .github/workflows/_build_image.yml      | 3 +++
 .github/workflows/deploy_terraform.yml  | 2 ++
 backend/address2UPRN/handler/Dockerfile | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml
index 3435c92d..e7ad9424 100644
--- a/.github/workflows/_build_image.yml
+++ b/.github/workflows/_build_image.yml
@@ -40,6 +40,8 @@ on:
         required: false
       EPC_AUTH_TOKEN:
         required: false
+      OPEN_EPC_API_TOKEN:
+        required: false
 
 jobs:
   build:
@@ -50,6 +52,7 @@ jobs:
       DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
       DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
       EPC_AUTH_TOKEN: ${{ secrets.EPC_AUTH_TOKEN }}
+      OPEN_EPC_API_TOKEN: ${{ secrets.OPEN_EPC_API_TOKEN }}
 
     outputs:
       image_digest: ${{ steps.digest.outputs.image_digest }}
diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index 8ba473ca..7f2eb890 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -133,6 +133,7 @@ jobs:
         DEV_DB_PORT=$DEV_DB_PORT
         DEV_DB_NAME=$DEV_DB_NAME
         EPC_AUTH_TOKEN=$EPC_AUTH_TOKEN
+        OPEN_EPC_API_TOKEN=$OPEN_EPC_API_TOKEN
     secrets:
       AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
@@ -141,6 +142,7 @@ jobs:
       DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
       DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
       EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
+      OPEN_EPC_API_TOKEN: ${{ secrets.DEV_OPEN_EPC_API_TOKEN }}
 
   # ============================================================
   # Deploy Address 2 UPRN Lambda
diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile
index 07159357..7d174152 100644
--- a/backend/address2UPRN/handler/Dockerfile
+++ b/backend/address2UPRN/handler/Dockerfile
@@ -6,11 +6,13 @@ ARG DEV_DB_HOST
 ARG DEV_DB_PORT
 ARG DEV_DB_NAME
 ARG EPC_AUTH_TOKEN
+ARG OPEN_EPC_API_TOKEN
 
 ENV DB_HOST=${DEV_DB_HOST}
 ENV DB_PORT=${DEV_DB_PORT}
 ENV DB_NAME=${DEV_DB_NAME}
 ENV EPC_AUTH_TOKEN=${EPC_AUTH_TOKEN}
+ENV OPEN_EPC_API_TOKEN=${OPEN_EPC_API_TOKEN}
 
 
 # Set working directory (Lambda task root)

From 4e21dda328dc4a06ab1eb69e5f44857c1a6cf03f Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Wed, 20 May 2026 16:26:07 +0000
Subject: [PATCH 85/91] rename files in sharepoint to desired structure

---
 scripts/rename_sharepoint_files.py          | 128 ++++++++++++++++++++
 utils/sharepoint/domna_sharepoint_client.py |   9 ++
 utils/sharepoint/sharepoint_client.py       |  11 ++
 3 files changed, 148 insertions(+)
 create mode 100644 scripts/rename_sharepoint_files.py

diff --git a/scripts/rename_sharepoint_files.py b/scripts/rename_sharepoint_files.py
new file mode 100644
index 00000000..881b96ef
--- /dev/null
+++ b/scripts/rename_sharepoint_files.py
@@ -0,0 +1,128 @@
+"""
+Rename files in SharePoint property folders to the canonical format:
+    {UPRN}_{Street} {Postcode}_{Document Name}.ext
+
+Set DRY_RUN = False when ready to commit. Run from repo root.
+Required env vars: SHAREPOINT_CLIENT_ID, SHAREPOINT_CLIENT_SECRET,
+                   SHAREPOINT_TENANT_ID, SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID
+"""
+
+import csv
+import os
+from typing import Optional
+
+from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders
+from utils.logger import setup_logger
+from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
+from utils.sharepoint.domna_sites import DomnaSites
+
+DRY_RUN: bool = True
+CSV_PATH: str = "scripts/sero_address_list.csv"
+
+BASE_PATH = (
+    "Osmosis-ACD Projects/Sero-Clarion Housing/"
+    "Sero Project Documents/Property Folders"
+)
+ASSESSMENT_SUBFOLDER = "A. Assessment"
+
+logger = setup_logger()
+
+
+def build_canonical_filename(
+    uprn: str, address: str, postcode: str, original_name: str
+) -> Optional[str]:
+    """
+    Returns the canonical filename, or None if the file is already renamed.
+
+    Already-renamed: name starts with "{uprn}_".
+    Strips any existing address prefix (address+postcode first, then address alone)
+    before inserting the canonical prefix.
+    """
+    if original_name.startswith(f"{uprn}_"):
+        return None
+
+    stem, ext = os.path.splitext(original_name)
+    stem_lower = stem.lower()
+
+    street = address.split(",")[0].strip()
+    prefixes = [
+        f"{address} {postcode}",
+        address,
+        f"{street} {postcode}",
+        street,
+    ]
+
+    doc_name = stem
+    for prefix in prefixes:
+        if stem_lower.startswith(prefix.lower()):
+            doc_name = stem[len(prefix) :]
+            break
+
+    if doc_name.startswith(" - "):
+        doc_name = doc_name[3:]
+    elif doc_name.startswith(" _ "):
+        doc_name = doc_name[3:]
+    doc_name = doc_name.strip()
+
+    street_post = f"{street} {postcode}"
+    if doc_name:
+        return f"{uprn}_{street_post}_{doc_name}{ext}"
+    return f"{uprn}_{street_post}{ext}"
+
+
+def main() -> None:
+    sp_client = DomnaSharepointClient(DomnaSites.SOCIAL_HOUSING_WAVE_3)
+
+    with open(CSV_PATH, newline="", encoding="utf-8-sig") as f:
+        reader = csv.DictReader(f)
+        required = {"UPRN", "Address", "Postcode"}
+        if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
+            raise ValueError(
+                f"CSV missing required columns. Expected {required}, got {reader.fieldnames}"
+            )
+
+        for row in reader:
+            uprn = row["UPRN"].strip()
+            address = row["Address"].strip()
+            postcode = row["Postcode"].strip()
+            folder_path = (
+                f"{BASE_PATH}/{address}, {postcode}"
+                f"/{SharepointSubfolders.ASSESSMENT.value}/{ASSESSMENT_SUBFOLDER}"
+            )
+
+            try:
+                contents = sp_client.get_folders_in_path(folder_path)
+            except ValueError:
+                logger.warning(f"Missing folder for UPRN {uprn}: {folder_path}")
+                continue
+
+            for item in contents.get("value", []):
+                if "file" not in item:
+                    continue
+
+                original_name: str = item["name"]
+                new_name = build_canonical_filename(
+                    uprn, address, postcode, original_name
+                )
+
+                if new_name is None:
+                    continue
+
+                if DRY_RUN:
+                    logger.info(
+                        f'[DRY RUN] Renaming: "{original_name}" → "{new_name}" (UPRN: {uprn})'
+                    )
+                else:
+                    try:
+                        sp_client.rename_file(item["id"], new_name)
+                        logger.info(
+                            f'Renamed: "{original_name}" → "{new_name}" (UPRN: {uprn})'
+                        )
+                    except Exception as e:
+                        logger.error(
+                            f'Failed to rename "{original_name}" → "{new_name}" (UPRN: {uprn}): {e}'
+                        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py
index 5e0255ac..3e9168ba 100644
--- a/utils/sharepoint/domna_sharepoint_client.py
+++ b/utils/sharepoint/domna_sharepoint_client.py
@@ -125,6 +125,15 @@ class DomnaSharepointClient:
         self.logger.debug(f"Downloaded SharePoint file to: {local_path}")
         return True
 
+    def rename_file(self, item_id: str, new_name: str) -> None:
+        sharepoint_client = SharePointClient(
+            tenant_id=self.sharepoint_tenant_id,
+            client_id=self.sharepoint_client_id,
+            client_secret=self.sharepoint_client_secret,
+            site_id=self.sharepoint_drive.value,
+        )
+        sharepoint_client.rename_file(item_id, new_name)
+
     def create_temp_file(self, content: BytesIO, path: str):
         # Ensure the path is under /tmp/
         new_path = os.path.join("/tmp/sharepoint", path)
diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py
index 5807c3bd..38107dbf 100644
--- a/utils/sharepoint/sharepoint_client.py
+++ b/utils/sharepoint/sharepoint_client.py
@@ -335,6 +335,17 @@ class SharePointClient:
             if retry == "retry":
                 return self.upload_file(file_name, sharepoint_parent_id, file_stream)
 
+    @api_call_decorator
+    def rename_file(self, item_id: str, new_name: str) -> None:
+        """
+        PATCH /drives/{drive_id}/items/{item_id}
+
+        Renames a file in-place. Caller should discard the return value.
+        """
+        url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/items/{item_id}"
+        data: Dict[str, Any] = {"name": new_name}
+        return "PATCH", url, data  # type: ignore[return-value]
+
     @staticmethod
     def download_sharepoint_file(download_url: str) -> BytesIO:
         """

From e5583aac1f693fa58ed1d1f5501751d97b38bd01 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 17:36:20 +0000
Subject: [PATCH 86/91] some excel files are formatted differently

---
 .../postcode_splitter/local_handler/invoke_local_lambda.py | 2 +-
 infrastructure/csv_s3_client.py                            | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
index 21fa9b9e..17d7e345 100755
--- a/applications/postcode_splitter/local_handler/invoke_local_lambda.py
+++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
@@ -14,7 +14,7 @@ payload = {
                 {
                     "task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf",
                     "sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d",
-                    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv",
+                    "s3_uri": "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2.csv",
                 }
             )
         }
diff --git a/infrastructure/csv_s3_client.py b/infrastructure/csv_s3_client.py
index 055d1ce3..8af8de73 100644
--- a/infrastructure/csv_s3_client.py
+++ b/infrastructure/csv_s3_client.py
@@ -13,7 +13,12 @@ class CsvS3Client(S3Client):
                 f"s3_uri bucket {bucket!r} does not match client bucket {self.bucket!r}"
             )
         raw = self.get_object(key)
-        text = raw.decode("utf-8-sig")
+        try:
+            text = raw.decode("utf-8-sig")
+        except UnicodeDecodeError:
+            # Some uploads are Windows-1252 (e.g. £ as byte 0xA3), not UTF-8.
+            text = raw.decode("cp1252")
+
         reader = csv.DictReader(StringIO(text))
         return [dict(row) for row in reader]
 

From 714478a99a7a221e26367bb2a762d1a31f938ac0 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Wed, 20 May 2026 17:51:45 +0000
Subject: [PATCH 87/91] clean up sanitise postcode

---
 backend/epc_client/epc_client_service.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py
index 86caeea3..72dbf142 100644
--- a/backend/epc_client/epc_client_service.py
+++ b/backend/epc_client/epc_client_service.py
@@ -47,8 +47,14 @@ class EpcClientService:
         latest = max(results, key=lambda r: r.registration_date)
         return self.get_by_certificate_number(latest.certificate_number)
 
+    @staticmethod
+    def _normalise_postcode(postcode: str) -> str:
+        """Return the postcode with all spaces removed and uppercased."""
+        return postcode.replace(" ", "").upper()
+
     def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]:
-        return call_with_retry(lambda: self._search(postcode=postcode))
+        normalised = self._normalise_postcode(postcode)
+        return call_with_retry(lambda: self._search(postcode=normalised))
 
     # ------------------------------------------------------------------
     # Private helperEpcRateLimpolarss

From c5ab795f851402145bc7ed65e3b17a10cd8cd494 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 21 May 2026 09:46:47 +0000
Subject: [PATCH 88/91] redeploy old postcode splitter

---
 .github/workflows/deploy_terraform.yml        |  4 +++-
 asset_list/app.py                             | 13 ++++++-----
 .../terraform/lambda/postcodeSplitter/main.tf | 22 +++++++++++++++++--
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index 8ba473ca..1af90291 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -169,7 +169,9 @@ jobs:
     uses: ./.github/workflows/_build_image.yml
     with:
       ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
-      dockerfile_path: applications/postcode_splitter/Dockerfile
+      # dockerfile_path: applications/postcode_splitter/Dockerfile
+      # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm
+      dockerfile_path:  backend/postcode_splitter/handler/Dockerfile
       build_context: .
       build_args: |
         DEV_DB_HOST=$DEV_DB_HOST
diff --git a/asset_list/app.py b/asset_list/app.py
index 9b10d7f3..424f4df6 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -79,23 +79,23 @@ def app():
     """
 
     data_folder = "/workspaces/model/asset_list"
-    data_filename = "lincs_address_list.xlsx"
-    sheet_name = "Sheet1"
+    data_filename = "hyde.xlsx"
+    sheet_name = "AddressProfilingResults"
     postcode_column = "Postcode"
-    address1_column = "Deal Name"
+    address1_column = "Address"
     address1_method = None
-    fulladdress_column = "Deal Name"
+    fulladdress_column = "Postcode"
     address_cols_to_concat = []
     missing_postcodes_method = None
     landlord_year_built = None
     landlord_os_uprn = None
-    landlord_property_type = None  # Good to include if landlord gave
+    landlord_property_type = "Property Type"  # Good to include if landlord gave
     landlord_built_form = None  # Good to include if landlord gave
     landlord_wall_construction = None
     landlord_roof_construction = None
     landlord_heating_system = None
     landlord_existing_pv = None
-    landlord_property_id = "landlord_id"
+    landlord_property_id = "Organisation Reference"
     landlord_sap = None
     outcomes_filename = None
     outcomes_sheetname = None
@@ -468,3 +468,4 @@ def app():
                 asset_list.duplicated_addresses.to_excel(
                     writer, sheet_name="Duplicate Properties", index=False
                 )
+
diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf
index 721cb2ea..e04ae00f 100644
--- a/deployment/terraform/lambda/postcodeSplitter/main.tf
+++ b/deployment/terraform/lambda/postcodeSplitter/main.tf
@@ -38,8 +38,26 @@ module "lambda" {
     {
       STAGE     = var.stage
       LOG_LEVEL = "info"
-      POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
-      POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
+      # POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
+      # POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
+      # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm
+      DB_USERNAME = local.db_credentials.db_assessment_model_username
+      DB_PASSWORD = local.db_credentials.db_assessment_model_password
+      # Placeholder values so backend/app/config.py Settings doesn't fall back to "changeme"
+      GOOGLE_SOLAR_API_KEY = "test"
+      SAP_PREDICTIONS_BUCKET = "test"
+      CARBON_PREDICTIONS_BUCKET = "test"
+      HEAT_PREDICTIONS_BUCKET = "test"
+      HEATING_KWH_PREDICTIONS_BUCKET = "test"
+      HOTWATER_KWH_PREDICTIONS_BUCKET = "test"
+      API_KEY = "test"
+      ENVIRONMENT = "test"
+      SECRET_KEY = "test"
+      PLAN_TRIGGER_BUCKET = "test"
+      DATA_BUCKET = "test"
+      EPC_AUTH_TOKEN = "test"
+      ENGINE_SQS_URL = "test"
+      ENERGY_ASSESSMENTS_BUCKET = "test"
       ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url
       S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
     },

From 856ea6eb9358f10e89e6b574a3a4367b0e92a874 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 21 May 2026 10:12:08 +0000
Subject: [PATCH 89/91] undo postcodesplitter changes

---
 .github/workflows/deploy_terraform.yml        |  4 +---
 .../terraform/lambda/postcodeSplitter/main.tf | 22 ++-----------------
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index 1af90291..8ba473ca 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -169,9 +169,7 @@ jobs:
     uses: ./.github/workflows/_build_image.yml
     with:
       ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }}
-      # dockerfile_path: applications/postcode_splitter/Dockerfile
-      # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm
-      dockerfile_path:  backend/postcode_splitter/handler/Dockerfile
+      dockerfile_path: applications/postcode_splitter/Dockerfile
       build_context: .
       build_args: |
         DEV_DB_HOST=$DEV_DB_HOST
diff --git a/deployment/terraform/lambda/postcodeSplitter/main.tf b/deployment/terraform/lambda/postcodeSplitter/main.tf
index e04ae00f..721cb2ea 100644
--- a/deployment/terraform/lambda/postcodeSplitter/main.tf
+++ b/deployment/terraform/lambda/postcodeSplitter/main.tf
@@ -38,26 +38,8 @@ module "lambda" {
     {
       STAGE     = var.stage
       LOG_LEVEL = "info"
-      # POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
-      # POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
-      # Switch back to the old postcode_splitter due to hyde priority - interface for the new one isn't working atm
-      DB_USERNAME = local.db_credentials.db_assessment_model_username
-      DB_PASSWORD = local.db_credentials.db_assessment_model_password
-      # Placeholder values so backend/app/config.py Settings doesn't fall back to "changeme"
-      GOOGLE_SOLAR_API_KEY = "test"
-      SAP_PREDICTIONS_BUCKET = "test"
-      CARBON_PREDICTIONS_BUCKET = "test"
-      HEAT_PREDICTIONS_BUCKET = "test"
-      HEATING_KWH_PREDICTIONS_BUCKET = "test"
-      HOTWATER_KWH_PREDICTIONS_BUCKET = "test"
-      API_KEY = "test"
-      ENVIRONMENT = "test"
-      SECRET_KEY = "test"
-      PLAN_TRIGGER_BUCKET = "test"
-      DATA_BUCKET = "test"
-      EPC_AUTH_TOKEN = "test"
-      ENGINE_SQS_URL = "test"
-      ENERGY_ASSESSMENTS_BUCKET = "test"
+      POSTGRES_USERNAME = local.db_credentials.db_assessment_model_username
+      POSTGRES_PASSWORD = local.db_credentials.db_assessment_model_password
       ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url
       S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
     },

From dbd03de842933fa189de077d48e5c13ecf9729f4 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 21 May 2026 10:37:13 +0000
Subject: [PATCH 90/91] local run changes

---
 .../postcode_splitter/local_handler/invoke_local_lambda.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/applications/postcode_splitter/local_handler/invoke_local_lambda.py b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
index 17d7e345..5f4b1d36 100755
--- a/applications/postcode_splitter/local_handler/invoke_local_lambda.py
+++ b/applications/postcode_splitter/local_handler/invoke_local_lambda.py
@@ -12,9 +12,9 @@ payload = {
         {
             "body": json.dumps(
                 {
-                    "task_id": "f4b3332f-c0cc-481f-96a5-d39860a647cf",
-                    "sub_task_id": "14c042de-40c4-473b-8cd8-72c983a94a8d",
-                    "s3_uri": "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2.csv",
+                    "task_id": "e295d89b-a7c5-4a9a-8b4e-b405fab1f298",
+                    "sub_task_id": "f4a9944f-41f0-4a33-8669-5016ec574068",
+                    "s3_uri": "s3://retrofit-data-dev/bulk_onboarding_inputs/hyde2 (1).csv",
                 }
             )
         }

From 9f7c16ccbd35e00d081701d5b46393ba3736278d Mon Sep 17 00:00:00 2001
From: Daniel Roth <daniel_roth@hotmail.co.uk>
Date: Thu, 21 May 2026 15:30:03 +0000
Subject: [PATCH 91/91] add address list

---
 scripts/sero_address_list.csv | 51 +++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 scripts/sero_address_list.csv

diff --git a/scripts/sero_address_list.csv b/scripts/sero_address_list.csv
new file mode 100644
index 00000000..8c9401c9
--- /dev/null
+++ b/scripts/sero_address_list.csv
@@ -0,0 +1,51 @@
+﻿UPRN,Address,Postcode
+U1035052,"1 Sudbury Crescent, Bromley",BR1 4PY
+U1027449,"11 Station Road, Bromley",BR1 3LP
+U1021310,"126 Faringdon Avenue, Bromley",BR2 8BU
+U1010811,"13 Gilbert Road, Bromley",BR1 3QP
+U1024017,"13 Manor Way, Bromley",BR2 8ES
+U1042232,"154 Southover, Bromley",BR1 4RZ
+U1009369,"17 Minster Road, Bromley",BR1 4DY
+U1022305,"18a Lansdowne Road, Bromley",BR1 3LZ
+U1033165,"2 Laburnum Way, Bromley",BR2 8BZ
+U1035326,"2 Whitebeam Avenue, Bromley",BR2 8DL
+U1037872,"20 Sudbury Crescent, Bromley",BR1 4PZ
+U1007432,"21 Detling Road, Bromley",BR1 4SH
+U1005123,"24 Bonville Road, Bromley",BR1 4QA
+U1034810,"24 Newbury Road, Bromley",BR2 0QW
+U1020351,"27 Laburnum Way, Bromley",BR2 8BY
+U1009511,"27 Newbury Road, Bromley",BR2 0QN
+U1034985,"272 Southborough Lane, Bromley",BR2 8AS
+U1037954,"28 Treewall Gardens, Bromley",BR1 5BT
+U1038103,"29 Whitebeam Avenue, Bromley",BR2 8DJ
+U1013358,"3 Bird In Hand Lane, Bromley",BR1 2NA
+U1024709,"3 Parkfield Way, Bromley",BR2 8AE
+U1031058,"303 Keedonwood Road, Bromley",BR1 4QR
+U1014077,"32 Aylesbury Road, Bromley",BR2 0QP
+U1019564,"32 Brook Lane, Bromley",BR1 4PU
+U1020237,"33 Hornbeam Way, Bromley",BR2 8DB
+U1027493,"35 Sudbury Crescent, Bromley",BR1 4PY
+U1042298,"39 Sudbury Crescent, Bromley",BR1 4PY
+U1024698,"4 Palace View, Bromley",BR1 3EL
+U1052186,"4 Ravensleigh Gardens, Bromley",BR1 5SN
+U1042153,"4 Scotts Road, Bromley",BR1 3QD
+U1037814,"42 Stanley Road, Bromley",BR2 9JH
+U1014078,"43 Aylesbury Road, Bromley",BR2 0QR
+U1007701,"46 Harwood Avenue, Bromley",BR1 3DU
+U1036758,"46 Newbury Road, Bromley",BR2 0QW
+U1025820,"46 Princes Plain, Bromley",BR2 8LE
+U1022991,"5 Link Way, Bromley",BR2 8JH
+U1024484,"55 Mounthurst Road, Bromley",BR2 7PG
+U1014793,"59 Headcorn Road, Bromley",BR1 4SQ
+U1037465,"6 Princes Plain, Bromley",BR2 8LE
+U1009202,"63 Mead Way, Bromley",BR2 9ER
+U1021353,"66 George Lane, Bromley",BR2 7LQ
+U1042733,"68 Whitebeam Avenue, Bromley",BR2 8DL
+U1030962,"7 Ravensleigh Gardens, Bromley",BR1 5SN
+U1031294,"70 London Lane, Bromley",BR1 4HE
+U1037450,"70 Pontefract Road, Bromley",BR1 4RB
+U1014589,"71 Empress Drive, Chislehurst",BR7 5BQ
+U1052429,"76 Southover, Bromley",BR1 4RY
+U1020199,"78 Hillside Road, Bromley",BR2 0ST
+U1024511,"81 Nightingale Lane, Bromley",BR1 2SA
+U1009194,"84 Mays Hill Road, Bromley",BR2 0HT