From 4dbafe3992a30a652480e3a478a061aa8eec8237 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 16 Apr 2026 22:21:54 +0000
Subject: [PATCH 1/3] added local devconaitner

---
 asset_list/app.py                             |  12 +-
 backend/export/property_scenarios/main.py     |  69 +++++------
 .../scripts/combine_address2uprn_outputs.py   |   2 +
 devcontainer.sh                               | 110 ++++++++++++++++++
 etl/hubspot/scripts/scraper/bulk_load.py      |   4 +-
 5 files changed, 156 insertions(+), 41 deletions(-)
 create mode 100644 devcontainer.sh

diff --git a/asset_list/app.py b/asset_list/app.py
index b0030667..25c72bda 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -74,23 +74,23 @@ def app():
     """
 
     data_folder = "/workspaces/model/asset_list"
-    data_filename = "Waverley UPRN Match.xlsx"
+    data_filename = "foom (2).xlsx"
     sheet_name = "in"
     postcode_column = "postcode_clean"
-    address1_column = "domna_found_address"
+    address1_column = "address2uprn_address"
     address1_method = None
-    fulladdress_column = "domna_found_address"
+    fulladdress_column = "address2uprn_address"
     address_cols_to_concat = []
     missing_postcodes_method = None
     landlord_year_built = None
-    landlord_os_uprn = "domna_found_uprn"
-    landlord_property_type = "Property Type 1"  # Good to include if landlord gave
+    landlord_os_uprn = "address2uprn_uprn"
+    landlord_property_type = None  # Good to include if landlord gave
     landlord_built_form = None  # Good to include if landlord gave
     landlord_wall_construction = None
     landlord_roof_construction = None
     landlord_heating_system = None
     landlord_existing_pv = None
-    landlord_property_id = "WBC Ref"
+    landlord_property_id = "UPRN"
     landlord_sap = None
     outcomes_filename = None
     outcomes_sheetname = None
diff --git a/backend/export/property_scenarios/main.py b/backend/export/property_scenarios/main.py
index f3ea0100..64627e01 100644
--- a/backend/export/property_scenarios/main.py
+++ b/backend/export/property_scenarios/main.py
@@ -26,15 +26,14 @@ def has_solar_with_battery(materials_list: Optional[List[Dict[str, Any]]]) -> bo
     :return:
     """
     for m in materials_list or []:
-        if (
-            m.get("type") == "solar_pv"
-            and m.get("includes_battery") is True
-        ):
+        if m.get("type") == "solar_pv" and m.get("includes_battery") is True:
             return True
     return False
 
 
-def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, int], pd.DataFrame]:
+def process_export(
+    payload: ExportRequest, session: Session
+) -> Dict[Union[str, int], pd.DataFrame]:
     export_files: Dict[Union[str, int], pd.DataFrame] = {}
 
     db_methods = DbMethods(session)
@@ -52,7 +51,9 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str,
     logger.info("Retrieved %s plans for export", len(plans_df))
 
     if plans_df.empty:
-        logger.info("Empty plans dataframe - no plans to export. Returning empty export.")
+        logger.info(
+            "Empty plans dataframe - no plans to export. Returning empty export."
+        )
         return export_files
     plan_ids: List[int] = plans_df["id"].tolist()
     recommendations_df: pd.DataFrame = db_methods.get_recommendations(plan_ids)
@@ -61,13 +62,12 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str,
 
     recommendations_df = db_methods.attach_materials(recommendations_df)
 
-    recommendations_df["has_solar_with_battery"] = (
-        recommendations_df["materials"].apply(has_solar_with_battery)
-    )
+    recommendations_df["has_solar_with_battery"] = recommendations_df[
+        "materials"
+    ].apply(has_solar_with_battery)
 
-    _filter = (
-        (recommendations_df["measure_type"] == "solar_pv")
-        & (recommendations_df["has_solar_with_battery"])
+    _filter = (recommendations_df["measure_type"] == "solar_pv") & (
+        recommendations_df["has_solar_with_battery"]
     )
 
     recommendations_df.loc[_filter, "measure_type"] = (
@@ -83,10 +83,13 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str,
         else:
             scenario_recs = recommendations_df[
                 recommendations_df["scenario_id"] == group_key
-                ]
+            ]
 
         if scenario_recs.empty:
-            logger.info("No recommendations found for group_key %s - skipping export for this group", group_key)
+            logger.info(
+                "No recommendations found for group_key %s - skipping export for this group",
+                group_key,
+            )
             continue
 
         measures_df: pd.DataFrame = scenario_recs[
@@ -99,14 +102,12 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str,
             values="estimated_cost",
         ).reset_index()
 
-        pivot["total_retrofit_cost"] = (
-            pivot.drop(columns=["property_id", "plan_name"]).sum(axis=1)
-        )
+        pivot["total_retrofit_cost"] = pivot.drop(
+            columns=["property_id", "plan_name"]
+        ).sum(axis=1)
 
         post_sap: pd.DataFrame = (
-            scenario_recs.groupby("property_id")[["sap_points"]]
-            .sum()
-            .reset_index()
+            scenario_recs.groupby("property_id")[["sap_points"]].sum().reset_index()
         )
 
         df: pd.DataFrame = (
@@ -117,7 +118,9 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str,
 
         df["sap_points"] = df["sap_points"].fillna(0)
         df["predicted_post_works_sap"] = df["current_sap_points"] + df["sap_points"]
-        df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply(sap_to_epc)
+        df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply(
+            sap_to_epc
+        )
 
         export_files[group_key] = df
 
@@ -128,22 +131,17 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str,
 # Lambda Handler
 # ============================================================
 
-def handler(event: Mapping[str, Any], context: Optional[Any]) -> Mapping[str, Union[int, str]]:
+
+def handler(
+    event: Mapping[str, Any], context: Optional[Any]
+) -> Mapping[str, Union[int, str]]:
     """
     Example event:
     body_dict = {
         "task_id": "test",
         "subtask_id": "test",
-        "portfolio_id": 655,
-        "scenario_ids": [],
-        "default_plans_only": True,
-    }
-
-    body_dict = {
-        "task_id": "test",
-        "subtask_id": "test",
-        "portfolio_id": 655,
-        "scenario_ids": [1174],
+        "portfolio_id": 682,
+        "scenario_ids": [1210],
         "default_plans_only": False,
     }
     :param event: Lambda event containing export request details
@@ -168,7 +166,12 @@ def handler(event: Mapping[str, Any], context: Optional[Any]) -> Mapping[str, Un
                 exported_files = process_export(payload, session)
 
             # TODO: Need to handle the exported files - e.g. upload to s3 and email a presigned url
-            _ = exported_files
+            output_path = f"/tmp/export_{payload.portfolio_id}.xlsx"
+            with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+                for group_key, df in exported_files.items():
+                    sheet_name = str(group_key)[:31]  # Excel sheet names max 31 chars
+                    df.to_excel(writer, sheet_name=sheet_name, index=False)
+            logger.info("Exported files written to %s", output_path)
             return {
                 "statusCode": 200,
                 "body": json.dumps({}),
diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py
index 105b8639..085240a9 100644
--- a/backend/scripts/combine_address2uprn_outputs.py
+++ b/backend/scripts/combine_address2uprn_outputs.py
@@ -31,6 +31,8 @@ def download_csv(key):
 
 
 def main(task_id, output):
+    task_id = "3fb9a9b7-ff49-4c11-b9e1-9d00da955a75"
+
     print(f"Scanning task: {task_id}")
 
     csv_files = list_csv_files(task_id)
diff --git a/devcontainer.sh b/devcontainer.sh
new file mode 100644
index 00000000..32908a9e
--- /dev/null
+++ b/devcontainer.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+#
+# dc.sh — devcontainer helper for this repo
+#
+# Usage:
+#   devcontainer.sh <config> <command>
+#
+# Configs:   backend | asset_list
+# Commands:  up, shell, down, rebuild
+#
+# `shell` auto-runs `up` first if the container isn't already running,
+# so it's safe to call cold.
+#
+# Examples:
+#   ./scripts/dc.sh backend shell       # up + exec bash
+#   ./scripts/dc.sh asset_list up
+#   ./scripts/dc.sh backend rebuild
+#   ./scripts/dc.sh backend down
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)"
+
+VALID_CONFIGS=(backend asset_list)
+VALID_COMMANDS=(up shell down rebuild)
+
+# --- helpers ---------------------------------------------------------------
+
+usage() {
+  sed -n '3,20p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
+  exit "${1:-0}"
+}
+
+die() {
+  echo "error: $*" >&2
+  exit 1
+}
+
+in_list() {
+  # in_list <needle> <haystack...>
+  local needle="$1"
+  shift
+  local item
+  for item in "$@"; do
+    [[ "${item}" == "${needle}" ]] && return 0
+  done
+  return 1
+}
+
+container_id_for() {
+  # Find a running container for the given config path via devcontainer labels.
+  local config_path="$1"
+  docker ps -q \
+    --filter "label=devcontainer.local_folder=${REPO_ROOT}" \
+    --filter "label=devcontainer.config_file=${config_path}"
+}
+
+# --- argument parsing ------------------------------------------------------
+
+[[ $# -eq 2 ]] || usage 1
+
+CONFIG_NAME="$1"
+COMMAND="$2"
+
+in_list "${CONFIG_NAME}" "${VALID_CONFIGS[@]}" \
+  || die "invalid config '${CONFIG_NAME}' (expected: ${VALID_CONFIGS[*]})"
+in_list "${COMMAND}" "${VALID_COMMANDS[@]}" \
+  || die "invalid command '${COMMAND}' (expected: ${VALID_COMMANDS[*]})"
+
+CONFIG_PATH="${REPO_ROOT}/.devcontainer/${CONFIG_NAME}/devcontainer.json"
+[[ -f "${CONFIG_PATH}" ]] || die "config not found: ${CONFIG_PATH}"
+
+DC_ARGS=(--workspace-folder "${REPO_ROOT}" --config "${CONFIG_PATH}")
+
+# --- dispatch --------------------------------------------------------------
+
+case "${COMMAND}" in
+  up)
+    echo ">> bringing up '${CONFIG_NAME}'"
+    devcontainer up "${DC_ARGS[@]}"
+    ;;
+
+  shell)
+    # Auto-up if not already running. `devcontainer up` is idempotent —
+    # it reuses an existing container, so this is cheap on warm starts.
+    if [[ -z "$(container_id_for "${CONFIG_PATH}")" ]]; then
+      echo ">> '${CONFIG_NAME}' not running, bringing it up first"
+      devcontainer up "${DC_ARGS[@]}"
+    fi
+    echo ">> attaching shell to '${CONFIG_NAME}'"
+    devcontainer exec "${DC_ARGS[@]}" bash 2>/dev/null \
+      || devcontainer exec "${DC_ARGS[@]}" sh
+    ;;
+
+  down)
+    cid="$(container_id_for "${CONFIG_PATH}")"
+    if [[ -z "${cid}" ]]; then
+      echo ">> '${CONFIG_NAME}' not running, nothing to stop"
+      exit 0
+    fi
+    echo ">> stopping '${CONFIG_NAME}'"
+    docker stop "${cid}"
+    ;;
+
+  rebuild)
+    echo ">> rebuilding '${CONFIG_NAME}' from scratch"
+    devcontainer up "${DC_ARGS[@]}" --remove-existing-container --build-no-cache
+    ;;
+esac
diff --git a/etl/hubspot/scripts/scraper/bulk_load.py b/etl/hubspot/scripts/scraper/bulk_load.py
index 91aa89e2..f0529905 100644
--- a/etl/hubspot/scripts/scraper/bulk_load.py
+++ b/etl/hubspot/scripts/scraper/bulk_load.py
@@ -9,8 +9,8 @@ PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value
 companies = list(
     [
         # Companies.THE_GUINESS_PARTNERSHIP,
-        # Companies.SOUTHERN_HOUSING_GROUP,
-        Companies.CALICO_HOMES,
+        Companies.SOUTHERN_HOUSING_GROUP,
+        # Companies.CALICO_HOMES,
     ]
 )
 

From c188a3ad2615a9d8ed4ec100e295a99456e97ad7 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 17 Apr 2026 14:50:57 +0000
Subject: [PATCH 2/3] add dev container

---
 devcontainer.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/devcontainer.sh b/devcontainer.sh
index 32908a9e..ba35e7f0 100644
--- a/devcontainer.sh
+++ b/devcontainer.sh
@@ -3,7 +3,7 @@
 # dc.sh — devcontainer helper for this repo
 #
 # Usage:
-#   devcontainer.sh <config> <command>
+#   ./devcontainer.sh <config> <command>
 #
 # Configs:   backend | asset_list
 # Commands:  up, shell, down, rebuild
@@ -12,15 +12,15 @@
 # so it's safe to call cold.
 #
 # Examples:
-#   ./scripts/dc.sh backend shell       # up + exec bash
-#   ./scripts/dc.sh asset_list up
-#   ./scripts/dc.sh backend rebuild
-#   ./scripts/dc.sh backend down
+#   ./devcontainer.sh backend shell       # up + exec bash
+#   ./devcontainer.sh asset_list up
+#   ./devcontainer.sh backend rebuild
+#   ./devcontainer.sh backend down
 
 set -euo pipefail
 
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)"
+REPO_ROOT="${SCRIPT_DIR}"
 
 VALID_CONFIGS=(backend asset_list)
 VALID_COMMANDS=(up shell down rebuild)

From ec4c870465499d66449e3b2efd855a62e9b92769 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 17 Apr 2026 19:08:34 +0000
Subject: [PATCH 3/3] added bulk address 2 uprn lmabda

---
 .github/workflows/deploy_terraform.yml        | 40 ++++++++++
 .gitignore                                    |  5 ++
 backend/app/db/models/bulk_address_uploads.py | 30 ++++++++
 .../bulk_address2uprn_combiner/__init__.py    |  0
 .../handler/Dockerfile                        | 23 ++++++
 .../handler/requirements.txt                  |  7 ++
 backend/bulk_address2uprn_combiner/main.py    | 74 +++++++++++++++++++
 .../lambda/bulk_address2uprn_combiner/main.tf | 44 +++++++++++
 .../bulk_address2uprn_combiner/outputs.tf     | 14 ++++
 .../bulk_address2uprn_combiner/provider.tf    | 16 ++++
 .../bulk_address2uprn_combiner/variables.tf   | 38 ++++++++++
 infrastructure/terraform/shared/main.tf       | 28 +++++++
 12 files changed, 319 insertions(+)
 create mode 100644 backend/app/db/models/bulk_address_uploads.py
 create mode 100644 backend/bulk_address2uprn_combiner/__init__.py
 create mode 100644 backend/bulk_address2uprn_combiner/handler/Dockerfile
 create mode 100644 backend/bulk_address2uprn_combiner/handler/requirements.txt
 create mode 100644 backend/bulk_address2uprn_combiner/main.py
 create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf
 create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf
 create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf
 create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index 22f16fee..7e1281b7 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -201,6 +201,46 @@ jobs:
       AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
       AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
 
+  # ============================================================
+  # Build Bulk Address2UPRN Combiner image and Push
+  # ============================================================
+  bulk_address2uprn_combiner_image:
+    needs: [determine_stage, shared_terraform]
+    uses: ./.github/workflows/_build_image.yml
+    with:
+      ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }}
+      dockerfile_path: backend/bulk_address2uprn_combiner/handler/Dockerfile
+      build_context: .
+      build_args: |
+        DEV_DB_HOST=$DEV_DB_HOST
+        DEV_DB_PORT=$DEV_DB_PORT
+        DEV_DB_NAME=$DEV_DB_NAME
+    secrets:
+      AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
+      DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
+      DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
+      DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
+
+  # ============================================================
+  # Deploy Bulk Address2UPRN Combiner Lambda
+  # ============================================================
+  bulk_address2uprn_combiner_lambda:
+    needs: [bulk_address2uprn_combiner_image, determine_stage, shared_terraform]
+    uses: ./.github/workflows/_deploy_lambda.yml
+    with:
+      lambda_name: bulk_address2uprn_combiner
+      lambda_path: infrastructure/terraform/lambda/bulk_address2uprn_combiner
+      stage: ${{ needs.determine_stage.outputs.stage }}
+      ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }}
+      image_digest: ${{ needs.bulk_address2uprn_combiner_image.outputs.image_digest }}
+      terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
+    secrets:
+      AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
+
   # ============================================================
   # Condition ETL image and Push
   # ============================================================
diff --git a/.gitignore b/.gitignore
index 299e03d4..51a32a0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -278,6 +278,11 @@ cache/
 
 *.png
 *.pptx
+*.csv
+*.xlsx
+*.pdf
+**/Chunks/
+*.ipynb
 
 local_data*
 
diff --git a/backend/app/db/models/bulk_address_uploads.py b/backend/app/db/models/bulk_address_uploads.py
new file mode 100644
index 00000000..335a4c45
--- /dev/null
+++ b/backend/app/db/models/bulk_address_uploads.py
@@ -0,0 +1,30 @@
+from typing import Optional
+from uuid import UUID, uuid4
+from datetime import datetime, timezone
+
+from sqlmodel import SQLModel, Field, select
+
+from backend.app.db.connection import get_db_session
+
+
+class BulkAddressUpload(SQLModel, table=True):
+    __tablename__ = "bulk_address_uploads"
+
+    id: UUID = Field(default_factory=uuid4, primary_key=True, index=True)
+    task_id: UUID = Field(foreign_key="tasks.id", index=True)
+    combined_csv_s3_uri: Optional[str] = Field(default=None)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+
+
+def set_combined_csv_s3_uri(task_id: UUID, s3_uri: str) -> None:
+    now = datetime.now(timezone.utc)
+    with get_db_session() as session:
+        row = session.exec(
+            select(BulkAddressUpload).where(BulkAddressUpload.task_id == task_id)
+        ).first()
+        if not row:
+            raise ValueError(f"No bulk_address_uploads row for task_id {task_id}")
+        row.combined_csv_s3_uri = s3_uri
+        row.updated_at = now
+        session.add(row)
+        session.commit()
diff --git a/backend/bulk_address2uprn_combiner/__init__.py b/backend/bulk_address2uprn_combiner/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/bulk_address2uprn_combiner/handler/Dockerfile b/backend/bulk_address2uprn_combiner/handler/Dockerfile
new file mode 100644
index 00000000..35f91d09
--- /dev/null
+++ b/backend/bulk_address2uprn_combiner/handler/Dockerfile
@@ -0,0 +1,23 @@
+FROM public.ecr.aws/lambda/python:3.11
+
+ARG DEV_DB_HOST
+ARG DEV_DB_PORT
+ARG DEV_DB_NAME
+
+ENV DB_HOST=${DEV_DB_HOST}
+ENV DB_PORT=${DEV_DB_PORT}
+ENV DB_NAME=${DEV_DB_NAME}
+
+WORKDIR /var/task
+
+COPY backend/bulk_address2uprn_combiner/handler/requirements.txt .
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY utils/ utils/
+COPY backend/ backend/
+COPY datatypes/ datatypes/
+
+COPY backend/bulk_address2uprn_combiner/main.py .
+
+CMD ["main.handler"]
diff --git a/backend/bulk_address2uprn_combiner/handler/requirements.txt b/backend/bulk_address2uprn_combiner/handler/requirements.txt
new file mode 100644
index 00000000..14d13e0d
--- /dev/null
+++ b/backend/bulk_address2uprn_combiner/handler/requirements.txt
@@ -0,0 +1,7 @@
+pandas==2.2.2
+numpy<2.0
+boto3==1.35.44
+sqlmodel
+sqlalchemy==2.0.36
+psycopg2-binary==2.9.10
+pydantic-settings==2.6.0
diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py
new file mode 100644
index 00000000..9b4dc6cb
--- /dev/null
+++ b/backend/bulk_address2uprn_combiner/main.py
@@ -0,0 +1,74 @@
+import os
+import boto3
+import pandas as pd
+from io import BytesIO
+from typing import Any
+from uuid import UUID
+from datetime import datetime, timezone
+
+from utils.logger import setup_logger
+from backend.utils.subtasks import subtask_handler
+from backend.app.db.models.bulk_address_uploads import set_combined_csv_s3_uri
+
+logger = setup_logger()
+
+S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
+
+
+def list_csv_files(s3_client, bucket: str, task_id: str) -> list[str]:
+    paginator = s3_client.get_paginator("list_objects_v2")
+    prefix = f"ara_raw_outputs/{task_id}/"
+    keys = []
+    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
+        for obj in page.get("Contents", []):
+            if obj["Key"].endswith(".csv"):
+                keys.append(obj["Key"])
+    return keys
+
+
+def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame:
+    obj = s3_client.get_object(Bucket=bucket, Key=key)
+    return pd.read_csv(BytesIO(obj["Body"].read()))
+
+
+@subtask_handler()
+def handler(body: dict[str, Any], context: Any) -> str:
+    task_id_str: str = body.get("task_id", "")
+
+    if not task_id_str:
+        raise RuntimeError("Missing task_id in message body")
+
+    bucket = S3_BUCKET_NAME
+    if not bucket:
+        raise RuntimeError("S3_BUCKET_NAME env var not set")
+
+    s3 = boto3.client("s3")
+
+    logger.info(f"Combining ara_raw_outputs for task {task_id_str}")
+
+    csv_keys = list_csv_files(s3, bucket, task_id_str)
+    if not csv_keys:
+        raise RuntimeError(f"No CSV files found under ara_raw_outputs/{task_id_str}/")
+
+    logger.info(f"Found {len(csv_keys)} CSV files")
+
+    dfs = [download_csv(s3, bucket, key) for key in csv_keys]
+    combined = pd.concat(dfs, ignore_index=True)
+    logger.info(f"Combined {len(combined)} rows from {len(dfs)} files")
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
+    output_key = f"bulk_final_outputs/{task_id_str}/combined_{timestamp}.csv"
+
+    csv_buffer = BytesIO()
+    combined.to_csv(csv_buffer, index=False)
+    csv_buffer.seek(0)
+    s3.put_object(Bucket=bucket, Key=output_key, Body=csv_buffer.getvalue())
+
+    s3_uri = f"s3://{bucket}/{output_key}"
+    logger.info(f"Saved combined CSV to {s3_uri}")
+    print(f"OUTPUT_S3_URI: {s3_uri}")
+
+    set_combined_csv_s3_uri(UUID(task_id_str), s3_uri)
+    logger.info(f"Persisted combined_csv_s3_uri for task {task_id_str}")
+
+    return s3_uri
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf
new file mode 100644
index 00000000..4be4fc20
--- /dev/null
+++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf
@@ -0,0 +1,44 @@
+data "terraform_remote_state" "shared" {
+  backend = "s3"
+  config = {
+    bucket = "assessment-model-terraform-state"
+    key    = "env:/${var.stage}/terraform.tfstate"
+    region = "eu-west-2"
+  }
+}
+
+data "aws_secretsmanager_secret_version" "db_credentials" {
+  secret_id = "${var.stage}/assessment_model/db_credentials"
+}
+
+locals {
+  db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
+}
+
+module "lambda" {
+  source = "../../modules/lambda_with_sqs"
+
+  name  = "bulk-address2uprn-combiner"
+  stage = var.stage
+
+  image_uri = local.image_uri
+
+  timeout     = 900
+  memory_size = 2048
+
+  maximum_concurrency = var.maximum_concurrency
+  batch_size          = var.batch_size
+
+  environment = {
+    STAGE          = var.stage
+    LOG_LEVEL      = "info"
+    S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
+    DB_USERNAME    = local.db_credentials.db_assessment_model_username
+    DB_PASSWORD    = local.db_credentials.db_assessment_model_password
+  }
+}
+
+resource "aws_iam_role_policy_attachment" "bulk_address2uprn_combiner_s3" {
+  role       = module.lambda.role_name
+  policy_arn = data.terraform_remote_state.shared.outputs.bulk_address2uprn_combiner_s3_arn
+}
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf
new file mode 100644
index 00000000..e5155388
--- /dev/null
+++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf
@@ -0,0 +1,14 @@
+output "bulk_address2uprn_combiner_queue_url" {
+  value       = module.lambda.queue_url
+  description = "URL of the bulk_address2uprn_combiner SQS queue"
+}
+
+output "bulk_address2uprn_combiner_queue_arn" {
+  value       = module.lambda.queue_arn
+  description = "ARN of the bulk_address2uprn_combiner SQS queue"
+}
+
+output "bulk_address2uprn_combiner_lambda_arn" {
+  value       = module.lambda.lambda_arn
+  description = "ARN of the bulk_address2uprn_combiner Lambda function"
+}
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf
new file mode 100644
index 00000000..45422d9f
--- /dev/null
+++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf
@@ -0,0 +1,16 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+  }
+
+  backend "s3" {
+    bucket = "bulk-address2uprn-combiner-terraform-state"
+    key    = "terraform.tfstate"
+    region = "eu-west-2"
+  }
+
+  required_version = ">= 1.2.0"
+}
diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf
new file mode 100644
index 00000000..6e1c84a2
--- /dev/null
+++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf
@@ -0,0 +1,38 @@
+variable "lambda_name" {
+  type        = string
+  description = "Logical name of the lambda"
+}
+
+variable "stage" {
+  description = "Deployment stage (e.g. dev, prod)"
+  type        = string
+}
+
+variable "ecr_repo_url" {
+  type        = string
+  description = "ECR repository URL (no tag, no digest)"
+}
+
+variable "image_digest" {
+  type        = string
+  description = "Image digest (sha256:...)"
+}
+
+variable "maximum_concurrency" {
+  type        = number
+  default     = 2
+  description = "Maximum concurrent Lambda invocations from SQS (2-1000)."
+}
+
+variable "batch_size" {
+  type    = number
+  default = 1
+}
+
+locals {
+  image_uri = "${var.ecr_repo_url}@${var.image_digest}"
+}
+
+output "resolved_image_uri" {
+  value = local.image_uri
+}
diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf
index 47866c92..fbd09565 100644
--- a/infrastructure/terraform/shared/main.tf
+++ b/infrastructure/terraform/shared/main.tf
@@ -477,6 +477,34 @@ output "postcode_splitter_s3_read_arn" {
   value = module.postcode_splitter_s3_read.policy_arn
 }
 
+################################################
+# Bulk Address2UPRN Combiner – Lambda ECR
+################################################
+module "bulk_address2uprn_combiner_state_bucket" {
+  source      = "../modules/tf_state_bucket"
+  bucket_name = "bulk-address2uprn-combiner-terraform-state"
+}
+
+module "bulk_address2uprn_combiner_registry" {
+  source = "../modules/container_registry"
+  name   = "bulk_address2uprn_combiner"
+  stage  = var.stage
+}
+
+module "bulk_address2uprn_combiner_s3" {
+  source = "../modules/s3_iam_policy"
+
+  policy_name        = "BulkAddress2UprnCombinerS3"
+  policy_description = "Allow bulk_address2uprn_combiner Lambda to read ara_raw_outputs and write bulk_final_outputs"
+  bucket_arns        = ["arn:aws:s3:::retrofit-data-${var.stage}"]
+  actions            = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"]
+  resource_paths     = ["/ara_raw_outputs/*", "/bulk_final_outputs/*"]
+}
+
+output "bulk_address2uprn_combiner_s3_arn" {
+  value = module.bulk_address2uprn_combiner_s3.policy_arn
+}
+
 ################################################
 # Categorisation – Lambda ECR
 ################################################