From 4dbafe3992a30a652480e3a478a061aa8eec8237 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 16 Apr 2026 22:21:54 +0000 Subject: [PATCH 1/3] added local devconaitner --- asset_list/app.py | 12 +- backend/export/property_scenarios/main.py | 69 +++++------ .../scripts/combine_address2uprn_outputs.py | 2 + devcontainer.sh | 110 ++++++++++++++++++ etl/hubspot/scripts/scraper/bulk_load.py | 4 +- 5 files changed, 156 insertions(+), 41 deletions(-) create mode 100644 devcontainer.sh diff --git a/asset_list/app.py b/asset_list/app.py index b0030667..25c72bda 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -74,23 +74,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "Waverley UPRN Match.xlsx" + data_filename = "foom (2).xlsx" sheet_name = "in" postcode_column = "postcode_clean" - address1_column = "domna_found_address" + address1_column = "address2uprn_address" address1_method = None - fulladdress_column = "domna_found_address" + fulladdress_column = "address2uprn_address" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "domna_found_uprn" - landlord_property_type = "Property Type 1" # Good to include if landlord gave + landlord_os_uprn = "address2uprn_uprn" + landlord_property_type = None # Good to include if landlord gave landlord_built_form = None # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "WBC Ref" + landlord_property_id = "UPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/backend/export/property_scenarios/main.py b/backend/export/property_scenarios/main.py index f3ea0100..64627e01 100644 --- a/backend/export/property_scenarios/main.py +++ b/backend/export/property_scenarios/main.py @@ -26,15 +26,14 @@ def has_solar_with_battery(materials_list: Optional[List[Dict[str, Any]]]) -> bo :return: """ for m in materials_list or []: - if ( - m.get("type") == "solar_pv" - and m.get("includes_battery") is True - ): + if m.get("type") == "solar_pv" and m.get("includes_battery") is True: return True return False -def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, int], pd.DataFrame]: +def process_export( + payload: ExportRequest, session: Session +) -> Dict[Union[str, int], pd.DataFrame]: export_files: Dict[Union[str, int], pd.DataFrame] = {} db_methods = DbMethods(session) @@ -52,7 +51,9 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, logger.info("Retrieved %s plans for export", len(plans_df)) if plans_df.empty: - logger.info("Empty plans dataframe - no plans to export. Returning empty export.") + logger.info( + "Empty plans dataframe - no plans to export. Returning empty export." + ) return export_files plan_ids: List[int] = plans_df["id"].tolist() recommendations_df: pd.DataFrame = db_methods.get_recommendations(plan_ids) @@ -61,13 +62,12 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, recommendations_df = db_methods.attach_materials(recommendations_df) - recommendations_df["has_solar_with_battery"] = ( - recommendations_df["materials"].apply(has_solar_with_battery) - ) + recommendations_df["has_solar_with_battery"] = recommendations_df[ + "materials" + ].apply(has_solar_with_battery) - _filter = ( - (recommendations_df["measure_type"] == "solar_pv") - & (recommendations_df["has_solar_with_battery"]) + _filter = (recommendations_df["measure_type"] == "solar_pv") & ( + recommendations_df["has_solar_with_battery"] ) recommendations_df.loc[_filter, "measure_type"] = ( @@ -83,10 +83,13 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, else: scenario_recs = recommendations_df[ recommendations_df["scenario_id"] == group_key - ] + ] if scenario_recs.empty: - logger.info("No recommendations found for group_key %s - skipping export for this group", group_key) + logger.info( + "No recommendations found for group_key %s - skipping export for this group", + group_key, + ) continue measures_df: pd.DataFrame = scenario_recs[ @@ -99,14 +102,12 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, values="estimated_cost", ).reset_index() - pivot["total_retrofit_cost"] = ( - pivot.drop(columns=["property_id", "plan_name"]).sum(axis=1) - ) + pivot["total_retrofit_cost"] = pivot.drop( + columns=["property_id", "plan_name"] + ).sum(axis=1) post_sap: pd.DataFrame = ( - scenario_recs.groupby("property_id")[["sap_points"]] - .sum() - .reset_index() + scenario_recs.groupby("property_id")[["sap_points"]].sum().reset_index() ) df: pd.DataFrame = ( @@ -117,7 +118,9 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, df["sap_points"] = df["sap_points"].fillna(0) df["predicted_post_works_sap"] = df["current_sap_points"] + df["sap_points"] - df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply(sap_to_epc) + df["predicted_post_works_epc"] = df["predicted_post_works_sap"].apply( + sap_to_epc + ) export_files[group_key] = df @@ -128,22 +131,17 @@ def process_export(payload: ExportRequest, session: Session) -> Dict[Union[str, # Lambda Handler # ============================================================ -def handler(event: Mapping[str, Any], context: Optional[Any]) -> Mapping[str, Union[int, str]]: + +def handler( + event: Mapping[str, Any], context: Optional[Any] +) -> Mapping[str, Union[int, str]]: """ Example event: body_dict = { "task_id": "test", "subtask_id": "test", - "portfolio_id": 655, - "scenario_ids": [], - "default_plans_only": True, - } - - body_dict = { - "task_id": "test", - "subtask_id": "test", - "portfolio_id": 655, - "scenario_ids": [1174], + "portfolio_id": 682, + "scenario_ids": [1210], "default_plans_only": False, } :param event: Lambda event containing export request details @@ -168,7 +166,12 @@ def handler(event: Mapping[str, Any], context: Optional[Any]) -> Mapping[str, Un exported_files = process_export(payload, session) # TODO: Need to handle the exported files - e.g. upload to s3 and email a presigned url - _ = exported_files + output_path = f"/tmp/export_{payload.portfolio_id}.xlsx" + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + for group_key, df in exported_files.items(): + sheet_name = str(group_key)[:31] # Excel sheet names max 31 chars + df.to_excel(writer, sheet_name=sheet_name, index=False) + logger.info("Exported files written to %s", output_path) return { "statusCode": 200, "body": json.dumps({}), diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py index 105b8639..085240a9 100644 --- a/backend/scripts/combine_address2uprn_outputs.py +++ b/backend/scripts/combine_address2uprn_outputs.py @@ -31,6 +31,8 @@ def download_csv(key): def main(task_id, output): + task_id = "3fb9a9b7-ff49-4c11-b9e1-9d00da955a75" + print(f"Scanning task: {task_id}") csv_files = list_csv_files(task_id) diff --git a/devcontainer.sh b/devcontainer.sh new file mode 100644 index 00000000..32908a9e --- /dev/null +++ b/devcontainer.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# +# dc.sh — devcontainer helper for this repo +# +# Usage: +# devcontainer.sh +# +# Configs: backend | asset_list +# Commands: up, shell, down, rebuild +# +# `shell` auto-runs `up` first if the container isn't already running, +# so it's safe to call cold. +# +# Examples: +# ./scripts/dc.sh backend shell # up + exec bash +# ./scripts/dc.sh asset_list up +# ./scripts/dc.sh backend rebuild +# ./scripts/dc.sh backend down + +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)" + +VALID_CONFIGS=(backend asset_list) +VALID_COMMANDS=(up shell down rebuild) + +# --- helpers --------------------------------------------------------------- + +usage() { + sed -n '3,20p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//' + exit "${1:-0}" +} + +die() { + echo "error: $*" >&2 + exit 1 +} + +in_list() { + # in_list + local needle="$1" + shift + local item + for item in "$@"; do + [[ "${item}" == "${needle}" ]] && return 0 + done + return 1 +} + +container_id_for() { + # Find a running container for the given config path via devcontainer labels. + local config_path="$1" + docker ps -q \ + --filter "label=devcontainer.local_folder=${REPO_ROOT}" \ + --filter "label=devcontainer.config_file=${config_path}" +} + +# --- argument parsing ------------------------------------------------------ + +[[ $# -eq 2 ]] || usage 1 + +CONFIG_NAME="$1" +COMMAND="$2" + +in_list "${CONFIG_NAME}" "${VALID_CONFIGS[@]}" \ + || die "invalid config '${CONFIG_NAME}' (expected: ${VALID_CONFIGS[*]})" +in_list "${COMMAND}" "${VALID_COMMANDS[@]}" \ + || die "invalid command '${COMMAND}' (expected: ${VALID_COMMANDS[*]})" + +CONFIG_PATH="${REPO_ROOT}/.devcontainer/${CONFIG_NAME}/devcontainer.json" +[[ -f "${CONFIG_PATH}" ]] || die "config not found: ${CONFIG_PATH}" + +DC_ARGS=(--workspace-folder "${REPO_ROOT}" --config "${CONFIG_PATH}") + +# --- dispatch -------------------------------------------------------------- + +case "${COMMAND}" in + up) + echo ">> bringing up '${CONFIG_NAME}'" + devcontainer up "${DC_ARGS[@]}" + ;; + + shell) + # Auto-up if not already running. `devcontainer up` is idempotent — + # it reuses an existing container, so this is cheap on warm starts. + if [[ -z "$(container_id_for "${CONFIG_PATH}")" ]]; then + echo ">> '${CONFIG_NAME}' not running, bringing it up first" + devcontainer up "${DC_ARGS[@]}" + fi + echo ">> attaching shell to '${CONFIG_NAME}'" + devcontainer exec "${DC_ARGS[@]}" bash 2>/dev/null \ + || devcontainer exec "${DC_ARGS[@]}" sh + ;; + + down) + cid="$(container_id_for "${CONFIG_PATH}")" + if [[ -z "${cid}" ]]; then + echo ">> '${CONFIG_NAME}' not running, nothing to stop" + exit 0 + fi + echo ">> stopping '${CONFIG_NAME}'" + docker stop "${cid}" + ;; + + rebuild) + echo ">> rebuilding '${CONFIG_NAME}' from scratch" + devcontainer up "${DC_ARGS[@]}" --remove-existing-container --build-no-cache + ;; +esac diff --git a/etl/hubspot/scripts/scraper/bulk_load.py b/etl/hubspot/scripts/scraper/bulk_load.py index 91aa89e2..f0529905 100644 --- a/etl/hubspot/scripts/scraper/bulk_load.py +++ b/etl/hubspot/scripts/scraper/bulk_load.py @@ -9,8 +9,8 @@ PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value companies = list( [ # Companies.THE_GUINESS_PARTNERSHIP, - # Companies.SOUTHERN_HOUSING_GROUP, - Companies.CALICO_HOMES, + Companies.SOUTHERN_HOUSING_GROUP, + # Companies.CALICO_HOMES, ] ) From c188a3ad2615a9d8ed4ec100e295a99456e97ad7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 17 Apr 2026 14:50:57 +0000 Subject: [PATCH 2/3] add dev container --- devcontainer.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/devcontainer.sh b/devcontainer.sh index 32908a9e..ba35e7f0 100644 --- a/devcontainer.sh +++ b/devcontainer.sh @@ -3,7 +3,7 @@ # dc.sh — devcontainer helper for this repo # # Usage: -# devcontainer.sh +# ./devcontainer.sh # # Configs: backend | asset_list # Commands: up, shell, down, rebuild @@ -12,15 +12,15 @@ # so it's safe to call cold. # # Examples: -# ./scripts/dc.sh backend shell # up + exec bash -# ./scripts/dc.sh asset_list up -# ./scripts/dc.sh backend rebuild -# ./scripts/dc.sh backend down +# ./devcontainer.sh backend shell # up + exec bash +# ./devcontainer.sh asset_list up +# ./devcontainer.sh backend rebuild +# ./devcontainer.sh backend down set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)" +REPO_ROOT="${SCRIPT_DIR}" VALID_CONFIGS=(backend asset_list) VALID_COMMANDS=(up shell down rebuild) From ec4c870465499d66449e3b2efd855a62e9b92769 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 17 Apr 2026 19:08:34 +0000 Subject: [PATCH 3/3] added bulk address 2 uprn lmabda --- .github/workflows/deploy_terraform.yml | 40 ++++++++++ .gitignore | 5 ++ backend/app/db/models/bulk_address_uploads.py | 30 ++++++++ .../bulk_address2uprn_combiner/__init__.py | 0 .../handler/Dockerfile | 23 ++++++ .../handler/requirements.txt | 7 ++ backend/bulk_address2uprn_combiner/main.py | 74 +++++++++++++++++++ .../lambda/bulk_address2uprn_combiner/main.tf | 44 +++++++++++ .../bulk_address2uprn_combiner/outputs.tf | 14 ++++ .../bulk_address2uprn_combiner/provider.tf | 16 ++++ .../bulk_address2uprn_combiner/variables.tf | 38 ++++++++++ infrastructure/terraform/shared/main.tf | 28 +++++++ 12 files changed, 319 insertions(+) create mode 100644 backend/app/db/models/bulk_address_uploads.py create mode 100644 backend/bulk_address2uprn_combiner/__init__.py create mode 100644 backend/bulk_address2uprn_combiner/handler/Dockerfile create mode 100644 backend/bulk_address2uprn_combiner/handler/requirements.txt create mode 100644 backend/bulk_address2uprn_combiner/main.py create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf create mode 100644 infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 22f16fee..7e1281b7 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -201,6 +201,46 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ + # Build Bulk Address2UPRN Combiner image and Push + # ============================================================ + bulk_address2uprn_combiner_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/bulk_address2uprn_combiner/handler/Dockerfile + build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + # ============================================================ + # Deploy Bulk Address2UPRN Combiner Lambda + # ============================================================ + bulk_address2uprn_combiner_lambda: + needs: [bulk_address2uprn_combiner_image, determine_stage, shared_terraform] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: bulk_address2uprn_combiner + lambda_path: infrastructure/terraform/lambda/bulk_address2uprn_combiner + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: bulk_address2uprn_combiner-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.bulk_address2uprn_combiner_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ # Condition ETL image and Push # ============================================================ diff --git a/.gitignore b/.gitignore index 299e03d4..51a32a0d 100644 --- a/.gitignore +++ b/.gitignore @@ -278,6 +278,11 @@ cache/ *.png *.pptx +*.csv +*.xlsx +*.pdf +**/Chunks/ +*.ipynb local_data* diff --git a/backend/app/db/models/bulk_address_uploads.py b/backend/app/db/models/bulk_address_uploads.py new file mode 100644 index 00000000..335a4c45 --- /dev/null +++ b/backend/app/db/models/bulk_address_uploads.py @@ -0,0 +1,30 @@ +from typing import Optional +from uuid import UUID, uuid4 +from datetime import datetime, timezone + +from sqlmodel import SQLModel, Field, select + +from backend.app.db.connection import get_db_session + + +class BulkAddressUpload(SQLModel, table=True): + __tablename__ = "bulk_address_uploads" + + id: UUID = Field(default_factory=uuid4, primary_key=True, index=True) + task_id: UUID = Field(foreign_key="tasks.id", index=True) + combined_csv_s3_uri: Optional[str] = Field(default=None) + updated_at: datetime = Field(default_factory=datetime.utcnow) + + +def set_combined_csv_s3_uri(task_id: UUID, s3_uri: str) -> None: + now = datetime.now(timezone.utc) + with get_db_session() as session: + row = session.exec( + select(BulkAddressUpload).where(BulkAddressUpload.task_id == task_id) + ).first() + if not row: + raise ValueError(f"No bulk_address_uploads row for task_id {task_id}") + row.combined_csv_s3_uri = s3_uri + row.updated_at = now + session.add(row) + session.commit() diff --git a/backend/bulk_address2uprn_combiner/__init__.py b/backend/bulk_address2uprn_combiner/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/bulk_address2uprn_combiner/handler/Dockerfile b/backend/bulk_address2uprn_combiner/handler/Dockerfile new file mode 100644 index 00000000..35f91d09 --- /dev/null +++ b/backend/bulk_address2uprn_combiner/handler/Dockerfile @@ -0,0 +1,23 @@ +FROM public.ecr.aws/lambda/python:3.11 + +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + +WORKDIR /var/task + +COPY backend/bulk_address2uprn_combiner/handler/requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +COPY backend/bulk_address2uprn_combiner/main.py . + +CMD ["main.handler"] diff --git a/backend/bulk_address2uprn_combiner/handler/requirements.txt b/backend/bulk_address2uprn_combiner/handler/requirements.txt new file mode 100644 index 00000000..14d13e0d --- /dev/null +++ b/backend/bulk_address2uprn_combiner/handler/requirements.txt @@ -0,0 +1,7 @@ +pandas==2.2.2 +numpy<2.0 +boto3==1.35.44 +sqlmodel +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 diff --git a/backend/bulk_address2uprn_combiner/main.py b/backend/bulk_address2uprn_combiner/main.py new file mode 100644 index 00000000..9b4dc6cb --- /dev/null +++ b/backend/bulk_address2uprn_combiner/main.py @@ -0,0 +1,74 @@ +import os +import boto3 +import pandas as pd +from io import BytesIO +from typing import Any +from uuid import UUID +from datetime import datetime, timezone + +from utils.logger import setup_logger +from backend.utils.subtasks import subtask_handler +from backend.app.db.models.bulk_address_uploads import set_combined_csv_s3_uri + +logger = setup_logger() + +S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME") + + +def list_csv_files(s3_client, bucket: str, task_id: str) -> list[str]: + paginator = s3_client.get_paginator("list_objects_v2") + prefix = f"ara_raw_outputs/{task_id}/" + keys = [] + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []): + if obj["Key"].endswith(".csv"): + keys.append(obj["Key"]) + return keys + + +def download_csv(s3_client, bucket: str, key: str) -> pd.DataFrame: + obj = s3_client.get_object(Bucket=bucket, Key=key) + return pd.read_csv(BytesIO(obj["Body"].read())) + + +@subtask_handler() +def handler(body: dict[str, Any], context: Any) -> str: + task_id_str: str = body.get("task_id", "") + + if not task_id_str: + raise RuntimeError("Missing task_id in message body") + + bucket = S3_BUCKET_NAME + if not bucket: + raise RuntimeError("S3_BUCKET_NAME env var not set") + + s3 = boto3.client("s3") + + logger.info(f"Combining ara_raw_outputs for task {task_id_str}") + + csv_keys = list_csv_files(s3, bucket, task_id_str) + if not csv_keys: + raise RuntimeError(f"No CSV files found under ara_raw_outputs/{task_id_str}/") + + logger.info(f"Found {len(csv_keys)} CSV files") + + dfs = [download_csv(s3, bucket, key) for key in csv_keys] + combined = pd.concat(dfs, ignore_index=True) + logger.info(f"Combined {len(combined)} rows from {len(dfs)} files") + + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S") + output_key = f"bulk_final_outputs/{task_id_str}/combined_{timestamp}.csv" + + csv_buffer = BytesIO() + combined.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + s3.put_object(Bucket=bucket, Key=output_key, Body=csv_buffer.getvalue()) + + s3_uri = f"s3://{bucket}/{output_key}" + logger.info(f"Saved combined CSV to {s3_uri}") + print(f"OUTPUT_S3_URI: {s3_uri}") + + set_combined_csv_s3_uri(UUID(task_id_str), s3_uri) + logger.info(f"Persisted combined_csv_s3_uri for task {task_id_str}") + + return s3_uri diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf new file mode 100644 index 00000000..4be4fc20 --- /dev/null +++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/main.tf @@ -0,0 +1,44 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "bulk-address2uprn-combiner" + stage = var.stage + + image_uri = local.image_uri + + timeout = 900 + memory_size = 2048 + + maximum_concurrency = var.maximum_concurrency + batch_size = var.batch_size + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + } +} + +resource "aws_iam_role_policy_attachment" "bulk_address2uprn_combiner_s3" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.bulk_address2uprn_combiner_s3_arn +} diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf new file mode 100644 index 00000000..e5155388 --- /dev/null +++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/outputs.tf @@ -0,0 +1,14 @@ +output "bulk_address2uprn_combiner_queue_url" { + value = module.lambda.queue_url + description = "URL of the bulk_address2uprn_combiner SQS queue" +} + +output "bulk_address2uprn_combiner_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the bulk_address2uprn_combiner SQS queue" +} + +output "bulk_address2uprn_combiner_lambda_arn" { + value = module.lambda.lambda_arn + description = "ARN of the bulk_address2uprn_combiner Lambda function" +} diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf new file mode 100644 index 00000000..45422d9f --- /dev/null +++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "bulk-address2uprn-combiner-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} diff --git a/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf new file mode 100644 index 00000000..6e1c84a2 --- /dev/null +++ b/infrastructure/terraform/lambda/bulk_address2uprn_combiner/variables.tf @@ -0,0 +1,38 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = 2 + description = "Maximum concurrent Lambda invocations from SQS (2-1000)." +} + +variable "batch_size" { + type = number + default = 1 +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 47866c92..fbd09565 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -477,6 +477,34 @@ output "postcode_splitter_s3_read_arn" { value = module.postcode_splitter_s3_read.policy_arn } +################################################ +# Bulk Address2UPRN Combiner – Lambda ECR +################################################ +module "bulk_address2uprn_combiner_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "bulk-address2uprn-combiner-terraform-state" +} + +module "bulk_address2uprn_combiner_registry" { + source = "../modules/container_registry" + name = "bulk_address2uprn_combiner" + stage = var.stage +} + +module "bulk_address2uprn_combiner_s3" { + source = "../modules/s3_iam_policy" + + policy_name = "BulkAddress2UprnCombinerS3" + policy_description = "Allow bulk_address2uprn_combiner Lambda to read ara_raw_outputs and write bulk_final_outputs" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] + resource_paths = ["/ara_raw_outputs/*", "/bulk_final_outputs/*"] +} + +output "bulk_address2uprn_combiner_s3_arn" { + value = module.bulk_address2uprn_combiner_s3.policy_arn +} + ################################################ # Categorisation – Lambda ECR ################################################