diff --git a/.devcontainer/asset_list/Dockerfile b/.devcontainer/asset_list/Dockerfile index 72a5de53..be869637 100644 --- a/.devcontainer/asset_list/Dockerfile +++ b/.devcontainer/asset_list/Dockerfile @@ -21,7 +21,7 @@ RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \ && rm -rf /tmp/libpostal # 3) Create the user and grant sudo privileges -RUN useradd -m -s /usr/bin/bash ${USER} \ +RUN useradd -m -s /bin/bash ${USER} \ && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ && chmod 0440 /etc/sudoers.d/${USER} @@ -32,6 +32,11 @@ ADD asset_list/requirements.txt requirements1.txt RUN cat requirements1.txt requirements2.txt >> requirements.txt RUN pip install -r requirements.txt + +# Install code server +RUN curl -fsSL https://code-server.dev/install.sh | sh + + # 5) Workdir WORKDIR /workspaces/model diff --git a/.devcontainer/asset_list/devcontainer.json b/.devcontainer/asset_list/devcontainer.json index 945dcd88..dfa9ba4d 100644 --- a/.devcontainer/asset_list/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -2,13 +2,14 @@ "name": "SAL ENV", "dockerComposeFile": "docker-compose.yml", "service": "model-sal", - "remoteUser": "vscode", + // "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", - "postStartCommand": "bash .devcontainer/post-install.sh", + "postStartCommand": "bash .devcontainer/asset_list/post-install.sh", "mounts": [ // Optional, just makes getting from Downloads (local env) easier - "source=${localEnv:HOME},target=/workspaces/home,type=bind" + "source=${localEnv:HOME},target=/home/vscode,type=bind" ], + "forwardPorts": [8081], "customizations": { "vscode": { "extensions": [ @@ -24,7 +25,8 @@ "ms-python.vscode-python-envs", "ms-python.black-formatter", "GrapeCity.gc-excelviewer", - "jakobhoeg.vscode-pokemon" + "jakobhoeg.vscode-pokemon", + "eamodio.gitlens" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/.devcontainer/asset_list/docker-compose.yml b/.devcontainer/asset_list/docker-compose.yml index 06e4124d..0568393b 100644 --- a/.devcontainer/asset_list/docker-compose.yml +++ b/.devcontainer/asset_list/docker-compose.yml @@ -2,15 +2,17 @@ version: '3.8' services: model-sal: - user: "${UID}:${GID}" build: context: ../.. dockerfile: .devcontainer/asset_list/Dockerfile - command: sleep infinity + command: code-server --bind-addr 0.0.0.0:8080 + user: vscode volumes: - ../../:/workspaces/model networks: - model-net + ports: + - "8081:8080" networks: model-net: diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index ac654ac1..48a58bd6 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -27,7 +27,9 @@ "GrapeCity.gc-excelviewer", "jakobhoeg.vscode-pokemon", "github.vscode-github-actions", - "me-dutour-mathieu.vscode-github-actions" + "me-dutour-mathieu.vscode-github-actions", + "anthropic.claude-code", + "eamodio.gitlens" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/.devcontainer/backend/post-install.sh b/.devcontainer/backend/post-install.sh index 48fbfde1..20c699d6 100644 --- a/.devcontainer/backend/post-install.sh +++ b/.devcontainer/backend/post-install.sh @@ -1,14 +1,14 @@ -mkdir -p ~/.ipython/profile_default/startup +# mkdir -p ~/.ipython/profile_default/startup -cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py -from dotenv import load_dotenv -import os +# cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py +# from dotenv import load_dotenv +# import os -# Adjust path as needed -env_path = "/workspaces/model/backend/.env" -if os.path.exists(env_path): - load_dotenv(env_path) - print("✔ Loaded .env into Jupyter kernel") -else: - print("⚠ No .env file found to load") -EOF +# # Adjust path as needed +# env_path = "/workspaces/model/backend/.env" +# if os.path.exists(env_path): +# load_dotenv(env_path) +# print("✔ Loaded .env into Jupyter kernel") +# else: +# print("⚠ No .env file found to load") +# EOF diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index ce1a0e77..dab98d8b 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -16,12 +16,14 @@ on: type: string ecr_repo: - required: true + required: false type: string + default: '' image_digest: - required: true + required: false type: string + default: '' terraform_apply: required: false @@ -58,6 +60,8 @@ on: required: false TF_VAR_google_solar_api_key: required: false + TF_VAR_ordnance_survey_api_key: + required: false jobs: deploy: @@ -115,12 +119,23 @@ jobs: TF_VAR_domain_name: ${{ secrets.TF_VAR_domain_name }} TF_VAR_epc_auth_token: ${{ secrets.TF_VAR_epc_auth_token }} TF_VAR_google_solar_api_key: ${{ secrets.TF_VAR_google_solar_api_key }} + TF_VAR_ordnance_survey_api_key: ${{ secrets.TF_VAR_ordnance_survey_api_key }} run: | + ECR_REPO_URL_VAR="" + if [[ -n "${{ inputs.ecr_repo }}" ]]; then + ECR_REPO_URL_VAR="-var=ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" + fi + + IMAGE_DIGEST_VAR="" + if [[ -n "${{ inputs.ecr_repo }}" ]]; then + IMAGE_DIGEST_VAR="-var=image_digest=${{ inputs.image_digest }}" + fi + terraform plan \ -var="stage=${{ inputs.stage }}" \ -var="lambda_name=${{ inputs.lambda_name }}" \ - -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ - -var="image_digest=${{ inputs.image_digest }}" \ + $ECR_REPO_URL_VAR \ + $IMAGE_DIGEST_VAR \ -out=lambdaplan - name: Terraform Apply @@ -140,9 +155,14 @@ jobs: TF_VAR_domain_name: ${{ secrets.TF_VAR_domain_name }} TF_VAR_epc_auth_token: ${{ secrets.TF_VAR_epc_auth_token }} TF_VAR_google_solar_api_key: ${{ secrets.TF_VAR_google_solar_api_key }} + TF_VAR_ordnance_survey_api_key: ${{ secrets.TF_VAR_ordnance_survey_api_key }} run: | + EXTRA_VARS="" + if [[ -n "${{ inputs.ecr_repo }}" ]]; then + EXTRA_VARS="-var=ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }} -var=image_digest=${{ inputs.image_digest }}" + fi + terraform destroy -auto-approve \ -var="stage=${{ inputs.stage }}" \ -var="lambda_name=${{ inputs.lambda_name }}" \ - -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ - -var="image_digest=${{ inputs.image_digest }}" + $EXTRA_VARS diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index c360aadf..e41534e6 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -41,7 +41,7 @@ jobs: fi # ============================================================ - # 1️⃣ Shared Terraform (infra) + # Shared Terraform (infra) # ============================================================ shared_terraform: needs: determine_stage @@ -77,9 +77,49 @@ jobs: if: env.TERRAFORM_APPLY == 'true' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan + + # ============================================================ + # Ara Engine image and Push + # ============================================================ + ara_engine_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: engine-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/docker/engine.Dockerfile + build_context: . + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + # ============================================================ + # Deploy Ara Engine Lambda + # ============================================================ + ara_engine_lambda: + needs: [ara_engine_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: ara_engine + lambda_path: infrastructure/terraform/lambda/engine + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: engine-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.ara_engine_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }} + TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }} + TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }} + TF_VAR_api_key: ${{ secrets.DEV_API_KEY }} + TF_VAR_secret_key: ${{ secrets.DEV_SECRET_KEY }} + TF_VAR_epc_auth_token: ${{ secrets.DEV_EPC_AUTH_TOKEN }} + TF_VAR_google_solar_api_key: ${{ secrets.DEV_GOOGLE_SOLAR_API_KEY }} # ============================================================ - # 2️⃣ Build Address 2 UPRN image and Push + # Build Address 2 UPRN image and Push # ============================================================ address2uprn_image: needs: [determine_stage, shared_terraform] @@ -103,7 +143,7 @@ jobs: EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} # ============================================================ - # 3️⃣ Deploy Address 2 UPRN Lambda + # Deploy Address 2 UPRN Lambda # ============================================================ address2uprn_lambda: needs: [address2uprn_image, determine_stage] @@ -122,7 +162,7 @@ jobs: # ============================================================ - # 2️⃣ Build Postcode Splitter image and Push + # Build Postcode Splitter image and Push # ============================================================ postcodeSplitter_image: needs: [determine_stage, shared_terraform] @@ -144,7 +184,7 @@ jobs: DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} # ============================================================ - # 3️⃣ Deploy Postcode Splitter Lambda + # Deploy Postcode Splitter Lambda # ============================================================ postcodeSplitter_lambda: needs: [postcodeSplitter_image, determine_stage, address2uprn_lambda] @@ -242,32 +282,56 @@ jobs: AWS_REGION: ${{ secrets.DEV_AWS_REGION }} # ============================================================ - # Ara Engine image and Push + # Build OrdanceSurvey image and Push # ============================================================ - ara_engine_image: + ordnanceSurvey_image: needs: [determine_stage, shared_terraform] uses: ./.github/workflows/_build_image.yml with: - ecr_repo: engine-${{ needs.determine_stage.outputs.stage }} - dockerfile_path: backend/docker/engine.Dockerfile + ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/ordnanceSurvey/handler/Dockerfile build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} # ============================================================ - # Deploy Categorisation Lambda + # Deploy OrdanceSurvey Lambda # ============================================================ - ara_engine_lambda: - needs: [ara_engine_image, determine_stage] + ordnanceSurvey_lambda: + needs: [ordnanceSurvey_image, determine_stage] uses: ./.github/workflows/_deploy_lambda.yml with: - lambda_name: ara_engine - lambda_path: infrastructure/terraform/lambda/engine + lambda_name: ordnanceSurvey + lambda_path: infrastructure/terraform/lambda/ordnanceSurvey + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_ORDNANCE_SURVEY_API_KEY: ${{ secrets.ORDNANCE_SURVEY_API_KEY }} + + # ============================================================ + # Deploy FastAPI Lambda + # ============================================================ + fast_api_lambda: + needs: [determine_stage, ara_engine_lambda, categorisation_lambda] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: ara_fast_api + lambda_path: infrastructure/terraform/lambda/fast-api stage: ${{ needs.determine_stage.outputs.stage }} - ecr_repo: engine-${{ needs.determine_stage.outputs.stage }} - image_digest: ${{ needs.ara_engine_image.outputs.image_digest }} terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} @@ -276,8 +340,97 @@ jobs: TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }} TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }} TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }} - TF_VAR_api_key: ${{ secrets.DEV_API_KEY }} - TF_VAR_secret_key: ${{ secrets.DEV_SECRET_KEY }} - TF_VAR_domain_name: ${{ secrets.DEV_DOMAIN_NAME }} + TF_VAR_api_key: ${{ secrets.FASTAPI_API_KEY }} + TF_VAR_secret_key: ${{ secrets.NEXTAUTH_SECRET }} + TF_VAR_domain_name: ${{ secrets.ARA_DEV_DOMAIN_NAME }} TF_VAR_epc_auth_token: ${{ secrets.DEV_EPC_AUTH_TOKEN }} - TF_VAR_google_solar_api_key: ${{ secrets.DEV_GOOGLE_SOLAR_API_KEY }} \ No newline at end of file + TF_VAR_google_solar_api_key: ${{ secrets.DEV_GOOGLE_SOLAR_API_KEY }} + + # ============================================================ + # Deploy ACM Certificate for Cloudfront + # ============================================================ + cloudfront_acm: + needs: [determine_stage, shared_terraform, fast_api_lambda] + runs-on: ubuntu-latest + + env: + STAGE: ${{ needs.determine_stage.outputs.stage }} + TERRAFORM_APPLY: ${{ needs.determine_stage.outputs.terraform_apply }} + + steps: + - uses: actions/checkout@v4 + + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.DEV_AWS_REGION }} + + - uses: hashicorp/setup-terraform@v3 + + - name: Terraform Init + working-directory: infrastructure/terraform/cdn_certificate + run: terraform init -reconfigure + + - name: Terraform Workspace + working-directory: infrastructure/terraform/cdn_certificate + run: | + terraform workspace select $STAGE \ + || terraform workspace new $STAGE + + - name: Terraform Plan + working-directory: infrastructure/terraform/cdn_certificate + run: | + terraform plan \ + -var="stage=${STAGE}" \ + -out=tfplan + + - name: Terraform Apply + if: env.TERRAFORM_APPLY == 'true' + working-directory: infrastructure/terraform/cdn_certificate + run: terraform apply -auto-approve tfplan + + + # ============================================================ + # Deploy Cloudfront CDN + # ============================================================ + cloudfront_cdn: + needs: [determine_stage, cloudfront_acm] + runs-on: ubuntu-latest + + env: + STAGE: ${{ needs.determine_stage.outputs.stage }} + TERRAFORM_APPLY: ${{ needs.determine_stage.outputs.terraform_apply }} + + steps: + - uses: actions/checkout@v4 + + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.DEV_AWS_REGION }} + + - uses: hashicorp/setup-terraform@v3 + + - name: Terraform Init + working-directory: infrastructure/terraform/cdn + run: terraform init -reconfigure + + - name: Terraform Workspace + working-directory: infrastructure/terraform/cdn + run: | + terraform workspace select $STAGE \ + || terraform workspace new $STAGE + + - name: Terraform Plan + working-directory: infrastructure/terraform/cdn + run: | + terraform plan \ + -var="stage=${STAGE}" \ + -out=tfplan + + - name: Terraform Apply + if: env.TERRAFORM_APPLY == 'true' + working-directory: infrastructure/terraform/cdn + run: terraform apply -auto-approve tfplan \ No newline at end of file diff --git a/.gitignore b/.gitignore index 68e66052..15050bdd 100644 --- a/.gitignore +++ b/.gitignore @@ -246,7 +246,7 @@ etl/epc/local_data/* /backend/condition/sample_data/peabody/* *.DS_Store -infrastructure/terraform/.terraform* +**/.terraform* # Don't commit packages up serverless packages .serverless diff --git a/.vscode/settings.json b/.vscode/settings.json index b294c736..56299a40 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,7 +16,13 @@ "python.languageServer": "Pylance", "python.analysis.typeCheckingMode": "strict", "python.analysis.autoSearchPaths": true, - "python.analysis.extraPaths": ["./src"] + "python.analysis.extraPaths": ["./src"], + + "vim.useCtrlKeys": true, + "vim.handleKeys": { + "": false, + "": false + } // Hot reload setting that needs to be in user settings // "jupyter.runStartupCommands": [ diff --git a/asset_list/app.py b/asset_list/app.py index 8becbd3e..a97bb8e0 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -73,24 +73,25 @@ def app(): Property UPRN """ - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/March 2026 SAL" - data_filename = "Domna System Review - Livewest.xlsx" + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lifespace Rentals/Missed" + # data_filename = "For Modelling - Final - reviewed.xlsx" + data_filename = "Missed Properties - with address.xlsx" sheet_name = "Sheet1" postcode_column = "Postcode" - address1_column = None - address1_method = "house_number_extraction" - fulladdress_column = "Address" + address1_column = "address1" + address1_method = None + fulladdress_column = "address1" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "gov UPRN" - landlord_property_type = "AssetType" - landlord_built_form = "AssetType" + landlord_os_uprn = "UPRN" + landlord_property_type = "Type" + landlord_built_form = None landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "landlord_uprn" + landlord_property_id = "Reference" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/backend/README.md b/backend/README.md index b8e859c2..9671b7b3 100644 --- a/backend/README.md +++ b/backend/README.md @@ -364,5 +364,5 @@ Here's what you should do: function. By following these steps, you should have your custom domain properly configured and pointing to your AWS Lambda -function via the CloudFront distribution. +function via the CloudFront distribution diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md index b4876340..6d26f281 100644 --- a/backend/address2UPRN/README.md +++ b/backend/address2UPRN/README.md @@ -1,20 +1,58 @@ -We have list of address as input. - -It'll come in batches of the same post code and from then we want to somehow convert that into UPRN - -if this lambda/function can do that we'll be speeding ahead +So you want to fetch UPRN for an address list? -Energy Performance Information: https://epc.opendatacommunities.org/ +Before you run: -guidance page: https://epc.opendatacommunities.org/docs/guidance#field_domestic_LMK_KEY +Step 1) Get the list and ensure the following columns exists -Example of past khalims code that he wrote some tests for: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/tests/test_search_epc.py#L11 +* Address 1 +* Address 2 +* Address 3 +* postcode + +And save it as a .csv file -Example of EPC search: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/SearchEpc.py#L118 +Step 2) + +Before we run this, we need to upload it into S3 as well as put initiate a subtask + task + +* S3 upload I'll recommend somewhere in retrofit-data-dev and get the s3_uri + +For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" + +Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key + +task_id = a7b70a02-4df4-45b5-a50b-196e095910bb +sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e + +Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling +postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev + +{ + "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", + "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" +} +Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches///.csv + +outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs///.csv +Run the script in backend/scripts/combine_address2uprn_outputs.py with . +This will combine all the outputs of the files for each address2uprn into one big file -Khalim has made a python package to help scrape data: https://github.com/KhalimCK/epc-api-python +Find out which ones have missing uprn and save that as a seperate sheet and save it somewhere in s3://retrofit-data-dev +I uploaded the missing uprn here: s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv + +ordnance_survey sqs is => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2FordnanceSurvey-queue-dev + +{ + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv", + "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", + "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e" +} + + +outputs are at s3://retrofit-data-dev/ara_ordnance_survey_outputs// \ No newline at end of file diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index af29a095..d0ba36e6 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -1,13 +1,11 @@ +from typing import Optional + from epc_api.client import EpcClient import os from urllib.parse import urlencode import pandas as pd -from difflib import SequenceMatcher from utils.logger import setup_logger -import re -from typing import Set import json -import requests from uuid import UUID import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface @@ -18,6 +16,8 @@ from utils.s3 import ( ) from datetime import datetime +from backend.utils.addressMatch import AddressMatch + logger = setup_logger() @@ -29,191 +29,6 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") -def is_valid_postcode(postcode_clean: str) -> bool: - """ - Validate postcode using postcodes.io. - - Expects a sanitised postcode (e.g. E84SQ). - Returns True if valid, False otherwise. - """ - POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" - if not postcode_clean: - return False - - try: - resp = requests.get( - POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), - timeout=5, - ) - resp.raise_for_status() - return resp.json().get("result", False) - except requests.RequestException: - # Network issues, rate limits, etc. - return False - - -def levenshtein(a: str, b: str) -> float: - """ - Address similarity score in [0, 1]. - - Strategy: - - Normalise - - Strongly penalise mismatched house/flat numbers - - Combine token overlap + character similarity - """ - - def extract_number_sequence(s: str) -> list[str]: - return re.findall(r"\d+[a-z]?", s) - - def extract_numbers(s: str) -> Set[str]: - return set(extract_number_sequence(s)) - - def tokenise(s: str) -> Set[str]: - return set(s.split()) - - def extract_building_number(s: str) -> str | None: - """ - Extract the main building number (NOT flat/unit). - Assumes formats like: - - '42 moreton road' - - 'flat 3 42 moreton road' - """ - tokens = s.split() - - # remove flat/unit context - cleaned = [] - skip_next = False - for t in tokens: - if t in ("flat", "apt", "apartment", "unit"): - skip_next = True - continue - if skip_next: - skip_next = False - continue - cleaned.append(t) - - # first remaining number is building number - for t in cleaned: - if re.fullmatch(r"\d+[a-z]?", t): - return t - - return None - - a_norm = normalise_address(a) - b_norm = normalise_address(b) - - # --- hard signal: numbers --- - nums_a = extract_numbers(a_norm) - nums_b = extract_numbers(b_norm) - - if nums_a and not nums_b: - return 0.0 - - # No shared numbers at all → impossible match - if nums_a and nums_b and nums_a.isdisjoint(nums_b): - return 0.0 - - # 🔒 HARD GUARD: building number must match - bld_a = extract_building_number(a_norm) - bld_b = extract_building_number(b_norm) - - if bld_a and bld_b and bld_a != bld_b: - return 0.0 - - # --- order-sensitive flat/building guard --- - seq_a = extract_number_sequence(a_norm) - seq_b = extract_number_sequence(b_norm) - - has_flat_token_user = any( - tok in a_norm for tok in ("flat", "apt", "apartment", "unit") - ) - has_flat_token_epc = "flat" in b_norm - - if ( - len(seq_a) == 2 - and len(seq_b) >= 2 - and has_flat_token_epc - and not has_flat_token_user - and seq_a != seq_b[:2] - ): - return 0.0 - - # --- token similarity (order-independent) --- - toks_a = tokenise(a_norm) - toks_b = tokenise(b_norm) - - if not toks_a or not toks_b: - token_score = 0.0 - else: - token_score = len(toks_a & toks_b) / len(toks_a | toks_b) - - # --- character similarity (soft signal) --- - char_score = SequenceMatcher(None, a_norm, b_norm).ratio() - - # --- weighted blend --- - return round( - 0.65 * token_score + 0.35 * char_score, - 4, - ) - - -def normalise_address(s: str) -> str: - """ - Canonical UK-focused address normalisation. - - - Lowercases - - Removes punctuation (keeps / for flats) - - Normalises whitespace - - Applies synonym compression at token level - """ - - if not s: - return "" - - ADDRESS_SYNONYMS = { - # street types - "rd": "road", - "rd.": "road", - "st": "street", - "st.": "street", - "ave": "avenue", - "ave.": "avenue", - "ln": "lane", - "ln.": "lane", - "cres": "crescent", - "ct": "court", - "dr": "drive", - # flats / units - "apt": "flat", - "apartment": "flat", - "unit": "flat", - "ste": "suite", - # numbering noise - "no": "", - "no.": "", - } - # 1. lowercase - s = s.lower() - - # 1.5 split digit-letter suffixes - s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) - - # 2. remove punctuation except / - s = re.sub(r"[^\w\s/]", " ", s) - - # 3. normalise whitespace - s = re.sub(r"\s+", " ", s).strip() - - # 4. tokenise + synonym normalisation - tokens = [] - for tok in s.split(): - replacement = ADDRESS_SYNONYMS.get(tok, tok) - if replacement: - tokens.append(replacement) - - return " ".join(tokens) - - def score_addresses( df: pd.DataFrame, user_address: str, @@ -222,7 +37,7 @@ def score_addresses( if column not in df.columns: raise ValueError(f"Missing column: {column}") - return df[column].apply(lambda x: levenshtein(user_address, x)) + return df[column].apply(lambda x: AddressMatch.score(user_address, x)) def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): @@ -314,9 +129,11 @@ def get_uprn_candidates( out = df.copy() - user_norm = normalise_address(user_address) + user_norm = AddressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) + out["lexiscore"] = out[address_column].apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) @@ -480,7 +297,10 @@ def resolve_uprns_for_postcode_group( def save_results_to_s3( - results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None + results_df: pd.DataFrame, + task_id: str, + sub_task_id: str, + bucket_name: Optional[str] = None, ) -> bool: """ Save results DataFrame to S3 as CSV. @@ -533,7 +353,7 @@ def handler(event, context, local=False): { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", } ) } @@ -621,19 +441,6 @@ def handler(event, context, local=False): # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") - # Create user_input column by concatenating Address columns if not already present - if "user_input" not in df.columns: - df["user_input"] = ( - df["Address 1"].fillna("") - + " " - + df["Address 2"].fillna("") - + " " - + df["Address 3"].fillna("") - ).str.strip() - logger.info(f"Created user_input column from Address 1 and Address 2") - else: - logger.info(f"user_input column already present in data") - clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = { @@ -653,7 +460,7 @@ def handler(event, context, local=False): ) # Validate postcode before processing - if not is_valid_postcode(postcode): + if not AddressMatch.is_valid_postcode(postcode): logger.warning(f"Postcode {postcode} is invalid, skipping") continue @@ -672,57 +479,67 @@ def handler(event, context, local=False): # Process each address in this postcode with the same EPC data for row in postcode_rows: try: - user_input = row.get("user_input", "") - if not user_input: + # Concatenate Address columns directly + address2uprn_user_input = ( + str(row.get("Address 1", "")).strip() + + " " + + str(row.get("Address 2", "")).strip() + + " " + + str(row.get("Address 3", "")).strip() + ).strip() + + if not address2uprn_user_input: logger.warning( - f"Skipping row with missing user_input for postcode {postcode}" + f"Skipping row with missing address components for postcode {postcode}" ) continue # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( - user_inputed_address=user_input, epc_df=epc_df, verbose=True + user_inputed_address=address2uprn_user_input, + epc_df=epc_df, + verbose=True, ) # Parse result tuple if successful if result: uprn, found_address, score = result logger.info( - f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" + f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})" ) results_data.append( { **row, # Include all original data - "uprn": uprn, - "domna_found_address": found_address, - "domna_lexiscore": score, + "address2uprn_uprn": uprn, + "address2uprn_address": found_address, + "address2uprn_lexiscore": score, } ) else: logger.warning( - f"No UPRN found for {user_input} in {postcode}" + f"No UPRN found for {address2uprn_user_input} in {postcode}" ) results_data.append( { **row, # Include all original data - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, } ) except Exception as e: logger.error( - f"Error processing address {row.get('user_input', 'unknown')}: {e}" + f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}" ) # Still add the row with error markers results_data.append( { **row, - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, "error": str(e), } ) diff --git a/backend/app/config.py b/backend/app/config.py index 26fb6b8b..6604fec9 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -35,7 +35,7 @@ class Settings(BaseSettings): SECRET_KEY: str = "changeme" ENVIRONMENT: str = "changeme" DATA_BUCKET: str = "changeme" - PLAN_TRIGGER_BUCKET: str + PLAN_TRIGGER_BUCKET: str = "changeme" ENGINE_SQS_URL: str = "changeme" CATEGORISATION_SQS_URL: str = "changeme" @@ -63,6 +63,8 @@ class Settings(BaseSettings): # Other S3 buckts ENERGY_ASSESSMENTS_BUCKET: str = "changeme" + ORDNANCE_SURVEY_API_KEY: str = "changeme" + # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None AWS_SECRET_KEY_ID: Optional[str] = None diff --git a/backend/app/db/models/postcode_search.py b/backend/app/db/models/postcode_search.py new file mode 100644 index 00000000..1e3e03b8 --- /dev/null +++ b/backend/app/db/models/postcode_search.py @@ -0,0 +1,24 @@ +import pytz +import datetime +from sqlalchemy import ( + Column, + BigInteger, + Text, + DateTime, +) +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class PostcodeSearchModel(Base): + __tablename__ = "postcode_search" + + id = Column(BigInteger, primary_key=True, autoincrement=True) + postcode = Column(Text, nullable=False) + result_data = Column(JSONB, nullable=True) + + created_at = Column( + DateTime(timezone=True), nullable=False, default=datetime.datetime.now(pytz.utc) + ) diff --git a/backend/app/utils.py b/backend/app/utils.py index b3843206..c1ad54f6 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -43,7 +43,7 @@ def generate_api_key(): # Define the characters that will be used to generate the api key characters = string.ascii_letters + string.digits # Generate a 40 character long api key - api_key = ''.join(secrets.choice(characters) for _ in range(40)) + api_key = "".join(secrets.choice(characters) for _ in range(40)) return api_key @@ -113,7 +113,7 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key): df.to_parquet(parquet_buffer) # Create the boto3 client - s3 = boto3.resource('s3') + s3 = boto3.resource("s3") # Upload the Parquet file to S3 s3.Object(bucket_name, file_key).put(Body=parquet_buffer.getvalue()) diff --git a/backend/ordnanceSurvey/__init__.py b/backend/ordnanceSurvey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/ordnanceSurvey/handler/Dockerfile b/backend/ordnanceSurvey/handler/Dockerfile new file mode 100644 index 00000000..6a3cbe26 --- /dev/null +++ b/backend/ordnanceSurvey/handler/Dockerfile @@ -0,0 +1,25 @@ +FROM public.ecr.aws/lambda/python:3.11 + +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + +# Set working directory (Lambda task root) +WORKDIR /var/task + +COPY backend/ordnanceSurvey/handler/requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +# Copy necessary files for database and utility imports +COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +# Lambda handler +CMD ["backend/ordnanceSurvey/main.handler"] + diff --git a/backend/ordnanceSurvey/handler/requirements.txt b/backend/ordnanceSurvey/handler/requirements.txt new file mode 100644 index 00000000..6ef41b2d --- /dev/null +++ b/backend/ordnanceSurvey/handler/requirements.txt @@ -0,0 +1,11 @@ +pandas==2.2.2 +numpy<2.0 +requests +tqdm +openpyxl +epc-api-python==1.0.2 +boto3==1.35.44 +sqlmodel +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 \ No newline at end of file diff --git a/backend/ordnanceSurvey/helpers.py b/backend/ordnanceSurvey/helpers.py new file mode 100644 index 00000000..c0d6583b --- /dev/null +++ b/backend/ordnanceSurvey/helpers.py @@ -0,0 +1,47 @@ +import urllib.parse +from pydantic import ValidationError +import requests +import pandas as pd +from utils.logger import setup_logger + +logger = setup_logger() + + +def os_places_results_to_dataframe(data: dict) -> pd.DataFrame: + """ + Flatten the OS Places API response results into a DataFrame. + Each result contains either a DPA or LPI record. + """ + results = data.get("results", []) + rows = [] + for r in results: + if "DPA" in r: + rows.append(r["DPA"]) + elif "LPI" in r: + rows.append(r["LPI"]) + return pd.DataFrame(rows) + + +def lookup_os_places(postcode: str, api_key: str) -> dict: + """ + Lookup a postcode using the OS Places API. + Returns the full API response data or an error dict. + """ + if not api_key: + return {"error": "Ordnance Survey API key not specified", "status": 400} + + encoded_postcode = urllib.parse.quote(postcode) + url = ( + f"https://api.os.uk/search/places/v1/postcode?postcode={encoded_postcode}" + f"&dataset=DPA,LPI&key={api_key}" + ) + + response = requests.get(url) + if response.status_code != 200: + logger.error( + f"OS Places API error for postcode {postcode}: {response.status_code}" + ) + return {"error": "Failed to fetch address data", "status": response.status_code} + + data = response.json() + return {"data": data, "status": 200} diff --git a/backend/ordnanceSurvey/local_handler/docker-compose.yml b/backend/ordnanceSurvey/local_handler/docker-compose.yml new file mode 100644 index 00000000..5f54e7da --- /dev/null +++ b/backend/ordnanceSurvey/local_handler/docker-compose.yml @@ -0,0 +1,11 @@ +version: "3.9" + +services: + ordnance-survey-lambda: + build: + context: ../../../ + dockerfile: backend/ordnanceSurvey/handler/Dockerfile + ports: + - "9000:8080" + env_file: + - ../../../.env \ No newline at end of file diff --git a/backend/ordnanceSurvey/local_handler/invoke_local_lambda.py b/backend/ordnanceSurvey/local_handler/invoke_local_lambda.py new file mode 100644 index 00000000..e5272732 --- /dev/null +++ b/backend/ordnanceSurvey/local_handler/invoke_local_lambda.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9000" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv", + } + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py new file mode 100644 index 00000000..6e82b468 --- /dev/null +++ b/backend/ordnanceSurvey/main.py @@ -0,0 +1,223 @@ +from typing import Any, Optional +import json +from utils.logger import setup_logger +import logging +from backend.utils.subtasks import subtask_handler +from utils.s3 import ( + save_csv_to_s3, + read_csv_from_s3 as read_csv_from_s3_dict, + parse_s3_uri, +) +from backend.utils.addressMatch import AddressMatch +from backend.app.db.connection import get_db_session +from backend.app.db.models.postcode_search import PostcodeSearchModel +from backend.ordnanceSurvey.helpers import ( + lookup_os_places, + os_places_results_to_dataframe, +) +from backend.app.config import get_settings +from sqlalchemy import select +from datetime import datetime +import uuid +import os + +import pandas as pd + +logger: logging.Logger = setup_logger() + + +def check_if_post_code_exists_in_db_cache(postcode): + + with get_db_session() as session: + result = ( + session.execute( + select(PostcodeSearchModel).where( + PostcodeSearchModel.postcode == postcode + ) + ) + .scalars() + .first() + ) + if result: + return os_places_results_to_dataframe(result.result_data) + + # Cache miss — fetch from OS Places API + api_key = get_settings().ORDNANCE_SURVEY_API_KEY + response = lookup_os_places(postcode, api_key) + + if response.get("status") != 200 or "data" not in response: + logger.error(f"OS Places API failed for {postcode}: {response}") + return pd.DataFrame() + + # Save to cache + new_record = PostcodeSearchModel( + postcode=postcode, + result_data=response["data"], + ) + session.add(new_record) + session.commit() + + return os_places_results_to_dataframe(response["data"]) + + +def save_results_to_s3( + results_df: pd.DataFrame, + task_id: str, + sub_task_id: str, + bucket_name: Optional[str] = None, +) -> bool: + """ + Save results DataFrame to S3 as CSV in a parent folder structure. + + :param results_df: The DataFrame containing results + :param task_id: The task ID (used for file naming) + :param sub_task_id: The subtask ID (used for file naming) + :param bucket_name: The S3 bucket name (defaults to env variable) + :return: True if successful, False otherwise + """ + if bucket_name is None: + bucket_name = os.getenv("S3_BUCKET_NAME") + + if not bucket_name: + logger.error( + "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set" + ) + return False + + try: + # Create a filename with timestamp and UUID + file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}" + file_key = f"ara_ordnance_survey_outputs/{task_id}/{sub_task_id}/ordnanceSurvey/{file_name}.csv" + + # Save to S3 + success = save_csv_to_s3(results_df, bucket_name, file_key) + + if success: + logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}") + return True + else: + logger.error(f"Failed to save results to S3") + return False + + except Exception as e: + logger.error(f"Error saving results to S3: {str(e)}") + return False + + +@subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body +def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: + + # delete this line after test + # local = True + # Example SQS message for testing (copy and paste into SQS): + if local is True: + body = { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv", + "lexiscore_column": "address2uprn_lexiscore", + } + + s3_uri: str = body.get("s3_uri", "") + lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5) + lexiscore_column: Optional[str] = body.get("lexiscore_column", None) + task_id: str = body.get("task_id", "") + sub_task_id: str = body.get("sub_task_id", "") + + if s3_uri == "": + raise RuntimeError("Missing s3_uri in message body") + + bucket, key = parse_s3_uri(s3_uri) + + # Assumption designing with address2uprn was ran first + csv_data = read_csv_from_s3_dict(bucket, key) + df = pd.DataFrame(csv_data) + # df = df.head(5) + + # If lexiscore_column is specified, use it; otherwise process all rows + if lexiscore_column and lexiscore_column in df.columns: + df[lexiscore_column] = pd.to_numeric(df[lexiscore_column], errors="coerce") + needs_processing = df[ + df[lexiscore_column].isna() | (df[lexiscore_column] < lexiscore_threshold) + ] + else: + # Default: process all rows + needs_processing = df + + grouped = needs_processing.groupby("postcode_clean") + + # Initialise new columns + df["ordnance_survey_address"] = None + df["ordnance_survey_uprn"] = None + df["ordnance_survey_lexiscore"] = None + + # Process each postcode group at a time + for postcode, group in grouped: + print(f"Processing postcode: {postcode} ({len(group)} rows)") + valid_group = AddressMatch.is_valid_postcode(postcode) + if not valid_group: + logger.warning(f"Postcode {postcode} is invalid, skipping") + for idx in group.index: + df.at[idx, "ordnance_survey_address"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_uprn"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_lexiscore"] = ( + "postcode not found in ordnance survey" + ) + continue + + postcode_cache = check_if_post_code_exists_in_db_cache(postcode) + if postcode_cache.empty: + logger.warning(f"No OS Places data for {postcode}") + for idx in group.index: + df.at[idx, "ordnance_survey_address"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_uprn"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_lexiscore"] = ( + "postcode not found in ordnance survey" + ) + continue + + for idx, row in group.iterrows(): + # Concatenate Address columns directly + ordnancy_survey_user_input = ( + str(row.get("Address 1", "")).strip() + + " " + + str(row.get("Address 2", "")).strip() + + " " + + str(row.get("Address 3", "")).strip() + ).strip() + + if not ordnancy_survey_user_input: + continue + + # Score against OS Places addresses + scores = postcode_cache["ADDRESS"].apply( + lambda addr: AddressMatch.score(ordnancy_survey_user_input, addr) + ) + best_idx = scores.idxmax() + best_score = scores[best_idx] + + if best_score <= 0: + continue + + df.at[idx, "ordnance_survey_address"] = postcode_cache.at[ + best_idx, "ADDRESS" + ] + df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"] + df.at[idx, "ordnance_survey_lexiscore"] = best_score + + # Save results locally + if local: + df.to_csv("ordnance_survey_results.csv", index=False) + print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)") + + # Save results to S3 + if task_id and sub_task_id: + save_results_to_s3(df, task_id, sub_task_id) diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py new file mode 100644 index 00000000..be17f610 --- /dev/null +++ b/backend/scripts/combine_address2uprn_outputs.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +import argparse +import boto3 +import pandas as pd +from io import BytesIO + +BUCKET = "retrofit-data-dev" + + +def list_csv_files(task_id): + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + + prefix = f"ara_raw_outputs/{task_id}" + csv_files = [] + + for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix): + for obj in page.get("Contents", []): + key = obj["Key"] + if key.endswith(".csv"): + csv_files.append(key) + + return csv_files + + +def download_csv(key): + s3 = boto3.client("s3") + obj = s3.get_object(Bucket=BUCKET, Key=key) + return pd.read_csv(BytesIO(obj["Body"].read())) + + +def main(task_id, output): + print(f"Scanning task: {task_id}") + + csv_files = list_csv_files(task_id) + + if not csv_files: + print("No CSV files found") + return + + print(f"Found {len(csv_files)} CSV files") + + dfs = [] + for key in csv_files: + print(f"Downloading {key}") + df = download_csv(key) + dfs.append(df) + + combined = pd.concat(dfs, ignore_index=True) + + combined.to_csv(output, index=False) + + print(f"Combined CSV saved to {output}") + print(f"Total rows: {len(combined)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("task_id", help="Task ID folder in S3") + parser.add_argument("--output", default="combined.csv") + + args = parser.parse_args() + + main(args.task_id, args.output) diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py new file mode 100644 index 00000000..411bb07c --- /dev/null +++ b/backend/utils/addressMatch.py @@ -0,0 +1,201 @@ +import re +from typing import Any, Optional +from difflib import SequenceMatcher +import requests + + +class AddressMatch: + def __init__(self): + return None + + @staticmethod + def score(a: str, b: str) -> float: + score: float = AddressMatch.levenshtein(a, b) + + return score + + @staticmethod + def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = ( + "https://api.postcodes.io/postcodes/{postcode}/validate" + ) + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + @staticmethod + def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + # numbering noise + "no": "", + "no.": "", + } + # 1. lowercase + s = s.lower() + + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens: list[str] = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + return " ".join(tokens) + + @staticmethod + def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + + def extract_numbers(s: str) -> set[str]: + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> set[str]: + return set(s.split()) + + def extract_building_number(s: str) -> Optional[str]: + """ + Extract the main building number (NOT flat/unit). + Assumes formats like: + - '42 moreton road' + - 'flat 3 42 moreton road' + """ + tokens = s.split() + + # remove flat/unit context + cleaned: list[Any] = [] + skip_next = False + for t in tokens: + if t in ("flat", "apt", "apartment", "unit"): + skip_next = True + continue + if skip_next: + skip_next = False + continue + cleaned.append(t) + + # first remaining number is building number + for t in cleaned: + if re.fullmatch(r"\d+[a-z]?", t): + return t + + return None + + a_norm = AddressMatch.normalise_address(a) + b_norm = AddressMatch.normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and not nums_b: + return 0.0 + + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): + return 0.0 + + # 🔒 HARD GUARD: building number must match + bld_a = extract_building_number(a_norm) + bld_b = extract_building_number(b_norm) + + if bld_a and bld_b and bld_a != bld_b: + return 0.0 + + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- + toks_a: set[str] = tokenise(a_norm) + toks_b: set[str] = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score: float = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + 0.35 * char_score, + 4, + ) diff --git a/backend/utils/subtasks.py b/backend/utils/subtasks.py new file mode 100644 index 00000000..041494e9 --- /dev/null +++ b/backend/utils/subtasks.py @@ -0,0 +1,95 @@ +# decorators/subtask_handler.py + +from functools import wraps +from typing import Callable, Any +from uuid import UUID +import json + +from backend.app.db.functions.tasks.Tasks import SubTaskInterface + + +def subtask_handler(): + """ + Decorator that wraps your existing handler and automatically: + + - Extracts task_id + sub_task_id from event + - Marks subtask as in progress + - Executes handler logic + - Marks subtask complete on success + - Marks failed on exception + """ + + def decorator(func: Callable[..., Any]): + + @wraps(func) + def wrapper(event: dict[str, Any], context: Any, *args, **kwargs): + + records = event.get("Records", [event]) + + interface = SubTaskInterface() + + for record in records: + + # ------------------------------- + # Parse body safely + # ------------------------------- + body = {} + + if isinstance(record.get("body"), str): + try: + body = json.loads(record["body"]) + except Exception: + body = {} + else: + body = record.get("body", {}) or {} + + task_id_raw = body.get("task_id") + subtask_id_raw = body.get("sub_task_id") + + task_id = UUID(task_id_raw) if isinstance(task_id_raw, str) else None + subtask_id = ( + UUID(subtask_id_raw) if isinstance(subtask_id_raw, str) else None + ) + + if not task_id or not subtask_id: + raise RuntimeError("task_id or sub_task_id missing") + + # ------------------------------- + # Mark in progress + # ------------------------------- + interface.update_subtask_status( + subtask_id=subtask_id, + status="in progress", + ) + + try: + # Pass the parsed body into your function + result = func(body, context, *args, **kwargs) + + # ------------------------------- + # Success → mark complete + # ------------------------------- + interface.update_subtask_status( + subtask_id=subtask_id, + status="complete", + outputs={"result": result} if result else None, + ) + + except Exception as e: + + # ------------------------------- + # Failure → mark failed + # ------------------------------- + interface.update_subtask_status( + subtask_id=subtask_id, + status="failed", + outputs={"error": str(e)}, + ) + + raise + + return None + + return wrapper + + return decorator diff --git a/infrastructure/terraform/cdn/main.tf b/infrastructure/terraform/cdn/main.tf new file mode 100644 index 00000000..56b2f52b --- /dev/null +++ b/infrastructure/terraform/cdn/main.tf @@ -0,0 +1,64 @@ +############################################ +# Load Shared Terraform State +############################################ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +############################################ +# Load FastAPI Terraform State +############################################ +data "terraform_remote_state" "fast_api" { + backend = "s3" + config = { + bucket = "ara-fast-api-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +############################################ +# Load CDN Certificate Terraform State +############################################ +data "terraform_remote_state" "cdn_certificate" { + backend = "s3" + config = { + bucket = "cdn-certificate-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +############################################ +# CloudFront for API +############################################ +module "cdn" { + source = "../modules/cloudfront" + + aliases = [data.terraform_remote_state.fast_api.outputs.domain_name] + + acm_certificate_arn = data.terraform_remote_state.cdn_certificate.outputs.certificate_arn + + origins = [ + # ---- S3 ---- + { + origin_type = "s3" + origin_domain_name = data.terraform_remote_state.shared.outputs.retrofit_datalake_bucket_domain_name + origin_id = "s3-origin" + bucket_id = data.terraform_remote_state.shared.outputs.retrofit_datalake_bucket_id + bucket_arn = data.terraform_remote_state.shared.outputs.retrofit_datalake_bucket_arn + }, + + # ---- API Gateway ---- + { + origin_type = "api" + origin_domain_name = replace(data.terraform_remote_state.fast_api.outputs.invoke_url, "/^https?://([^/]*).*/", "$1") + origin_id = "api-origin" + } + ] +} \ No newline at end of file diff --git a/infrastructure/terraform/cdn/provider.tf b/infrastructure/terraform/cdn/provider.tf new file mode 100644 index 00000000..186adc10 --- /dev/null +++ b/infrastructure/terraform/cdn/provider.tf @@ -0,0 +1,14 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "ara-cdn-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } +} \ No newline at end of file diff --git a/infrastructure/terraform/cdn/variables.tf b/infrastructure/terraform/cdn/variables.tf new file mode 100644 index 00000000..423f0b0f --- /dev/null +++ b/infrastructure/terraform/cdn/variables.tf @@ -0,0 +1,3 @@ +variable "stage" { + type = string +} \ No newline at end of file diff --git a/infrastructure/terraform/cdn_certificate/main.tf b/infrastructure/terraform/cdn_certificate/main.tf new file mode 100644 index 00000000..5c0c178c --- /dev/null +++ b/infrastructure/terraform/cdn_certificate/main.tf @@ -0,0 +1,28 @@ +############################################ +# Load FastAPI Terraform State +############################################ +data "terraform_remote_state" "fast_api" { + backend = "s3" + config = { + bucket = "ara-fast-api-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +############################################ +# Define Certificate +############################################ +module "cdn_certificate" { + source = "../modules/acm_certificate" + + providers = { + aws = aws.us_east_1 + } + + domain_name = data.terraform_remote_state.fast_api.outputs.domain_name + + tags = { + Environment = var.stage + } +} \ No newline at end of file diff --git a/infrastructure/terraform/cdn_certificate/outputs.tf b/infrastructure/terraform/cdn_certificate/outputs.tf new file mode 100644 index 00000000..e72c6e13 --- /dev/null +++ b/infrastructure/terraform/cdn_certificate/outputs.tf @@ -0,0 +1,3 @@ +output "certificate_arn" { + value = module.cdn_certificate.certificate_arn +} \ No newline at end of file diff --git a/infrastructure/terraform/cdn_certificate/provider.tf b/infrastructure/terraform/cdn_certificate/provider.tf new file mode 100644 index 00000000..38b01d82 --- /dev/null +++ b/infrastructure/terraform/cdn_certificate/provider.tf @@ -0,0 +1,23 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "cdn-certificate-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } +} + +provider "aws" { + region = var.region +} + +provider "aws" { + alias = "us_east_1" + region = "us-east-1" +} \ No newline at end of file diff --git a/infrastructure/terraform/cdn_certificate/variables.tf b/infrastructure/terraform/cdn_certificate/variables.tf new file mode 100644 index 00000000..423f0b0f --- /dev/null +++ b/infrastructure/terraform/cdn_certificate/variables.tf @@ -0,0 +1,3 @@ +variable "stage" { + type = string +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/_template/main.tf b/infrastructure/terraform/lambda/_template/main.tf index c6015ea1..81b1c7f1 100644 --- a/infrastructure/terraform/lambda/_template/main.tf +++ b/infrastructure/terraform/lambda/_template/main.tf @@ -26,7 +26,7 @@ data "terraform_remote_state" "shared" { } module "lambda" { - source = "../modules/lambda_with_sqs" + source = "../../modules/lambda_with_sqs" name = REPLACE ME #"address2uprn" for example stage = var.stage diff --git a/infrastructure/terraform/lambda/_template/provider.tf b/infrastructure/terraform/lambda/_template/provider.tf index 37c412ce..3d66f392 100644 --- a/infrastructure/terraform/lambda/_template/provider.tf +++ b/infrastructure/terraform/lambda/_template/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 2d185497..bc6f9e67 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -15,7 +15,7 @@ locals { } module "address2uprn" { - source = "../modules/lambda_with_sqs" + source = "../../modules/lambda_with_sqs" name = "address2uprn" stage = var.stage @@ -33,19 +33,6 @@ module "address2uprn" { LOG_LEVEL = "info" DB_USERNAME = local.db_credentials.db_assessment_model_username DB_PASSWORD = local.db_credentials.db_assessment_model_password - GOOGLE_SOLAR_API_KEY = "test" - SAP_PREDICTIONS_BUCKET = "test" - CARBON_PREDICTIONS_BUCKET = "test" - HEAT_PREDICTIONS_BUCKET = "test" - HEATING_KWH_PREDICTIONS_BUCKET = "test" - HOTWATER_KWH_PREDICTIONS_BUCKET = "test" - API_KEY = "test" - ENVIRONMENT = "test" - SECRET_KEY = "test" - PLAN_TRIGGER_BUCKET = "test" - DATA_BUCKET = "test" - ENGINE_SQS_URL = "test" - ENERGY_ASSESSMENTS_BUCKET = "test" S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, ) diff --git a/infrastructure/terraform/lambda/address2UPRN/provider.tf b/infrastructure/terraform/lambda/address2UPRN/provider.tf index ad873717..3cfa2400 100644 --- a/infrastructure/terraform/lambda/address2UPRN/provider.tf +++ b/infrastructure/terraform/lambda/address2UPRN/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } diff --git a/infrastructure/terraform/lambda/categorisation/main.tf b/infrastructure/terraform/lambda/categorisation/main.tf index b7193da4..01e16213 100644 --- a/infrastructure/terraform/lambda/categorisation/main.tf +++ b/infrastructure/terraform/lambda/categorisation/main.tf @@ -16,7 +16,7 @@ locals { } module "lambda" { - source = "../modules/lambda_with_sqs" + source = "../../modules/lambda_with_sqs" name = "categorisation" stage = var.stage diff --git a/infrastructure/terraform/lambda/categorisation/outputs.tf b/infrastructure/terraform/lambda/categorisation/outputs.tf new file mode 100644 index 00000000..8e33b8e0 --- /dev/null +++ b/infrastructure/terraform/lambda/categorisation/outputs.tf @@ -0,0 +1,9 @@ +output "categorisation_queue_url" { + value = module.lambda.queue_url + description = "URL of the Categorisation SQS queue" +} + +output "categorisation_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the Categorisation SQS queue" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/categorisation/provider.tf b/infrastructure/terraform/lambda/categorisation/provider.tf index fe497c81..30e73ed2 100644 --- a/infrastructure/terraform/lambda/categorisation/provider.tf +++ b/infrastructure/terraform/lambda/categorisation/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } diff --git a/infrastructure/terraform/lambda/condition-etl/main.tf b/infrastructure/terraform/lambda/condition-etl/main.tf index 0128f975..d654223c 100644 --- a/infrastructure/terraform/lambda/condition-etl/main.tf +++ b/infrastructure/terraform/lambda/condition-etl/main.tf @@ -17,7 +17,7 @@ locals { module "lambda" { - source = "../modules/lambda_with_sqs" + source = "../../modules/lambda_with_sqs" name = "condition-etl" stage = var.stage diff --git a/infrastructure/terraform/lambda/condition-etl/provider.tf b/infrastructure/terraform/lambda/condition-etl/provider.tf index c633d238..f7adf65a 100644 --- a/infrastructure/terraform/lambda/condition-etl/provider.tf +++ b/infrastructure/terraform/lambda/condition-etl/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } diff --git a/infrastructure/terraform/lambda/engine/main.tf b/infrastructure/terraform/lambda/engine/main.tf index 9d44c9ed..1f3ce017 100644 --- a/infrastructure/terraform/lambda/engine/main.tf +++ b/infrastructure/terraform/lambda/engine/main.tf @@ -17,15 +17,17 @@ locals { module "lambda" { - source = "../modules/lambda_with_sqs" + source = "../../modules/lambda_with_sqs" name = "engine" stage = var.stage image_uri = local.image_uri - # Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000) maximum_concurrency = var.maximum_concurrency + batch_size = var.batch_size + timeout = var.timeout + memory_size = var.memory_size environment = merge( { @@ -42,7 +44,6 @@ module "lambda" { DB_PORT = var.db_port API_KEY = var.api_key SECRET_KEY = var.secret_key - DOMAIN_NAME = var.domain_name EPC_AUTH_TOKEN = var.epc_auth_token GOOGLE_SOLAR_API_KEY = var.google_solar_api_key diff --git a/infrastructure/terraform/lambda/engine/outputs.tf b/infrastructure/terraform/lambda/engine/outputs.tf new file mode 100644 index 00000000..c59e0809 --- /dev/null +++ b/infrastructure/terraform/lambda/engine/outputs.tf @@ -0,0 +1,9 @@ +output "ara_engine_queue_url" { + value = module.lambda.queue_url + description = "URL of the Engine SQS queue" +} + +output "ara_engine_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the Engine SQS queue" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/engine/provider.tf b/infrastructure/terraform/lambda/engine/provider.tf index 2895d039..74021fd0 100644 --- a/infrastructure/terraform/lambda/engine/provider.tf +++ b/infrastructure/terraform/lambda/engine/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } diff --git a/infrastructure/terraform/lambda/engine/variables.tf b/infrastructure/terraform/lambda/engine/variables.tf index 9805d409..bf0a42a2 100644 --- a/infrastructure/terraform/lambda/engine/variables.tf +++ b/infrastructure/terraform/lambda/engine/variables.tf @@ -23,6 +23,23 @@ variable "maximum_concurrency" { description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." } +variable "batch_size" { + type = number + default = 1 +} + +variable "timeout" { + type = number + default = 900 + description = "Lambda timeout in seconds" +} + +variable "memory_size" { + type = number + default = 3008 + description = "Lambda memory size in MB" +} + variable "db_host" { type = string sensitive = true @@ -48,10 +65,6 @@ variable "secret_key" { sensitive = true } -variable "domain_name" { - type = string -} - variable "epc_auth_token" { type = string sensitive = true diff --git a/infrastructure/terraform/lambda/fast-api/main.tf b/infrastructure/terraform/lambda/fast-api/main.tf new file mode 100644 index 00000000..f71b6f60 --- /dev/null +++ b/infrastructure/terraform/lambda/fast-api/main.tf @@ -0,0 +1,124 @@ +############################################ +# Load Terraform State +############################################ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "terraform_remote_state" "engine" { + backend = "s3" + config = { + bucket = "ara-engine-terraform-state", + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "terraform_remote_state" "categorisation" { + backend = "s3" + config = { + bucket = "categorisation-terraform-state", + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +############################################ +# Load Credentials +############################################ +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +############################################ +# FastAPI Lambda + API Gateway +############################################ +module "fastapi" { + source = "../../modules/lambda_with_api_gateway" + + name = "fastapi" + stage = var.stage + source_dir = "${path.root}/../../../../" + handler = "backend.app.main.handler" + runtime = "python3.11" + timeout = 600 + memory_size = 512 + artifact_bucket = data.terraform_remote_state.shared.outputs.ara_fast_api_state_bucket + requirements_file = "${path.root}/../../../../backend/app/requirements/requirements.txt" + + domain_name = "api.${var.domain_name}" + + environment = { + ENVIRONMENT = var.stage + API_KEY = var.api_key + SECRET_KEY = var.secret_key + # DOMAIN_NAME = var.domain_name + EPC_AUTH_TOKEN = var.epc_auth_token + GOOGLE_SOLAR_API_KEY = var.google_solar_api_key + + DB_HOST = var.db_host + DB_NAME = var.db_name + DB_PORT = var.db_port + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + + PLAN_TRIGGER_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_plan_trigger_bucket_name + DATA_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name + SAP_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_sap_predictions_bucket_name + CARBON_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_carbon_predictions_bucket_name + HEAT_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_heat_predictions_bucket_name + HEATING_KWH_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_heating_kwh_predictions_bucket_name + HOTWATER_KWH_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_hotwater_kwh_predictions_bucket_name + ENERGY_ASSESSMENTS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_energy_assessments_bucket_name + + ENGINE_SQS_URL = data.terraform_remote_state.engine.outputs.ara_engine_queue_url + CATEGORISATION_SQS_URL = data.terraform_remote_state.categorisation.outputs.categorisation_queue_url + } +} + +############################################ +# IAM policy attachments +############################################ +# SQS +module "fastapi_sqs_policy" { + source = "../../modules/general_iam_policy" + + policy_name = "fastapi-sqs-send-${var.stage}" + policy_description = "Allow FastAPI to send messages to engine & categorisation queues" + + actions = [ + "sqs:SendMessage" + ] + + resources = [ + data.terraform_remote_state.engine.outputs.ara_engine_queue_arn, + data.terraform_remote_state.categorisation.outputs.categorisation_queue_arn + ] + + conditions = null + + tags = { + Service = "fastapi" + Stage = var.stage + } +} + +resource "aws_iam_role_policy_attachment" "fastapi_sqs_send" { + role = module.fastapi.role_name + policy_arn = module.fastapi_sqs_policy.policy_arn +} + +# S3 +resource "aws_iam_role_policy_attachment" "fastapi_s3_read_and_write" { + role = module.fastapi.role_name + policy_arn = data.terraform_remote_state.shared.outputs.fast_api_s3_read_and_write_arn +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/fast-api/outputs.tf b/infrastructure/terraform/lambda/fast-api/outputs.tf new file mode 100644 index 00000000..c9fc6f86 --- /dev/null +++ b/infrastructure/terraform/lambda/fast-api/outputs.tf @@ -0,0 +1,7 @@ +output "domain_name" { + value = module.fastapi.domain_name +} + +output "invoke_url" { + value = module.fastapi.invoke_url +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/fast-api/provider.tf b/infrastructure/terraform/lambda/fast-api/provider.tf new file mode 100644 index 00000000..afe6f3f6 --- /dev/null +++ b/infrastructure/terraform/lambda/fast-api/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "ara-fast-api-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/fast-api/variables.tf b/infrastructure/terraform/lambda/fast-api/variables.tf new file mode 100644 index 00000000..a3157590 --- /dev/null +++ b/infrastructure/terraform/lambda/fast-api/variables.tf @@ -0,0 +1,44 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + type = string +} + +variable "db_host" { + type = string +} + +variable "db_name" { + type = string +} + +variable "db_port" { + type = string +} + +variable "api_key" { + type = string + sensitive = true +} + +variable "secret_key" { + type = string + sensitive = true +} + +variable "domain_name" { + type = string +} + +variable "epc_auth_token" { + type = string + sensitive = true +} + +variable "google_solar_api_key" { + type = string + sensitive = true +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/main.tf b/infrastructure/terraform/lambda/ordnanceSurvey/main.tf new file mode 100644 index 00000000..ace03ffc --- /dev/null +++ b/infrastructure/terraform/lambda/ordnanceSurvey/main.tf @@ -0,0 +1,45 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "ordnance" { + source = "../../modules/lambda_with_sqs" + + name = "ordnanceSurvey" #"address2uprn" for example + stage = var.stage + + image_uri = local.image_uri + + timeout = 900 + + # Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000) + maximum_concurrency = var.maximum_concurrency + + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name + ORDNANCE_SURVEY_API_KEY = var.ordnance_survey_api_key + }, + ) +} + +# Attach S3 read policy to the Lambda execution role +resource "aws_iam_role_policy_attachment" "ordanceSurvey_read_and_write" { + role = module.ordnance.role_name + policy_arn = data.terraform_remote_state.shared.outputs.ordnance_s3_read_and_write_arn +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf b/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf new file mode 100644 index 00000000..12bd0f85 --- /dev/null +++ b/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "ordnance-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf b/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf new file mode 100644 index 00000000..936aebc9 --- /dev/null +++ b/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf @@ -0,0 +1,43 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = null + description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." +} + +variable "batch_size" { + type = number + default = 1 +} + +variable "ordnance_survey_api_key" { + type = string + sensitive = true +} + + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index d37a01c9..94c5cd4e 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -26,7 +26,7 @@ data "terraform_remote_state" "address2uprn" { } module "lambda" { - source = "../modules/lambda_with_sqs" + source = "../../modules/lambda_with_sqs" name = "postcode-splitter" stage = var.stage diff --git a/infrastructure/terraform/lambda/postcodeSplitter/provider.tf b/infrastructure/terraform/lambda/postcodeSplitter/provider.tf index dbe323f2..5749143d 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/provider.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } diff --git a/infrastructure/terraform/modules/acm_certificate/main.tf b/infrastructure/terraform/modules/acm_certificate/main.tf new file mode 100644 index 00000000..9f813a77 --- /dev/null +++ b/infrastructure/terraform/modules/acm_certificate/main.tf @@ -0,0 +1,11 @@ +resource "aws_acm_certificate" "this" { + domain_name = var.domain_name + subject_alternative_names = var.subject_alternative_names + validation_method = "DNS" + + lifecycle { + create_before_destroy = false + } + + tags = var.tags +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/acm_certificate/outputs.tf b/infrastructure/terraform/modules/acm_certificate/outputs.tf new file mode 100644 index 00000000..990a6db9 --- /dev/null +++ b/infrastructure/terraform/modules/acm_certificate/outputs.tf @@ -0,0 +1,7 @@ +output "certificate_arn" { + value = aws_acm_certificate.this.arn +} + +output "domain_validation_options" { + value = aws_acm_certificate.this.domain_validation_options +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/acm_certificate/variables.tf b/infrastructure/terraform/modules/acm_certificate/variables.tf new file mode 100644 index 00000000..bd30501f --- /dev/null +++ b/infrastructure/terraform/modules/acm_certificate/variables.tf @@ -0,0 +1,16 @@ +variable "domain_name" { + description = "Primary domain name for the certificate" + type = string +} + +variable "subject_alternative_names" { + description = "Additional domains for the certificate" + type = list(string) + default = [] +} + +variable "tags" { + description = "Tags to apply to the certificate" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/cloudfront/main.tf b/infrastructure/terraform/modules/cloudfront/main.tf index 281ff09f..03082604 100644 --- a/infrastructure/terraform/modules/cloudfront/main.tf +++ b/infrastructure/terraform/modules/cloudfront/main.tf @@ -1,65 +1,165 @@ -resource "aws_cloudfront_distribution" "s3_distribution" { - origin { - domain_name = var.bucket_domain_name - origin_id = "S3-${var.bucket_name}" +############################################# +# Use Managed Caching and Forwarding Policies +############################################# +data "aws_cloudfront_cache_policy" "caching_disabled" { + name = "Managed-CachingDisabled" +} - s3_origin_config { - origin_access_identity = aws_cloudfront_origin_access_identity.oai.cloudfront_access_identity_path +data "aws_cloudfront_origin_request_policy" "all_viewer_except_host_header" { + name = "Managed-AllViewerExceptHostHeader" +} + +############################################ +# CloudFront Distribution +############################################ + +resource "aws_cloudfront_distribution" "this" { + + ########################################## + # Origins + ########################################## + + dynamic "origin" { + for_each = { for o in var.origins : o.origin_id => o } + + content { + domain_name = origin.value.origin_domain_name + origin_id = origin.value.origin_id + + ###################################### + # S3 Origin + ###################################### + dynamic "s3_origin_config" { + for_each = origin.value.origin_type == "s3" ? [1] : [] + + content { + origin_access_identity = aws_cloudfront_origin_access_identity.oai[origin.key].cloudfront_access_identity_path + } + } + + ###################################### + # API Gateway Origin + ###################################### + dynamic "custom_origin_config" { + for_each = origin.value.origin_type == "api" ? [1] : [] + + content { + http_port = 80 + https_port = 443 + origin_protocol_policy = "https-only" + origin_ssl_protocols = ["TLSv1.2"] + } + } } } enabled = true + aliases = var.aliases + + ########################################## + # Default Cache Behavior (S3) + ########################################## default_cache_behavior { - allowed_methods = ["GET", "HEAD"] - cached_methods = ["GET", "HEAD"] - target_origin_id = "S3-${var.bucket_name}" + target_origin_id = "s3-origin" + viewer_protocol_policy = "redirect-to-https" - compress = true + + allowed_methods = ["GET", "HEAD"] + cached_methods = ["GET", "HEAD"] forwarded_values { query_string = false + cookies { forward = "none" } } - min_ttl = 0 - default_ttl = 86400 - max_ttl = 31536000 + compress = true + min_ttl = 0 + default_ttl = 3600 + max_ttl = 86400 + } + + ########################################## + # API Behavior + ########################################## + + ordered_cache_behavior { + path_pattern = "/v1/*" + target_origin_id = "api-origin" + + viewer_protocol_policy = "redirect-to-https" + + allowed_methods = ["GET","HEAD","OPTIONS","PUT","POST","PATCH","DELETE"] + cached_methods = ["GET","HEAD"] + + cache_policy_id = data.aws_cloudfront_cache_policy.caching_disabled.id + origin_request_policy_id = data.aws_cloudfront_origin_request_policy.all_viewer_except_host_header.id } price_class = "PriceClass_All" + ########################################## + # Geo Restrictions + ########################################## + restrictions { geo_restriction { restriction_type = "none" } } + ########################################## + # SSL Certificate + ########################################## + viewer_certificate { - cloudfront_default_certificate = true + acm_certificate_arn = var.acm_certificate_arn + ssl_support_method = var.acm_certificate_arn != null ? "sni-only" : null + minimum_protocol_version = var.acm_certificate_arn != null ? "TLSv1.2_2021" : null + + cloudfront_default_certificate = var.acm_certificate_arn == null } } +############################################ +# Origin Access Identities (S3 only) +############################################ + resource "aws_cloudfront_origin_access_identity" "oai" { - comment = "OAI for ${var.bucket_name}" + for_each = { + for o in var.origins : o.origin_id => o + if o.origin_type == "s3" + } + + comment = "OAI for ${each.key}" } +############################################ +# S3 Bucket Policy (S3 only) +############################################ + resource "aws_s3_bucket_policy" "bucket_policy" { - bucket = var.bucket_id + for_each = { + for o in var.origins : o.origin_id => o + if o.origin_type == "s3" + } + + bucket = each.value.bucket_id policy = jsonencode({ - Version = "2012-10-17" + Version = "2012-10-17" Statement = [ { - Effect = "Allow" + Effect = "Allow" Principal = { - AWS = "arn:aws:iam::cloudfront:user/CloudFront Origin Access Identity ${aws_cloudfront_origin_access_identity.oai.id}" + AWS = aws_cloudfront_origin_access_identity.oai[each.key].iam_arn } Action = "s3:GetObject" - Resource = "${var.bucket_arn}/*" - }, + Resource = "${each.value.bucket_arn}/*" + } ] }) -} +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/cloudfront/variables.tf b/infrastructure/terraform/modules/cloudfront/variables.tf index 88f770a8..4721d3d1 100644 --- a/infrastructure/terraform/modules/cloudfront/variables.tf +++ b/infrastructure/terraform/modules/cloudfront/variables.tf @@ -1,24 +1,20 @@ -variable "bucket_name" { - description = "The name of the bucket" - type = string +variable "origins" { + type = list(object({ + origin_type = string # "s3" or "api" + origin_domain_name = string + origin_id = string + + bucket_id = optional(string) + bucket_arn = optional(string) + })) } -variable "stage" { - description = "The deployment stage" - type = string +variable "aliases" { + type = list(string) } -variable "bucket_id" { - description = "The ID of the S3 bucket" - type = string -} - -variable "bucket_arn" { - description = "The ARN of the S3 bucket" - type = string -} - -variable "bucket_domain_name" { - description = "The regional domain name of the S3 bucket" +variable "acm_certificate_arn" { + description = "ACM certificate ARN for custom aliases" type = string + default = null } \ No newline at end of file diff --git a/infrastructure/terraform/modules/lambda_service_zip/main.tf b/infrastructure/terraform/modules/lambda_service_zip/main.tf new file mode 100644 index 00000000..d52f5ba4 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service_zip/main.tf @@ -0,0 +1,27 @@ +resource "aws_lambda_function" "this" { + function_name = var.name + role = var.role_arn + package_type = "Zip" + # filename = var.filename + source_code_hash = var.source_code_hash + handler = var.handler + runtime = var.runtime + timeout = var.timeout + memory_size = var.memory_size + publish = true + + s3_bucket = var.s3_bucket + s3_key = var.s3_key + + environment { + variables = var.environment + } +} + +output "lambda_arn" { + value = aws_lambda_function.this.arn +} + +output "function_name" { + value = aws_lambda_function.this.function_name +} diff --git a/infrastructure/terraform/modules/lambda_service_zip/variables.tf b/infrastructure/terraform/modules/lambda_service_zip/variables.tf new file mode 100644 index 00000000..095d4a81 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_service_zip/variables.tf @@ -0,0 +1,20 @@ +variable "name" { type = string } +variable "role_arn" { type = string } +# variable "filename" { type = string } +variable "source_code_hash" { type = string } +variable "handler" { type = string } +variable "runtime" { type = string } +variable "timeout" { + type = number + default = 30 +} +variable "memory_size" { + type = number + default = 128 +} +variable "environment" { + type = map(string) + default = {} +} +variable "s3_bucket" { type = string } +variable "s3_key" { type = string } \ No newline at end of file diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/main.tf b/infrastructure/terraform/modules/lambda_with_api_gateway/main.tf new file mode 100644 index 00000000..99d12efa --- /dev/null +++ b/infrastructure/terraform/modules/lambda_with_api_gateway/main.tf @@ -0,0 +1,117 @@ +############################################ +# IAM role +############################################ +module "role" { + source = "../lambda_execution_role" + name = "${var.name}-lambda-${var.stage}" +} + +############################################ +# Cloudwatch log group +############################################ +resource "aws_cloudwatch_log_group" "api_logs" { + name = "/aws/apigateway/${var.name}-${var.stage}" + retention_in_days = 14 +} + +############################################ +# Install python packages +############################################ +resource "null_resource" "pip_install" { + count = var.requirements_file != null ? 1 : 0 + + triggers = { + always_run = timestamp() + } + + provisioner "local-exec" { + command = "pip install -r ${var.requirements_file} -t ${var.source_dir} --platform manylinux2014_x86_64 --implementation cp --python-version 3.11 --only-binary=:all: --upgrade" + } +} + +############################################ +# Zip the source code +############################################ +data "archive_file" "this" { + depends_on = [null_resource.pip_install] + type = "zip" + source_dir = var.source_dir + output_path = "${path.module}/lambda_package.zip" + excludes = var.zip_excludes +} + +############################################ +# Upload zip to S3 +############################################ +resource "aws_s3_object" "lambda_zip" { + bucket = var.artifact_bucket + key = "env:/${var.stage}/${var.name}.zip" + source = data.archive_file.this.output_path + etag = data.archive_file.this.output_md5 +} + +############################################ +# Lambda +############################################ +module "lambda" { + source = "../lambda_service_zip" + + name = "${var.name}-${var.stage}" + role_arn = module.role.role_arn + s3_bucket = var.artifact_bucket + s3_key = aws_s3_object.lambda_zip.key + source_code_hash = data.archive_file.this.output_base64sha256 + handler = var.handler + runtime = var.runtime + timeout = var.timeout + memory_size = var.memory_size + environment = var.environment +} + +############################################ +# API Gateway +############################################ +resource "aws_apigatewayv2_api" "this" { + name = "${var.name}-api-${var.stage}" + protocol_type = "HTTP" +} + +resource "aws_apigatewayv2_stage" "this" { + api_id = aws_apigatewayv2_api.this.id + name = "$default" + auto_deploy = true + + access_log_settings { + destination_arn = aws_cloudwatch_log_group.api_logs.arn + + format = jsonencode({ + requestId = "$context.requestId" + domainName = "$context.domainName" + path = "$context.path" + status = "$context.status" + sourceIp = "$context.identity.sourceIp" + userAgent = "$context.identity.userAgent" + }) + } +} + +resource "aws_apigatewayv2_integration" "this" { + api_id = aws_apigatewayv2_api.this.id + integration_type = "AWS_PROXY" + integration_uri = module.lambda.lambda_arn + payload_format_version = "2.0" +} + +resource "aws_apigatewayv2_route" "catch_all" { + api_id = aws_apigatewayv2_api.this.id + route_key = "$default" + target = "integrations/${aws_apigatewayv2_integration.this.id}" +} + +resource "aws_lambda_permission" "apigw_invoke" { + statement_id = "AllowAPIGatewayInvoke" + action = "lambda:InvokeFunction" + function_name = module.lambda.lambda_arn + principal = "apigateway.amazonaws.com" + source_arn = "${aws_apigatewayv2_api.this.execution_arn}/*/*" +} diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf b/infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf new file mode 100644 index 00000000..eae0f7d7 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_with_api_gateway/outputs.tf @@ -0,0 +1,11 @@ +output "role_name" { + value = module.role.role_name +} + +output "domain_name" { + value = var.domain_name +} + +output "invoke_url" { + value = aws_apigatewayv2_stage.this.invoke_url +} \ No newline at end of file diff --git a/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf b/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf new file mode 100644 index 00000000..95e5acd9 --- /dev/null +++ b/infrastructure/terraform/modules/lambda_with_api_gateway/variables.tf @@ -0,0 +1,48 @@ +variable "name" { type = string } +variable "stage" { type = string } +variable "source_dir" { type = string } +variable "handler" { type = string } +variable "runtime" { type = string } + +variable "zip_excludes" { + type = list(string) + default = [ + "**/__pycache__/**", + "**/*.pyc", + "**/.pytest_cache/**", + "**/tests/**", + "**/infrastructure/**" + ] +} + +variable "timeout" { + type = number + default = 600 +} +variable "memory_size" { + type = number + default = 512 +} +variable "environment" { + type = map(string) + default = {} +} + +variable "domain_name" { + type = string + default = null +} +variable "certificate_arn" { + type = string + default = null +} +variable "route53_zone_id" { + type = string + default = null +} +variable "artifact_bucket" { type = string } + +variable "requirements_file" { + type = string + default = null +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf b/infrastructure/terraform/modules/lambda_with_sqs/main.tf similarity index 85% rename from infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf rename to infrastructure/terraform/modules/lambda_with_sqs/main.tf index 74345d24..35626487 100644 --- a/infrastructure/terraform/lambda/modules/lambda_with_sqs/main.tf +++ b/infrastructure/terraform/modules/lambda_with_sqs/main.tf @@ -2,7 +2,7 @@ # IAM role ############################################ module "role" { - source = "../../../modules/lambda_execution_role" + source = "../lambda_execution_role" name = "${var.name}-lambda-${var.stage}" } @@ -14,7 +14,7 @@ output "role_name" { # SQS queue + DLQ ############################################ module "queue" { - source = "../../../modules/sqs_queue" + source = "../sqs_queue" name = "${var.name}-queue-${var.stage}" } @@ -22,7 +22,7 @@ module "queue" { # Lambda ############################################ module "lambda" { - source = "../../../modules/lambda_service" + source = "../lambda_service" name = "${var.name}-${var.stage}" role_arn = module.role.role_arn @@ -38,7 +38,7 @@ module "lambda" { # SQS → Lambda trigger ############################################ module "sqs_trigger" { - source = "../../../modules/lambda_sqs_trigger" + source = "../lambda_sqs_trigger" lambda_arn = module.lambda.lambda_arn lambda_role_name = module.role.role_name diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf b/infrastructure/terraform/modules/lambda_with_sqs/outputs.tf similarity index 100% rename from infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf rename to infrastructure/terraform/modules/lambda_with_sqs/outputs.tf diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf b/infrastructure/terraform/modules/lambda_with_sqs/variables.tf similarity index 100% rename from infrastructure/terraform/lambda/modules/lambda_with_sqs/variables.tf rename to infrastructure/terraform/modules/lambda_with_sqs/variables.tf diff --git a/infrastructure/terraform/modules/s3_iam_policy/main.tf b/infrastructure/terraform/modules/s3_iam_policy/main.tf index 397bd963..0ef5c4be 100644 --- a/infrastructure/terraform/modules/s3_iam_policy/main.tf +++ b/infrastructure/terraform/modules/s3_iam_policy/main.tf @@ -2,9 +2,10 @@ locals { # Generate full resource ARNs by combining bucket ARNs with resource paths resources = flatten([ - for bucket_arn in var.bucket_arns : [ - for path in var.resource_paths : "${bucket_arn}${path}" - ] + for bucket_arn in var.bucket_arns : concat( + [bucket_arn], # bare ARN for bucket-level actions like ListBucket + [for path in var.resource_paths : "${bucket_arn}${path}"] + ) ]) } diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index c19e3a0c..fdc7f203 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -2,20 +2,20 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.16" + version = ">= 5.0" } } backend "s3" { - bucket = "assessment-model-terraform-state" - region = "eu-west-2" - key = "terraform.tfstate" + bucket = "assessment-model-terraform-state" + region = "eu-west-2" + key = "terraform.tfstate" } required_version = ">= 1.2.0" } provider "aws" { - region = var.region + region = var.region } # Additional provider for resources that need to be in us-east-1, specifically the SSL certificate @@ -47,30 +47,30 @@ resource "aws_security_group" "allow_db" { ingress { # TLS (change to whatever ports you need) - from_port = 5432 - to_port = 5432 - protocol = "tcp" + from_port = 5432 + to_port = 5432 + protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] } egress { - from_port = 0 - to_port = 0 - protocol = "-1" + from_port = 0 + to_port = 0 + protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } } resource "aws_db_instance" "default" { - allocated_storage = var.allocated_storage - engine = "postgres" - engine_version = "14.17" - instance_class = var.instance_class - db_name = var.database_name - username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"] - password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"] - parameter_group_name = "default.postgres14" - skip_final_snapshot = true + allocated_storage = var.allocated_storage + engine = "postgres" + engine_version = "14.17" + instance_class = var.instance_class + db_name = var.database_name + username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"] + password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"] + parameter_group_name = "default.postgres14" + skip_final_snapshot = true vpc_security_group_ids = [aws_security_group.allow_db.id] lifecycle { prevent_destroy = true @@ -87,7 +87,7 @@ resource "aws_db_instance" "default" { storage_type = "gp3" # Automated backups configuration - backup_retention_period = 14 + backup_retention_period = 14 backup_window = "03:00-04:00" maintenance_window = "Sun:02:00-Sun:02:30" copy_tags_to_snapshot = true @@ -103,7 +103,7 @@ module "s3_presignable_bucket" { } output "retrofit_plan_trigger_bucket_name" { - value = module.s3_presignable_bucket.bucket_name + value = module.s3_presignable_bucket.bucket_name description = "Name of the retrofit plan trigger bucket" } @@ -127,6 +127,22 @@ module "s3" { allowed_origins = var.allowed_origins } +output "retrofit_datalake_bucket_id" { + value = module.s3.bucket_id +} + +output "retrofit_datalake_bucket_arn" { + value = module.s3.bucket_arn +} + +output "retrofit_datalake_bucket_domain_name" { + value = module.s3.bucket_domain_name +} + +output "retrofit_datalake_bucket_name" { + value = module.s3.bucket_name +} + module "model_directory" { source = "../modules/s3" bucketname = "retrofit-model-directory-${var.stage}" @@ -140,7 +156,7 @@ module "retrofit_sap_predictions" { } output "retrofit_sap_predictions_bucket_name" { - value = module.retrofit_sap_predictions.bucket_name + value = module.retrofit_sap_predictions.bucket_name description = "Name of the retrofit SAP predictions bucket" } @@ -151,7 +167,7 @@ module "retrofit_sap_data" { } output "retrofit_sap_data_bucket_name" { - value = module.retrofit_sap_data.bucket_name + value = module.retrofit_sap_data.bucket_name description = "Name of the retrofit SAP data bucket" } @@ -162,7 +178,7 @@ module "retrofit_carbon_predictions" { } output "retrofit_carbon_predictions_bucket_name" { - value = module.retrofit_carbon_predictions.bucket_name + value = module.retrofit_carbon_predictions.bucket_name description = "Name of the retrofit carbon predictions bucket" } @@ -173,7 +189,7 @@ module "retrofit_heat_predictions" { } output "retrofit_heat_predictions_bucket_name" { - value = module.retrofit_heat_predictions.bucket_name + value = module.retrofit_heat_predictions.bucket_name description = "Name of the retrofit heat predictions bucket" } @@ -202,7 +218,7 @@ module "retrofit_heating_kwh_predictions" { } output "retrofit_heating_kwh_predictions_bucket_name" { - value = module.retrofit_heating_kwh_predictions.bucket_name + value = module.retrofit_heating_kwh_predictions.bucket_name description = "Name of the retrofit heating kWh predictions bucket" } @@ -213,7 +229,7 @@ module "retrofit_hotwater_kwh_predictions" { } output "retrofit_hotwater_kwh_predictions_bucket_name" { - value = module.retrofit_hotwater_kwh_predictions.bucket_name + value = module.retrofit_hotwater_kwh_predictions.bucket_name description = "Name of the retrofit hotwater kWh predictions bucket" } @@ -232,7 +248,7 @@ module "retrofit_energy_assessments" { } output "retrofit_energy_assessments_bucket_name" { - value = module.retrofit_energy_assessments.bucket_name + value = module.retrofit_energy_assessments.bucket_name description = "Name of the retrofit energy assessments bucket" } @@ -311,16 +327,14 @@ module "sap_baseline_ecr" { source = "../modules/ecr" } -############################################## -# CDN - Cloudfront -############################################## -module "cloudfront_distribution" { - source = "../modules/cloudfront" - bucket_name = module.s3.bucket_name - bucket_id = module.s3.bucket_id - bucket_arn = module.s3.bucket_arn - bucket_domain_name = module.s3.bucket_domain_name - stage = var.stage +module "heat_baseline_ecr" { + ecr_name = "heat-baseline-prediction-${var.stage}" + source = "../modules/ecr" +} + +module "carbon_baseline_ecr" { + ecr_name = "carbon-baseline-prediction-${var.stage}" + source = "../modules/ecr" } ################################################ @@ -348,7 +362,7 @@ module "address2uprn_state_bucket" { module "address2uprn_registry" { source = "../modules/container_registry" name = "address2uprn" - stage = var.stage + stage = var.stage } @@ -379,14 +393,14 @@ module "condition_etl_state_bucket" { module "condition_etl_registry" { source = "../modules/container_registry" name = "condition-etl" - stage = var.stage + stage = var.stage } # Condition Data S3 Bucket to store initial data module "condition_data_bucket" { - source = "../modules/s3" - bucketname = "condition-data-${var.stage}" + source = "../modules/s3" + bucketname = "condition-data-${var.stage}" allowed_origins = var.allowed_origins } @@ -417,7 +431,7 @@ module "postcode_splitter_state_bucket" { module "postcode_splitter_registry" { source = "../modules/container_registry" name = "postcode_splitter" - stage = var.stage + stage = var.stage } @@ -448,7 +462,39 @@ module "categorisation_state_bucket" { module "categorisation_registry" { source = "../modules/container_registry" name = "categorisation" - stage = var.stage + stage = var.stage +} + + +################################################ +# OrdnanceSurveyAPI – Lambda +################################################ +module "ordnance_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "ordnance-terraform-state" + +} + +module "ordnance_registry" { + source = "../modules/container_registry" + name = "ordnance" + stage = var.stage + +} + +# S3 policy for postcode splitter to read from retrofit data bucket +module "ordnance_s3_read_and_write" { + source = "../modules/s3_iam_policy" + + policy_name = "OrdnanceSurveyReadandWriteS3" + policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] + resource_paths = ["/*"] +} + +output "ordnance_s3_read_and_write_arn" { + value = module.ordnance_s3_read_and_write.policy_arn } ################################################ @@ -463,7 +509,7 @@ module "engine_state_bucket" { module "engine_registry" { source = "../modules/container_registry" name = "engine" - stage = var.stage + stage = var.stage } # S3 policy for Engine to read and write from various S3 buckets @@ -472,7 +518,7 @@ module "engine_s3_read_and_write" { policy_name = "EngineReadandWriteS3" policy_description = "Allow Engine Lambda to read from and write to various S3 buckets" - bucket_arns = [ + bucket_arns = [ "arn:aws:s3:::${module.s3_presignable_bucket.bucket_name}", "arn:aws:s3:::${module.retrofit_sap_data.bucket_name}", "arn:aws:s3:::${module.retrofit_sap_predictions.bucket_name}", @@ -482,10 +528,70 @@ module "engine_s3_read_and_write" { "arn:aws:s3:::${module.retrofit_hotwater_kwh_predictions.bucket_name}", "arn:aws:s3:::${module.retrofit_energy_assessments.bucket_name}" ] - actions = ["s3:*"] - resource_paths = ["/*"] + actions = ["s3:*"] + resource_paths = ["/*"] } output "engine_s3_read_and_write_arn" { value = module.engine_s3_read_and_write.policy_arn } + +################################################ +# FastAPI – Lambda +################################################ +module "ara_fast_api_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "ara-fast-api-terraform-state" +} + +output "ara_fast_api_state_bucket" { + value = module.ara_fast_api_state_bucket.bucket_name +} + +# S3 policy for FastAPI app to read and write from various S3 buckets +module "fast_api_s3_read_and_write" { + source = "../modules/s3_iam_policy" + + policy_name = "FastAPIReadandWriteS3" + policy_description = "Allow FastAPI Lambda to read from and write to various S3 buckets" + bucket_arns = [ + "arn:aws:s3:::${module.s3_presignable_bucket.bucket_name}", + "arn:aws:s3:::${module.retrofit_sap_data.bucket_name}", + "arn:aws:s3:::${module.retrofit_sap_predictions.bucket_name}", + "arn:aws:s3:::${module.retrofit_carbon_predictions.bucket_name}", + "arn:aws:s3:::${module.retrofit_heat_predictions.bucket_name}", + "arn:aws:s3:::${module.retrofit_heating_kwh_predictions.bucket_name}", + "arn:aws:s3:::${module.retrofit_hotwater_kwh_predictions.bucket_name}", + "arn:aws:s3:::${module.retrofit_energy_assessments.bucket_name}" + ] + actions = ["s3:GetObject", "s3:ListBucket"] + resource_paths = ["/*"] +} + +output "fast_api_s3_read_and_write_arn" { + value = module.fast_api_s3_read_and_write.policy_arn +} + +################################################ +# CDN Certificate +################################################ +module "cdn_certificate_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "cdn-certificate-terraform-state" +} + +output "cdn_certificate_state_bucket" { + value = module.cdn_certificate_state_bucket.bucket_name +} + +################################################ +# CDN +################################################ +module "cdn_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "ara-cdn-terraform-state" +} + +output "cdn_state_bucket" { + value = module.cdn_state_bucket.bucket_name +} diff --git a/scripts/download_cotality_evidence.py b/scripts/download_cotality_evidence.py new file mode 100644 index 00000000..43f9afea --- /dev/null +++ b/scripts/download_cotality_evidence.py @@ -0,0 +1,73 @@ +import requests +import json + +TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzMyMzc4MjQsImV4cCI6MTc3MzI0NTAyNCwic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.mkkxeZiD_ByHY4TJKpLQ-trmeGs15s0ekL6u1n-ek9j-EzNyf6qalEHCyHf8gzdNhU_vay96bIOMRHp4vXFaLqSANwKZayIS3EoA_b9-u2FAZpooxEvReAMNJGoZ6WLD01AQXWv-l7ww1ZqAnQzw0moL_Oma6hVmA5oa-RJKJ3MerS7e0Wei97Db48E140-EAbQf2iPcKYYtCNRA4il6n8DFiqGeoUMGo99jkR1ceZAvMpOAj8RhKX-4qSiDfX6yXUS2G96U5m7S_GWI-DEj5TazkN10Af3TyOY3EVjmZoJcRpiAR4cFmlfcTydjrShU03DWmPZm1QItf2McxfCpNA" + +base = "https://pashub.net/api" + +headers = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/json"} + +company_id = "cb5249e2-8f31-4ef4-aefd-08ddaccb1fa2" + +# 1️⃣ get jobs +params = { + "pageIndex": 0, + "pageSize": 20, + "orderBy": "createdUtc", + "orderDesc": "true", + "addressUprn": "100061885568", + "companyId": company_id, +} + +r = requests.get(f"{base}/jobs", headers=headers, params=params) + +payload = r.json() + +property_id = payload["results"][0]["id"] + +print("JOB:", property_id) + +# 2️⃣ get evidence list +r = requests.get(f"{base}/jobs/{property_id}/evidence", headers=headers) + +print(r.status_code) + +evidence = r.json() + +print(evidence) + + +# 3️⃣ get evidence metadata + +if evidence: + evidence_id = evidence["results"][0]["fileId"] + + meta_url = f"https://pashub.net/api/jobs/{property_id}/evidenceMetadata" + + meta_params = {"evidenceIds": evidence_id} + + r = requests.get(meta_url, headers=headers, params=meta_params) + r.raise_for_status() + + meta = r.json() + + container = meta["containerName"] + blob_uri = meta["blobUri"] + + file = meta["files"][0] + file_id = file["fileId"] + file_name = file["fileName"] + + base, sas = blob_uri.split("?", 1) + + download_url = f"{base}{container}/{file_id}?{sas}" + + print("Download URL:", download_url) + + pdf = requests.get(download_url) + pdf.raise_for_status() + + with open(file_name, "wb") as f: + f.write(pdf.content) + + print("Saved:", file_name) diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index b6a33ae1..519636be 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -29,9 +29,7 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] PORTFOLIO_ID = 581 -SCENARIOS = [ - 1124 -] +SCENARIOS = [1124] scenario_names = { 1124: "EPC C - Solar Focused", } @@ -234,7 +232,7 @@ for scenario_id in SCENARIOS: # Get recs for this scenario recommended_measures_df = recommendations_df[ recommendations_df["scenario_id"] == scenario_id - ][["property_id", "measure_type", "estimated_cost", "default"]] + ][["property_id", "measure_type", "estimated_cost", "default"]] recommended_measures_df = recommended_measures_df[ recommended_measures_df["default"] ] @@ -242,7 +240,7 @@ for scenario_id in SCENARIOS: post_install_sap = recommendations_df[ recommendations_df["scenario_id"] == scenario_id - ][["property_id", "default", "sap_points"]] + ][["property_id", "default", "sap_points"]] post_install_sap = post_install_sap[post_install_sap["default"]] # Sum up the sap points by property id post_install_sap = ( @@ -320,7 +318,7 @@ for scenario_id in SCENARIOS: z = df2[ (df2["predicted_post_works_epc"] != "D") & (df2["post_epc_rating"].astype(str) == "Epc.D") - ] + ] df2["predicted_post_works_epc"].value_counts() df2["post_epc_rating"].astype(str).value_counts() @@ -330,8 +328,6 @@ for scenario_id in SCENARIOS: getting_works = df[df["total_retrofit_cost"] > 0] getting_works["predicted_post_works_epc"].value_counts() - 32565 / getting_works.shape[0] - df[df["predicted_post_works_sap"] == ""] # Expected columns list diff --git a/utils/s3.py b/utils/s3.py index b3a96dba..6aa3f44e 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -6,6 +6,7 @@ from io import BytesIO, StringIO from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError +from typing import Any logger = setup_logger() @@ -316,7 +317,7 @@ def save_excel_to_s3(df, bucket_name, file_key): logger.info(f"Excel file saved to S3 bucket '{bucket_name}' with key '{file_key}'") -def read_csv_from_s3(bucket_name, filepath): +def read_csv_from_s3(bucket_name: str, filepath: str) -> list[dict[str, str]]: logger.info( f"Reading CSV file from S3 bucket '{bucket_name}' with key '{filepath}'" )