Merge branch 'main' of https://github.com/Hestia-Homes/Model into feature/ara-rebaselining

# Conflicts:
#	asset_list/app.py
This commit is contained in:
Khalim Conn-Kowlessar 2026-03-19 11:00:32 +00:00
commit 6e73d807a9
74 changed files with 2191 additions and 431 deletions

View file

@ -21,7 +21,7 @@ RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \
&& rm -rf /tmp/libpostal && rm -rf /tmp/libpostal
# 3) Create the user and grant sudo privileges # 3) Create the user and grant sudo privileges
RUN useradd -m -s /usr/bin/bash ${USER} \ RUN useradd -m -s /bin/bash ${USER} \
&& echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \
&& chmod 0440 /etc/sudoers.d/${USER} && chmod 0440 /etc/sudoers.d/${USER}
@ -32,6 +32,11 @@ ADD asset_list/requirements.txt requirements1.txt
RUN cat requirements1.txt requirements2.txt >> requirements.txt RUN cat requirements1.txt requirements2.txt >> requirements.txt
RUN pip install -r requirements.txt RUN pip install -r requirements.txt
# Install code server
RUN curl -fsSL https://code-server.dev/install.sh | sh
# 5) Workdir # 5) Workdir
WORKDIR /workspaces/model WORKDIR /workspaces/model

View file

@ -2,13 +2,14 @@
"name": "SAL ENV", "name": "SAL ENV",
"dockerComposeFile": "docker-compose.yml", "dockerComposeFile": "docker-compose.yml",
"service": "model-sal", "service": "model-sal",
"remoteUser": "vscode", // "remoteUser": "vscode",
"workspaceFolder": "/workspaces/model", "workspaceFolder": "/workspaces/model",
"postStartCommand": "bash .devcontainer/post-install.sh", "postStartCommand": "bash .devcontainer/asset_list/post-install.sh",
"mounts": [ "mounts": [
// Optional, just makes getting from Downloads (local env) easier // Optional, just makes getting from Downloads (local env) easier
"source=${localEnv:HOME},target=/workspaces/home,type=bind" "source=${localEnv:HOME},target=/home/vscode,type=bind"
], ],
"forwardPorts": [8081],
"customizations": { "customizations": {
"vscode": { "vscode": {
"extensions": [ "extensions": [
@ -24,7 +25,8 @@
"ms-python.vscode-python-envs", "ms-python.vscode-python-envs",
"ms-python.black-formatter", "ms-python.black-formatter",
"GrapeCity.gc-excelviewer", "GrapeCity.gc-excelviewer",
"jakobhoeg.vscode-pokemon" "jakobhoeg.vscode-pokemon",
"eamodio.gitlens"
], ],
"settings": { "settings": {
"files.defaultWorkspace": "/workspaces/model", "files.defaultWorkspace": "/workspaces/model",

View file

@ -2,15 +2,17 @@ version: '3.8'
services: services:
model-sal: model-sal:
user: "${UID}:${GID}"
build: build:
context: ../.. context: ../..
dockerfile: .devcontainer/asset_list/Dockerfile dockerfile: .devcontainer/asset_list/Dockerfile
command: sleep infinity command: code-server --bind-addr 0.0.0.0:8080
user: vscode
volumes: volumes:
- ../../:/workspaces/model - ../../:/workspaces/model
networks: networks:
- model-net - model-net
ports:
- "8081:8080"
networks: networks:
model-net: model-net:

View file

@ -27,7 +27,9 @@
"GrapeCity.gc-excelviewer", "GrapeCity.gc-excelviewer",
"jakobhoeg.vscode-pokemon", "jakobhoeg.vscode-pokemon",
"github.vscode-github-actions", "github.vscode-github-actions",
"me-dutour-mathieu.vscode-github-actions" "me-dutour-mathieu.vscode-github-actions",
"anthropic.claude-code",
"eamodio.gitlens"
], ],
"settings": { "settings": {
"files.defaultWorkspace": "/workspaces/model", "files.defaultWorkspace": "/workspaces/model",

View file

@ -1,14 +1,14 @@
mkdir -p ~/.ipython/profile_default/startup # mkdir -p ~/.ipython/profile_default/startup
cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py # cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py
from dotenv import load_dotenv # from dotenv import load_dotenv
import os # import os
# Adjust path as needed # # Adjust path as needed
env_path = "/workspaces/model/backend/.env" # env_path = "/workspaces/model/backend/.env"
if os.path.exists(env_path): # if os.path.exists(env_path):
load_dotenv(env_path) # load_dotenv(env_path)
print("✔ Loaded .env into Jupyter kernel") # print("✔ Loaded .env into Jupyter kernel")
else: # else:
print("⚠ No .env file found to load") # print("⚠ No .env file found to load")
EOF # EOF

View file

@ -16,12 +16,14 @@ on:
type: string type: string
ecr_repo: ecr_repo:
required: true required: false
type: string type: string
default: ''
image_digest: image_digest:
required: true required: false
type: string type: string
default: ''
terraform_apply: terraform_apply:
required: false required: false
@ -58,6 +60,8 @@ on:
required: false required: false
TF_VAR_google_solar_api_key: TF_VAR_google_solar_api_key:
required: false required: false
TF_VAR_ordnance_survey_api_key:
required: false
jobs: jobs:
deploy: deploy:
@ -115,12 +119,23 @@ jobs:
TF_VAR_domain_name: ${{ secrets.TF_VAR_domain_name }} TF_VAR_domain_name: ${{ secrets.TF_VAR_domain_name }}
TF_VAR_epc_auth_token: ${{ secrets.TF_VAR_epc_auth_token }} TF_VAR_epc_auth_token: ${{ secrets.TF_VAR_epc_auth_token }}
TF_VAR_google_solar_api_key: ${{ secrets.TF_VAR_google_solar_api_key }} TF_VAR_google_solar_api_key: ${{ secrets.TF_VAR_google_solar_api_key }}
TF_VAR_ordnance_survey_api_key: ${{ secrets.TF_VAR_ordnance_survey_api_key }}
run: | run: |
ECR_REPO_URL_VAR=""
if [[ -n "${{ inputs.ecr_repo }}" ]]; then
ECR_REPO_URL_VAR="-var=ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}"
fi
IMAGE_DIGEST_VAR=""
if [[ -n "${{ inputs.ecr_repo }}" ]]; then
IMAGE_DIGEST_VAR="-var=image_digest=${{ inputs.image_digest }}"
fi
terraform plan \ terraform plan \
-var="stage=${{ inputs.stage }}" \ -var="stage=${{ inputs.stage }}" \
-var="lambda_name=${{ inputs.lambda_name }}" \ -var="lambda_name=${{ inputs.lambda_name }}" \
-var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ $ECR_REPO_URL_VAR \
-var="image_digest=${{ inputs.image_digest }}" \ $IMAGE_DIGEST_VAR \
-out=lambdaplan -out=lambdaplan
- name: Terraform Apply - name: Terraform Apply
@ -140,9 +155,14 @@ jobs:
TF_VAR_domain_name: ${{ secrets.TF_VAR_domain_name }} TF_VAR_domain_name: ${{ secrets.TF_VAR_domain_name }}
TF_VAR_epc_auth_token: ${{ secrets.TF_VAR_epc_auth_token }} TF_VAR_epc_auth_token: ${{ secrets.TF_VAR_epc_auth_token }}
TF_VAR_google_solar_api_key: ${{ secrets.TF_VAR_google_solar_api_key }} TF_VAR_google_solar_api_key: ${{ secrets.TF_VAR_google_solar_api_key }}
TF_VAR_ordnance_survey_api_key: ${{ secrets.TF_VAR_ordnance_survey_api_key }}
run: | run: |
EXTRA_VARS=""
if [[ -n "${{ inputs.ecr_repo }}" ]]; then
EXTRA_VARS="-var=ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }} -var=image_digest=${{ inputs.image_digest }}"
fi
terraform destroy -auto-approve \ terraform destroy -auto-approve \
-var="stage=${{ inputs.stage }}" \ -var="stage=${{ inputs.stage }}" \
-var="lambda_name=${{ inputs.lambda_name }}" \ -var="lambda_name=${{ inputs.lambda_name }}" \
-var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ $EXTRA_VARS
-var="image_digest=${{ inputs.image_digest }}"

View file

@ -41,7 +41,7 @@ jobs:
fi fi
# ============================================================ # ============================================================
# 1 Shared Terraform (infra) # Shared Terraform (infra)
# ============================================================ # ============================================================
shared_terraform: shared_terraform:
needs: determine_stage needs: determine_stage
@ -77,9 +77,49 @@ jobs:
if: env.TERRAFORM_APPLY == 'true' if: env.TERRAFORM_APPLY == 'true'
working-directory: infrastructure/terraform/shared working-directory: infrastructure/terraform/shared
run: terraform apply -auto-approve tfplan run: terraform apply -auto-approve tfplan
# ============================================================
# Ara Engine image and Push
# ============================================================
ara_engine_image:
needs: [determine_stage, shared_terraform]
uses: ./.github/workflows/_build_image.yml
with:
ecr_repo: engine-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: backend/docker/engine.Dockerfile
build_context: .
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================
# Deploy Ara Engine Lambda
# ============================================================
ara_engine_lambda:
needs: [ara_engine_image, determine_stage]
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: ara_engine
lambda_path: infrastructure/terraform/lambda/engine
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: engine-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.ara_engine_image.outputs.image_digest }}
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }}
TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }}
TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }}
TF_VAR_api_key: ${{ secrets.DEV_API_KEY }}
TF_VAR_secret_key: ${{ secrets.DEV_SECRET_KEY }}
TF_VAR_epc_auth_token: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
TF_VAR_google_solar_api_key: ${{ secrets.DEV_GOOGLE_SOLAR_API_KEY }}
# ============================================================ # ============================================================
# 2⃣ Build Address 2 UPRN image and Push # Build Address 2 UPRN image and Push
# ============================================================ # ============================================================
address2uprn_image: address2uprn_image:
needs: [determine_stage, shared_terraform] needs: [determine_stage, shared_terraform]
@ -103,7 +143,7 @@ jobs:
EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
# ============================================================ # ============================================================
# 3 Deploy Address 2 UPRN Lambda # Deploy Address 2 UPRN Lambda
# ============================================================ # ============================================================
address2uprn_lambda: address2uprn_lambda:
needs: [address2uprn_image, determine_stage] needs: [address2uprn_image, determine_stage]
@ -122,7 +162,7 @@ jobs:
# ============================================================ # ============================================================
# 2 Build Postcode Splitter image and Push # Build Postcode Splitter image and Push
# ============================================================ # ============================================================
postcodeSplitter_image: postcodeSplitter_image:
needs: [determine_stage, shared_terraform] needs: [determine_stage, shared_terraform]
@ -144,7 +184,7 @@ jobs:
DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
# ============================================================ # ============================================================
# 3 Deploy Postcode Splitter Lambda # Deploy Postcode Splitter Lambda
# ============================================================ # ============================================================
postcodeSplitter_lambda: postcodeSplitter_lambda:
needs: [postcodeSplitter_image, determine_stage, address2uprn_lambda] needs: [postcodeSplitter_image, determine_stage, address2uprn_lambda]
@ -242,32 +282,56 @@ jobs:
AWS_REGION: ${{ secrets.DEV_AWS_REGION }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================ # ============================================================
# Ara Engine image and Push # Build OrdanceSurvey image and Push
# ============================================================ # ============================================================
ara_engine_image: ordnanceSurvey_image:
needs: [determine_stage, shared_terraform] needs: [determine_stage, shared_terraform]
uses: ./.github/workflows/_build_image.yml uses: ./.github/workflows/_build_image.yml
with: with:
ecr_repo: engine-${{ needs.determine_stage.outputs.stage }} ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: backend/docker/engine.Dockerfile dockerfile_path: backend/ordnanceSurvey/handler/Dockerfile
build_context: . build_context: .
build_args: |
DEV_DB_HOST=$DEV_DB_HOST
DEV_DB_PORT=$DEV_DB_PORT
DEV_DB_NAME=$DEV_DB_NAME
secrets: secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
# ============================================================ # ============================================================
# Deploy Categorisation Lambda # Deploy OrdanceSurvey Lambda
# ============================================================ # ============================================================
ara_engine_lambda: ordnanceSurvey_lambda:
needs: [ara_engine_image, determine_stage] needs: [ordnanceSurvey_image, determine_stage]
uses: ./.github/workflows/_deploy_lambda.yml uses: ./.github/workflows/_deploy_lambda.yml
with: with:
lambda_name: ara_engine lambda_name: ordnanceSurvey
lambda_path: infrastructure/terraform/lambda/engine lambda_path: infrastructure/terraform/lambda/ordnanceSurvey
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }}
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
TF_VAR_ORDNANCE_SURVEY_API_KEY: ${{ secrets.ORDNANCE_SURVEY_API_KEY }}
# ============================================================
# Deploy FastAPI Lambda
# ============================================================
fast_api_lambda:
needs: [determine_stage, ara_engine_lambda, categorisation_lambda]
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: ara_fast_api
lambda_path: infrastructure/terraform/lambda/fast-api
stage: ${{ needs.determine_stage.outputs.stage }} stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: engine-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.ara_engine_image.outputs.image_digest }}
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
secrets: secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
@ -276,8 +340,97 @@ jobs:
TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }} TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }}
TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }} TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }}
TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }} TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }}
TF_VAR_api_key: ${{ secrets.DEV_API_KEY }} TF_VAR_api_key: ${{ secrets.FASTAPI_API_KEY }}
TF_VAR_secret_key: ${{ secrets.DEV_SECRET_KEY }} TF_VAR_secret_key: ${{ secrets.NEXTAUTH_SECRET }}
TF_VAR_domain_name: ${{ secrets.DEV_DOMAIN_NAME }} TF_VAR_domain_name: ${{ secrets.ARA_DEV_DOMAIN_NAME }}
TF_VAR_epc_auth_token: ${{ secrets.DEV_EPC_AUTH_TOKEN }} TF_VAR_epc_auth_token: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
TF_VAR_google_solar_api_key: ${{ secrets.DEV_GOOGLE_SOLAR_API_KEY }} TF_VAR_google_solar_api_key: ${{ secrets.DEV_GOOGLE_SOLAR_API_KEY }}
# ============================================================
# Deploy ACM Certificate for Cloudfront
# ============================================================
cloudfront_acm:
needs: [determine_stage, shared_terraform, fast_api_lambda]
runs-on: ubuntu-latest
env:
STAGE: ${{ needs.determine_stage.outputs.stage }}
TERRAFORM_APPLY: ${{ needs.determine_stage.outputs.terraform_apply }}
steps:
- uses: actions/checkout@v4
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.DEV_AWS_REGION }}
- uses: hashicorp/setup-terraform@v3
- name: Terraform Init
working-directory: infrastructure/terraform/cdn_certificate
run: terraform init -reconfigure
- name: Terraform Workspace
working-directory: infrastructure/terraform/cdn_certificate
run: |
terraform workspace select $STAGE \
|| terraform workspace new $STAGE
- name: Terraform Plan
working-directory: infrastructure/terraform/cdn_certificate
run: |
terraform plan \
-var="stage=${STAGE}" \
-out=tfplan
- name: Terraform Apply
if: env.TERRAFORM_APPLY == 'true'
working-directory: infrastructure/terraform/cdn_certificate
run: terraform apply -auto-approve tfplan
# ============================================================
# Deploy Cloudfront CDN
# ============================================================
cloudfront_cdn:
needs: [determine_stage, cloudfront_acm]
runs-on: ubuntu-latest
env:
STAGE: ${{ needs.determine_stage.outputs.stage }}
TERRAFORM_APPLY: ${{ needs.determine_stage.outputs.terraform_apply }}
steps:
- uses: actions/checkout@v4
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.DEV_AWS_REGION }}
- uses: hashicorp/setup-terraform@v3
- name: Terraform Init
working-directory: infrastructure/terraform/cdn
run: terraform init -reconfigure
- name: Terraform Workspace
working-directory: infrastructure/terraform/cdn
run: |
terraform workspace select $STAGE \
|| terraform workspace new $STAGE
- name: Terraform Plan
working-directory: infrastructure/terraform/cdn
run: |
terraform plan \
-var="stage=${STAGE}" \
-out=tfplan
- name: Terraform Apply
if: env.TERRAFORM_APPLY == 'true'
working-directory: infrastructure/terraform/cdn
run: terraform apply -auto-approve tfplan

2
.gitignore vendored
View file

@ -246,7 +246,7 @@ etl/epc/local_data/*
/backend/condition/sample_data/peabody/* /backend/condition/sample_data/peabody/*
*.DS_Store *.DS_Store
infrastructure/terraform/.terraform* **/.terraform*
# Don't commit packages up serverless packages # Don't commit packages up serverless packages
.serverless .serverless

View file

@ -16,7 +16,13 @@
"python.languageServer": "Pylance", "python.languageServer": "Pylance",
"python.analysis.typeCheckingMode": "strict", "python.analysis.typeCheckingMode": "strict",
"python.analysis.autoSearchPaths": true, "python.analysis.autoSearchPaths": true,
"python.analysis.extraPaths": ["./src"] "python.analysis.extraPaths": ["./src"],
"vim.useCtrlKeys": true,
"vim.handleKeys": {
"<C-c>": false,
"<C-v>": false
}
// Hot reload setting that needs to be in user settings // Hot reload setting that needs to be in user settings
// "jupyter.runStartupCommands": [ // "jupyter.runStartupCommands": [

View file

@ -73,24 +73,25 @@ def app():
Property UPRN Property UPRN
""" """
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/March 2026 SAL" data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lifespace Rentals/Missed"
data_filename = "Domna System Review - Livewest.xlsx" # data_filename = "For Modelling - Final - reviewed.xlsx"
data_filename = "Missed Properties - with address.xlsx"
sheet_name = "Sheet1" sheet_name = "Sheet1"
postcode_column = "Postcode" postcode_column = "Postcode"
address1_column = None address1_column = "address1"
address1_method = "house_number_extraction" address1_method = None
fulladdress_column = "Address" fulladdress_column = "address1"
address_cols_to_concat = [] address_cols_to_concat = []
missing_postcodes_method = None missing_postcodes_method = None
landlord_year_built = None landlord_year_built = None
landlord_os_uprn = "gov UPRN" landlord_os_uprn = "UPRN"
landlord_property_type = "AssetType" landlord_property_type = "Type"
landlord_built_form = "AssetType" landlord_built_form = None
landlord_wall_construction = None landlord_wall_construction = None
landlord_roof_construction = None landlord_roof_construction = None
landlord_heating_system = None landlord_heating_system = None
landlord_existing_pv = None landlord_existing_pv = None
landlord_property_id = "landlord_uprn" landlord_property_id = "Reference"
landlord_sap = None landlord_sap = None
outcomes_filename = None outcomes_filename = None
outcomes_sheetname = None outcomes_sheetname = None

View file

@ -364,5 +364,5 @@ Here's what you should do:
function. function.
By following these steps, you should have your custom domain properly configured and pointing to your AWS Lambda By following these steps, you should have your custom domain properly configured and pointing to your AWS Lambda
function via the CloudFront distribution. function via the CloudFront distribution

View file

@ -1,20 +1,58 @@
We have list of address as input. So you want to fetch UPRN for an address list?
It'll come in batches of the same post code and from then we want to somehow convert that into UPRN
if this lambda/function can do that we'll be speeding ahead
Energy Performance Information: https://epc.opendatacommunities.org/ Before you run:
guidance page: https://epc.opendatacommunities.org/docs/guidance#field_domestic_LMK_KEY Step 1) Get the list and ensure the following columns exists
Example of past khalims code that he wrote some tests for: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/tests/test_search_epc.py#L11 * Address 1
* Address 2
* Address 3
* postcode
And save it as a .csv file
Example of EPC search: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/SearchEpc.py#L118 Step 2)
Before we run this, we need to upload it into S3 as well as put initiate a subtask + task
* S3 upload I'll recommend somewhere in retrofit-data-dev and get the s3_uri
For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key
task_id = a7b70a02-4df4-45b5-a50b-196e095910bb
sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e
Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling
postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev
{
"task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
"sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
}
Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs/<task-id>/<subtask-id>/<timestamp:uuid4>.csv
Run the script in backend/scripts/combine_address2uprn_outputs.py with <task-id>.
This will combine all the outputs of the files for each address2uprn into one big file
Khalim has made a python package to help scrape data: https://github.com/KhalimCK/epc-api-python Find out which ones have missing uprn and save that as a seperate sheet and save it somewhere in s3://retrofit-data-dev
I uploaded the missing uprn here: s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv
ordnance_survey sqs is => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2FordnanceSurvey-queue-dev
{
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv",
"task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
"sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e"
}
outputs are at s3://retrofit-data-dev/ara_ordnance_survey_outputs/<task-id>/<sub-task-id>

View file

@ -1,13 +1,11 @@
from typing import Optional
from epc_api.client import EpcClient from epc_api.client import EpcClient
import os import os
from urllib.parse import urlencode from urllib.parse import urlencode
import pandas as pd import pandas as pd
from difflib import SequenceMatcher
from utils.logger import setup_logger from utils.logger import setup_logger
import re
from typing import Set
import json import json
import requests
from uuid import UUID from uuid import UUID
import uuid import uuid
from backend.app.db.functions.tasks.Tasks import SubTaskInterface from backend.app.db.functions.tasks.Tasks import SubTaskInterface
@ -18,6 +16,8 @@ from utils.s3 import (
) )
from datetime import datetime from datetime import datetime
from backend.utils.addressMatch import AddressMatch
logger = setup_logger() logger = setup_logger()
@ -29,191 +29,6 @@ if EPC_AUTH_TOKEN is None:
raise RuntimeError("EPC_AUTH_TOKEN not defined in env") raise RuntimeError("EPC_AUTH_TOKEN not defined in env")
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
def levenshtein(a: str, b: str) -> float:
"""
Address similarity score in [0, 1].
Strategy:
- Normalise
- Strongly penalise mismatched house/flat numbers
- Combine token overlap + character similarity
"""
def extract_number_sequence(s: str) -> list[str]:
return re.findall(r"\d+[a-z]?", s)
def extract_numbers(s: str) -> Set[str]:
return set(extract_number_sequence(s))
def tokenise(s: str) -> Set[str]:
return set(s.split())
def extract_building_number(s: str) -> str | None:
"""
Extract the main building number (NOT flat/unit).
Assumes formats like:
- '42 moreton road'
- 'flat 3 42 moreton road'
"""
tokens = s.split()
# remove flat/unit context
cleaned = []
skip_next = False
for t in tokens:
if t in ("flat", "apt", "apartment", "unit"):
skip_next = True
continue
if skip_next:
skip_next = False
continue
cleaned.append(t)
# first remaining number is building number
for t in cleaned:
if re.fullmatch(r"\d+[a-z]?", t):
return t
return None
a_norm = normalise_address(a)
b_norm = normalise_address(b)
# --- hard signal: numbers ---
nums_a = extract_numbers(a_norm)
nums_b = extract_numbers(b_norm)
if nums_a and not nums_b:
return 0.0
# No shared numbers at all → impossible match
if nums_a and nums_b and nums_a.isdisjoint(nums_b):
return 0.0
# 🔒 HARD GUARD: building number must match
bld_a = extract_building_number(a_norm)
bld_b = extract_building_number(b_norm)
if bld_a and bld_b and bld_a != bld_b:
return 0.0
# --- order-sensitive flat/building guard ---
seq_a = extract_number_sequence(a_norm)
seq_b = extract_number_sequence(b_norm)
has_flat_token_user = any(
tok in a_norm for tok in ("flat", "apt", "apartment", "unit")
)
has_flat_token_epc = "flat" in b_norm
if (
len(seq_a) == 2
and len(seq_b) >= 2
and has_flat_token_epc
and not has_flat_token_user
and seq_a != seq_b[:2]
):
return 0.0
# --- token similarity (order-independent) ---
toks_a = tokenise(a_norm)
toks_b = tokenise(b_norm)
if not toks_a or not toks_b:
token_score = 0.0
else:
token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
# --- character similarity (soft signal) ---
char_score = SequenceMatcher(None, a_norm, b_norm).ratio()
# --- weighted blend ---
return round(
0.65 * token_score + 0.35 * char_score,
4,
)
def normalise_address(s: str) -> str:
"""
Canonical UK-focused address normalisation.
- Lowercases
- Removes punctuation (keeps / for flats)
- Normalises whitespace
- Applies synonym compression at token level
"""
if not s:
return ""
ADDRESS_SYNONYMS = {
# street types
"rd": "road",
"rd.": "road",
"st": "street",
"st.": "street",
"ave": "avenue",
"ave.": "avenue",
"ln": "lane",
"ln.": "lane",
"cres": "crescent",
"ct": "court",
"dr": "drive",
# flats / units
"apt": "flat",
"apartment": "flat",
"unit": "flat",
"ste": "suite",
# numbering noise
"no": "",
"no.": "",
}
# 1. lowercase
s = s.lower()
# 1.5 split digit-letter suffixes
s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s)
# 2. remove punctuation except /
s = re.sub(r"[^\w\s/]", " ", s)
# 3. normalise whitespace
s = re.sub(r"\s+", " ", s).strip()
# 4. tokenise + synonym normalisation
tokens = []
for tok in s.split():
replacement = ADDRESS_SYNONYMS.get(tok, tok)
if replacement:
tokens.append(replacement)
return " ".join(tokens)
def score_addresses( def score_addresses(
df: pd.DataFrame, df: pd.DataFrame,
user_address: str, user_address: str,
@ -222,7 +37,7 @@ def score_addresses(
if column not in df.columns: if column not in df.columns:
raise ValueError(f"Missing column: {column}") raise ValueError(f"Missing column: {column}")
return df[column].apply(lambda x: levenshtein(user_address, x)) return df[column].apply(lambda x: AddressMatch.score(user_address, x))
def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
@ -314,9 +129,11 @@ def get_uprn_candidates(
out = df.copy() out = df.copy()
user_norm = normalise_address(user_address) user_norm = AddressMatch.normalise_address(user_address)
out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) out["lexiscore"] = out[address_column].apply(
lambda x: AddressMatch.levenshtein(user_norm, x)
)
# Normalise UPRN to string # Normalise UPRN to string
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
@ -480,7 +297,10 @@ def resolve_uprns_for_postcode_group(
def save_results_to_s3( def save_results_to_s3(
results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None results_df: pd.DataFrame,
task_id: str,
sub_task_id: str,
bucket_name: Optional[str] = None,
) -> bool: ) -> bool:
""" """
Save results DataFrame to S3 as CSV. Save results DataFrame to S3 as CSV.
@ -533,7 +353,7 @@ def handler(event, context, local=False):
{ {
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
"sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
"s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
} }
) )
} }
@ -621,19 +441,6 @@ def handler(event, context, local=False):
# Process the rows # Process the rows
logger.info(f"Processing {len(df)} rows for task {task_id}") logger.info(f"Processing {len(df)} rows for task {task_id}")
# Create user_input column by concatenating Address columns if not already present
if "user_input" not in df.columns:
df["user_input"] = (
df["Address 1"].fillna("")
+ " "
+ df["Address 2"].fillna("")
+ " "
+ df["Address 3"].fillna("")
).str.strip()
logger.info(f"Created user_input column from Address 1 and Address 2")
else:
logger.info(f"user_input column already present in data")
clean_df = df.dropna(subset=["postcode_clean"]) clean_df = df.dropna(subset=["postcode_clean"])
postcode_to_addresses = { postcode_to_addresses = {
@ -653,7 +460,7 @@ def handler(event, context, local=False):
) )
# Validate postcode before processing # Validate postcode before processing
if not is_valid_postcode(postcode): if not AddressMatch.is_valid_postcode(postcode):
logger.warning(f"Postcode {postcode} is invalid, skipping") logger.warning(f"Postcode {postcode} is invalid, skipping")
continue continue
@ -672,57 +479,67 @@ def handler(event, context, local=False):
# Process each address in this postcode with the same EPC data # Process each address in this postcode with the same EPC data
for row in postcode_rows: for row in postcode_rows:
try: try:
user_input = row.get("user_input", "") # Concatenate Address columns directly
if not user_input: address2uprn_user_input = (
str(row.get("Address 1", "")).strip()
+ " "
+ str(row.get("Address 2", "")).strip()
+ " "
+ str(row.get("Address 3", "")).strip()
).strip()
if not address2uprn_user_input:
logger.warning( logger.warning(
f"Skipping row with missing user_input for postcode {postcode}" f"Skipping row with missing address components for postcode {postcode}"
) )
continue continue
# Get UPRN using the pre-fetched EPC data with all return options # Get UPRN using the pre-fetched EPC data with all return options
result = get_uprn_with_epc_df( result = get_uprn_with_epc_df(
user_inputed_address=user_input, epc_df=epc_df, verbose=True user_inputed_address=address2uprn_user_input,
epc_df=epc_df,
verbose=True,
) )
# Parse result tuple if successful # Parse result tuple if successful
if result: if result:
uprn, found_address, score = result uprn, found_address, score = result
logger.info( logger.info(
f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})"
) )
results_data.append( results_data.append(
{ {
**row, # Include all original data **row, # Include all original data
"uprn": uprn, "address2uprn_uprn": uprn,
"domna_found_address": found_address, "address2uprn_address": found_address,
"domna_lexiscore": score, "address2uprn_lexiscore": score,
} }
) )
else: else:
logger.warning( logger.warning(
f"No UPRN found for {user_input} in {postcode}" f"No UPRN found for {address2uprn_user_input} in {postcode}"
) )
results_data.append( results_data.append(
{ {
**row, # Include all original data **row, # Include all original data
"uprn": None, "address2uprn_uprn": None,
"domna_found_address": None, "address2uprn_address": None,
"domna_lexiscore": None, "address2uprn_lexiscore": None,
} }
) )
except Exception as e: except Exception as e:
logger.error( logger.error(
f"Error processing address {row.get('user_input', 'unknown')}: {e}" f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}"
) )
# Still add the row with error markers # Still add the row with error markers
results_data.append( results_data.append(
{ {
**row, **row,
"uprn": None, "address2uprn_uprn": None,
"domna_found_address": None, "address2uprn_address": None,
"domna_lexiscore": None, "address2uprn_lexiscore": None,
"error": str(e), "error": str(e),
} }
) )

View file

@ -35,7 +35,7 @@ class Settings(BaseSettings):
SECRET_KEY: str = "changeme" SECRET_KEY: str = "changeme"
ENVIRONMENT: str = "changeme" ENVIRONMENT: str = "changeme"
DATA_BUCKET: str = "changeme" DATA_BUCKET: str = "changeme"
PLAN_TRIGGER_BUCKET: str PLAN_TRIGGER_BUCKET: str = "changeme"
ENGINE_SQS_URL: str = "changeme" ENGINE_SQS_URL: str = "changeme"
CATEGORISATION_SQS_URL: str = "changeme" CATEGORISATION_SQS_URL: str = "changeme"
@ -63,6 +63,8 @@ class Settings(BaseSettings):
# Other S3 buckts # Other S3 buckts
ENERGY_ASSESSMENTS_BUCKET: str = "changeme" ENERGY_ASSESSMENTS_BUCKET: str = "changeme"
ORDNANCE_SURVEY_API_KEY: str = "changeme"
# Optional AWS creds (only required in local) # Optional AWS creds (only required in local)
AWS_ACCESS_KEY_ID: Optional[str] = None AWS_ACCESS_KEY_ID: Optional[str] = None
AWS_SECRET_KEY_ID: Optional[str] = None AWS_SECRET_KEY_ID: Optional[str] = None

View file

@ -0,0 +1,24 @@
import pytz
import datetime
from sqlalchemy import (
Column,
BigInteger,
Text,
DateTime,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class PostcodeSearchModel(Base):
__tablename__ = "postcode_search"
id = Column(BigInteger, primary_key=True, autoincrement=True)
postcode = Column(Text, nullable=False)
result_data = Column(JSONB, nullable=True)
created_at = Column(
DateTime(timezone=True), nullable=False, default=datetime.datetime.now(pytz.utc)
)

View file

@ -43,7 +43,7 @@ def generate_api_key():
# Define the characters that will be used to generate the api key # Define the characters that will be used to generate the api key
characters = string.ascii_letters + string.digits characters = string.ascii_letters + string.digits
# Generate a 40 character long api key # Generate a 40 character long api key
api_key = ''.join(secrets.choice(characters) for _ in range(40)) api_key = "".join(secrets.choice(characters) for _ in range(40))
return api_key return api_key
@ -113,7 +113,7 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
df.to_parquet(parquet_buffer) df.to_parquet(parquet_buffer)
# Create the boto3 client # Create the boto3 client
s3 = boto3.resource('s3') s3 = boto3.resource("s3")
# Upload the Parquet file to S3 # Upload the Parquet file to S3
s3.Object(bucket_name, file_key).put(Body=parquet_buffer.getvalue()) s3.Object(bucket_name, file_key).put(Body=parquet_buffer.getvalue())

View file

View file

@ -0,0 +1,25 @@
FROM public.ecr.aws/lambda/python:3.11
ARG DEV_DB_HOST
ARG DEV_DB_PORT
ARG DEV_DB_NAME
ENV DB_HOST=${DEV_DB_HOST}
ENV DB_PORT=${DEV_DB_PORT}
ENV DB_NAME=${DEV_DB_NAME}
# Set working directory (Lambda task root)
WORKDIR /var/task
COPY backend/ordnanceSurvey/handler/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy necessary files for database and utility imports
COPY utils/ utils/
COPY backend/ backend/
COPY datatypes/ datatypes/
# Lambda handler
CMD ["backend/ordnanceSurvey/main.handler"]

View file

@ -0,0 +1,11 @@
pandas==2.2.2
numpy<2.0
requests
tqdm
openpyxl
epc-api-python==1.0.2
boto3==1.35.44
sqlmodel
sqlalchemy==2.0.36
psycopg2-binary==2.9.10
pydantic-settings==2.6.0

View file

@ -0,0 +1,47 @@
import urllib.parse
from pydantic import ValidationError
import requests
import pandas as pd
from utils.logger import setup_logger
logger = setup_logger()
def os_places_results_to_dataframe(data: dict) -> pd.DataFrame:
"""
Flatten the OS Places API response results into a DataFrame.
Each result contains either a DPA or LPI record.
"""
results = data.get("results", [])
rows = []
for r in results:
if "DPA" in r:
rows.append(r["DPA"])
elif "LPI" in r:
rows.append(r["LPI"])
return pd.DataFrame(rows)
def lookup_os_places(postcode: str, api_key: str) -> dict:
"""
Lookup a postcode using the OS Places API.
Returns the full API response data or an error dict.
"""
if not api_key:
return {"error": "Ordnance Survey API key not specified", "status": 400}
encoded_postcode = urllib.parse.quote(postcode)
url = (
f"https://api.os.uk/search/places/v1/postcode?postcode={encoded_postcode}"
f"&dataset=DPA,LPI&key={api_key}"
)
response = requests.get(url)
if response.status_code != 200:
logger.error(
f"OS Places API error for postcode {postcode}: {response.status_code}"
)
return {"error": "Failed to fetch address data", "status": response.status_code}
data = response.json()
return {"data": data, "status": 200}

View file

@ -0,0 +1,11 @@
version: "3.9"
services:
ordnance-survey-lambda:
build:
context: ../../../
dockerfile: backend/ordnanceSurvey/handler/Dockerfile
ports:
- "9000:8080"
env_file:
- ../../../.env

View file

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import json
import requests
HOST = "localhost"
PORT = "9000"
LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations"
payload = {
"Records": [
{
"body": json.dumps(
{
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
"sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv",
}
)
}
]
}
response = requests.post(LAMBDA_URL, json=payload)
print("Status code:", response.status_code)
print("Response:")
print(response.text)

View file

@ -0,0 +1,223 @@
from typing import Any, Optional
import json
from utils.logger import setup_logger
import logging
from backend.utils.subtasks import subtask_handler
from utils.s3 import (
save_csv_to_s3,
read_csv_from_s3 as read_csv_from_s3_dict,
parse_s3_uri,
)
from backend.utils.addressMatch import AddressMatch
from backend.app.db.connection import get_db_session
from backend.app.db.models.postcode_search import PostcodeSearchModel
from backend.ordnanceSurvey.helpers import (
lookup_os_places,
os_places_results_to_dataframe,
)
from backend.app.config import get_settings
from sqlalchemy import select
from datetime import datetime
import uuid
import os
import pandas as pd
logger: logging.Logger = setup_logger()
def check_if_post_code_exists_in_db_cache(postcode):
with get_db_session() as session:
result = (
session.execute(
select(PostcodeSearchModel).where(
PostcodeSearchModel.postcode == postcode
)
)
.scalars()
.first()
)
if result:
return os_places_results_to_dataframe(result.result_data)
# Cache miss — fetch from OS Places API
api_key = get_settings().ORDNANCE_SURVEY_API_KEY
response = lookup_os_places(postcode, api_key)
if response.get("status") != 200 or "data" not in response:
logger.error(f"OS Places API failed for {postcode}: {response}")
return pd.DataFrame()
# Save to cache
new_record = PostcodeSearchModel(
postcode=postcode,
result_data=response["data"],
)
session.add(new_record)
session.commit()
return os_places_results_to_dataframe(response["data"])
def save_results_to_s3(
results_df: pd.DataFrame,
task_id: str,
sub_task_id: str,
bucket_name: Optional[str] = None,
) -> bool:
"""
Save results DataFrame to S3 as CSV in a parent folder structure.
:param results_df: The DataFrame containing results
:param task_id: The task ID (used for file naming)
:param sub_task_id: The subtask ID (used for file naming)
:param bucket_name: The S3 bucket name (defaults to env variable)
:return: True if successful, False otherwise
"""
if bucket_name is None:
bucket_name = os.getenv("S3_BUCKET_NAME")
if not bucket_name:
logger.error(
"S3 bucket name not provided and S3_BUCKET_NAME environment variable not set"
)
return False
try:
# Create a filename with timestamp and UUID
file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}"
file_key = f"ara_ordnance_survey_outputs/{task_id}/{sub_task_id}/ordnanceSurvey/{file_name}.csv"
# Save to S3
success = save_csv_to_s3(results_df, bucket_name, file_key)
if success:
logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}")
return True
else:
logger.error(f"Failed to save results to S3")
return False
except Exception as e:
logger.error(f"Error saving results to S3: {str(e)}")
return False
@subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body
def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
# delete this line after test
# local = True
# Example SQS message for testing (copy and paste into SQS):
if local is True:
body = {
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
"sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
"s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv",
"lexiscore_column": "address2uprn_lexiscore",
}
s3_uri: str = body.get("s3_uri", "")
lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5)
lexiscore_column: Optional[str] = body.get("lexiscore_column", None)
task_id: str = body.get("task_id", "")
sub_task_id: str = body.get("sub_task_id", "")
if s3_uri == "":
raise RuntimeError("Missing s3_uri in message body")
bucket, key = parse_s3_uri(s3_uri)
# Assumption designing with address2uprn was ran first
csv_data = read_csv_from_s3_dict(bucket, key)
df = pd.DataFrame(csv_data)
# df = df.head(5)
# If lexiscore_column is specified, use it; otherwise process all rows
if lexiscore_column and lexiscore_column in df.columns:
df[lexiscore_column] = pd.to_numeric(df[lexiscore_column], errors="coerce")
needs_processing = df[
df[lexiscore_column].isna() | (df[lexiscore_column] < lexiscore_threshold)
]
else:
# Default: process all rows
needs_processing = df
grouped = needs_processing.groupby("postcode_clean")
# Initialise new columns
df["ordnance_survey_address"] = None
df["ordnance_survey_uprn"] = None
df["ordnance_survey_lexiscore"] = None
# Process each postcode group at a time
for postcode, group in grouped:
print(f"Processing postcode: {postcode} ({len(group)} rows)")
valid_group = AddressMatch.is_valid_postcode(postcode)
if not valid_group:
logger.warning(f"Postcode {postcode} is invalid, skipping")
for idx in group.index:
df.at[idx, "ordnance_survey_address"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_uprn"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_lexiscore"] = (
"postcode not found in ordnance survey"
)
continue
postcode_cache = check_if_post_code_exists_in_db_cache(postcode)
if postcode_cache.empty:
logger.warning(f"No OS Places data for {postcode}")
for idx in group.index:
df.at[idx, "ordnance_survey_address"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_uprn"] = (
"postcode not found in ordnance survey"
)
df.at[idx, "ordnance_survey_lexiscore"] = (
"postcode not found in ordnance survey"
)
continue
for idx, row in group.iterrows():
# Concatenate Address columns directly
ordnancy_survey_user_input = (
str(row.get("Address 1", "")).strip()
+ " "
+ str(row.get("Address 2", "")).strip()
+ " "
+ str(row.get("Address 3", "")).strip()
).strip()
if not ordnancy_survey_user_input:
continue
# Score against OS Places addresses
scores = postcode_cache["ADDRESS"].apply(
lambda addr: AddressMatch.score(ordnancy_survey_user_input, addr)
)
best_idx = scores.idxmax()
best_score = scores[best_idx]
if best_score <= 0:
continue
df.at[idx, "ordnance_survey_address"] = postcode_cache.at[
best_idx, "ADDRESS"
]
df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"]
df.at[idx, "ordnance_survey_lexiscore"] = best_score
# Save results locally
if local:
df.to_csv("ordnance_survey_results.csv", index=False)
print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)")
# Save results to S3
if task_id and sub_task_id:
save_results_to_s3(df, task_id, sub_task_id)

View file

@ -0,0 +1,65 @@
#!/usr/bin/env python3
import argparse
import boto3
import pandas as pd
from io import BytesIO
BUCKET = "retrofit-data-dev"
def list_csv_files(task_id):
s3 = boto3.client("s3")
paginator = s3.get_paginator("list_objects_v2")
prefix = f"ara_raw_outputs/{task_id}"
csv_files = []
for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
for obj in page.get("Contents", []):
key = obj["Key"]
if key.endswith(".csv"):
csv_files.append(key)
return csv_files
def download_csv(key):
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=BUCKET, Key=key)
return pd.read_csv(BytesIO(obj["Body"].read()))
def main(task_id, output):
print(f"Scanning task: {task_id}")
csv_files = list_csv_files(task_id)
if not csv_files:
print("No CSV files found")
return
print(f"Found {len(csv_files)} CSV files")
dfs = []
for key in csv_files:
print(f"Downloading {key}")
df = download_csv(key)
dfs.append(df)
combined = pd.concat(dfs, ignore_index=True)
combined.to_csv(output, index=False)
print(f"Combined CSV saved to {output}")
print(f"Total rows: {len(combined)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("task_id", help="Task ID folder in S3")
parser.add_argument("--output", default="combined.csv")
args = parser.parse_args()
main(args.task_id, args.output)

View file

View file

@ -0,0 +1,201 @@
import re
from typing import Any, Optional
from difflib import SequenceMatcher
import requests
class AddressMatch:
def __init__(self):
return None
@staticmethod
def score(a: str, b: str) -> float:
score: float = AddressMatch.levenshtein(a, b)
return score
@staticmethod
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = (
"https://api.postcodes.io/postcodes/{postcode}/validate"
)
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
@staticmethod
def normalise_address(s: str) -> str:
"""
Canonical UK-focused address normalisation.
- Lowercases
- Removes punctuation (keeps / for flats)
- Normalises whitespace
- Applies synonym compression at token level
"""
if not s:
return ""
ADDRESS_SYNONYMS = {
# street types
"rd": "road",
"rd.": "road",
"st": "street",
"st.": "street",
"ave": "avenue",
"ave.": "avenue",
"ln": "lane",
"ln.": "lane",
"cres": "crescent",
"ct": "court",
"dr": "drive",
# flats / units
"apt": "flat",
"apartment": "flat",
"unit": "flat",
"ste": "suite",
# numbering noise
"no": "",
"no.": "",
}
# 1. lowercase
s = s.lower()
# 1.5 split digit-letter suffixes
s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s)
# 2. remove punctuation except /
s = re.sub(r"[^\w\s/]", " ", s)
# 3. normalise whitespace
s = re.sub(r"\s+", " ", s).strip()
# 4. tokenise + synonym normalisation
tokens: list[str] = []
for tok in s.split():
replacement = ADDRESS_SYNONYMS.get(tok, tok)
if replacement:
tokens.append(replacement)
return " ".join(tokens)
@staticmethod
def levenshtein(a: str, b: str) -> float:
"""
Address similarity score in [0, 1].
Strategy:
- Normalise
- Strongly penalise mismatched house/flat numbers
- Combine token overlap + character similarity
"""
def extract_number_sequence(s: str) -> list[str]:
return re.findall(r"\d+[a-z]?", s)
def extract_numbers(s: str) -> set[str]:
return set(extract_number_sequence(s))
def tokenise(s: str) -> set[str]:
return set(s.split())
def extract_building_number(s: str) -> Optional[str]:
"""
Extract the main building number (NOT flat/unit).
Assumes formats like:
- '42 moreton road'
- 'flat 3 42 moreton road'
"""
tokens = s.split()
# remove flat/unit context
cleaned: list[Any] = []
skip_next = False
for t in tokens:
if t in ("flat", "apt", "apartment", "unit"):
skip_next = True
continue
if skip_next:
skip_next = False
continue
cleaned.append(t)
# first remaining number is building number
for t in cleaned:
if re.fullmatch(r"\d+[a-z]?", t):
return t
return None
a_norm = AddressMatch.normalise_address(a)
b_norm = AddressMatch.normalise_address(b)
# --- hard signal: numbers ---
nums_a = extract_numbers(a_norm)
nums_b = extract_numbers(b_norm)
if nums_a and not nums_b:
return 0.0
# No shared numbers at all → impossible match
if nums_a and nums_b and nums_a.isdisjoint(nums_b):
return 0.0
# 🔒 HARD GUARD: building number must match
bld_a = extract_building_number(a_norm)
bld_b = extract_building_number(b_norm)
if bld_a and bld_b and bld_a != bld_b:
return 0.0
# --- order-sensitive flat/building guard ---
seq_a = extract_number_sequence(a_norm)
seq_b = extract_number_sequence(b_norm)
has_flat_token_user = any(
tok in a_norm for tok in ("flat", "apt", "apartment", "unit")
)
has_flat_token_epc = "flat" in b_norm
if (
len(seq_a) == 2
and len(seq_b) >= 2
and has_flat_token_epc
and not has_flat_token_user
and seq_a != seq_b[:2]
):
return 0.0
# --- token similarity (order-independent) ---
toks_a: set[str] = tokenise(a_norm)
toks_b: set[str] = tokenise(b_norm)
if not toks_a or not toks_b:
token_score = 0.0
else:
token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
# --- character similarity (soft signal) ---
char_score: float = SequenceMatcher(None, a_norm, b_norm).ratio()
# --- weighted blend ---
return round(
0.65 * token_score + 0.35 * char_score,
4,
)

95
backend/utils/subtasks.py Normal file
View file

@ -0,0 +1,95 @@
# decorators/subtask_handler.py
from functools import wraps
from typing import Callable, Any
from uuid import UUID
import json
from backend.app.db.functions.tasks.Tasks import SubTaskInterface
def subtask_handler():
"""
Decorator that wraps your existing handler and automatically:
- Extracts task_id + sub_task_id from event
- Marks subtask as in progress
- Executes handler logic
- Marks subtask complete on success
- Marks failed on exception
"""
def decorator(func: Callable[..., Any]):
@wraps(func)
def wrapper(event: dict[str, Any], context: Any, *args, **kwargs):
records = event.get("Records", [event])
interface = SubTaskInterface()
for record in records:
# -------------------------------
# Parse body safely
# -------------------------------
body = {}
if isinstance(record.get("body"), str):
try:
body = json.loads(record["body"])
except Exception:
body = {}
else:
body = record.get("body", {}) or {}
task_id_raw = body.get("task_id")
subtask_id_raw = body.get("sub_task_id")
task_id = UUID(task_id_raw) if isinstance(task_id_raw, str) else None
subtask_id = (
UUID(subtask_id_raw) if isinstance(subtask_id_raw, str) else None
)
if not task_id or not subtask_id:
raise RuntimeError("task_id or sub_task_id missing")
# -------------------------------
# Mark in progress
# -------------------------------
interface.update_subtask_status(
subtask_id=subtask_id,
status="in progress",
)
try:
# Pass the parsed body into your function
result = func(body, context, *args, **kwargs)
# -------------------------------
# Success → mark complete
# -------------------------------
interface.update_subtask_status(
subtask_id=subtask_id,
status="complete",
outputs={"result": result} if result else None,
)
except Exception as e:
# -------------------------------
# Failure → mark failed
# -------------------------------
interface.update_subtask_status(
subtask_id=subtask_id,
status="failed",
outputs={"error": str(e)},
)
raise
return None
return wrapper
return decorator

View file

@ -0,0 +1,64 @@
############################################
# Load Shared Terraform State
############################################
data "terraform_remote_state" "shared" {
backend = "s3"
config = {
bucket = "assessment-model-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
############################################
# Load FastAPI Terraform State
############################################
data "terraform_remote_state" "fast_api" {
backend = "s3"
config = {
bucket = "ara-fast-api-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
############################################
# Load CDN Certificate Terraform State
############################################
data "terraform_remote_state" "cdn_certificate" {
backend = "s3"
config = {
bucket = "cdn-certificate-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
############################################
# CloudFront for API
############################################
module "cdn" {
source = "../modules/cloudfront"
aliases = [data.terraform_remote_state.fast_api.outputs.domain_name]
acm_certificate_arn = data.terraform_remote_state.cdn_certificate.outputs.certificate_arn
origins = [
# ---- S3 ----
{
origin_type = "s3"
origin_domain_name = data.terraform_remote_state.shared.outputs.retrofit_datalake_bucket_domain_name
origin_id = "s3-origin"
bucket_id = data.terraform_remote_state.shared.outputs.retrofit_datalake_bucket_id
bucket_arn = data.terraform_remote_state.shared.outputs.retrofit_datalake_bucket_arn
},
# ---- API Gateway ----
{
origin_type = "api"
origin_domain_name = replace(data.terraform_remote_state.fast_api.outputs.invoke_url, "/^https?://([^/]*).*/", "$1")
origin_id = "api-origin"
}
]
}

View file

@ -0,0 +1,14 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
backend "s3" {
bucket = "ara-cdn-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
}

View file

@ -0,0 +1,3 @@
variable "stage" {
type = string
}

View file

@ -0,0 +1,28 @@
############################################
# Load FastAPI Terraform State
############################################
data "terraform_remote_state" "fast_api" {
backend = "s3"
config = {
bucket = "ara-fast-api-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
############################################
# Define Certificate
############################################
module "cdn_certificate" {
source = "../modules/acm_certificate"
providers = {
aws = aws.us_east_1
}
domain_name = data.terraform_remote_state.fast_api.outputs.domain_name
tags = {
Environment = var.stage
}
}

View file

@ -0,0 +1,3 @@
output "certificate_arn" {
value = module.cdn_certificate.certificate_arn
}

View file

@ -0,0 +1,23 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
backend "s3" {
bucket = "cdn-certificate-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
}
provider "aws" {
region = var.region
}
provider "aws" {
alias = "us_east_1"
region = "us-east-1"
}

View file

@ -0,0 +1,3 @@
variable "stage" {
type = string
}

View file

@ -26,7 +26,7 @@ data "terraform_remote_state" "shared" {
} }
module "lambda" { module "lambda" {
source = "../modules/lambda_with_sqs" source = "../../modules/lambda_with_sqs"
name = REPLACE ME #"address2uprn" for example name = REPLACE ME #"address2uprn" for example
stage = var.stage stage = var.stage

View file

@ -2,7 +2,7 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }

View file

@ -15,7 +15,7 @@ locals {
} }
module "address2uprn" { module "address2uprn" {
source = "../modules/lambda_with_sqs" source = "../../modules/lambda_with_sqs"
name = "address2uprn" name = "address2uprn"
stage = var.stage stage = var.stage
@ -33,19 +33,6 @@ module "address2uprn" {
LOG_LEVEL = "info" LOG_LEVEL = "info"
DB_USERNAME = local.db_credentials.db_assessment_model_username DB_USERNAME = local.db_credentials.db_assessment_model_username
DB_PASSWORD = local.db_credentials.db_assessment_model_password DB_PASSWORD = local.db_credentials.db_assessment_model_password
GOOGLE_SOLAR_API_KEY = "test"
SAP_PREDICTIONS_BUCKET = "test"
CARBON_PREDICTIONS_BUCKET = "test"
HEAT_PREDICTIONS_BUCKET = "test"
HEATING_KWH_PREDICTIONS_BUCKET = "test"
HOTWATER_KWH_PREDICTIONS_BUCKET = "test"
API_KEY = "test"
ENVIRONMENT = "test"
SECRET_KEY = "test"
PLAN_TRIGGER_BUCKET = "test"
DATA_BUCKET = "test"
ENGINE_SQS_URL = "test"
ENERGY_ASSESSMENTS_BUCKET = "test"
S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
}, },
) )

View file

@ -2,7 +2,7 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }

View file

@ -16,7 +16,7 @@ locals {
} }
module "lambda" { module "lambda" {
source = "../modules/lambda_with_sqs" source = "../../modules/lambda_with_sqs"
name = "categorisation" name = "categorisation"
stage = var.stage stage = var.stage

View file

@ -0,0 +1,9 @@
output "categorisation_queue_url" {
value = module.lambda.queue_url
description = "URL of the Categorisation SQS queue"
}
output "categorisation_queue_arn" {
value = module.lambda.queue_arn
description = "ARN of the Categorisation SQS queue"
}

View file

@ -2,7 +2,7 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }

View file

@ -17,7 +17,7 @@ locals {
module "lambda" { module "lambda" {
source = "../modules/lambda_with_sqs" source = "../../modules/lambda_with_sqs"
name = "condition-etl" name = "condition-etl"
stage = var.stage stage = var.stage

View file

@ -2,7 +2,7 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }

View file

@ -17,15 +17,17 @@ locals {
module "lambda" { module "lambda" {
source = "../modules/lambda_with_sqs" source = "../../modules/lambda_with_sqs"
name = "engine" name = "engine"
stage = var.stage stage = var.stage
image_uri = local.image_uri image_uri = local.image_uri
# Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000)
maximum_concurrency = var.maximum_concurrency maximum_concurrency = var.maximum_concurrency
batch_size = var.batch_size
timeout = var.timeout
memory_size = var.memory_size
environment = merge( environment = merge(
{ {
@ -42,7 +44,6 @@ module "lambda" {
DB_PORT = var.db_port DB_PORT = var.db_port
API_KEY = var.api_key API_KEY = var.api_key
SECRET_KEY = var.secret_key SECRET_KEY = var.secret_key
DOMAIN_NAME = var.domain_name
EPC_AUTH_TOKEN = var.epc_auth_token EPC_AUTH_TOKEN = var.epc_auth_token
GOOGLE_SOLAR_API_KEY = var.google_solar_api_key GOOGLE_SOLAR_API_KEY = var.google_solar_api_key

View file

@ -0,0 +1,9 @@
output "ara_engine_queue_url" {
value = module.lambda.queue_url
description = "URL of the Engine SQS queue"
}
output "ara_engine_queue_arn" {
value = module.lambda.queue_arn
description = "ARN of the Engine SQS queue"
}

View file

@ -2,7 +2,7 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }

View file

@ -23,6 +23,23 @@ variable "maximum_concurrency" {
description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit."
} }
variable "batch_size" {
type = number
default = 1
}
variable "timeout" {
type = number
default = 900
description = "Lambda timeout in seconds"
}
variable "memory_size" {
type = number
default = 3008
description = "Lambda memory size in MB"
}
variable "db_host" { variable "db_host" {
type = string type = string
sensitive = true sensitive = true
@ -48,10 +65,6 @@ variable "secret_key" {
sensitive = true sensitive = true
} }
variable "domain_name" {
type = string
}
variable "epc_auth_token" { variable "epc_auth_token" {
type = string type = string
sensitive = true sensitive = true

View file

@ -0,0 +1,124 @@
############################################
# Load Terraform State
############################################
data "terraform_remote_state" "shared" {
backend = "s3"
config = {
bucket = "assessment-model-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
data "terraform_remote_state" "engine" {
backend = "s3"
config = {
bucket = "ara-engine-terraform-state",
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
data "terraform_remote_state" "categorisation" {
backend = "s3"
config = {
bucket = "categorisation-terraform-state",
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
############################################
# Load Credentials
############################################
data "aws_secretsmanager_secret_version" "db_credentials" {
secret_id = "${var.stage}/assessment_model/db_credentials"
}
locals {
db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
}
############################################
# FastAPI Lambda + API Gateway
############################################
module "fastapi" {
source = "../../modules/lambda_with_api_gateway"
name = "fastapi"
stage = var.stage
source_dir = "${path.root}/../../../../"
handler = "backend.app.main.handler"
runtime = "python3.11"
timeout = 600
memory_size = 512
artifact_bucket = data.terraform_remote_state.shared.outputs.ara_fast_api_state_bucket
requirements_file = "${path.root}/../../../../backend/app/requirements/requirements.txt"
domain_name = "api.${var.domain_name}"
environment = {
ENVIRONMENT = var.stage
API_KEY = var.api_key
SECRET_KEY = var.secret_key
# DOMAIN_NAME = var.domain_name
EPC_AUTH_TOKEN = var.epc_auth_token
GOOGLE_SOLAR_API_KEY = var.google_solar_api_key
DB_HOST = var.db_host
DB_NAME = var.db_name
DB_PORT = var.db_port
DB_USERNAME = local.db_credentials.db_assessment_model_username
DB_PASSWORD = local.db_credentials.db_assessment_model_password
PLAN_TRIGGER_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_plan_trigger_bucket_name
DATA_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
SAP_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_sap_predictions_bucket_name
CARBON_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_carbon_predictions_bucket_name
HEAT_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_heat_predictions_bucket_name
HEATING_KWH_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_heating_kwh_predictions_bucket_name
HOTWATER_KWH_PREDICTIONS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_hotwater_kwh_predictions_bucket_name
ENERGY_ASSESSMENTS_BUCKET = data.terraform_remote_state.shared.outputs.retrofit_energy_assessments_bucket_name
ENGINE_SQS_URL = data.terraform_remote_state.engine.outputs.ara_engine_queue_url
CATEGORISATION_SQS_URL = data.terraform_remote_state.categorisation.outputs.categorisation_queue_url
}
}
############################################
# IAM policy attachments
############################################
# SQS
module "fastapi_sqs_policy" {
source = "../../modules/general_iam_policy"
policy_name = "fastapi-sqs-send-${var.stage}"
policy_description = "Allow FastAPI to send messages to engine & categorisation queues"
actions = [
"sqs:SendMessage"
]
resources = [
data.terraform_remote_state.engine.outputs.ara_engine_queue_arn,
data.terraform_remote_state.categorisation.outputs.categorisation_queue_arn
]
conditions = null
tags = {
Service = "fastapi"
Stage = var.stage
}
}
resource "aws_iam_role_policy_attachment" "fastapi_sqs_send" {
role = module.fastapi.role_name
policy_arn = module.fastapi_sqs_policy.policy_arn
}
# S3
resource "aws_iam_role_policy_attachment" "fastapi_s3_read_and_write" {
role = module.fastapi.role_name
policy_arn = data.terraform_remote_state.shared.outputs.fast_api_s3_read_and_write_arn
}

View file

@ -0,0 +1,7 @@
output "domain_name" {
value = module.fastapi.domain_name
}
output "invoke_url" {
value = module.fastapi.invoke_url
}

View file

@ -0,0 +1,16 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
backend "s3" {
bucket = "ara-fast-api-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,44 @@
variable "lambda_name" {
type = string
description = "Logical name of the lambda (e.g. address2uprn)"
}
variable "stage" {
type = string
}
variable "db_host" {
type = string
}
variable "db_name" {
type = string
}
variable "db_port" {
type = string
}
variable "api_key" {
type = string
sensitive = true
}
variable "secret_key" {
type = string
sensitive = true
}
variable "domain_name" {
type = string
}
variable "epc_auth_token" {
type = string
sensitive = true
}
variable "google_solar_api_key" {
type = string
sensitive = true
}

View file

@ -0,0 +1,45 @@
data "terraform_remote_state" "shared" {
backend = "s3"
config = {
bucket = "assessment-model-terraform-state"
key = "env:/${var.stage}/terraform.tfstate"
region = "eu-west-2"
}
}
data "aws_secretsmanager_secret_version" "db_credentials" {
secret_id = "${var.stage}/assessment_model/db_credentials"
}
locals {
db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
}
module "ordnance" {
source = "../../modules/lambda_with_sqs"
name = "ordnanceSurvey" #"address2uprn" for example
stage = var.stage
image_uri = local.image_uri
timeout = 900
# Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000)
maximum_concurrency = var.maximum_concurrency
environment = merge(
{
STAGE = var.stage
LOG_LEVEL = "info"
DB_USERNAME = local.db_credentials.db_assessment_model_username
DB_PASSWORD = local.db_credentials.db_assessment_model_password
S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name
ORDNANCE_SURVEY_API_KEY = var.ordnance_survey_api_key
},
)
}
# Attach S3 read policy to the Lambda execution role
resource "aws_iam_role_policy_attachment" "ordanceSurvey_read_and_write" {
role = module.ordnance.role_name
policy_arn = data.terraform_remote_state.shared.outputs.ordnance_s3_read_and_write_arn
}

View file

@ -0,0 +1,16 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
backend "s3" {
bucket = "ordnance-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,43 @@
variable "lambda_name" {
type = string
description = "Logical name of the lambda (e.g. address2uprn)"
}
variable "stage" {
description = "Deployment stage (e.g. dev, prod)"
type = string
}
variable "ecr_repo_url" {
type = string
description = "ECR repository URL (no tag, no digest)"
}
variable "image_digest" {
type = string
description = "Image digest (sha256:...)"
}
variable "maximum_concurrency" {
type = number
default = null
description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit."
}
variable "batch_size" {
type = number
default = 1
}
variable "ordnance_survey_api_key" {
type = string
sensitive = true
}
locals {
image_uri = "${var.ecr_repo_url}@${var.image_digest}"
}
output "resolved_image_uri" {
value = local.image_uri
}

View file

@ -26,7 +26,7 @@ data "terraform_remote_state" "address2uprn" {
} }
module "lambda" { module "lambda" {
source = "../modules/lambda_with_sqs" source = "../../modules/lambda_with_sqs"
name = "postcode-splitter" name = "postcode-splitter"
stage = var.stage stage = var.stage

View file

@ -2,7 +2,7 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }

View file

@ -0,0 +1,11 @@
resource "aws_acm_certificate" "this" {
domain_name = var.domain_name
subject_alternative_names = var.subject_alternative_names
validation_method = "DNS"
lifecycle {
create_before_destroy = false
}
tags = var.tags
}

View file

@ -0,0 +1,7 @@
output "certificate_arn" {
value = aws_acm_certificate.this.arn
}
output "domain_validation_options" {
value = aws_acm_certificate.this.domain_validation_options
}

View file

@ -0,0 +1,16 @@
variable "domain_name" {
description = "Primary domain name for the certificate"
type = string
}
variable "subject_alternative_names" {
description = "Additional domains for the certificate"
type = list(string)
default = []
}
variable "tags" {
description = "Tags to apply to the certificate"
type = map(string)
default = {}
}

View file

@ -1,65 +1,165 @@
resource "aws_cloudfront_distribution" "s3_distribution" { #############################################
origin { # Use Managed Caching and Forwarding Policies
domain_name = var.bucket_domain_name #############################################
origin_id = "S3-${var.bucket_name}" data "aws_cloudfront_cache_policy" "caching_disabled" {
name = "Managed-CachingDisabled"
}
s3_origin_config { data "aws_cloudfront_origin_request_policy" "all_viewer_except_host_header" {
origin_access_identity = aws_cloudfront_origin_access_identity.oai.cloudfront_access_identity_path name = "Managed-AllViewerExceptHostHeader"
}
############################################
# CloudFront Distribution
############################################
resource "aws_cloudfront_distribution" "this" {
##########################################
# Origins
##########################################
dynamic "origin" {
for_each = { for o in var.origins : o.origin_id => o }
content {
domain_name = origin.value.origin_domain_name
origin_id = origin.value.origin_id
######################################
# S3 Origin
######################################
dynamic "s3_origin_config" {
for_each = origin.value.origin_type == "s3" ? [1] : []
content {
origin_access_identity = aws_cloudfront_origin_access_identity.oai[origin.key].cloudfront_access_identity_path
}
}
######################################
# API Gateway Origin
######################################
dynamic "custom_origin_config" {
for_each = origin.value.origin_type == "api" ? [1] : []
content {
http_port = 80
https_port = 443
origin_protocol_policy = "https-only"
origin_ssl_protocols = ["TLSv1.2"]
}
}
} }
} }
enabled = true enabled = true
aliases = var.aliases
##########################################
# Default Cache Behavior (S3)
##########################################
default_cache_behavior { default_cache_behavior {
allowed_methods = ["GET", "HEAD"] target_origin_id = "s3-origin"
cached_methods = ["GET", "HEAD"]
target_origin_id = "S3-${var.bucket_name}"
viewer_protocol_policy = "redirect-to-https" viewer_protocol_policy = "redirect-to-https"
compress = true
allowed_methods = ["GET", "HEAD"]
cached_methods = ["GET", "HEAD"]
forwarded_values { forwarded_values {
query_string = false query_string = false
cookies { cookies {
forward = "none" forward = "none"
} }
} }
min_ttl = 0 compress = true
default_ttl = 86400 min_ttl = 0
max_ttl = 31536000 default_ttl = 3600
max_ttl = 86400
}
##########################################
# API Behavior
##########################################
ordered_cache_behavior {
path_pattern = "/v1/*"
target_origin_id = "api-origin"
viewer_protocol_policy = "redirect-to-https"
allowed_methods = ["GET","HEAD","OPTIONS","PUT","POST","PATCH","DELETE"]
cached_methods = ["GET","HEAD"]
cache_policy_id = data.aws_cloudfront_cache_policy.caching_disabled.id
origin_request_policy_id = data.aws_cloudfront_origin_request_policy.all_viewer_except_host_header.id
} }
price_class = "PriceClass_All" price_class = "PriceClass_All"
##########################################
# Geo Restrictions
##########################################
restrictions { restrictions {
geo_restriction { geo_restriction {
restriction_type = "none" restriction_type = "none"
} }
} }
##########################################
# SSL Certificate
##########################################
viewer_certificate { viewer_certificate {
cloudfront_default_certificate = true acm_certificate_arn = var.acm_certificate_arn
ssl_support_method = var.acm_certificate_arn != null ? "sni-only" : null
minimum_protocol_version = var.acm_certificate_arn != null ? "TLSv1.2_2021" : null
cloudfront_default_certificate = var.acm_certificate_arn == null
} }
} }
############################################
# Origin Access Identities (S3 only)
############################################
resource "aws_cloudfront_origin_access_identity" "oai" { resource "aws_cloudfront_origin_access_identity" "oai" {
comment = "OAI for ${var.bucket_name}" for_each = {
for o in var.origins : o.origin_id => o
if o.origin_type == "s3"
}
comment = "OAI for ${each.key}"
} }
############################################
# S3 Bucket Policy (S3 only)
############################################
resource "aws_s3_bucket_policy" "bucket_policy" { resource "aws_s3_bucket_policy" "bucket_policy" {
bucket = var.bucket_id for_each = {
for o in var.origins : o.origin_id => o
if o.origin_type == "s3"
}
bucket = each.value.bucket_id
policy = jsonencode({ policy = jsonencode({
Version = "2012-10-17" Version = "2012-10-17"
Statement = [ Statement = [
{ {
Effect = "Allow" Effect = "Allow"
Principal = { Principal = {
AWS = "arn:aws:iam::cloudfront:user/CloudFront Origin Access Identity ${aws_cloudfront_origin_access_identity.oai.id}" AWS = aws_cloudfront_origin_access_identity.oai[each.key].iam_arn
} }
Action = "s3:GetObject" Action = "s3:GetObject"
Resource = "${var.bucket_arn}/*" Resource = "${each.value.bucket_arn}/*"
}, }
] ]
}) })
} }

View file

@ -1,24 +1,20 @@
variable "bucket_name" { variable "origins" {
description = "The name of the bucket" type = list(object({
type = string origin_type = string # "s3" or "api"
origin_domain_name = string
origin_id = string
bucket_id = optional(string)
bucket_arn = optional(string)
}))
} }
variable "stage" { variable "aliases" {
description = "The deployment stage" type = list(string)
type = string
} }
variable "bucket_id" { variable "acm_certificate_arn" {
description = "The ID of the S3 bucket" description = "ACM certificate ARN for custom aliases"
type = string
}
variable "bucket_arn" {
description = "The ARN of the S3 bucket"
type = string
}
variable "bucket_domain_name" {
description = "The regional domain name of the S3 bucket"
type = string type = string
default = null
} }

View file

@ -0,0 +1,27 @@
resource "aws_lambda_function" "this" {
function_name = var.name
role = var.role_arn
package_type = "Zip"
# filename = var.filename
source_code_hash = var.source_code_hash
handler = var.handler
runtime = var.runtime
timeout = var.timeout
memory_size = var.memory_size
publish = true
s3_bucket = var.s3_bucket
s3_key = var.s3_key
environment {
variables = var.environment
}
}
output "lambda_arn" {
value = aws_lambda_function.this.arn
}
output "function_name" {
value = aws_lambda_function.this.function_name
}

View file

@ -0,0 +1,20 @@
variable "name" { type = string }
variable "role_arn" { type = string }
# variable "filename" { type = string }
variable "source_code_hash" { type = string }
variable "handler" { type = string }
variable "runtime" { type = string }
variable "timeout" {
type = number
default = 30
}
variable "memory_size" {
type = number
default = 128
}
variable "environment" {
type = map(string)
default = {}
}
variable "s3_bucket" { type = string }
variable "s3_key" { type = string }

View file

@ -0,0 +1,117 @@
############################################
# IAM role
############################################
module "role" {
source = "../lambda_execution_role"
name = "${var.name}-lambda-${var.stage}"
}
############################################
# Cloudwatch log group
############################################
resource "aws_cloudwatch_log_group" "api_logs" {
name = "/aws/apigateway/${var.name}-${var.stage}"
retention_in_days = 14
}
############################################
# Install python packages
############################################
resource "null_resource" "pip_install" {
count = var.requirements_file != null ? 1 : 0
triggers = {
always_run = timestamp()
}
provisioner "local-exec" {
command = "pip install -r ${var.requirements_file} -t ${var.source_dir} --platform manylinux2014_x86_64 --implementation cp --python-version 3.11 --only-binary=:all: --upgrade"
}
}
############################################
# Zip the source code
############################################
data "archive_file" "this" {
depends_on = [null_resource.pip_install]
type = "zip"
source_dir = var.source_dir
output_path = "${path.module}/lambda_package.zip"
excludes = var.zip_excludes
}
############################################
# Upload zip to S3
############################################
resource "aws_s3_object" "lambda_zip" {
bucket = var.artifact_bucket
key = "env:/${var.stage}/${var.name}.zip"
source = data.archive_file.this.output_path
etag = data.archive_file.this.output_md5
}
############################################
# Lambda
############################################
module "lambda" {
source = "../lambda_service_zip"
name = "${var.name}-${var.stage}"
role_arn = module.role.role_arn
s3_bucket = var.artifact_bucket
s3_key = aws_s3_object.lambda_zip.key
source_code_hash = data.archive_file.this.output_base64sha256
handler = var.handler
runtime = var.runtime
timeout = var.timeout
memory_size = var.memory_size
environment = var.environment
}
############################################
# API Gateway
############################################
resource "aws_apigatewayv2_api" "this" {
name = "${var.name}-api-${var.stage}"
protocol_type = "HTTP"
}
resource "aws_apigatewayv2_stage" "this" {
api_id = aws_apigatewayv2_api.this.id
name = "$default"
auto_deploy = true
access_log_settings {
destination_arn = aws_cloudwatch_log_group.api_logs.arn
format = jsonencode({
requestId = "$context.requestId"
domainName = "$context.domainName"
path = "$context.path"
status = "$context.status"
sourceIp = "$context.identity.sourceIp"
userAgent = "$context.identity.userAgent"
})
}
}
resource "aws_apigatewayv2_integration" "this" {
api_id = aws_apigatewayv2_api.this.id
integration_type = "AWS_PROXY"
integration_uri = module.lambda.lambda_arn
payload_format_version = "2.0"
}
resource "aws_apigatewayv2_route" "catch_all" {
api_id = aws_apigatewayv2_api.this.id
route_key = "$default"
target = "integrations/${aws_apigatewayv2_integration.this.id}"
}
resource "aws_lambda_permission" "apigw_invoke" {
statement_id = "AllowAPIGatewayInvoke"
action = "lambda:InvokeFunction"
function_name = module.lambda.lambda_arn
principal = "apigateway.amazonaws.com"
source_arn = "${aws_apigatewayv2_api.this.execution_arn}/*/*"
}

View file

@ -0,0 +1,11 @@
output "role_name" {
value = module.role.role_name
}
output "domain_name" {
value = var.domain_name
}
output "invoke_url" {
value = aws_apigatewayv2_stage.this.invoke_url
}

View file

@ -0,0 +1,48 @@
variable "name" { type = string }
variable "stage" { type = string }
variable "source_dir" { type = string }
variable "handler" { type = string }
variable "runtime" { type = string }
variable "zip_excludes" {
type = list(string)
default = [
"**/__pycache__/**",
"**/*.pyc",
"**/.pytest_cache/**",
"**/tests/**",
"**/infrastructure/**"
]
}
variable "timeout" {
type = number
default = 600
}
variable "memory_size" {
type = number
default = 512
}
variable "environment" {
type = map(string)
default = {}
}
variable "domain_name" {
type = string
default = null
}
variable "certificate_arn" {
type = string
default = null
}
variable "route53_zone_id" {
type = string
default = null
}
variable "artifact_bucket" { type = string }
variable "requirements_file" {
type = string
default = null
}

View file

@ -2,7 +2,7 @@
# IAM role # IAM role
############################################ ############################################
module "role" { module "role" {
source = "../../../modules/lambda_execution_role" source = "../lambda_execution_role"
name = "${var.name}-lambda-${var.stage}" name = "${var.name}-lambda-${var.stage}"
} }
@ -14,7 +14,7 @@ output "role_name" {
# SQS queue + DLQ # SQS queue + DLQ
############################################ ############################################
module "queue" { module "queue" {
source = "../../../modules/sqs_queue" source = "../sqs_queue"
name = "${var.name}-queue-${var.stage}" name = "${var.name}-queue-${var.stage}"
} }
@ -22,7 +22,7 @@ module "queue" {
# Lambda # Lambda
############################################ ############################################
module "lambda" { module "lambda" {
source = "../../../modules/lambda_service" source = "../lambda_service"
name = "${var.name}-${var.stage}" name = "${var.name}-${var.stage}"
role_arn = module.role.role_arn role_arn = module.role.role_arn
@ -38,7 +38,7 @@ module "lambda" {
# SQS Lambda trigger # SQS Lambda trigger
############################################ ############################################
module "sqs_trigger" { module "sqs_trigger" {
source = "../../../modules/lambda_sqs_trigger" source = "../lambda_sqs_trigger"
lambda_arn = module.lambda.lambda_arn lambda_arn = module.lambda.lambda_arn
lambda_role_name = module.role.role_name lambda_role_name = module.role.role_name

View file

@ -2,9 +2,10 @@
locals { locals {
# Generate full resource ARNs by combining bucket ARNs with resource paths # Generate full resource ARNs by combining bucket ARNs with resource paths
resources = flatten([ resources = flatten([
for bucket_arn in var.bucket_arns : [ for bucket_arn in var.bucket_arns : concat(
for path in var.resource_paths : "${bucket_arn}${path}" [bucket_arn], # bare ARN for bucket-level actions like ListBucket
] [for path in var.resource_paths : "${bucket_arn}${path}"]
)
]) ])
} }

View file

@ -2,20 +2,20 @@ terraform {
required_providers { required_providers {
aws = { aws = {
source = "hashicorp/aws" source = "hashicorp/aws"
version = "~> 4.16" version = ">= 5.0"
} }
} }
backend "s3" { backend "s3" {
bucket = "assessment-model-terraform-state" bucket = "assessment-model-terraform-state"
region = "eu-west-2" region = "eu-west-2"
key = "terraform.tfstate" key = "terraform.tfstate"
} }
required_version = ">= 1.2.0" required_version = ">= 1.2.0"
} }
provider "aws" { provider "aws" {
region = var.region region = var.region
} }
# Additional provider for resources that need to be in us-east-1, specifically the SSL certificate # Additional provider for resources that need to be in us-east-1, specifically the SSL certificate
@ -47,30 +47,30 @@ resource "aws_security_group" "allow_db" {
ingress { ingress {
# TLS (change to whatever ports you need) # TLS (change to whatever ports you need)
from_port = 5432 from_port = 5432
to_port = 5432 to_port = 5432
protocol = "tcp" protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"] cidr_blocks = ["0.0.0.0/0"]
} }
egress { egress {
from_port = 0 from_port = 0
to_port = 0 to_port = 0
protocol = "-1" protocol = "-1"
cidr_blocks = ["0.0.0.0/0"] cidr_blocks = ["0.0.0.0/0"]
} }
} }
resource "aws_db_instance" "default" { resource "aws_db_instance" "default" {
allocated_storage = var.allocated_storage allocated_storage = var.allocated_storage
engine = "postgres" engine = "postgres"
engine_version = "14.17" engine_version = "14.17"
instance_class = var.instance_class instance_class = var.instance_class
db_name = var.database_name db_name = var.database_name
username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"] username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"] password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
parameter_group_name = "default.postgres14" parameter_group_name = "default.postgres14"
skip_final_snapshot = true skip_final_snapshot = true
vpc_security_group_ids = [aws_security_group.allow_db.id] vpc_security_group_ids = [aws_security_group.allow_db.id]
lifecycle { lifecycle {
prevent_destroy = true prevent_destroy = true
@ -87,7 +87,7 @@ resource "aws_db_instance" "default" {
storage_type = "gp3" storage_type = "gp3"
# Automated backups configuration # Automated backups configuration
backup_retention_period = 14 backup_retention_period = 14
backup_window = "03:00-04:00" backup_window = "03:00-04:00"
maintenance_window = "Sun:02:00-Sun:02:30" maintenance_window = "Sun:02:00-Sun:02:30"
copy_tags_to_snapshot = true copy_tags_to_snapshot = true
@ -103,7 +103,7 @@ module "s3_presignable_bucket" {
} }
output "retrofit_plan_trigger_bucket_name" { output "retrofit_plan_trigger_bucket_name" {
value = module.s3_presignable_bucket.bucket_name value = module.s3_presignable_bucket.bucket_name
description = "Name of the retrofit plan trigger bucket" description = "Name of the retrofit plan trigger bucket"
} }
@ -127,6 +127,22 @@ module "s3" {
allowed_origins = var.allowed_origins allowed_origins = var.allowed_origins
} }
output "retrofit_datalake_bucket_id" {
value = module.s3.bucket_id
}
output "retrofit_datalake_bucket_arn" {
value = module.s3.bucket_arn
}
output "retrofit_datalake_bucket_domain_name" {
value = module.s3.bucket_domain_name
}
output "retrofit_datalake_bucket_name" {
value = module.s3.bucket_name
}
module "model_directory" { module "model_directory" {
source = "../modules/s3" source = "../modules/s3"
bucketname = "retrofit-model-directory-${var.stage}" bucketname = "retrofit-model-directory-${var.stage}"
@ -140,7 +156,7 @@ module "retrofit_sap_predictions" {
} }
output "retrofit_sap_predictions_bucket_name" { output "retrofit_sap_predictions_bucket_name" {
value = module.retrofit_sap_predictions.bucket_name value = module.retrofit_sap_predictions.bucket_name
description = "Name of the retrofit SAP predictions bucket" description = "Name of the retrofit SAP predictions bucket"
} }
@ -151,7 +167,7 @@ module "retrofit_sap_data" {
} }
output "retrofit_sap_data_bucket_name" { output "retrofit_sap_data_bucket_name" {
value = module.retrofit_sap_data.bucket_name value = module.retrofit_sap_data.bucket_name
description = "Name of the retrofit SAP data bucket" description = "Name of the retrofit SAP data bucket"
} }
@ -162,7 +178,7 @@ module "retrofit_carbon_predictions" {
} }
output "retrofit_carbon_predictions_bucket_name" { output "retrofit_carbon_predictions_bucket_name" {
value = module.retrofit_carbon_predictions.bucket_name value = module.retrofit_carbon_predictions.bucket_name
description = "Name of the retrofit carbon predictions bucket" description = "Name of the retrofit carbon predictions bucket"
} }
@ -173,7 +189,7 @@ module "retrofit_heat_predictions" {
} }
output "retrofit_heat_predictions_bucket_name" { output "retrofit_heat_predictions_bucket_name" {
value = module.retrofit_heat_predictions.bucket_name value = module.retrofit_heat_predictions.bucket_name
description = "Name of the retrofit heat predictions bucket" description = "Name of the retrofit heat predictions bucket"
} }
@ -202,7 +218,7 @@ module "retrofit_heating_kwh_predictions" {
} }
output "retrofit_heating_kwh_predictions_bucket_name" { output "retrofit_heating_kwh_predictions_bucket_name" {
value = module.retrofit_heating_kwh_predictions.bucket_name value = module.retrofit_heating_kwh_predictions.bucket_name
description = "Name of the retrofit heating kWh predictions bucket" description = "Name of the retrofit heating kWh predictions bucket"
} }
@ -213,7 +229,7 @@ module "retrofit_hotwater_kwh_predictions" {
} }
output "retrofit_hotwater_kwh_predictions_bucket_name" { output "retrofit_hotwater_kwh_predictions_bucket_name" {
value = module.retrofit_hotwater_kwh_predictions.bucket_name value = module.retrofit_hotwater_kwh_predictions.bucket_name
description = "Name of the retrofit hotwater kWh predictions bucket" description = "Name of the retrofit hotwater kWh predictions bucket"
} }
@ -232,7 +248,7 @@ module "retrofit_energy_assessments" {
} }
output "retrofit_energy_assessments_bucket_name" { output "retrofit_energy_assessments_bucket_name" {
value = module.retrofit_energy_assessments.bucket_name value = module.retrofit_energy_assessments.bucket_name
description = "Name of the retrofit energy assessments bucket" description = "Name of the retrofit energy assessments bucket"
} }
@ -311,16 +327,14 @@ module "sap_baseline_ecr" {
source = "../modules/ecr" source = "../modules/ecr"
} }
############################################## module "heat_baseline_ecr" {
# CDN - Cloudfront ecr_name = "heat-baseline-prediction-${var.stage}"
############################################## source = "../modules/ecr"
module "cloudfront_distribution" { }
source = "../modules/cloudfront"
bucket_name = module.s3.bucket_name module "carbon_baseline_ecr" {
bucket_id = module.s3.bucket_id ecr_name = "carbon-baseline-prediction-${var.stage}"
bucket_arn = module.s3.bucket_arn source = "../modules/ecr"
bucket_domain_name = module.s3.bucket_domain_name
stage = var.stage
} }
################################################ ################################################
@ -348,7 +362,7 @@ module "address2uprn_state_bucket" {
module "address2uprn_registry" { module "address2uprn_registry" {
source = "../modules/container_registry" source = "../modules/container_registry"
name = "address2uprn" name = "address2uprn"
stage = var.stage stage = var.stage
} }
@ -379,14 +393,14 @@ module "condition_etl_state_bucket" {
module "condition_etl_registry" { module "condition_etl_registry" {
source = "../modules/container_registry" source = "../modules/container_registry"
name = "condition-etl" name = "condition-etl"
stage = var.stage stage = var.stage
} }
# Condition Data S3 Bucket to store initial data # Condition Data S3 Bucket to store initial data
module "condition_data_bucket" { module "condition_data_bucket" {
source = "../modules/s3" source = "../modules/s3"
bucketname = "condition-data-${var.stage}" bucketname = "condition-data-${var.stage}"
allowed_origins = var.allowed_origins allowed_origins = var.allowed_origins
} }
@ -417,7 +431,7 @@ module "postcode_splitter_state_bucket" {
module "postcode_splitter_registry" { module "postcode_splitter_registry" {
source = "../modules/container_registry" source = "../modules/container_registry"
name = "postcode_splitter" name = "postcode_splitter"
stage = var.stage stage = var.stage
} }
@ -448,7 +462,39 @@ module "categorisation_state_bucket" {
module "categorisation_registry" { module "categorisation_registry" {
source = "../modules/container_registry" source = "../modules/container_registry"
name = "categorisation" name = "categorisation"
stage = var.stage stage = var.stage
}
################################################
# OrdnanceSurveyAPI Lambda
################################################
module "ordnance_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "ordnance-terraform-state"
}
module "ordnance_registry" {
source = "../modules/container_registry"
name = "ordnance"
stage = var.stage
}
# S3 policy for postcode splitter to read from retrofit data bucket
module "ordnance_s3_read_and_write" {
source = "../modules/s3_iam_policy"
policy_name = "OrdnanceSurveyReadandWriteS3"
policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket"
bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"]
actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"]
resource_paths = ["/*"]
}
output "ordnance_s3_read_and_write_arn" {
value = module.ordnance_s3_read_and_write.policy_arn
} }
################################################ ################################################
@ -463,7 +509,7 @@ module "engine_state_bucket" {
module "engine_registry" { module "engine_registry" {
source = "../modules/container_registry" source = "../modules/container_registry"
name = "engine" name = "engine"
stage = var.stage stage = var.stage
} }
# S3 policy for Engine to read and write from various S3 buckets # S3 policy for Engine to read and write from various S3 buckets
@ -472,7 +518,7 @@ module "engine_s3_read_and_write" {
policy_name = "EngineReadandWriteS3" policy_name = "EngineReadandWriteS3"
policy_description = "Allow Engine Lambda to read from and write to various S3 buckets" policy_description = "Allow Engine Lambda to read from and write to various S3 buckets"
bucket_arns = [ bucket_arns = [
"arn:aws:s3:::${module.s3_presignable_bucket.bucket_name}", "arn:aws:s3:::${module.s3_presignable_bucket.bucket_name}",
"arn:aws:s3:::${module.retrofit_sap_data.bucket_name}", "arn:aws:s3:::${module.retrofit_sap_data.bucket_name}",
"arn:aws:s3:::${module.retrofit_sap_predictions.bucket_name}", "arn:aws:s3:::${module.retrofit_sap_predictions.bucket_name}",
@ -482,10 +528,70 @@ module "engine_s3_read_and_write" {
"arn:aws:s3:::${module.retrofit_hotwater_kwh_predictions.bucket_name}", "arn:aws:s3:::${module.retrofit_hotwater_kwh_predictions.bucket_name}",
"arn:aws:s3:::${module.retrofit_energy_assessments.bucket_name}" "arn:aws:s3:::${module.retrofit_energy_assessments.bucket_name}"
] ]
actions = ["s3:*"] actions = ["s3:*"]
resource_paths = ["/*"] resource_paths = ["/*"]
} }
output "engine_s3_read_and_write_arn" { output "engine_s3_read_and_write_arn" {
value = module.engine_s3_read_and_write.policy_arn value = module.engine_s3_read_and_write.policy_arn
} }
################################################
# FastAPI Lambda
################################################
module "ara_fast_api_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "ara-fast-api-terraform-state"
}
output "ara_fast_api_state_bucket" {
value = module.ara_fast_api_state_bucket.bucket_name
}
# S3 policy for FastAPI app to read and write from various S3 buckets
module "fast_api_s3_read_and_write" {
source = "../modules/s3_iam_policy"
policy_name = "FastAPIReadandWriteS3"
policy_description = "Allow FastAPI Lambda to read from and write to various S3 buckets"
bucket_arns = [
"arn:aws:s3:::${module.s3_presignable_bucket.bucket_name}",
"arn:aws:s3:::${module.retrofit_sap_data.bucket_name}",
"arn:aws:s3:::${module.retrofit_sap_predictions.bucket_name}",
"arn:aws:s3:::${module.retrofit_carbon_predictions.bucket_name}",
"arn:aws:s3:::${module.retrofit_heat_predictions.bucket_name}",
"arn:aws:s3:::${module.retrofit_heating_kwh_predictions.bucket_name}",
"arn:aws:s3:::${module.retrofit_hotwater_kwh_predictions.bucket_name}",
"arn:aws:s3:::${module.retrofit_energy_assessments.bucket_name}"
]
actions = ["s3:GetObject", "s3:ListBucket"]
resource_paths = ["/*"]
}
output "fast_api_s3_read_and_write_arn" {
value = module.fast_api_s3_read_and_write.policy_arn
}
################################################
# CDN Certificate
################################################
module "cdn_certificate_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "cdn-certificate-terraform-state"
}
output "cdn_certificate_state_bucket" {
value = module.cdn_certificate_state_bucket.bucket_name
}
################################################
# CDN
################################################
module "cdn_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "ara-cdn-terraform-state"
}
output "cdn_state_bucket" {
value = module.cdn_state_bucket.bucket_name
}

View file

@ -0,0 +1,73 @@
import requests
import json
TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzMyMzc4MjQsImV4cCI6MTc3MzI0NTAyNCwic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.mkkxeZiD_ByHY4TJKpLQ-trmeGs15s0ekL6u1n-ek9j-EzNyf6qalEHCyHf8gzdNhU_vay96bIOMRHp4vXFaLqSANwKZayIS3EoA_b9-u2FAZpooxEvReAMNJGoZ6WLD01AQXWv-l7ww1ZqAnQzw0moL_Oma6hVmA5oa-RJKJ3MerS7e0Wei97Db48E140-EAbQf2iPcKYYtCNRA4il6n8DFiqGeoUMGo99jkR1ceZAvMpOAj8RhKX-4qSiDfX6yXUS2G96U5m7S_GWI-DEj5TazkN10Af3TyOY3EVjmZoJcRpiAR4cFmlfcTydjrShU03DWmPZm1QItf2McxfCpNA"
base = "https://pashub.net/api"
headers = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/json"}
company_id = "cb5249e2-8f31-4ef4-aefd-08ddaccb1fa2"
# 1⃣ get jobs
params = {
"pageIndex": 0,
"pageSize": 20,
"orderBy": "createdUtc",
"orderDesc": "true",
"addressUprn": "100061885568",
"companyId": company_id,
}
r = requests.get(f"{base}/jobs", headers=headers, params=params)
payload = r.json()
property_id = payload["results"][0]["id"]
print("JOB:", property_id)
# 2⃣ get evidence list
r = requests.get(f"{base}/jobs/{property_id}/evidence", headers=headers)
print(r.status_code)
evidence = r.json()
print(evidence)
# 3⃣ get evidence metadata
if evidence:
evidence_id = evidence["results"][0]["fileId"]
meta_url = f"https://pashub.net/api/jobs/{property_id}/evidenceMetadata"
meta_params = {"evidenceIds": evidence_id}
r = requests.get(meta_url, headers=headers, params=meta_params)
r.raise_for_status()
meta = r.json()
container = meta["containerName"]
blob_uri = meta["blobUri"]
file = meta["files"][0]
file_id = file["fileId"]
file_name = file["fileName"]
base, sas = blob_uri.split("?", 1)
download_url = f"{base}{container}/{file_id}?{sas}"
print("Download URL:", download_url)
pdf = requests.get(download_url)
pdf.raise_for_status()
with open(file_name, "wb") as f:
f.write(pdf.content)
print("Saved:", file_name)

View file

@ -29,9 +29,7 @@ from sqlalchemy import func
# PORTFOLIO_ID = 206 # PORTFOLIO_ID = 206
# SCENARIOS = [389] # SCENARIOS = [389]
PORTFOLIO_ID = 581 PORTFOLIO_ID = 581
SCENARIOS = [ SCENARIOS = [1124]
1124
]
scenario_names = { scenario_names = {
1124: "EPC C - Solar Focused", 1124: "EPC C - Solar Focused",
} }
@ -234,7 +232,7 @@ for scenario_id in SCENARIOS:
# Get recs for this scenario # Get recs for this scenario
recommended_measures_df = recommendations_df[ recommended_measures_df = recommendations_df[
recommendations_df["scenario_id"] == scenario_id recommendations_df["scenario_id"] == scenario_id
][["property_id", "measure_type", "estimated_cost", "default"]] ][["property_id", "measure_type", "estimated_cost", "default"]]
recommended_measures_df = recommended_measures_df[ recommended_measures_df = recommended_measures_df[
recommended_measures_df["default"] recommended_measures_df["default"]
] ]
@ -242,7 +240,7 @@ for scenario_id in SCENARIOS:
post_install_sap = recommendations_df[ post_install_sap = recommendations_df[
recommendations_df["scenario_id"] == scenario_id recommendations_df["scenario_id"] == scenario_id
][["property_id", "default", "sap_points"]] ][["property_id", "default", "sap_points"]]
post_install_sap = post_install_sap[post_install_sap["default"]] post_install_sap = post_install_sap[post_install_sap["default"]]
# Sum up the sap points by property id # Sum up the sap points by property id
post_install_sap = ( post_install_sap = (
@ -320,7 +318,7 @@ for scenario_id in SCENARIOS:
z = df2[ z = df2[
(df2["predicted_post_works_epc"] != "D") (df2["predicted_post_works_epc"] != "D")
& (df2["post_epc_rating"].astype(str) == "Epc.D") & (df2["post_epc_rating"].astype(str) == "Epc.D")
] ]
df2["predicted_post_works_epc"].value_counts() df2["predicted_post_works_epc"].value_counts()
df2["post_epc_rating"].astype(str).value_counts() df2["post_epc_rating"].astype(str).value_counts()
@ -330,8 +328,6 @@ for scenario_id in SCENARIOS:
getting_works = df[df["total_retrofit_cost"] > 0] getting_works = df[df["total_retrofit_cost"] > 0]
getting_works["predicted_post_works_epc"].value_counts() getting_works["predicted_post_works_epc"].value_counts()
32565 / getting_works.shape[0]
df[df["predicted_post_works_sap"] == ""] df[df["predicted_post_works_sap"] == ""]
# Expected columns list # Expected columns list

View file

@ -6,6 +6,7 @@ from io import BytesIO, StringIO
from urllib.parse import unquote from urllib.parse import unquote
from utils.logger import setup_logger from utils.logger import setup_logger
from botocore.exceptions import NoCredentialsError, PartialCredentialsError from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from typing import Any
logger = setup_logger() logger = setup_logger()
@ -316,7 +317,7 @@ def save_excel_to_s3(df, bucket_name, file_key):
logger.info(f"Excel file saved to S3 bucket '{bucket_name}' with key '{file_key}'") logger.info(f"Excel file saved to S3 bucket '{bucket_name}' with key '{file_key}'")
def read_csv_from_s3(bucket_name, filepath): def read_csv_from_s3(bucket_name: str, filepath: str) -> list[dict[str, str]]:
logger.info( logger.info(
f"Reading CSV file from S3 bucket '{bucket_name}' with key '{filepath}'" f"Reading CSV file from S3 bucket '{bucket_name}' with key '{filepath}'"
) )