From 68a95d02965ce78045118a51d6522f391c03fc39 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 5 Feb 2026 17:46:23 +0000 Subject: [PATCH 001/135] merged peters code --- .devcontainer/asset_list/requirements.txt | 2 +- .devcontainer/backend/requirements.txt | 2 +- asset_list/app.py | 53 ++++------------------- backend/address2UPRN/main.py | 13 ++++-- backend/address2UPRN/script.py | 15 ++++--- backend/app/requirements/requirements.txt | 2 +- sfr/principal_pitch/2_export_data.py | 6 +-- 7 files changed, 34 insertions(+), 59 deletions(-) diff --git a/.devcontainer/asset_list/requirements.txt b/.devcontainer/asset_list/requirements.txt index fe536a81..28730ed5 100644 --- a/.devcontainer/asset_list/requirements.txt +++ b/.devcontainer/asset_list/requirements.txt @@ -7,7 +7,7 @@ mangum==0.19.0 # AWS boto3==1.35.44 # Data -openpyxl==3.1.2 +openpyxl==3.1.5 # Basic pytz uvicorn[standard] diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index 9562aa6a..9814c8d4 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -9,7 +9,7 @@ mangum==0.19.0 # AWS boto3==1.35.44 # Data -openpyxl==3.1.2 +openpyxl==3.1.5 # Basic pytz uvicorn[standard] diff --git a/asset_list/app.py b/asset_list/app.py index b46254f9..9bb0c1f4 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -69,61 +69,24 @@ def app(): Property UPRN """ - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hackney" - data_filename = "Domna SHF Wave 3 (3).xlsx" - sheet_name = "Domna Wave 3" - postcode_column = "Postcode" - address1_column = "Address 1" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["Address 1"] - missing_postcodes_method = None - landlord_year_built = "Construction Years" - landlord_os_uprn = "UPRN" - landlord_property_type = "Type" - landlord_built_form = "Attachment" - landlord_wall_construction = "Wall type" - landlord_roof_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "Row ID" - landlord_sap = None - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_id_colnames = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - asset_list_header = 0 - landlord_block_reference = None - - # Peabody data for cleaning - data_folder = ( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - "Project/data_validation" - ) - data_filename = "to_standardise_uprns.xlsx" + data_folder = "/workspaces/model/asset_list/" + data_filename = "assets.xlsx" sheet_name = "Sheet1" postcode_column = "Postcode" - address1_column = None - address1_method = "house_number_extraction" - fulladdress_column = "Address" - address_cols_to_concat = None + address1_column = "junte found address" + address1_method = None + fulladdress_column = None + address_cols_to_concat = ["junte found address"] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = None + landlord_os_uprn = "juntes uprn" landlord_property_type = None landlord_built_form = None landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "LLUPRN" + landlord_property_id = "landlordid" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index ba386e0a..5f4fed74 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -12,6 +12,7 @@ import re EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", + "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=", ) if EPC_AUTH_TOKEN is None: @@ -300,7 +301,9 @@ def get_uprn_candidates( ) -def get_uprn(user_inputed_address: str, postcode: str, return_address=False): +def get_uprn( + user_inputed_address: str, postcode: str, return_address=False, return_EPC=False +): """ Return uprn (str) Return False if failed to find a sensible matching epc @@ -331,8 +334,9 @@ def get_uprn(user_inputed_address: str, postcode: str, return_address=False): address = top_rank_df["address"].values[0] lexiscore = float(top_rank_df["lexiscore"].values[0]) + epc = top_rank_df["current-energy-rating"].values[0] - logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") + # logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") # Safe to return the agreed UPRN found_uprn = top_rank_df.iloc[0]["uprn"] @@ -340,7 +344,10 @@ def get_uprn(user_inputed_address: str, postcode: str, return_address=False): return None if return_address: - return found_uprn, address + if return_EPC is False: + return found_uprn, address + else: + return found_uprn, address, epc return found_uprn diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py index a71b5827..0582450b 100644 --- a/backend/address2UPRN/script.py +++ b/backend/address2UPRN/script.py @@ -5,12 +5,15 @@ from backend.address2UPRN.main import get_uprn # Enable tqdm for pandas tqdm.pandas() -df = pd.read_excel("address2.xlsx") +file_name = "brentwood.xlsx" + +df = pd.read_excel(file_name) def extract_uprn(row): - print(row["User Input"], row["Postcode"]) - result = get_uprn(row["User Input"], row["Postcode"], return_address=True) + user_input = "Address" + postcode = "Postcode" + result = get_uprn(row[user_input], row[postcode], return_address=True) if result is None: return pd.Series([None, None]) @@ -19,6 +22,8 @@ def extract_uprn(row): return pd.Series([uprn, found_address]) -df[["juntes uprn", "junte found address"]] = df.progress_apply(extract_uprn, axis=1) +df[["juntes uprn", "junte found address", "junte found epc"]] = df.progress_apply( + extract_uprn, axis=1 +) -df.to_excel("outputs2.xlsx", index=False) +df.to_excel(f"{file_name}_outputs.xlsx", index=False) diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index 3124034e..9fdbfe4c 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -10,7 +10,7 @@ mangum==0.19.0 # AWS boto3==1.35.44 # Data -openpyxl==3.1.2 +openpyxl==3.1.5 # Basic pytz sqlmodel \ No newline at end of file diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index a65509d5..4e8cd157 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -28,12 +28,12 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 524 +PORTFOLIO_ID = 506 SCENARIOS = [ - 1009, + 987, ] scenario_names = { - 1009: "EPC C; Most Economic", + 987: "EPC C", } From d29ccecefb20c2cf15d44efa67c9a1e5fb5cb94f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 5 Feb 2026 17:54:10 +0000 Subject: [PATCH 002/135] more logs --- .github/workflows/deploy_terraform.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index f8718119..61ab586a 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -10,13 +10,23 @@ jobs: runs-on: ubuntu-latest outputs: stage: ${{ steps.set-stage.outputs.stage }} - + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} steps: - name: Determine stage from branch id: set-stage shell: bash run: | + echo $AWS_ACCESS_KEY_ID + echo $AWS_SECRET_ACCESS_KEY + echo $AWS_REGION + echo $DEV_DB_HOST + env + BRANCH="${GITHUB_REF_NAME}" if [[ "$BRANCH" == "prod" ]]; then From 09905cf68170b5c97c1d927c9ebc5c30f3e3bdec Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 5 Feb 2026 17:55:24 +0000 Subject: [PATCH 003/135] more logs --- .github/workflows/deploy_terraform.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 61ab586a..963160ae 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -24,6 +24,7 @@ jobs: echo $AWS_SECRET_ACCESS_KEY echo $AWS_REGION echo $DEV_DB_HOST + echo " dev db host${{ secrets.DEV_DB_HOST }}"" env From f986f85cfade72ea68fd23bb88fbd2621f2869ce Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 5 Feb 2026 17:56:22 +0000 Subject: [PATCH 004/135] m ore logs --- .github/workflows/deploy_terraform.yml | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 963160ae..4f941462 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -8,34 +8,30 @@ on: jobs: determine_stage: runs-on: ubuntu-latest + outputs: stage: ${{ steps.set-stage.outputs.stage }} - secrets: + + env: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + steps: - name: Determine stage from branch id: set-stage shell: bash run: | - echo $AWS_ACCESS_KEY_ID - echo $AWS_SECRET_ACCESS_KEY - echo $AWS_REGION - echo $DEV_DB_HOST - echo " dev db host${{ secrets.DEV_DB_HOST }}"" - - env + echo "AWS_ACCESS_KEY_ID is set? ${AWS_ACCESS_KEY_ID:+yes}" + echo "AWS_SECRET_ACCESS_KEY is set? ${AWS_SECRET_ACCESS_KEY:+yes}" + echo "AWS_REGION=$AWS_REGION" + echo "DEV_DB_HOST=$DEV_DB_HOST" BRANCH="${GITHUB_REF_NAME}" if [[ "$BRANCH" == "prod" ]]; then echo "stage=prod" >> "$GITHUB_OUTPUT" - - elif [[ "$BRANCH" == "dev" ]]; then - echo "stage=dev" >> "$GITHUB_OUTPUT" - else echo "stage=dev" >> "$GITHUB_OUTPUT" fi From 7c8a3858e79862d5db8fe8c1c482784d4cf9fb8d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 5 Feb 2026 18:03:35 +0000 Subject: [PATCH 005/135] DEV DB_HSOT --- .github/workflows/_build_image.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index fce856b6..8b0d74ef 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -40,6 +40,8 @@ on: jobs: build: runs-on: ubuntu-latest + env: + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} outputs: image_digest: ${{ steps.digest.outputs.image_digest }} From 18396d94944d4ec130e20af340de561aeb2baa23 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Feb 2026 15:45:25 +0000 Subject: [PATCH 006/135] temporary script built --- .devcontainer/asset_list/devcontainer.json | 3 ++- .devcontainer/backend/devcontainer.json | 3 ++- asset_list/app.py | 14 ++++++------- backend/address2UPRN/main.py | 17 +++++++++++++-- backend/address2UPRN/script.py | 24 +++++++++++++++------- sfr/principal_pitch/2_export_data.py | 10 +++++---- 6 files changed, 49 insertions(+), 22 deletions(-) diff --git a/.devcontainer/asset_list/devcontainer.json b/.devcontainer/asset_list/devcontainer.json index 4834d559..7c597859 100644 --- a/.devcontainer/asset_list/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -22,7 +22,8 @@ "jgclark.vscode-todo-highlight", "corentinartaud.pdfpreview", "ms-python.vscode-python-envs", - "ms-python.black-formatter" + "ms-python.black-formatter", + "GrapeCity.gc-excelviewer" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index c672b1bf..377adf1e 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -22,7 +22,8 @@ "corentinartaud.pdfpreview", "ms-python.vscode-python-envs", "ms-python.black-formatter", - "waderyan.gitblame" + "waderyan.gitblame", + "GrapeCity.gc-excelviewer" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/asset_list/app.py b/asset_list/app.py index 9bb0c1f4..da4eb6bb 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -70,23 +70,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list/" - data_filename = "assets.xlsx" - sheet_name = "Sheet1" - postcode_column = "Postcode" - address1_column = "junte found address" + data_filename = "manchester.xlsx" + sheet_name = "PW0099 - Property List" + postcode_column = "post Code" + address1_column = "address" address1_method = None fulladdress_column = None - address_cols_to_concat = ["junte found address"] + address_cols_to_concat = ["address"] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "juntes uprn" + landlord_os_uprn = None landlord_property_type = None landlord_built_form = None landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "landlordid" + landlord_property_id = "UHTprop Ref" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 5f4fed74..1b3a6c8a 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -302,7 +302,11 @@ def get_uprn_candidates( def get_uprn( - user_inputed_address: str, postcode: str, return_address=False, return_EPC=False + user_inputed_address: str, + postcode: str, + return_address=False, + return_EPC=False, + return_score=True, ): """ Return uprn (str) @@ -335,6 +339,7 @@ def get_uprn( address = top_rank_df["address"].values[0] lexiscore = float(top_rank_df["lexiscore"].values[0]) epc = top_rank_df["current-energy-rating"].values[0] + score = float(top_rank_df["lexiscore"].values[0]) # logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") # Safe to return the agreed UPRN @@ -347,7 +352,15 @@ def get_uprn( if return_EPC is False: return found_uprn, address else: - return found_uprn, address, epc + if return_score is False: + return found_uprn, address, epc + else: + return ( + found_uprn, + address, + epc, + score, + ) return found_uprn diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py index 0582450b..59855dbc 100644 --- a/backend/address2UPRN/script.py +++ b/backend/address2UPRN/script.py @@ -5,7 +5,7 @@ from backend.address2UPRN.main import get_uprn # Enable tqdm for pandas tqdm.pandas() -file_name = "brentwood.xlsx" +file_name = "forhousing.xlsx" df = pd.read_excel(file_name) @@ -13,17 +13,27 @@ df = pd.read_excel(file_name) def extract_uprn(row): user_input = "Address" postcode = "Postcode" - result = get_uprn(row[user_input], row[postcode], return_address=True) + result = get_uprn( + row[user_input], + row[postcode], + return_address=True, + return_EPC=True, + return_score=True, + ) if result is None: - return pd.Series([None, None]) + return pd.Series([None, None, None, None]) - uprn, found_address = result - return pd.Series([uprn, found_address]) + uprn, found_address, epc, score = result + return pd.Series([uprn, found_address, epc, score]) -df[["juntes uprn", "junte found address", "junte found epc"]] = df.progress_apply( - extract_uprn, axis=1 +df[["juntes uprn", "junte found address", "junte found epc", "junte score"]] = ( + df.progress_apply(extract_uprn, axis=1) ) df.to_excel(f"{file_name}_outputs.xlsx", index=False) + +# TODO: add lexiscore +# TODO: run it +# TODO: give it to danny diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 4e8cd157..1841cf3f 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -28,14 +28,16 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 506 +PORTFOLIO_ID = 544 SCENARIOS = [ - 987, + 1027, ] scenario_names = { - 987: "EPC C", + 1027: "EPC C", } +project_name = "manchester" + def get_data(portfolio_id, scenario_ids): session = sessionmaker(bind=db_engine)() @@ -329,6 +331,6 @@ for scenario_id in SCENARIOS: df[df["predicted_post_works_sap"] == ""] # Create excel to store to - filename = f"{scenario_names[scenario_id]} - 20250113 final.xlsx" + filename = f"{scenario_names[scenario_id]} - {project_name}.xlsx" with pd.ExcelWriter(filename) as writer: df.to_excel(writer, sheet_name="properties", index=False) From 47fce5f3f8afce2f1b59b25b9c81b19901f72ea0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 11:35:00 +0000 Subject: [PATCH 007/135] added postcode splittelr handler code --- .devcontainer/asset_list/devcontainer.json | 3 ++- .devcontainer/backend/devcontainer.json | 3 ++- backend/postcode_splitter/handler/Dockerfile | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.devcontainer/asset_list/devcontainer.json b/.devcontainer/asset_list/devcontainer.json index 7c597859..945dcd88 100644 --- a/.devcontainer/asset_list/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -23,7 +23,8 @@ "corentinartaud.pdfpreview", "ms-python.vscode-python-envs", "ms-python.black-formatter", - "GrapeCity.gc-excelviewer" + "GrapeCity.gc-excelviewer", + "jakobhoeg.vscode-pokemon" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index 377adf1e..5d728dcd 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -23,7 +23,8 @@ "ms-python.vscode-python-envs", "ms-python.black-formatter", "waderyan.gitblame", - "GrapeCity.gc-excelviewer" + "GrapeCity.gc-excelviewer", + "jakobhoeg.vscode-pokemon" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 7c1a7989..4c002f1d 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -3,6 +3,12 @@ FROM public.ecr.aws/lambda/python:3.10 # Set working directory (Lambda task root) WORKDIR /var/task +COPY backend/postcode_splitter/handler/requirements.txt + +RUN pip install --no-cache-dir -r requirements.txt + +COPY utils/ utils/ +COPY backend/postcode_splitter/main.py . # ----------------------------- # Lambda handler # ----------------------------- From 53367bcb980aaa13b18c05a0f281d51ff6499c34 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 11:43:01 +0000 Subject: [PATCH 008/135] docker build was wrong --- backend/postcode_splitter/handler/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 4c002f1d..3f77f38f 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -3,7 +3,7 @@ FROM public.ecr.aws/lambda/python:3.10 # Set working directory (Lambda task root) WORKDIR /var/task -COPY backend/postcode_splitter/handler/requirements.txt +COPY backend/postcode_splitter/handler/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt From 277588e629413e848e8d8776025ee55ac7447283 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 11:49:49 +0000 Subject: [PATCH 009/135] check out manual button --- .github/workflows/_deploy_lambda.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index bff106c5..be7ac95b 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -86,6 +86,13 @@ jobs: -var="image_digest=${{ inputs.image_digest }}" \ -out=lambdaplan + - name: Manual Approval + uses: trstringer/manual-approval@v1 + with: + secret: ${{ github.TOKEN }} + approvers: ${{ github.repository_owner }} + issue-title: "Approve Terraform Apply for ${{ inputs.lambda_name }} (${{ inputs.stage }})" + - name: Terraform Apply working-directory: ${{ inputs.lambda_path }} run: terraform apply -auto-approve lambdaplan From 00ea86500687dddb51614b51611b7315b6645802 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 11:58:20 +0000 Subject: [PATCH 010/135] check out manual button --- .github/workflows/_deploy_lambda.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index be7ac95b..24db77c5 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -86,12 +86,13 @@ jobs: -var="image_digest=${{ inputs.image_digest }}" \ -out=lambdaplan - - name: Manual Approval + - name: Wait for Approval uses: trstringer/manual-approval@v1 with: - secret: ${{ github.TOKEN }} - approvers: ${{ github.repository_owner }} - issue-title: "Approve Terraform Apply for ${{ inputs.lambda_name }} (${{ inputs.stage }})" + secret: ${{ secrets.GITHUB_TOKEN }} + approvers: ${{ github.actor }} + issue-title: "Click to approve Terraform Apply for ${{ inputs.lambda_name }} (${{ inputs.stage }})" + issue-body: "Press approve to proceed with Terraform Apply" - name: Terraform Apply working-directory: ${{ inputs.lambda_path }} From 3a2abca7472dae4f673194c38b8f44cf22bac79f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 12:05:28 +0000 Subject: [PATCH 011/135] check out manual button --- .github/workflows/_deploy_lambda.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 24db77c5..02d95525 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -1,5 +1,9 @@ name: Deploy Lambda (Terraform) +permissions: + contents: write + issues: write + on: workflow_call: inputs: From 969084c649b64097d30911b0e6b96616f9ae65de Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 12:11:27 +0000 Subject: [PATCH 012/135] check out manual button --- .github/workflows/_deploy_lambda.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 02d95525..24db77c5 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -1,9 +1,5 @@ name: Deploy Lambda (Terraform) -permissions: - contents: write - issues: write - on: workflow_call: inputs: From e6d994e0b0249a44fb512859ef1a9f63f536d0c1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 12:16:52 +0000 Subject: [PATCH 013/135] developers --- .github/workflows/_deploy_lambda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 24db77c5..8d399cde 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -90,7 +90,7 @@ jobs: uses: trstringer/manual-approval@v1 with: secret: ${{ secrets.GITHUB_TOKEN }} - approvers: ${{ github.actor }} + approvers: developers issue-title: "Click to approve Terraform Apply for ${{ inputs.lambda_name }} (${{ inputs.stage }})" issue-body: "Press approve to proceed with Terraform Apply" From ffbb6212822662aeb352095a0026f1d927370d9a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 12:26:59 +0000 Subject: [PATCH 014/135] made terraform apply work --- .github/workflows/_deploy_lambda.yml | 17 +++++++++-------- .github/workflows/deploy_terraform.yml | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 8d399cde..d3a9f79a 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -23,6 +23,14 @@ on: required: true type: string + terraform_apply: + required: false + type: choice + default: 'false' + options: + - 'true' + - 'false' + secrets: AWS_ACCESS_KEY_ID: required: true @@ -86,14 +94,7 @@ jobs: -var="image_digest=${{ inputs.image_digest }}" \ -out=lambdaplan - - name: Wait for Approval - uses: trstringer/manual-approval@v1 - with: - secret: ${{ secrets.GITHUB_TOKEN }} - approvers: developers - issue-title: "Click to approve Terraform Apply for ${{ inputs.lambda_name }} (${{ inputs.stage }})" - issue-body: "Press approve to proceed with Terraform Apply" - - name: Terraform Apply + if: inputs.terraform_apply == 'true' || inputs.stage == 'dev' || inputs.stage == 'main' working-directory: ${{ inputs.lambda_path }} run: terraform apply -auto-approve lambdaplan diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 4f941462..1356b341 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -133,6 +133,8 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} + # This should not be deployed in production!!!! + terraform_apply: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} From 50018934907014d979b33773f8515bb136d57bc2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 12:27:53 +0000 Subject: [PATCH 015/135] terraform apply as a string --- .github/workflows/_deploy_lambda.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index d3a9f79a..b3ca4583 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -25,11 +25,8 @@ on: terraform_apply: required: false - type: choice + type: string default: 'false' - options: - - 'true' - - 'false' secrets: AWS_ACCESS_KEY_ID: From 2881ecd2879d637ad9f5b544229a69521a5834d2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 12:35:18 +0000 Subject: [PATCH 016/135] terraform apply based on branch name --- .github/workflows/_deploy_lambda.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index b3ca4583..9bd686aa 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -27,6 +27,7 @@ on: required: false type: string default: 'false' + # can only be 'true' or 'false' secrets: AWS_ACCESS_KEY_ID: @@ -92,6 +93,6 @@ jobs: -out=lambdaplan - name: Terraform Apply - if: inputs.terraform_apply == 'true' || inputs.stage == 'dev' || inputs.stage == 'main' + if: inputs.terraform_apply == 'true' || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/main' working-directory: ${{ inputs.lambda_path }} run: terraform apply -auto-approve lambdaplan From 555544fc2da2e24923044bd6719f720225c53de0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 13:04:37 +0000 Subject: [PATCH 017/135] added requirements txt file --- backend/postcode_splitter/handler/requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/postcode_splitter/handler/requirements.txt b/backend/postcode_splitter/handler/requirements.txt index e69de29b..f6618d2b 100644 --- a/backend/postcode_splitter/handler/requirements.txt +++ b/backend/postcode_splitter/handler/requirements.txt @@ -0,0 +1,5 @@ +pandas>=1.3.0 +requests>=2.28.0 +tqdm>=4.64.0 +epc-api>=0.1.0 +openpyxl>=3.8.0 From 14dbc802c2644792ec8fe2b3df5c6d58bd881929 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 13:58:45 +0000 Subject: [PATCH 018/135] postcode spliter --- backend/address2UPRN/handler/Dockerfile | 4 +++- backend/address2UPRN/handler/requirements.txt | 7 +++++-- backend/postcode_splitter/handler/Dockerfile | 8 ++++---- backend/postcode_splitter/handler/requirements.txt | 11 ++++++----- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 3f7567d3..5ccb5590 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -1,4 +1,5 @@ -FROM public.ecr.aws/lambda/python:3.10 +# FROM public.ecr.aws/lambda/python:3.10 +# FROM python:3.11.10-bullseye # This is not going to be permenant - but until we solve for env variables in live prod ENV EPC_AUTH_TOKEN=a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzAg @@ -11,6 +12,7 @@ WORKDIR /var/task # ----------------------------- COPY backend/address2UPRN/handler/requirements.txt . + # Install dependencies into Lambda runtime RUN pip install --no-cache-dir -r requirements.txt diff --git a/backend/address2UPRN/handler/requirements.txt b/backend/address2UPRN/handler/requirements.txt index bc753841..eba2c846 100644 --- a/backend/address2UPRN/handler/requirements.txt +++ b/backend/address2UPRN/handler/requirements.txt @@ -1,3 +1,6 @@ -epc-api-python==1.0.2 +pandas==2.2.2 +numpy<2.0 +requests tqdm -pandas \ No newline at end of file +openpyxl +epc-api-python==1.0.2 diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 3f77f38f..f8196297 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM public.ecr.aws/lambda/python:3.10 +FROM public.ecr.aws/lambda/python:3.11 # Set working directory (Lambda task root) WORKDIR /var/task @@ -9,7 +9,7 @@ RUN pip install --no-cache-dir -r requirements.txt COPY utils/ utils/ COPY backend/postcode_splitter/main.py . -# ----------------------------- -# Lambda handler -# ----------------------------- +# # ----------------------------- +# # Lambda handler +# # ----------------------------- CMD ["main.handler"] diff --git a/backend/postcode_splitter/handler/requirements.txt b/backend/postcode_splitter/handler/requirements.txt index f6618d2b..8adea4e7 100644 --- a/backend/postcode_splitter/handler/requirements.txt +++ b/backend/postcode_splitter/handler/requirements.txt @@ -1,5 +1,6 @@ -pandas>=1.3.0 -requests>=2.28.0 -tqdm>=4.64.0 -epc-api>=0.1.0 -openpyxl>=3.8.0 +pandas==2.2.2 +numpy<2.0 +requests +tqdm +openpyxl +epc-api-python==1.0.2 \ No newline at end of file From 9506b9f591fa107c8530a12f124adf428439c808 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 14:01:28 +0000 Subject: [PATCH 019/135] lol compeltely skipped lambda --- backend/address2UPRN/handler/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 5ccb5590..c6dc1180 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -1,4 +1,4 @@ -# FROM public.ecr.aws/lambda/python:3.10 +FROM public.ecr.aws/lambda/python:3.10 # FROM python:3.11.10-bullseye # This is not going to be permenant - but until we solve for env variables in live prod From 455a89aa1a2af649ae8bb235ea641c603bdcfc5e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 14:27:05 +0000 Subject: [PATCH 020/135] added backend code --- backend/postcode_splitter/handler/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index f8196297..ae9056ed 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -9,6 +9,12 @@ RUN pip install --no-cache-dir -r requirements.txt COPY utils/ utils/ COPY backend/postcode_splitter/main.py . + +COPY utils/ utils/ +COPY backend/ backend/ + +COPY backend/__init__.py backend/__init__.py + # # ----------------------------- # # Lambda handler # # ----------------------------- From 11510fbe836cb41197c713862935807404f7ed99 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 15:41:22 +0000 Subject: [PATCH 021/135] added backend code --- backend/postcode_splitter/handler/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index ae9056ed..72ce3094 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -19,3 +19,4 @@ COPY backend/__init__.py backend/__init__.py # # Lambda handler # # ----------------------------- CMD ["main.handler"] + From dd30d0d2a88eaefbd4aa839a03500cc2763c6585 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 16:15:14 +0000 Subject: [PATCH 022/135] exr Pull remove --- .../modules/lambda_execution_role/main.tf | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/infrastructure/terraform/modules/lambda_execution_role/main.tf b/infrastructure/terraform/modules/lambda_execution_role/main.tf index fa657afd..af035ebb 100644 --- a/infrastructure/terraform/modules/lambda_execution_role/main.tf +++ b/infrastructure/terraform/modules/lambda_execution_role/main.tf @@ -19,19 +19,19 @@ resource "aws_iam_role_policy_attachment" "basic_logs" { policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" } -resource "aws_iam_role_policy" "ecr_pull" { - role = aws_iam_role.this.name +# resource "aws_iam_role_policy" "ecr_pull" { +# role = aws_iam_role.this.name - policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Effect = "Allow" - Action = [ - "ecr:GetAuthorizationToken", - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer" - ] - Resource = "*" - }] - }) -} +# policy = jsonencode({ +# Version = "2012-10-17" +# Statement = [{ +# Effect = "Allow" +# Action = [ +# "ecr:GetAuthorizationToken", +# "ecr:BatchGetImage", +# "ecr:GetDownloadUrlForLayer" +# ] +# Resource = "*" +# }] +# }) +# } From e1ce16e3cdf00e461b24ca619002e2e6c065c09b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 16:28:33 +0000 Subject: [PATCH 023/135] polciy --- .../modules/lambda_execution_role/main.tf | 16 ---------------- .../terraform/modules/lambda_sqs_trigger/main.tf | 15 --------------- 2 files changed, 31 deletions(-) diff --git a/infrastructure/terraform/modules/lambda_execution_role/main.tf b/infrastructure/terraform/modules/lambda_execution_role/main.tf index af035ebb..e593b17c 100644 --- a/infrastructure/terraform/modules/lambda_execution_role/main.tf +++ b/infrastructure/terraform/modules/lambda_execution_role/main.tf @@ -19,19 +19,3 @@ resource "aws_iam_role_policy_attachment" "basic_logs" { policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" } -# resource "aws_iam_role_policy" "ecr_pull" { -# role = aws_iam_role.this.name - -# policy = jsonencode({ -# Version = "2012-10-17" -# Statement = [{ -# Effect = "Allow" -# Action = [ -# "ecr:GetAuthorizationToken", -# "ecr:BatchGetImage", -# "ecr:GetDownloadUrlForLayer" -# ] -# Resource = "*" -# }] -# }) -# } diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf index 5919e10f..0cf9a353 100644 --- a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf +++ b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf @@ -5,19 +5,4 @@ resource "aws_lambda_event_source_mapping" "this" { enabled = true } -resource "aws_iam_role_policy" "allow_sqs" { - role = var.lambda_role_name - policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Effect = "Allow" - Action = [ - "sqs:ReceiveMessage", - "sqs:DeleteMessage", - "sqs:GetQueueAttributes" - ] - Resource = var.queue_arn - }] - }) -} From 65daf388da8c1f5c877f6f43e8939bee5b7ccc77 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 16:43:46 +0000 Subject: [PATCH 024/135] sqs policy --- .../terraform/modules/lambda_sqs_trigger/main.tf | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf index 0cf9a353..5919e10f 100644 --- a/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf +++ b/infrastructure/terraform/modules/lambda_sqs_trigger/main.tf @@ -5,4 +5,19 @@ resource "aws_lambda_event_source_mapping" "this" { enabled = true } +resource "aws_iam_role_policy" "allow_sqs" { + role = var.lambda_role_name + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes" + ] + Resource = var.queue_arn + }] + }) +} From b9d31fa6157112525f5b2f482831652ae6f49881 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 18:26:41 +0000 Subject: [PATCH 025/135] sqs policy --- .../terraform/lambda/modules/lambda_with_sqs/outputs.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf b/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf index afc9246d..b408593f 100644 --- a/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf +++ b/infrastructure/terraform/lambda/modules/lambda_with_sqs/outputs.tf @@ -9,3 +9,4 @@ output "queue_arn" { output "queue_url" { value = module.queue.queue_url } + From 10c552772b4efff0a04d4ed1556b415633e225f3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 18:53:49 +0000 Subject: [PATCH 026/135] more useful logs --- backend/postcode_splitter/main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index d55f618a..dda1163a 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -119,8 +119,17 @@ def main(): def handler(event, context): - print("hello Postcode splitter world") - return {"statusCode": 200, "body": "hello world"} + print(f"Function: {context.function_name}") + print(f"Function Version: {context.function_version}") + print(f"Log Group: {context.log_group_name}") + print(f"Log Stream: {context.log_stream_name}") + print(f"Request ID: {context.aws_request_id}") + print(f"Memory Limit: {context.memory_limit_in_mb} MB") + print(f"Remaining Time: {context.get_remaining_time_in_millis()} ms") + print(f"Event: {event}") + + print("Postcode splitter handler invoked") + return {"statusCode": 200, "body": "postcode splitter executed"} if __name__ == "__main__": From 79eb81fd94c474e21cd911d704d6bc73dc3f1f54 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 20:28:16 +0000 Subject: [PATCH 027/135] force it to rerun --- backend/postcode_splitter/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index dda1163a..da15a48a 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -127,6 +127,7 @@ def handler(event, context): print(f"Memory Limit: {context.memory_limit_in_mb} MB") print(f"Remaining Time: {context.get_remaining_time_in_millis()} ms") print(f"Event: {event}") + print(f"Event: {event}") print("Postcode splitter handler invoked") return {"statusCode": 200, "body": "postcode splitter executed"} From 53ec9c261c807c7b84ac8d16841956a2c3c5d1d5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 21:26:37 +0000 Subject: [PATCH 028/135] test post code splitter with csv file --- backend/postcode_splitter/main.py | 149 ++++++++++++++++++++++++++++-- 1 file changed, 140 insertions(+), 9 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index da15a48a..d5fe3b1b 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -1,12 +1,34 @@ +import json import pandas as pd import requests +from uuid import UUID +from urllib.parse import unquote from backend.address2UPRN.main import ( resolve_uprns_for_postcode_group, get_epc_data_with_postcode, ) +from backend.app.db.functions.tasks.Tasks import SubTaskInterface +from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict from tqdm import tqdm +def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: + """ + Parse AWS console S3 URL to extract bucket and key. + + Format: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path + """ + if "console.aws.amazon.com" in s3_uri and "?prefix=" in s3_uri: + base, query = s3_uri.split("?", 1) + path_parts = base.split("/s3/object/") + if len(path_parts) > 1: + bucket = path_parts[1] + params = dict(item.split("=") for item in query.split("&") if "=" in item) + key = unquote(params.get("prefix", "")) + return bucket, key + raise ValueError(f"Could not parse S3 URI: {s3_uri}") + + def sanitise_postcode(postcode: str) -> str | None: """ Normalise postcode for grouping. @@ -120,17 +142,126 @@ def main(): def handler(event, context): print(f"Function: {context.function_name}") - print(f"Function Version: {context.function_version}") - print(f"Log Group: {context.log_group_name}") - print(f"Log Stream: {context.log_stream_name}") print(f"Request ID: {context.aws_request_id}") - print(f"Memory Limit: {context.memory_limit_in_mb} MB") - print(f"Remaining Time: {context.get_remaining_time_in_millis()} ms") - print(f"Event: {event}") - print(f"Event: {event}") - print("Postcode splitter handler invoked") - return {"statusCode": 200, "body": "postcode splitter executed"} + # Example SQS message for testing (copy and paste into SQS): + # { + # "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + # "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv" + # } + + # Handle both single event and batch events (SQS, etc.) + records = event.get("Records", [event]) + results = [] + errors = [] + subtask_interface = SubTaskInterface() + + for record in records: + task_id = None + subtask_id = None + try: + # Parse body + if isinstance(record.get("body"), str): + body = json.loads(record["body"]) + else: + body = record.get("body", {}) + + # Validate required fields + task_id = body.get("task_id") + s3_uri = body.get("s3_uri") + + if not task_id: + errors.append({"error": "Missing required field: task_id"}) + continue + + if not s3_uri: + errors.append({"error": "Missing required field: s3_uri"}) + continue + + # Convert task_id to UUID + try: + task_id = UUID(task_id) if isinstance(task_id, str) else task_id + except ValueError as e: + errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) + continue + + # Create a new subtask for this postcode splitter invocation + subtask_id = subtask_interface.create_subtask( + task_id=task_id, inputs={"s3_uri": s3_uri} + ) + print(f"Created subtask {subtask_id} for task {task_id}") + + # Process normal flow + print(f"Processing task_id: {task_id}") + print(f"Processing s3_uri: {s3_uri}") + + # Read CSV from S3 + print("Reading CSV from S3...") + bucket, key = parse_s3_console_url(s3_uri) + print(f"Parsed S3 - Bucket: {bucket}, Key: {key}") + csv_data = read_csv_from_s3_dict(bucket, key) + df = pd.DataFrame(csv_data) + print(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") + + # Get head for demo + df_head = df.head() + print("DataFrame head:") + print(df_head) + df_head_dict = df_head.to_dict("records") + + results.append( + { + "message": "Postcode splitter processing started", + "task_id": str(task_id), + "s3_uri": s3_uri, + "subtask_id": str(subtask_id), + } + ) + + # Mark subtask as complete after successful processing + subtask_interface.update_subtask_status( + subtask_id, + "complete", + outputs={ + "status": "processing_complete", + "s3_uri": s3_uri, + "rows_processed": len(df), + }, + ) + print(f"Subtask {subtask_id} marked as complete") + + except json.JSONDecodeError as e: + errors.append({"error": "Invalid JSON in request body", "details": str(e)}) + # Mark subtask as failed if we have one + if subtask_id: + try: + subtask_interface.update_subtask_status( + subtask_id, "failed", outputs={"error": str(e)} + ) + except Exception as db_error: + print(f"Failed to update subtask status: {db_error}") + except Exception as e: + print(f"Unexpected error processing record: {e}") + errors.append({"error": "Unexpected error", "details": str(e)}) + # Mark subtask as failed if we have one + if subtask_id: + try: + subtask_interface.update_subtask_status( + subtask_id, "failed", outputs={"error": str(e)} + ) + except Exception as db_error: + print(f"Failed to update subtask status: {db_error}") + + # Return error if all records failed + if errors and not results: + return {"statusCode": 500, "body": json.dumps({"errors": errors})} + + return { + "statusCode": 200, + "body": json.dumps( + {"processed": results, "errors": errors if errors else None} + ), + } if __name__ == "__main__": From e5cf3a426e3d0b762e95af0984b883eeb6c31972 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 21:32:26 +0000 Subject: [PATCH 029/135] imports --- backend/postcode_splitter/handler/Dockerfile | 18 +++++++++++------- .../postcode_splitter/handler/requirements.txt | 6 +++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 72ce3094..7ddd1e11 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -7,16 +7,20 @@ COPY backend/postcode_splitter/handler/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY utils/ utils/ -COPY backend/postcode_splitter/main.py . - +# Copy necessary files for database and utility imports COPY utils/ utils/ COPY backend/ backend/ -COPY backend/__init__.py backend/__init__.py +# Copy the handler +COPY backend/postcode_splitter/main.py . -# # ----------------------------- -# # Lambda handler -# # ----------------------------- +# Ensure __init__.py files exist for proper module importing +RUN touch backend/__init__.py +RUN touch backend/app/__init__.py +RUN touch backend/db/__init__.py +RUN touch backend/postcode_splitter/__init__.py +RUN touch utils/__init__.py + +# Lambda handler CMD ["main.handler"] diff --git a/backend/postcode_splitter/handler/requirements.txt b/backend/postcode_splitter/handler/requirements.txt index 8adea4e7..a718b818 100644 --- a/backend/postcode_splitter/handler/requirements.txt +++ b/backend/postcode_splitter/handler/requirements.txt @@ -3,4 +3,8 @@ numpy<2.0 requests tqdm openpyxl -epc-api-python==1.0.2 \ No newline at end of file +epc-api-python==1.0.2 +boto3==1.35.44 +sqlmodel +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 \ No newline at end of file From e3e024f70c869cc5ef73ee84eea9ba740f111468 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 21:37:02 +0000 Subject: [PATCH 030/135] imports --- backend/postcode_splitter/handler/Dockerfile | 7 ------- 1 file changed, 7 deletions(-) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 7ddd1e11..0ec53108 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -14,13 +14,6 @@ COPY backend/ backend/ # Copy the handler COPY backend/postcode_splitter/main.py . -# Ensure __init__.py files exist for proper module importing -RUN touch backend/__init__.py -RUN touch backend/app/__init__.py -RUN touch backend/db/__init__.py -RUN touch backend/postcode_splitter/__init__.py -RUN touch utils/__init__.py - # Lambda handler CMD ["main.handler"] From c673604ec4b98a1fcae55ef010c236d62a658e5f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 21:43:03 +0000 Subject: [PATCH 031/135] imports --- backend/postcode_splitter/handler/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 0ec53108..13ac309e 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -10,6 +10,7 @@ RUN pip install --no-cache-dir -r requirements.txt # Copy necessary files for database and utility imports COPY utils/ utils/ COPY backend/ backend/ +COPY datatypes/ datatypes/ # Copy the handler COPY backend/postcode_splitter/main.py . From 45026b402fb6004bbbe4d7178f78466d4fb0bdbf Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 21:47:23 +0000 Subject: [PATCH 032/135] pydantic settings --- backend/postcode_splitter/handler/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/postcode_splitter/handler/requirements.txt b/backend/postcode_splitter/handler/requirements.txt index a718b818..6ef41b2d 100644 --- a/backend/postcode_splitter/handler/requirements.txt +++ b/backend/postcode_splitter/handler/requirements.txt @@ -7,4 +7,5 @@ epc-api-python==1.0.2 boto3==1.35.44 sqlmodel sqlalchemy==2.0.36 -psycopg2-binary==2.9.10 \ No newline at end of file +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 \ No newline at end of file From 5a995c8443de38b184cfff9ed82bb95fad5b7df0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 21:57:19 +0000 Subject: [PATCH 033/135] save a random port number --- backend/.env.local | 2 +- backend/postcode_splitter/main.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/.env.local b/backend/.env.local index 22e1db35..9b478e53 100644 --- a/backend/.env.local +++ b/backend/.env.local @@ -30,7 +30,7 @@ GOOGLE_SOLAR_API_KEY="test" DB_HOST="test" DB_PASSWORD="test" DB_USERNAME="test" -DB_PORT="test" +DB_PORT="5432" DB_NAME="test" SAP_PREDICTIONS_BUCKET="test" CARBON_PREDICTIONS_BUCKET="test" diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index d5fe3b1b..740d1c7d 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -146,8 +146,8 @@ def handler(event, context): # Example SQS message for testing (copy and paste into SQS): # { - # "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - # "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv" + # "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + # "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv", # } # Handle both single event and batch events (SQS, etc.) From 851432b3573bebe56a3b9d9c439710670b9c4d16 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:10:27 +0000 Subject: [PATCH 034/135] database things --- .github/workflows/_build_image.yml | 15 ++++----- .github/workflows/deploy_terraform.yml | 4 +++ backend/postcode_splitter/handler/Dockerfile | 8 +++++ .../terraform/lambda/postcodeSplitter/main.tf | 31 ++++++++++++++++--- 4 files changed, 46 insertions(+), 12 deletions(-) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index 8b0d74ef..641e31f9 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -34,14 +34,19 @@ on: required: true DEV_DB_HOST: required: false - REAL_DB_HOST: + DEV_DB_PORT: + required: false + DEV_DB_NAME: required: false jobs: build: runs-on: ubuntu-latest + env: DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} outputs: image_digest: ${{ steps.digest.outputs.image_digest }} @@ -82,11 +87,7 @@ jobs: temp=$(eval echo "$line") BUILD_ARGS="$BUILD_ARGS --build-arg $temp" done <<< "${{ inputs.build_args }}" - - echo "dev db host: $DEV_DB_HOST" - echo "real db host: $REAL_DB_HOST" - echo "aws_key_id: $AWS_ACCESS_KEY_ID" - + docker build \ -f ${{ inputs.dockerfile_path }} \ $BUILD_ARGS \ @@ -103,4 +104,4 @@ jobs: --image-ids imageTag=${GITHUB_SHA} \ --query 'imageDetails[0].imageDigest' \ --output text) - echo "image_digest=$DIGEST" >> "$GITHUB_OUTPUT" + echo "image_digest=$DIGEST" >> "$GITHUB_OUTPUT" \ No newline at end of file diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 1356b341..ab42d4b9 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -116,6 +116,10 @@ jobs: ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} dockerfile_path: backend/postcode_splitter/handler/Dockerfile build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 13ac309e..74c00b9f 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -1,5 +1,13 @@ FROM public.ecr.aws/lambda/python:3.11 +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + # Set working directory (Lambda task root) WORKDIR /var/task diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index ebbdbfdc..7ba4506c 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -1,3 +1,20 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" # TODO: dont hardcode this + region = "eu-west-2" + } +} +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + module "lambda" { source = "../modules/lambda_with_sqs" @@ -7,8 +24,12 @@ module "lambda" { image_uri = local.image_uri - environment = { - STAGE = var.stage - LOG_LEVEL = "info" - } -} + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + }, + ) +} \ No newline at end of file From 091edfdd3a9c93cbea5c55e767d7dd23a65adcec Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:12:11 +0000 Subject: [PATCH 035/135] database things --- .github/workflows/deploy_terraform.yml | 2 -- backend/condition/handler/Dockerfile | 2 -- backend/condition/handler/handler.py | 4 ---- 3 files changed, 8 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index ab42d4b9..9a9b4421 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -157,7 +157,6 @@ jobs: build_args: | JUNTE=best DEV_DB_HOST=$DEV_DB_HOST - REAL_DB_HOST=$REAL_DB_HOST AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID AWS_REGION=$AWS_REGION secrets: @@ -165,7 +164,6 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} - REAL_DB_HOST: ${{ secrets.dev_DB_HOST }} # ============================================================ # Deploy Condition ETL Lambda diff --git a/backend/condition/handler/Dockerfile b/backend/condition/handler/Dockerfile index 5cb95532..8759dff3 100644 --- a/backend/condition/handler/Dockerfile +++ b/backend/condition/handler/Dockerfile @@ -12,8 +12,6 @@ ENV JUNTE=${JUNTE} ARG DEV_DB_HOST ENV DEV_DB_HOST=${DEV_DB_HOST} -ARG REAL_DB_HOST -ENV REAL_DB_HOST=${REAL_DB_HOST} ARG AWS_ACCESS_KEY_ID ENV AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} diff --git a/backend/condition/handler/handler.py b/backend/condition/handler/handler.py index 21fa6928..0f8dd940 100644 --- a/backend/condition/handler/handler.py +++ b/backend/condition/handler/handler.py @@ -23,10 +23,6 @@ def handler(event: Mapping[str, Any], context: Any) -> None: "hello DEV DB HOST:", os.getenv("DEV_DB_HOST", "empty db"), ) - print( - "hello REAL DB HOST:", - os.getenv("REAL_DB_HOST", "empty db"), - ) print( "hello access key", os.getenv("AWS_ACCESS_KEY_ID", "empty key"), From 72df7fbb745294f38f622f9b297c16bd9ae6b8b6 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:13:10 +0000 Subject: [PATCH 036/135] database things --- .github/workflows/deploy_terraform.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 9a9b4421..b9fc533e 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -154,16 +154,10 @@ jobs: ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} dockerfile_path: backend/condition/handler/Dockerfile build_context: . - build_args: | - JUNTE=best - DEV_DB_HOST=$DEV_DB_HOST - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID - AWS_REGION=$AWS_REGION secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} # ============================================================ # Deploy Condition ETL Lambda From 68ddced1af7f9b18d6e93215cc0d128b1b9c72f4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:21:58 +0000 Subject: [PATCH 037/135] pass in secrets --- .github/workflows/deploy_terraform.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index b9fc533e..c863f6f1 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -124,6 +124,9 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} # ============================================================ # 3️⃣ Deploy Postcode Splitter Lambda From c56789a5023816fdd4e7831a2494b1316cdf550b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:31:04 +0000 Subject: [PATCH 038/135] show me secrets --- backend/postcode_splitter/main.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 740d1c7d..d51866a4 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -1,3 +1,12 @@ +import os +import sys +print("=" * 60) +print("ENVIRONMENT AT STARTUP:") +print("=" * 60) +for k, v in sorted(os.environ.items()): + print(f"{k}={v}") +print("=" * 60) + import json import pandas as pd import requests From 477ebcef6705738f11fad88d8016db475e3a0155 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:40:08 +0000 Subject: [PATCH 039/135] add more logging --- backend/postcode_splitter/main.py | 39 +++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index d51866a4..14610171 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -7,18 +7,33 @@ for k, v in sorted(os.environ.items()): print(f"{k}={v}") print("=" * 60) -import json -import pandas as pd -import requests -from uuid import UUID -from urllib.parse import unquote -from backend.address2UPRN.main import ( - resolve_uprns_for_postcode_group, - get_epc_data_with_postcode, -) -from backend.app.db.functions.tasks.Tasks import SubTaskInterface -from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict -from tqdm import tqdm +try: + import json + print("✓ json imported") + import pandas as pd + print("✓ pandas imported") + import requests + print("✓ requests imported") + from uuid import UUID + print("✓ UUID imported") + from urllib.parse import unquote + print("✓ urllib.parse imported") + from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict + print("✓ utils.s3 imported") + from tqdm import tqdm + print("✓ tqdm imported") + from backend.address2UPRN.main import ( + resolve_uprns_for_postcode_group, + get_epc_data_with_postcode, + ) + print("✓ backend.address2UPRN imported") + from backend.app.db.functions.tasks.Tasks import SubTaskInterface + print("✓ SubTaskInterface imported") +except Exception as e: + print(f"✗ IMPORT ERROR: {type(e).__name__}: {e}") + import traceback + traceback.print_exc() + raise def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: From dd8a490210252f5b2c0c8de893c9cb7ab109663e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 22:57:23 +0000 Subject: [PATCH 040/135] lets do subtasks first --- backend/address2UPRN/main.py | 7 ++----- backend/postcode_splitter/main.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 1b3a6c8a..293ce3d9 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -5,10 +5,11 @@ import pandas as pd from difflib import SequenceMatcher from tqdm import tqdm from utils.logger import setup_logger +import re +from typing import Set logger = setup_logger() -import re EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", @@ -18,10 +19,6 @@ EPC_AUTH_TOKEN = os.getenv( if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") -import re -from difflib import SequenceMatcher -from typing import Set - def levenshtein(a: str, b: str) -> float: """ diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 14610171..e3a8c438 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -1,5 +1,6 @@ import os import sys + print("=" * 60) print("ENVIRONMENT AT STARTUP:") print("=" * 60) @@ -9,29 +10,39 @@ print("=" * 60) try: import json + print("✓ json imported") import pandas as pd + print("✓ pandas imported") import requests + print("✓ requests imported") from uuid import UUID + print("✓ UUID imported") from urllib.parse import unquote + print("✓ urllib.parse imported") from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict + print("✓ utils.s3 imported") from tqdm import tqdm + print("✓ tqdm imported") + from backend.app.db.functions.tasks.Tasks import SubTaskInterface + + print("✓ SubTaskInterface imported") from backend.address2UPRN.main import ( resolve_uprns_for_postcode_group, get_epc_data_with_postcode, ) + print("✓ backend.address2UPRN imported") - from backend.app.db.functions.tasks.Tasks import SubTaskInterface - print("✓ SubTaskInterface imported") except Exception as e: print(f"✗ IMPORT ERROR: {type(e).__name__}: {e}") import traceback + traceback.print_exc() raise From 1a0d463e2eeeb4c4d85a84a8e7cdaae74fc4d006 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 23:07:51 +0000 Subject: [PATCH 041/135] missing init.py --- backend/app/db/functions/tasks/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 backend/app/db/functions/tasks/__init__.py diff --git a/backend/app/db/functions/tasks/__init__.py b/backend/app/db/functions/tasks/__init__.py new file mode 100644 index 00000000..e69de29b From c0efa07d2a415697ae96ec41415c1d9152f7abb7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 23:15:53 +0000 Subject: [PATCH 042/135] handler remap --- backend/postcode_splitter/handler/Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 74c00b9f..ad0d1d69 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -20,9 +20,6 @@ COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ -# Copy the handler -COPY backend/postcode_splitter/main.py . - # Lambda handler -CMD ["main.handler"] +CMD ["backend.postcode_splitter.main.handler"] From f5981e91474e88d072479b82b0d1060a61e438fc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 23:22:55 +0000 Subject: [PATCH 043/135] imports are working now? --- backend/postcode_splitter/handler/Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index ad0d1d69..74c00b9f 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -20,6 +20,9 @@ COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ -# Lambda handler -CMD ["backend.postcode_splitter.main.handler"] +# Copy the handler +COPY backend/postcode_splitter/main.py . + +# Lambda handler +CMD ["main.handler"] From 8325bb53cf188274a8a2a3c92714601b8b50b288 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 23:25:52 +0000 Subject: [PATCH 044/135] added more logs --- backend/postcode_splitter/main.py | 32 ++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index e3a8c438..282e432a 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -176,8 +176,13 @@ def main(): def handler(event, context): + print("=" * 60) + print("HANDLER INVOKED") + print("=" * 60) print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") + print(f"Event received: {type(event)}") + print(f"Event keys: {event.keys() if isinstance(event, dict) else 'N/A'}") # Example SQS message for testing (copy and paste into SQS): # { @@ -186,24 +191,33 @@ def handler(event, context): # } # Handle both single event and batch events (SQS, etc.) + print("Extracting records from event...") records = event.get("Records", [event]) + print(f"Found {len(records)} record(s) to process") results = [] errors = [] + + print("Initializing SubTaskInterface...") subtask_interface = SubTaskInterface() + print("✓ SubTaskInterface initialized") for record in records: + print("Processing record...") task_id = None subtask_id = None try: # Parse body + print("Parsing body from record...") if isinstance(record.get("body"), str): body = json.loads(record["body"]) else: body = record.get("body", {}) + print(f"Body parsed: {body}") # Validate required fields task_id = body.get("task_id") s3_uri = body.get("s3_uri") + print(f"task_id: {task_id}, s3_uri: {s3_uri}") if not task_id: errors.append({"error": "Missing required field: task_id"}) @@ -214,13 +228,16 @@ def handler(event, context): continue # Convert task_id to UUID + print("Converting task_id to UUID...") try: task_id = UUID(task_id) if isinstance(task_id, str) else task_id + print(f"UUID conversion successful: {task_id}") except ValueError as e: errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) continue # Create a new subtask for this postcode splitter invocation + print(f"Creating subtask for task {task_id}...") subtask_id = subtask_interface.create_subtask( task_id=task_id, inputs={"s3_uri": s3_uri} ) @@ -231,19 +248,26 @@ def handler(event, context): print(f"Processing s3_uri: {s3_uri}") # Read CSV from S3 - print("Reading CSV from S3...") + print("Parsing S3 URI...") bucket, key = parse_s3_console_url(s3_uri) - print(f"Parsed S3 - Bucket: {bucket}, Key: {key}") + print(f"Bucket: {bucket}, Key: {key}") + + print("Fetching CSV from S3...") csv_data = read_csv_from_s3_dict(bucket, key) + print(f"CSV fetched: {len(csv_data)} rows") + + print("Creating DataFrame...") df = pd.DataFrame(csv_data) - print(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") + print(f"DataFrame created: {len(df)} rows, {len(df.columns)} columns") # Get head for demo + print("Getting DataFrame head...") df_head = df.head() print("DataFrame head:") print(df_head) df_head_dict = df_head.to_dict("records") + print("Appending result...") results.append( { "message": "Postcode splitter processing started", @@ -252,8 +276,10 @@ def handler(event, context): "subtask_id": str(subtask_id), } ) + print("Result appended") # Mark subtask as complete after successful processing + print("Updating subtask status to complete...") subtask_interface.update_subtask_status( subtask_id, "complete", From 94524379e480ca885cbbab4270578bbd977cbe00 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Feb 2026 23:34:02 +0000 Subject: [PATCH 045/135] even more logs --- backend/postcode_splitter/main.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 282e432a..8210bf78 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -203,14 +203,21 @@ def handler(event, context): for record in records: print("Processing record...") + print(f"Record type: {type(record)}") + print(f"Record: {record}") task_id = None subtask_id = None try: # Parse body print("Parsing body from record...") + print(f"record.get('body'): {record.get('body')}") + print(f"isinstance(record.get('body'), str): {isinstance(record.get('body'), str)}") + if isinstance(record.get("body"), str): + print("Body is string, parsing JSON...") body = json.loads(record["body"]) else: + print("Body is not string, using directly...") body = record.get("body", {}) print(f"Body parsed: {body}") From 8121e6d5b67d87b8e60b5f28a6a03edae2d7e465 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 07:53:54 +0000 Subject: [PATCH 046/135] more logs for s3 --- backend/postcode_splitter/main.py | 146 +++++++++++------------------- 1 file changed, 53 insertions(+), 93 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 8210bf78..1d0e56a0 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -1,50 +1,20 @@ import os import sys +import json +import pandas as pd +import requests +from uuid import UUID +from urllib.parse import unquote +from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict +from utils.logger import setup_logger +from tqdm import tqdm +from backend.app.db.functions.tasks.Tasks import SubTaskInterface +from backend.address2UPRN.main import ( + resolve_uprns_for_postcode_group, + get_epc_data_with_postcode, +) -print("=" * 60) -print("ENVIRONMENT AT STARTUP:") -print("=" * 60) -for k, v in sorted(os.environ.items()): - print(f"{k}={v}") -print("=" * 60) - -try: - import json - - print("✓ json imported") - import pandas as pd - - print("✓ pandas imported") - import requests - - print("✓ requests imported") - from uuid import UUID - - print("✓ UUID imported") - from urllib.parse import unquote - - print("✓ urllib.parse imported") - from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict - - print("✓ utils.s3 imported") - from tqdm import tqdm - - print("✓ tqdm imported") - from backend.app.db.functions.tasks.Tasks import SubTaskInterface - - print("✓ SubTaskInterface imported") - from backend.address2UPRN.main import ( - resolve_uprns_for_postcode_group, - get_epc_data_with_postcode, - ) - - print("✓ backend.address2UPRN imported") -except Exception as e: - print(f"✗ IMPORT ERROR: {type(e).__name__}: {e}") - import traceback - - traceback.print_exc() - raise +logger = setup_logger() def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: @@ -53,15 +23,41 @@ def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: Format: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path """ - if "console.aws.amazon.com" in s3_uri and "?prefix=" in s3_uri: + logger.info(f"Parsing S3 URI: {s3_uri}") + + if "console.aws.amazon.com" not in s3_uri: + logger.error("URI does not contain 'console.aws.amazon.com'") + raise ValueError(f"Could not parse S3 URI: {s3_uri}") + + if "?prefix=" not in s3_uri: + logger.error("URI does not contain '?prefix='") + raise ValueError(f"Could not parse S3 URI: {s3_uri}") + + try: base, query = s3_uri.split("?", 1) + logger.debug(f"Base: {base}") + logger.debug(f"Query: {query}") + path_parts = base.split("/s3/object/") + logger.debug(f"Path parts: {path_parts}") + if len(path_parts) > 1: bucket = path_parts[1] + logger.info(f"Extracted bucket: {bucket}") + params = dict(item.split("=") for item in query.split("&") if "=" in item) + logger.debug(f"Query params: {params}") + key = unquote(params.get("prefix", "")) + logger.info(f"Extracted key: {key}") + return bucket, key - raise ValueError(f"Could not parse S3 URI: {s3_uri}") + else: + logger.error(f"Could not find '/s3/object/' in URI") + raise ValueError(f"Could not parse S3 URI: {s3_uri}") + except Exception as e: + logger.error(f"Error parsing S3 URI: {type(e).__name__}: {e}") + raise ValueError(f"Could not parse S3 URI: {s3_uri}") from e def sanitise_postcode(postcode: str) -> str | None: @@ -176,13 +172,8 @@ def main(): def handler(event, context): - print("=" * 60) - print("HANDLER INVOKED") - print("=" * 60) print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") - print(f"Event received: {type(event)}") - print(f"Event keys: {event.keys() if isinstance(event, dict) else 'N/A'}") # Example SQS message for testing (copy and paste into SQS): # { @@ -191,40 +182,24 @@ def handler(event, context): # } # Handle both single event and batch events (SQS, etc.) - print("Extracting records from event...") records = event.get("Records", [event]) - print(f"Found {len(records)} record(s) to process") results = [] errors = [] - - print("Initializing SubTaskInterface...") subtask_interface = SubTaskInterface() - print("✓ SubTaskInterface initialized") for record in records: - print("Processing record...") - print(f"Record type: {type(record)}") - print(f"Record: {record}") task_id = None subtask_id = None try: # Parse body - print("Parsing body from record...") - print(f"record.get('body'): {record.get('body')}") - print(f"isinstance(record.get('body'), str): {isinstance(record.get('body'), str)}") - if isinstance(record.get("body"), str): - print("Body is string, parsing JSON...") body = json.loads(record["body"]) else: - print("Body is not string, using directly...") body = record.get("body", {}) - print(f"Body parsed: {body}") # Validate required fields task_id = body.get("task_id") s3_uri = body.get("s3_uri") - print(f"task_id: {task_id}, s3_uri: {s3_uri}") if not task_id: errors.append({"error": "Missing required field: task_id"}) @@ -235,46 +210,32 @@ def handler(event, context): continue # Convert task_id to UUID - print("Converting task_id to UUID...") try: task_id = UUID(task_id) if isinstance(task_id, str) else task_id - print(f"UUID conversion successful: {task_id}") except ValueError as e: errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) continue # Create a new subtask for this postcode splitter invocation - print(f"Creating subtask for task {task_id}...") subtask_id = subtask_interface.create_subtask( task_id=task_id, inputs={"s3_uri": s3_uri} ) - print(f"Created subtask {subtask_id} for task {task_id}") - - # Process normal flow - print(f"Processing task_id: {task_id}") - print(f"Processing s3_uri: {s3_uri}") + logger.info(f"Created subtask {subtask_id} for task {task_id}") # Read CSV from S3 - print("Parsing S3 URI...") + logger.info(f"Processing S3 URI: {s3_uri}") bucket, key = parse_s3_console_url(s3_uri) - print(f"Bucket: {bucket}, Key: {key}") + logger.info(f"S3 Bucket: {bucket}, Key: {key}") - print("Fetching CSV from S3...") csv_data = read_csv_from_s3_dict(bucket, key) - print(f"CSV fetched: {len(csv_data)} rows") - - print("Creating DataFrame...") df = pd.DataFrame(csv_data) - print(f"DataFrame created: {len(df)} rows, {len(df.columns)} columns") + logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") # Get head for demo - print("Getting DataFrame head...") df_head = df.head() - print("DataFrame head:") - print(df_head) - df_head_dict = df_head.to_dict("records") + logger.info("DataFrame head:") + logger.info(f"\n{df_head}") - print("Appending result...") results.append( { "message": "Postcode splitter processing started", @@ -283,10 +244,8 @@ def handler(event, context): "subtask_id": str(subtask_id), } ) - print("Result appended") # Mark subtask as complete after successful processing - print("Updating subtask status to complete...") subtask_interface.update_subtask_status( subtask_id, "complete", @@ -296,9 +255,10 @@ def handler(event, context): "rows_processed": len(df), }, ) - print(f"Subtask {subtask_id} marked as complete") + logger.info(f"Subtask {subtask_id} marked as complete") except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in request body: {e}") errors.append({"error": "Invalid JSON in request body", "details": str(e)}) # Mark subtask as failed if we have one if subtask_id: @@ -307,9 +267,9 @@ def handler(event, context): subtask_id, "failed", outputs={"error": str(e)} ) except Exception as db_error: - print(f"Failed to update subtask status: {db_error}") + logger.error(f"Failed to update subtask status: {db_error}") except Exception as e: - print(f"Unexpected error processing record: {e}") + logger.error(f"Unexpected error processing record: {e}", exc_info=True) errors.append({"error": "Unexpected error", "details": str(e)}) # Mark subtask as failed if we have one if subtask_id: @@ -318,7 +278,7 @@ def handler(event, context): subtask_id, "failed", outputs={"error": str(e)} ) except Exception as db_error: - print(f"Failed to update subtask status: {db_error}") + logger.error(f"Failed to update subtask status: {db_error}") # Return error if all records failed if errors and not results: From a94e5ca592fd1e83d320bc2d8ae0bf2c34996282 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 08:04:57 +0000 Subject: [PATCH 047/135] s3 url processing --- backend/postcode_splitter/main.py | 43 ++++++++++++------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 1d0e56a0..adb8e5c9 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -23,41 +23,32 @@ def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: Format: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path """ - logger.info(f"Parsing S3 URI: {s3_uri}") - - if "console.aws.amazon.com" not in s3_uri: - logger.error("URI does not contain 'console.aws.amazon.com'") - raise ValueError(f"Could not parse S3 URI: {s3_uri}") - - if "?prefix=" not in s3_uri: - logger.error("URI does not contain '?prefix='") - raise ValueError(f"Could not parse S3 URI: {s3_uri}") + logger.info("Parsing S3 console URL") try: + # Split base URL and query string + if "?" not in s3_uri: + raise ValueError("No query string found") + base, query = s3_uri.split("?", 1) - logger.debug(f"Base: {base}") - logger.debug(f"Query: {query}") + + # Extract bucket from base URL + if "/s3/object/" not in base: + raise ValueError("No '/s3/object/' found in URL path") path_parts = base.split("/s3/object/") - logger.debug(f"Path parts: {path_parts}") + bucket = path_parts[1] + logger.info(f"Extracted bucket: {bucket}") - if len(path_parts) > 1: - bucket = path_parts[1] - logger.info(f"Extracted bucket: {bucket}") + # Extract prefix from query parameters + params = dict(item.split("=") for item in query.split("&") if "=" in item) + key = unquote(params.get("prefix", "")) + logger.info(f"Extracted key: {key}") - params = dict(item.split("=") for item in query.split("&") if "=" in item) - logger.debug(f"Query params: {params}") - - key = unquote(params.get("prefix", "")) - logger.info(f"Extracted key: {key}") - - return bucket, key - else: - logger.error(f"Could not find '/s3/object/' in URI") - raise ValueError(f"Could not parse S3 URI: {s3_uri}") + return bucket, key except Exception as e: logger.error(f"Error parsing S3 URI: {type(e).__name__}: {e}") - raise ValueError(f"Could not parse S3 URI: {s3_uri}") from e + raise ValueError(f"Could not parse S3 URI") from e def sanitise_postcode(postcode: str) -> str | None: From 507ecfb8a14e7af0945e6609a08d652a89b0320b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 15:49:04 +0000 Subject: [PATCH 048/135] terrform files --- .../terraform/lambda/_template/main.tf | 49 ++++++++++++++++ .../terraform/lambda/postcodeSplitter/main.tf | 6 ++ .../terraform/modules/s3_iam_policy/main.tf | 29 ++++++++++ .../modules/s3_iam_policy/outputs.tf | 14 +++++ .../modules/s3_iam_policy/variables.tf | 39 +++++++++++++ infrastructure/terraform/shared/main.tf | 57 +++++++++++-------- 6 files changed, 170 insertions(+), 24 deletions(-) create mode 100644 infrastructure/terraform/modules/s3_iam_policy/main.tf create mode 100644 infrastructure/terraform/modules/s3_iam_policy/outputs.tf create mode 100644 infrastructure/terraform/modules/s3_iam_policy/variables.tf diff --git a/infrastructure/terraform/lambda/_template/main.tf b/infrastructure/terraform/lambda/_template/main.tf index 3010aa8a..2b767ce1 100644 --- a/infrastructure/terraform/lambda/_template/main.tf +++ b/infrastructure/terraform/lambda/_template/main.tf @@ -1,3 +1,30 @@ +# ============================================================================== +# TEMPLATE: Lambda Configuration with Optional S3 IAM Policy +# ============================================================================== +# Instructions: +# 1. Replace "REPLACE ME" with your lambda name (e.g., "my-lambda-name") +# 2. Add any additional environment variables as needed +# 3. To attach S3 IAM policies from shared state: +# - Uncomment the S3 policy attachment section below +# - Update the policy_arn to match the output from shared/main.tf +# - Available shared outputs (examples): +# - data.terraform_remote_state.shared.outputs.condition_etl_s3_read_arn +# - data.terraform_remote_state.shared.outputs.postcode_splitter_s3_read_arn +# 4. To create a NEW S3 policy: +# - Add a new module "lambda_s3_policy" in shared/main.tf using the +# s3_iam_policy module (see examples in shared/main.tf) +# - Then reference it here using data.terraform_remote_state.shared.outputs +# ============================================================================== + +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + module "lambda" { source = "../modules/lambda_with_sqs" @@ -12,3 +39,25 @@ module "lambda" { LOG_LEVEL = "info" } } + +# ====================================================================== +# OPTIONAL: Attach S3 IAM policy to Lambda execution role +# ====================================================================== +# Uncomment and configure the resource below to attach S3 permissions +# +# Example 1: Attach existing policy from shared state +# resource "aws_iam_role_policy_attachment" "lambda_s3_policy" { +# role = module.lambda.lambda_role_name +# policy_arn = data.terraform_remote_state.shared.outputs.YOUR_POLICY_OUTPUT_NAME_arn +# } +# +# Example 2: Attach multiple policies +# resource "aws_iam_role_policy_attachment" "lambda_read_policy" { +# role = module.lambda.lambda_role_name +# policy_arn = data.terraform_remote_state.shared.outputs.postcode_splitter_s3_read_arn +# } +# +# resource "aws_iam_role_policy_attachment" "lambda_write_policy" { +# role = module.lambda.lambda_role_name +# policy_arn = data.terraform_remote_state.shared.outputs.another_policy_arn +# } diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 7ba4506c..9bbd1b26 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -32,4 +32,10 @@ module "lambda" { DB_PASSWORD = local.db_credentials.db_assessment_model_password }, ) +} + +# Attach S3 read policy to the Lambda execution role +resource "aws_iam_role_policy_attachment" "postcode_splitter_s3_read" { + role = module.lambda.lambda_role_name + policy_arn = data.terraform_remote_state.shared.outputs.postcode_splitter_s3_read_arn } \ No newline at end of file diff --git a/infrastructure/terraform/modules/s3_iam_policy/main.tf b/infrastructure/terraform/modules/s3_iam_policy/main.tf new file mode 100644 index 00000000..e4e1e2f9 --- /dev/null +++ b/infrastructure/terraform/modules/s3_iam_policy/main.tf @@ -0,0 +1,29 @@ +# Dynamically build S3 resources list from bucket ARNs and resource paths +locals { + # Generate full resource ARNs by combining bucket ARNs with resource paths + resources = flatten([ + for bucket_arn in var.bucket_arns : [ + for path in var.resource_paths : "${bucket_arn}${path}" + ] + ]) +} + +# IAM Policy with dynamic actions and resources +resource "aws_iam_policy" "s3_policy" { + name = var.policy_name + description = var.policy_description + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = var.actions + Resource = local.resources + Condition = var.conditions != null ? var.conditions : null + } + ] + }) + + tags = var.tags +} diff --git a/infrastructure/terraform/modules/s3_iam_policy/outputs.tf b/infrastructure/terraform/modules/s3_iam_policy/outputs.tf new file mode 100644 index 00000000..85defd9c --- /dev/null +++ b/infrastructure/terraform/modules/s3_iam_policy/outputs.tf @@ -0,0 +1,14 @@ +output "policy_arn" { + description = "ARN of the S3 IAM policy" + value = aws_iam_policy.s3_policy.arn +} + +output "policy_name" { + description = "Name of the S3 IAM policy" + value = aws_iam_policy.s3_policy.name +} + +output "policy_id" { + description = "ID of the S3 IAM policy" + value = aws_iam_policy.s3_policy.id +} diff --git a/infrastructure/terraform/modules/s3_iam_policy/variables.tf b/infrastructure/terraform/modules/s3_iam_policy/variables.tf new file mode 100644 index 00000000..ed53ea1f --- /dev/null +++ b/infrastructure/terraform/modules/s3_iam_policy/variables.tf @@ -0,0 +1,39 @@ +variable "policy_name" { + description = "Name of the IAM policy" + type = string +} + +variable "policy_description" { + description = "Description of the IAM policy" + type = string + default = "" +} + +variable "bucket_arns" { + description = "List of S3 bucket ARNs to grant access to" + type = list(string) +} + +variable "actions" { + description = "List of S3 actions to allow (e.g., ['s3:GetObject'], ['s3:PutObject'], ['s3:DeleteObject'])" + type = list(string) + default = ["s3:GetObject"] +} + +variable "resource_paths" { + description = "List of resource paths within buckets (e.g., ['/*'] for all objects, ['/specific-prefix/*'] for specific prefix)" + type = list(string) + default = ["/*"] +} + +variable "conditions" { + description = "Optional IAM policy conditions to apply to the statement" + type = any + default = null +} + +variable "tags" { + description = "Tags to apply to the policy" + type = map(string) + default = {} +} diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index b1474055..5e189dc9 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -321,6 +321,28 @@ module "condition_etl_registry" { } +# Condition Data S3 Bucket to store initial data +module "condition_data_bucket" { + source = "../modules/s3" + bucketname = "condition-data-${var.stage}" + allowed_origins = var.allowed_origins +} + +module "condition_etl_s3_read" { + source = "../modules/s3_iam_policy" + + policy_name = "ConditionETLReadS3" + policy_description = "Allow Lambda to read objects from condition-data-${var.stage}" + bucket_arns = ["arn:aws:s3:::condition-data-${var.stage}"] + actions = ["s3:GetObject"] + resource_paths = ["/*"] +} + +output "condition_etl_s3_read_arn" { + value = module.condition_etl_s3_read.policy_arn +} + + ################################################ # Postcode Splitter – Lambda ECR ################################################ @@ -337,30 +359,17 @@ module "postcode_splitter_registry" { } -################################################ -# Conidition data – S3 bucket -################################################ -module "condition_data_bucket" { - source = "../modules/s3" - bucketname = "condition-data-${var.stage}" - allowed_origins = var.allowed_origins +# S3 policy for postcode splitter to read from retrofit data bucket +module "postcode_splitter_s3_read" { + source = "../modules/s3_iam_policy" + + policy_name = "PostcodeSplitterReadS3" + policy_description = "Allow postcode splitter Lambda to read from retrofit-data bucket" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject"] + resource_paths = ["/*"] } -resource "aws_iam_policy" "condition_etl_s3_read" { - name = "ConditionETLReadS3" - description = "Allow Lambda to read objects from condition-data-${var.stage}" - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = ["s3:GetObject"] - Resource = "arn:aws:s3:::condition-data-${var.stage}/*" - } - ] - }) -} - -output "condition_etl_s3_read_arn" { - value = aws_iam_policy.condition_etl_s3_read.arn +output "postcode_splitter_s3_read_arn" { + value = module.postcode_splitter_s3_read.policy_arn } \ No newline at end of file From 8955082ac517f25aa23aff0205827499542240ed Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 15:54:10 +0000 Subject: [PATCH 049/135] wrong lambda --- infrastructure/terraform/lambda/_template/main.tf | 6 +++--- infrastructure/terraform/lambda/postcodeSplitter/main.tf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/infrastructure/terraform/lambda/_template/main.tf b/infrastructure/terraform/lambda/_template/main.tf index 2b767ce1..7f60d684 100644 --- a/infrastructure/terraform/lambda/_template/main.tf +++ b/infrastructure/terraform/lambda/_template/main.tf @@ -47,17 +47,17 @@ module "lambda" { # # Example 1: Attach existing policy from shared state # resource "aws_iam_role_policy_attachment" "lambda_s3_policy" { -# role = module.lambda.lambda_role_name +# role = module.lambda.role_name # policy_arn = data.terraform_remote_state.shared.outputs.YOUR_POLICY_OUTPUT_NAME_arn # } # # Example 2: Attach multiple policies # resource "aws_iam_role_policy_attachment" "lambda_read_policy" { -# role = module.lambda.lambda_role_name +# role = module.lambda.role_name # policy_arn = data.terraform_remote_state.shared.outputs.postcode_splitter_s3_read_arn # } # # resource "aws_iam_role_policy_attachment" "lambda_write_policy" { -# role = module.lambda.lambda_role_name +# role = module.lambda.role_name # policy_arn = data.terraform_remote_state.shared.outputs.another_policy_arn # } diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 9bbd1b26..68c433d1 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -36,6 +36,6 @@ module "lambda" { # Attach S3 read policy to the Lambda execution role resource "aws_iam_role_policy_attachment" "postcode_splitter_s3_read" { - role = module.lambda.lambda_role_name + role = module.lambda.role_name policy_arn = data.terraform_remote_state.shared.outputs.postcode_splitter_s3_read_arn } \ No newline at end of file From 6a29967b1bdf29b4cb4401e2addd2d867335eae8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 15:57:31 +0000 Subject: [PATCH 050/135] only run if the file gets changed --- .github/workflows/deploy_terraform.yml | 5 +++++ .github/workflows/unit_tests.yml | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 0d235ab1..5248383b 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -4,6 +4,11 @@ on: push: branches: - "**" + paths: + - 'infrastructure/terraform/**' + - '.github/workflows/deploy_terraform.yml' + - '.github/workflows/_build_image.yml' + - '.github/workflows/_deploy_lambda.yml' jobs: determine_stage: diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 14d5a06f..d3a92463 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -4,9 +4,6 @@ on: pull_request: branches: - "**" - push: - branches: - - "**" jobs: From 0c9dada6426d785dcefe42ca7cd2e7b89e87d6be Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 15:58:28 +0000 Subject: [PATCH 051/135] run for production --- .github/workflows/deploy_terraform.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 5248383b..88a84257 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -74,7 +74,7 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - if: env.STAGE == 'prod' + # if: env.STAGE == 'prod' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan From 12185bffa6fdebf6eb4f991ee0fc6978e22d3ab8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 16:17:28 +0000 Subject: [PATCH 052/135] destroy condition --- .github/workflows/_deploy_lambda.yml | 13 ++++++++++++- .github/workflows/deploy_terraform.yml | 1 + .../terraform/modules/s3_iam_policy/main.tf | 14 ++++++++------ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 9bd686aa..1ab50e8d 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -29,6 +29,12 @@ on: default: 'false' # can only be 'true' or 'false' + terraform_destroy: + required: false + type: string + default: 'false' + # can only be 'true' or 'false' + secrets: AWS_ACCESS_KEY_ID: required: true @@ -93,6 +99,11 @@ jobs: -out=lambdaplan - name: Terraform Apply - if: inputs.terraform_apply == 'true' || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/main' + if: (inputs.terraform_apply == 'true' || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/main') && inputs.terraform_destroy != 'true' working-directory: ${{ inputs.lambda_path }} run: terraform apply -auto-approve lambdaplan + + - name: Terraform Destroy + if: inputs.terraform_destroy == 'true' + working-directory: ${{ inputs.lambda_path }} + run: terraform destroy -auto-approve diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 88a84257..4c504ba9 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -186,6 +186,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.condition_etl_image.outputs.image_digest }} + terraform_destroy: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} diff --git a/infrastructure/terraform/modules/s3_iam_policy/main.tf b/infrastructure/terraform/modules/s3_iam_policy/main.tf index e4e1e2f9..397bd963 100644 --- a/infrastructure/terraform/modules/s3_iam_policy/main.tf +++ b/infrastructure/terraform/modules/s3_iam_policy/main.tf @@ -16,12 +16,14 @@ resource "aws_iam_policy" "s3_policy" { policy = jsonencode({ Version = "2012-10-17" Statement = [ - { - Effect = "Allow" - Action = var.actions - Resource = local.resources - Condition = var.conditions != null ? var.conditions : null - } + merge( + { + Effect = "Allow" + Action = var.actions + Resource = local.resources + }, + var.conditions != null ? { Condition = var.conditions } : {} + ) ] }) From a9b8f09d9a217339430f8b30fa5c98273cc5c687 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 16:22:34 +0000 Subject: [PATCH 053/135] don't run apply yet must destroy first --- .github/workflows/deploy_terraform.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 4c504ba9..397eb6ee 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -74,7 +74,7 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - # if: env.STAGE == 'prod' + if: env.STAGE == 'prod' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan From cb6f0925c1c3c3eaff5aafa1e4337d3519c6836a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 17:31:38 +0000 Subject: [PATCH 054/135] get rid of duplicagte env --- .github/workflows/deploy_terraform.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 1cdaaf79..a89eb42b 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -24,12 +24,6 @@ jobs: AWS_REGION: ${{ secrets.DEV_AWS_REGION }} DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} - env: - AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} - AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} - steps: - name: Determine stage from branch id: set-stage From 3f9e8b303c70b3e4882550cd182c9b1b714307c7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 18:08:03 +0000 Subject: [PATCH 055/135] terraform destroy --- .devcontainer/backend/Dockerfile | 15 ++++++++++++++- .github/workflows/_deploy_lambda.yml | 7 ++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index 4c5d16f5..99cd66d6 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -43,4 +43,17 @@ WORKDIR /workspaces/model # 6) Make Python find your package # Add project root to PYTHONPATH for all processes -ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} \ No newline at end of file +ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} + + +# Install terraform +RUN apt-get update && sudo apt-get install -y gnupg software-properties-common +RUN wget -O- https://apt.releases.hashicorp.com/gpg | \ +gpg --dearmor | \ +sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg > /dev/null +RUN echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ +https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ +tee /etc/apt/sources.list.d/hashicorp.list +RUN apt update +RUN apt-get install terraform +RUN terraform -install-autocomplete \ No newline at end of file diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index e0da2f2b..b8731446 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -106,4 +106,9 @@ jobs: - name: Terraform Destroy if: inputs.terraform_destroy == 'true' && inputs.terraform_apply != 'true' working-directory: ${{ inputs.lambda_path }} - run: terraform destroy -auto-approve + run: | + terraform destroy -auto-approve \ + -var="stage=${{ inputs.stage }}" \ + -var="lambda_name=${{ inputs.lambda_name }}" \ + -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ + -var="image_digest=${{ inputs.image_digest }}" From eb393eb0e88a22bca26d4151922f02983a9da53f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 18:13:56 +0000 Subject: [PATCH 056/135] terraform apply new env --- .github/workflows/deploy_terraform.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index a89eb42b..3a46e9a1 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -76,7 +76,7 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - if: env.STAGE == 'prod' + # if: env.STAGE == 'prod' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan @@ -148,7 +148,8 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} - terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + terraform_apply: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -189,7 +190,8 @@ jobs: ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.condition_etl_image.outputs.image_digest }} # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} - terraform_destroy: 'true' + # terraform_destroy: 'true' + terraform_apply: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} From e2fa13e2cc3d0eb6020ba348a8608e508d84902e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 18:17:58 +0000 Subject: [PATCH 057/135] delete it in a comment --- infrastructure/terraform/shared/main.tf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 5e189dc9..fc3d086a 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -328,19 +328,19 @@ module "condition_data_bucket" { allowed_origins = var.allowed_origins } -module "condition_etl_s3_read" { - source = "../modules/s3_iam_policy" +# module "condition_etl_s3_read" { +# source = "../modules/s3_iam_policy" - policy_name = "ConditionETLReadS3" - policy_description = "Allow Lambda to read objects from condition-data-${var.stage}" - bucket_arns = ["arn:aws:s3:::condition-data-${var.stage}"] - actions = ["s3:GetObject"] - resource_paths = ["/*"] -} +# policy_name = "ConditionETLReadS3" +# policy_description = "Allow Lambda to read objects from condition-data-${var.stage}" +# bucket_arns = ["arn:aws:s3:::condition-data-${var.stage}"] +# actions = ["s3:GetObject"] +# resource_paths = ["/*"] +# } -output "condition_etl_s3_read_arn" { - value = module.condition_etl_s3_read.policy_arn -} +# output "condition_etl_s3_read_arn" { +# value = module.condition_etl_s3_read.policy_arn +# } ################################################ From 0e5ea0f490f1a88d502f34eacb90b39ba134b76c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 18:19:54 +0000 Subject: [PATCH 058/135] now re deploy --- infrastructure/terraform/shared/main.tf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index fc3d086a..5e189dc9 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -328,19 +328,19 @@ module "condition_data_bucket" { allowed_origins = var.allowed_origins } -# module "condition_etl_s3_read" { -# source = "../modules/s3_iam_policy" +module "condition_etl_s3_read" { + source = "../modules/s3_iam_policy" -# policy_name = "ConditionETLReadS3" -# policy_description = "Allow Lambda to read objects from condition-data-${var.stage}" -# bucket_arns = ["arn:aws:s3:::condition-data-${var.stage}"] -# actions = ["s3:GetObject"] -# resource_paths = ["/*"] -# } + policy_name = "ConditionETLReadS3" + policy_description = "Allow Lambda to read objects from condition-data-${var.stage}" + bucket_arns = ["arn:aws:s3:::condition-data-${var.stage}"] + actions = ["s3:GetObject"] + resource_paths = ["/*"] +} -# output "condition_etl_s3_read_arn" { -# value = module.condition_etl_s3_read.policy_arn -# } +output "condition_etl_s3_read_arn" { + value = module.condition_etl_s3_read.policy_arn +} ################################################ From e549eae8202b838d1e8956d79798afd6c77481c7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 18:30:15 +0000 Subject: [PATCH 059/135] time out --- infrastructure/terraform/lambda/condition-etl/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/infrastructure/terraform/lambda/condition-etl/main.tf b/infrastructure/terraform/lambda/condition-etl/main.tf index 4219f209..0128f975 100644 --- a/infrastructure/terraform/lambda/condition-etl/main.tf +++ b/infrastructure/terraform/lambda/condition-etl/main.tf @@ -23,7 +23,6 @@ module "lambda" { stage = var.stage image_uri = local.image_uri - timeout = 180 environment = merge( From 526d1a79631c3a1aaf6e6e0de1d9aeb15589aa9f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 18:46:25 +0000 Subject: [PATCH 060/135] default variables --- .github/workflows/deploy_terraform.yml | 4 +--- .../terraform/lambda/postcodeSplitter/main.tf | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 3a46e9a1..39132944 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -189,9 +189,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: condition-etl-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.condition_etl_image.outputs.image_digest }} - # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} - # terraform_destroy: 'true' - terraform_apply: 'true' + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 68c433d1..2e2e91da 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -30,6 +30,20 @@ module "lambda" { LOG_LEVEL = "info" DB_USERNAME = local.db_credentials.db_assessment_model_username DB_PASSWORD = local.db_credentials.db_assessment_model_password + GOOGLE_SOLAR_API_KEY = "test" + SAP_PREDICTIONS_BUCKET = "test" + CARBON_PREDICTIONS_BUCKET = "test" + HEAT_PREDICTIONS_BUCKET = "test" + HEATING_KWH_PREDICTIONS_BUCKET = "test" + HOTWATER_KWH_PREDICTIONS_BUCKET = "test" + API_KEY = "test" + ENVIRONMENT = "test" + SECRET_KEY = "test" + PLAN_TRIGGER_BUCKET = "test" + DATA_BUCKET = "test" + EPC_AUTH_TOKEN = "test" + ENGINE_SQS_URL = "test" + ENERGY_ASSESSMENTS_BUCKET = "test" }, ) } From a8d89dc2863e7c0e9791d3190cb8c3d64ddfe980 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 19:12:34 +0000 Subject: [PATCH 061/135] s3 policy --- infrastructure/terraform/shared/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 5e189dc9..83845185 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -366,7 +366,7 @@ module "postcode_splitter_s3_read" { policy_name = "PostcodeSplitterReadS3" policy_description = "Allow postcode splitter Lambda to read from retrofit-data bucket" bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] - actions = ["s3:GetObject"] + actions = ["s3:GetObject", "s3:ListBucket"] resource_paths = ["/*"] } From 663f3755e7fed28c9ae1561188742fc524f992de Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 19:17:02 +0000 Subject: [PATCH 062/135] apply new s3 policy --- .github/workflows/deploy_terraform.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 39132944..ef1887ee 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -76,7 +76,7 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - # if: env.STAGE == 'prod' + if: env.STAGE == 'prod' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan From 9dc5e0b98447c3f3a623fcf1eed14ef2f1a7967d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 19:26:58 +0000 Subject: [PATCH 063/135] apply new s3 policy --- .github/workflows/deploy_terraform.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index ef1887ee..39132944 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -76,7 +76,7 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - if: env.STAGE == 'prod' + # if: env.STAGE == 'prod' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan From 7911bb4db0746f94bd7f01c7e82f8ffdc47c39bc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 21:08:39 +0000 Subject: [PATCH 064/135] parse uri --- backend/postcode_splitter/main.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index adb8e5c9..5a63d920 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -17,15 +17,30 @@ from backend.address2UPRN.main import ( logger = setup_logger() -def parse_s3_console_url(s3_uri: str) -> tuple[str, str]: +def parse_s3_uri(s3_uri: str) -> tuple[str, str]: """ - Parse AWS console S3 URL to extract bucket and key. + Parse S3 URI to extract bucket and key. - Format: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path + Supports two formats: + 1. S3 URI format: s3://bucket/key + 2. AWS console URL: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path """ - logger.info("Parsing S3 console URL") + logger.info("Parsing S3 URI") try: + # Check if it's an S3 URI format + if s3_uri.startswith("s3://"): + parts = s3_uri[5:].split("/", 1) + if len(parts) < 2: + raise ValueError("S3 URI must include both bucket and key") + bucket = parts[0] + key = parts[1] + logger.info(f"Extracted bucket: {bucket}, key: {key}") + return bucket, key + + # Otherwise, treat as AWS console URL + logger.info("Parsing as AWS console URL") + # Split base URL and query string if "?" not in s3_uri: raise ValueError("No query string found") @@ -215,7 +230,7 @@ def handler(event, context): # Read CSV from S3 logger.info(f"Processing S3 URI: {s3_uri}") - bucket, key = parse_s3_console_url(s3_uri) + bucket, key = parse_s3_uri(s3_uri) logger.info(f"S3 Bucket: {bucket}, Key: {key}") csv_data = read_csv_from_s3_dict(bucket, key) From 76e362520df88526514c0e5c9da5f93062e7b129 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 10 Feb 2026 21:15:14 +0000 Subject: [PATCH 065/135] parse uri --- infrastructure/terraform/lambda/postcodeSplitter/variables.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf b/infrastructure/terraform/lambda/postcodeSplitter/variables.tf index 9ce45fa5..0c8ba5b2 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/variables.tf @@ -24,3 +24,6 @@ locals { output "resolved_image_uri" { value = local.image_uri } + + + From b7e201f3d47e088d71f66381f01d9ad05e727710 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 09:46:45 +0000 Subject: [PATCH 066/135] redploy my lambda without list and see if it works --- backend/address2UPRN/main.py | 2 +- backend/condition/condition_trigger_request.py | 2 +- backend/postcode_splitter/main.py | 1 - infrastructure/terraform/shared/main.tf | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 293ce3d9..2cc604cb 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -335,7 +335,7 @@ def get_uprn( address = top_rank_df["address"].values[0] lexiscore = float(top_rank_df["lexiscore"].values[0]) - epc = top_rank_df["current-energy-rating"].values[0] + epc = top_rank_df["current-energy-efficiency"].values[0] score = float(top_rank_df["lexiscore"].values[0]) # logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") diff --git a/backend/condition/condition_trigger_request.py b/backend/condition/condition_trigger_request.py index 03bd6ad1..daa82949 100644 --- a/backend/condition/condition_trigger_request.py +++ b/backend/condition/condition_trigger_request.py @@ -29,5 +29,5 @@ class ConditionTriggerRequest(BaseModel): # { # "file_type": "LBWF", # "trigger_file_bucket": "condition-data-dev", -# "trigger_file_key": "input/lbwf/LBWF - Example Asset Data September 2025.xlsx", +# "trigger_file_key": "input/lbwf/LBWF - Example Asset Data September 2025.xlsx" # } diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 5a63d920..06a9d1a3 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -23,7 +23,6 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]: Supports two formats: 1. S3 URI format: s3://bucket/key - 2. AWS console URL: https://account-id-hash.region.console.aws.amazon.com/s3/object/bucket?region=...&prefix=path """ logger.info("Parsing S3 URI") diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 83845185..5e189dc9 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -366,7 +366,7 @@ module "postcode_splitter_s3_read" { policy_name = "PostcodeSplitterReadS3" policy_description = "Allow postcode splitter Lambda to read from retrofit-data bucket" bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] - actions = ["s3:GetObject", "s3:ListBucket"] + actions = ["s3:GetObject"] resource_paths = ["/*"] } From d4ac6aee71df211e5c31238fc046a23991839faf Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 11:50:02 +0000 Subject: [PATCH 067/135] mount home directory to devcontainer home directory --- .devcontainer/backend/devcontainer.json | 2 +- asset_list/AssetList.py | 2 +- asset_list/app.py | 82 ++++---------- backend/address2UPRN/main.py | 23 ++++ backend/postcode_splitter/main.py | 143 ++++++------------------ 5 files changed, 76 insertions(+), 176 deletions(-) diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index 5d728dcd..6e2edc93 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -6,7 +6,7 @@ "workspaceFolder": "/workspaces/model", "postStartCommand": "bash .devcontainer/backend/post-install.sh", "mounts": [ - "source=${localEnv:HOME},target=/workspaces/home,type=bind" + "source=${localEnv:HOME},target=/home/vscode,type=bind" ], "customizations": { "vscode": { diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index ea4d8b34..36b3d58e 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -34,7 +34,7 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") diff --git a/asset_list/app.py b/asset_list/app.py index 43c653a7..02557831 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -13,11 +13,15 @@ from asset_list.utils import get_data from dotenv import load_dotenv from backend.SearchEpc import SearchEpc -load_dotenv(dotenv_path="backend/.env") +load_dotenv(dotenv_path="../backend/.env") EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", ) +OPENAI_API_KEY = os.getenv( + "OPENAI_API_KEY", +) + def extract_address1( asset_list, full_address_col, postcode_col, method="first_two_words" @@ -69,72 +73,24 @@ def app(): Property UPRN """ -<<<<<<< HEAD - data_folder = "/workspaces/model/asset_list/" - data_filename = "manchester.xlsx" - sheet_name = "PW0099 - Property List" - postcode_column = "post Code" - address1_column = "address" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["address"] -======= - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Aspire" - data_filename = "ASPIRE ASSET LIST.xlsx" - sheet_name = "Asset List" - postcode_column = "Postcode" + data_folder = "/workspaces/model/asset_list" + data_filename = "assets.xlsx" + sheet_name = "Sheet1" + postcode_column = "POSTCODE" address1_column = None address1_method = "house_number_extraction" - fulladdress_column = "Address" + fulladdress_column = "ADDRESS" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_built_form = None - landlord_wall_construction = None - landlord_roof_construction = None - landlord_heating_system = None + landlord_property_type = "PROPERTY TYPE" + landlord_built_form = None # Skipped as empty + landlord_wall_construction = "wall combined" # combin F + G + landlord_roof_construction = "HEATING SYSTEM" # Combine I + J + landlord_heating_system = None # Check with Khalim landlord_existing_pv = None - landlord_property_id = "LLUPRN" - landlord_sap = None - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_id_colnames = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - asset_list_header = 0 - landlord_block_reference = None - - # Peabody data for cleaning - data_folder = ( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - "Project/data_validation" - ) - data_filename = "to_standardise_uprns.xlsx" - sheet_name = "Sheet1" - postcode_column = "Postcode" - address1_column = None - address1_method = "house_number_extraction" - fulladdress_column = "Address" - address_cols_to_concat = None ->>>>>>> d4064da36565f87c2b72d10e9f3604cc6c37bdb6 - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = None - landlord_built_form = None - landlord_wall_construction = None - landlord_roof_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "UHTprop Ref" + landlord_property_id = "UPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -286,7 +242,7 @@ def app(): if skip is not None and not force_retrieve_data: if i <= skip: continue - chunk = asset_list.standardised_asset_list[i: i + chunk_size] + chunk = asset_list.standardised_asset_list[i : i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, @@ -429,7 +385,7 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) - ].rename(columns=asset_list.EPC_API_DATA_NAMES) + ].rename(columns=asset_list.EPC_API_DATA_NAMES) # Look for columns not in the find my EPC data, which will have happened if we didn't # retrieve it in the first place @@ -446,7 +402,7 @@ def app(): find_my_epc_data[ [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) - ].rename(columns=asset_list.FIND_EPC_DATA_NAMES), + ].rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", on=asset_list.DOMNA_PROPERTY_ID, ) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 2cc604cb..fb812d67 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -20,6 +20,29 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + def levenshtein(a: str, b: str) -> float: """ Address similarity score in [0, 1]. diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 06a9d1a3..0f21a67f 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -78,112 +78,14 @@ def sanitise_postcode(postcode: str) -> str | None: return postcode.upper().replace(" ", "") -def is_valid_postcode(postcode_clean: str) -> bool: - """ - Validate postcode using postcodes.io. - - Expects a sanitised postcode (e.g. E84SQ). - Returns True if valid, False otherwise. - """ - POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" - if not postcode_clean: - return False - - try: - resp = requests.get( - POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), - timeout=5, - ) - resp.raise_for_status() - return resp.json().get("result", False) - except requests.RequestException: - # Network issues, rate limits, etc. - return False - - -def main(): - df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability") - df = df.head(500) - - # Sanitise postcodes - df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) - - # --- validate AFTER grouping (save API calls) --- - - # Get unique, non-null postcodes - unique_postcodes = df["postcode_clean"].dropna().unique() - - # Validate each postcode once, TODOadd a progress bar - postcode_validity = { - pc: is_valid_postcode(pc) - for pc in tqdm(unique_postcodes, total=len(unique_postcodes)) - } - - # Map validity back onto dataframe - df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) - - results = [] - - for postcode, group_df in tqdm( - df[df["postcode_valid"]].groupby("postcode_clean"), - desc="Resolving UPRNs by postcode", - ): - try: - epc_df = get_epc_data_with_postcode(postcode) - - if epc_df.empty: - tmp = group_df.copy() - tmp["found_uprn"] = None - tmp["status"] = "no_epc_results" - results.append(tmp) - continue - - resolved = resolve_uprns_for_postcode_group( - group_df=group_df, - epc_df=epc_df, - ) - - results.append(resolved) - - except Exception as e: - tmp = group_df.copy() - tmp["found_uprn"] = None - tmp["status"] = "exception" - tmp["error"] = str(e) - results.append(tmp) - - final_df = pd.concat(results, ignore_index=True) - a = final_df[ - [ - "best_match_lexiscore", - "Address 1", - "best_match_address", - "Postcode", - "UPRN", - "best_match_uprn", - ] - ] # add levi score to viewing - b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing - b = b[ - [ - "best_match_lexiscore", - "Address 1", - "best_match_address", - "Postcode", - "UPRN", - "best_match_uprn", - ] - ] - - -def handler(event, context): +def handler(event, context, local=False): print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") # Example SQS message for testing (copy and paste into SQS): # { - # "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - # "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv", + # "task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917", + # "s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv" # } # Handle both single event and batch events (SQS, etc.) @@ -196,7 +98,13 @@ def handler(event, context): task_id = None subtask_id = None try: - # Parse body + # For local development + if local is True: + record = {} + record["body"] = ( + '{"task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917","s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"}' + ) + # Parse body (inputs) if isinstance(record.get("body"), str): body = json.loads(record["body"]) else: @@ -236,17 +144,33 @@ def handler(event, context): df = pd.DataFrame(csv_data) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") - # Get head for demo - df_head = df.head() - logger.info("DataFrame head:") - logger.info(f"\n{df_head}") + # Sanitise postcodes + df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + + # Group by sanitised postcode (excluding null values) + grouped_data = [] + for postcode, group_df in df.dropna(subset=["postcode_clean"]).groupby( + "postcode_clean" + ): + group_info = { + "postcode": postcode, + "row_count": len(group_df), + "rows": group_df.to_dict(orient="records"), + } + grouped_data.append(group_info) + logger.info(f"Postcode: {postcode}, Rows: {len(group_df)}") + + logger.info(f"Total postcodes: {len(grouped_data)}") results.append( { - "message": "Postcode splitter processing started", + "message": "Postcode splitter processing completed", "task_id": str(task_id), "s3_uri": s3_uri, "subtask_id": str(subtask_id), + "total_rows": len(df), + "total_postcodes": len(grouped_data), + "grouped_data": grouped_data, } ) @@ -258,6 +182,7 @@ def handler(event, context): "status": "processing_complete", "s3_uri": s3_uri, "rows_processed": len(df), + "total_postcodes": len(grouped_data), }, ) logger.info(f"Subtask {subtask_id} marked as complete") @@ -295,7 +220,3 @@ def handler(event, context): {"processed": results, "errors": errors if errors else None} ), } - - -if __name__ == "__main__": - main() From ffb840da81e131bcdeb2d1fd784f909b72493f68 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 13:11:31 +0000 Subject: [PATCH 068/135] added address2uprn and postcodesplitter link --- .github/workflows/deploy_terraform.yml | 5 +- backend/address2UPRN/main.py | 98 +-------- backend/postcode_splitter/main.py | 186 +++++++++++++----- .../terraform/lambda/postcodeSplitter/main.tf | 33 ++++ 4 files changed, 180 insertions(+), 142 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 39132944..514fc7af 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -107,7 +107,8 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} - terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + terraform_apply: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -140,7 +141,7 @@ jobs: # 3️⃣ Deploy Postcode Splitter Lambda # ============================================================ postcodeSplitter_lambda: - needs: [postcodeSplitter_image, determine_stage] + needs: [postcodeSplitter_image, determine_stage, address2uprn_lambda] uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: postcodeSplitter diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index fb812d67..33c37760 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -506,99 +506,13 @@ def run_all_test(): ) -if __name__ == "__main__": - INPUT_FILE = "hackney.xlsx" - - ADDRESS_COL = "Address 1" - POSTCODE_COL = "Postcode" - UPRN_COL = "UPRN" - - df = pd.read_excel(INPUT_FILE) - - failures = [] - - for _, row in tqdm( - df.iterrows(), - total=len(df), - desc="Auditing UPRNs", - ): - input_address = str(row[ADDRESS_COL]).strip() - postcode = str(row[POSTCODE_COL]).strip() - - expected_uprn = None if pd.isna(row[UPRN_COL]) else str(int(row[UPRN_COL])) - - try: - epc_df = get_epc_data_with_postcode(postcode) - - if epc_df.empty: - failures.append( - { - **row.to_dict(), - "found_uprn": None, - "best_match_uprn": None, - "best_match_address": None, - "best_match_lexiscore": None, - "status": "no_epc_results", - } - ) - continue - - scored_df = get_uprn_candidates( - epc_df, - user_address=input_address, - ) - - best_row = scored_df.iloc[0] - - best_match_uprn = str(best_row["uprn"]) - best_match_address = best_row["address"] - best_match_lexiscore = round(float(best_row["lexiscore"]), 4) - - found_uprn = get_uprn(input_address, postcode) - - except Exception as e: - failures.append( - { - **row.to_dict(), - "found_uprn": None, - "best_match_uprn": None, - "best_match_address": None, - "best_match_lexiscore": None, - "status": "exception", - "error": str(e), - } - ) - continue - - found_uprn_norm = None if not found_uprn else str(found_uprn) - - if found_uprn_norm != expected_uprn: - failures.append( - { - **row.to_dict(), - "found_uprn": found_uprn_norm, - "best_match_uprn": best_match_uprn, - "best_match_address": best_match_address, - "best_match_lexiscore": best_match_lexiscore, - "status": ("no_match" if found_uprn_norm is None else "mismatch"), - } - ) - - failures_df = pd.DataFrame(failures) - - print("===================================") - print(f"Total rows : {len(df)}") - print(f"Failures : {len(failures_df)}") - print("===================================") - - failures_df.to_excel( - "hackney_uprn_failures.xlsx", - index=False, - ) - - def handler(event, context): - print("hello world") + print("=== Address2UPRN Lambda Handler ===") + print(f"Function: {context.function_name}") + print(f"Request ID: {context.aws_request_id}") + print(f"Event: {json.dumps(event, indent=2, default=str)}") + print(f"Context: {context}") + print("===================================") return {"statusCode": 200, "body": "hello world"} diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 0f21a67f..d515a21f 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -3,16 +3,13 @@ import sys import json import pandas as pd import requests +import boto3 from uuid import UUID from urllib.parse import unquote from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict from utils.logger import setup_logger from tqdm import tqdm from backend.app.db.functions.tasks.Tasks import SubTaskInterface -from backend.address2UPRN.main import ( - resolve_uprns_for_postcode_group, - get_epc_data_with_postcode, -) logger = setup_logger() @@ -65,17 +62,39 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]: raise ValueError(f"Could not parse S3 URI") from e -def sanitise_postcode(postcode: str) -> str | None: +def send_to_address2uprn_queue(task_id: str, rows: list) -> str: """ - Normalise postcode for grouping. + Send a postcode group to the address2UPRN SQS queue. - - Uppercase - - Remove all whitespace + Args: + task_id: The parent task ID + rows: List of row dictionaries for this postcode group + + Returns: + Message ID from SQS """ - if pd.isna(postcode): - return None + sqs_client = boto3.client("sqs") + queue_url = os.getenv("ADDRESS2UPRN_QUEUE_URL") - return postcode.upper().replace(" ", "") + if not queue_url: + raise ValueError("ADDRESS2UPRN_QUEUE_URL environment variable not set") + + message_body = { + "task_id": task_id, + "rows": rows, + } + + response = sqs_client.send_message( + QueueUrl=queue_url, + MessageBody=json.dumps(message_body), + ) + + logger.info( + f"Sent message to address2UPRN queue. " + f"Task: {task_id}, MessageId: {response['MessageId']}" + ) + + return response["MessageId"] def handler(event, context, local=False): @@ -142,50 +161,121 @@ def handler(event, context, local=False): csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) + # just do 5 well we are testing, sqs connection + df = df.head(5) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") # Sanitise postcodes - df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + df["postcode_clean"] = df["postcode"].str.upper().str.replace(" ", "") - # Group by sanitised postcode (excluding null values) - grouped_data = [] - for postcode, group_df in df.dropna(subset=["postcode_clean"]).groupby( - "postcode_clean" - ): - group_info = { - "postcode": postcode, - "row_count": len(group_df), - "rows": group_df.to_dict(orient="records"), - } - grouped_data.append(group_info) - logger.info(f"Postcode: {postcode}, Rows: {len(group_df)}") + clean_df = df.dropna(subset=["postcode_clean"]) - logger.info(f"Total postcodes: {len(grouped_data)}") + postcode_to_addresses = { + postcode: group.to_dict(orient="records") + for postcode, group in clean_df.groupby("postcode_clean", sort=False) + } - results.append( - { - "message": "Postcode splitter processing completed", - "task_id": str(task_id), - "s3_uri": s3_uri, - "subtask_id": str(subtask_id), - "total_rows": len(df), - "total_postcodes": len(grouped_data), - "grouped_data": grouped_data, - } - ) + logger.info(f"Total postcodes: {len(postcode_to_addresses)}") - # Mark subtask as complete after successful processing - subtask_interface.update_subtask_status( - subtask_id, - "complete", - outputs={ - "status": "processing_complete", - "s3_uri": s3_uri, - "rows_processed": len(df), - "total_postcodes": len(grouped_data), - }, - ) - logger.info(f"Subtask {subtask_id} marked as complete") + # Batch rows in groups of 500 + batch_rows = [] + batch_size = 500 + + for postcode, rows in postcode_to_addresses.items(): + # If postcode itself is larger than batch_size, send it individually + if len(rows) > batch_size: + # First, send the current batch if it has data + if batch_rows: + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=batch_rows, + ) + logger.info( + f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" + ) + batch_rows = [] + except Exception as e: + logger.error( + f"Failed to send batch to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) + + # Send the large postcode on its own + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=rows, + ) + logger.info( + f"Sent large postcode {postcode} ({len(rows)} rows) to address2UPRN queue" + ) + except Exception as e: + logger.error( + f"Failed to send large postcode to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) + continue + + # If adding this postcode's rows would exceed batch_size, send current batch + if batch_rows and len(batch_rows) + len(rows) > batch_size: + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=batch_rows, + ) + logger.info( + f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" + ) + batch_rows = [] + except Exception as e: + logger.error( + f"Failed to send batch to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) + + # Add current postcode's rows to batch + batch_rows.extend(rows) + + # Send remaining batch + if batch_rows: + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=batch_rows, + ) + logger.info( + f"Sent final batch of {len(batch_rows)} rows to address2UPRN queue" + ) + except Exception as e: + logger.error( + f"Failed to send final batch to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) except json.JSONDecodeError as e: logger.error(f"Invalid JSON in request body: {e}") diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 2e2e91da..69b80011 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -15,6 +15,16 @@ locals { db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) } +# Reference the existing address2UPRN Lambda outputs from shared state +data "terraform_remote_state" "address2uprn" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + module "lambda" { source = "../modules/lambda_with_sqs" @@ -44,6 +54,7 @@ module "lambda" { EPC_AUTH_TOKEN = "test" ENGINE_SQS_URL = "test" ENERGY_ASSESSMENTS_BUCKET = "test" + ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url }, ) } @@ -52,4 +63,26 @@ module "lambda" { resource "aws_iam_role_policy_attachment" "postcode_splitter_s3_read" { role = module.lambda.role_name policy_arn = data.terraform_remote_state.shared.outputs.postcode_splitter_s3_read_arn +} + +# Create SQS send policy for address2UPRN queue +module "postcode_splitter_sqs_policy" { + source = "../../modules/general_iam_policy" + + policy_name = "postcode-splitter-sqs-send-${var.stage}" + policy_description = "Allow postcode-splitter Lambda to send messages to address2UPRN queue" + + actions = [ + "sqs:SendMessage" + ] + + resources = [ + data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_arn + ] +} + +# Attach SQS policy to the Lambda execution role +resource "aws_iam_role_policy_attachment" "postcode_splitter_sqs_send" { + role = module.lambda.role_name + policy_arn = module.postcode_splitter_sqs_policy.policy_arn } \ No newline at end of file From 203843c387adafbba7eb3e1f47627343e296958d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 13:16:11 +0000 Subject: [PATCH 069/135] added new files --- .../terraform/lambda/address2UPRN/outputs.tf | 14 ++++++++ .../modules/general_iam_policy/main.tf | 21 ++++++++++++ .../modules/general_iam_policy/outputs.tf | 9 ++++++ .../modules/general_iam_policy/variables.tf | 32 +++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 infrastructure/terraform/lambda/address2UPRN/outputs.tf create mode 100644 infrastructure/terraform/modules/general_iam_policy/main.tf create mode 100644 infrastructure/terraform/modules/general_iam_policy/outputs.tf create mode 100644 infrastructure/terraform/modules/general_iam_policy/variables.tf diff --git a/infrastructure/terraform/lambda/address2UPRN/outputs.tf b/infrastructure/terraform/lambda/address2UPRN/outputs.tf new file mode 100644 index 00000000..e4645a0a --- /dev/null +++ b/infrastructure/terraform/lambda/address2UPRN/outputs.tf @@ -0,0 +1,14 @@ +output "address2uprn_queue_url" { + value = module.address2uprn.queue_url + description = "URL of the address2UPRN SQS queue" +} + +output "address2uprn_queue_arn" { + value = module.address2uprn.queue_arn + description = "ARN of the address2UPRN SQS queue" +} + +output "address2uprn_lambda_arn" { + value = module.address2uprn.lambda_arn + description = "ARN of the address2UPRN Lambda function" +} diff --git a/infrastructure/terraform/modules/general_iam_policy/main.tf b/infrastructure/terraform/modules/general_iam_policy/main.tf new file mode 100644 index 00000000..f7ffe4a1 --- /dev/null +++ b/infrastructure/terraform/modules/general_iam_policy/main.tf @@ -0,0 +1,21 @@ +# IAM Policy with dynamic actions and resources +resource "aws_iam_policy" "policy" { + name = var.policy_name + description = var.policy_description + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + merge( + { + Effect = "Allow" + Action = var.actions + Resource = var.resources + }, + var.conditions != null ? { Condition = var.conditions } : {} + ) + ] + }) + + tags = var.tags +} diff --git a/infrastructure/terraform/modules/general_iam_policy/outputs.tf b/infrastructure/terraform/modules/general_iam_policy/outputs.tf new file mode 100644 index 00000000..cfceab05 --- /dev/null +++ b/infrastructure/terraform/modules/general_iam_policy/outputs.tf @@ -0,0 +1,9 @@ +output "policy_arn" { + value = aws_iam_policy.policy.arn + description = "ARN of the created IAM policy" +} + +output "policy_name" { + value = aws_iam_policy.policy.name + description = "Name of the created IAM policy" +} diff --git a/infrastructure/terraform/modules/general_iam_policy/variables.tf b/infrastructure/terraform/modules/general_iam_policy/variables.tf new file mode 100644 index 00000000..0d824eb5 --- /dev/null +++ b/infrastructure/terraform/modules/general_iam_policy/variables.tf @@ -0,0 +1,32 @@ +variable "policy_name" { + description = "Name of the IAM policy" + type = string +} + +variable "policy_description" { + description = "Description of the IAM policy" + type = string + default = "" +} + +variable "actions" { + description = "List of IAM actions allowed by this policy" + type = list(string) +} + +variable "resources" { + description = "List of AWS resources this policy applies to" + type = list(string) +} + +variable "conditions" { + description = "Optional IAM policy conditions" + type = any + default = null +} + +variable "tags" { + description = "Tags to apply to the policy" + type = map(string) + default = {} +} From b2f67bfa785efe8af887930168f41533ed751cd5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 13:25:41 +0000 Subject: [PATCH 070/135] address2 uprn --- infrastructure/terraform/lambda/postcodeSplitter/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 69b80011..0350a139 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -19,7 +19,7 @@ locals { data "terraform_remote_state" "address2uprn" { backend = "s3" config = { - bucket = "assessment-model-terraform-state" + bucket = "address2uprn-terraform-state" key = "env:/${var.stage}/terraform.tfstate" region = "eu-west-2" } From ef0b0d6142c2833565bf797f70a0467e8ad0cebf Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 13:31:47 +0000 Subject: [PATCH 071/135] add json --- backend/address2UPRN/main.py | 1 + infrastructure/terraform/lambda/postcodeSplitter/main.tf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 33c37760..30066bcb 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -7,6 +7,7 @@ from tqdm import tqdm from utils.logger import setup_logger import re from typing import Set +import json logger = setup_logger() diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 0350a139..81120772 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -15,7 +15,7 @@ locals { db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) } -# Reference the existing address2UPRN Lambda outputs from shared state +# Reference the existing address2UPRN Lambda outputs from address2uprn state data "terraform_remote_state" "address2uprn" { backend = "s3" config = { From 5a0e0c0a698f858abdfcb39554370dabd2e35c25 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 13:45:06 +0000 Subject: [PATCH 072/135] add more logic to batch and also missing libraries --- backend/address2UPRN/main.py | 1 + backend/postcode_splitter/main.py | 153 +++++++++++++++++++----------- 2 files changed, 96 insertions(+), 58 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 30066bcb..777dde0e 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -8,6 +8,7 @@ from utils.logger import setup_logger import re from typing import Set import json +import requests logger = setup_logger() diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index d515a21f..eb7cf044 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -177,23 +177,103 @@ def handler(event, context, local=False): logger.info(f"Total postcodes: {len(postcode_to_addresses)}") - # Batch rows in groups of 500 - batch_rows = [] + # Calculate total rows to send + total_rows = sum(len(rows) for rows in postcode_to_addresses.values()) + logger.info(f"Total rows to send: {total_rows}") + batch_size = 500 - for postcode, rows in postcode_to_addresses.items(): - # If postcode itself is larger than batch_size, send it individually - if len(rows) > batch_size: - # First, send the current batch if it has data - if batch_rows: + # If all rows fit in one batch, just send them all at once + if total_rows <= batch_size: + all_rows = [] + for postcode, rows in postcode_to_addresses.items(): + all_rows.extend(rows) + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=all_rows, + ) + logger.info(f"Sent all {len(all_rows)} rows in single batch to address2UPRN queue") + except Exception as e: + logger.error( + f"Failed to send all rows to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) + else: + # Multi-batch processing for large datasets + batch_rows = [] + total_sent = 0 + + for postcode, rows in postcode_to_addresses.items(): + logger.info(f"Processing postcode {postcode} with {len(rows)} rows") + # If postcode itself is larger than batch_size, send it individually + if len(rows) > batch_size: + # First, send the current batch if it has data + if batch_rows: + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=batch_rows, + ) + logger.info( + f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" + ) + batch_rows = [] + except Exception as e: + logger.error( + f"Failed to send batch to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) + + # Send the large postcode on its own + try: + send_to_address2uprn_queue( + task_id=str(task_id), + rows=rows, + ) + logger.info( + f"Sent large postcode {postcode} ({len(rows)} rows) to address2UPRN queue" + ) + except Exception as e: + logger.error( + f"Failed to send large postcode to address2UPRN queue: {e}", + exc_info=True, + ) + errors.append( + { + "error": "Failed to send to address2UPRN queue", + "details": str(e), + } + ) + continue + + # If adding this postcode's rows would exceed batch_size, send current batch + current_batch_size = len(batch_rows) + len(rows) + if batch_rows and current_batch_size > batch_size: + logger.info( + f"Batch threshold reached: current {len(batch_rows)} + next postcode {len(rows)} = {current_batch_size} > {batch_size}" + ) try: send_to_address2uprn_queue( task_id=str(task_id), rows=batch_rows, ) logger.info( - f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" + f"Sent batch of {len(batch_rows)} rows to address2UPRN queue (total sent: {total_sent})" ) + total_sent += len(batch_rows) batch_rows = [] except Exception as e: logger.error( @@ -207,42 +287,24 @@ def handler(event, context, local=False): } ) - # Send the large postcode on its own - try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=rows, - ) - logger.info( - f"Sent large postcode {postcode} ({len(rows)} rows) to address2UPRN queue" - ) - except Exception as e: - logger.error( - f"Failed to send large postcode to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) - continue + # Add current postcode's rows to batch + batch_rows.extend(rows) - # If adding this postcode's rows would exceed batch_size, send current batch - if batch_rows and len(batch_rows) + len(rows) > batch_size: + # Send remaining batch + if batch_rows: try: send_to_address2uprn_queue( task_id=str(task_id), rows=batch_rows, ) + total_sent += len(batch_rows) logger.info( - f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" + f"Sent final batch of {len(batch_rows)} rows to address2UPRN queue (total sent: {total_sent})" ) batch_rows = [] except Exception as e: logger.error( - f"Failed to send batch to address2UPRN queue: {e}", + f"Failed to send final batch to address2UPRN queue: {e}", exc_info=True, ) errors.append( @@ -252,31 +314,6 @@ def handler(event, context, local=False): } ) - # Add current postcode's rows to batch - batch_rows.extend(rows) - - # Send remaining batch - if batch_rows: - try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=batch_rows, - ) - logger.info( - f"Sent final batch of {len(batch_rows)} rows to address2UPRN queue" - ) - except Exception as e: - logger.error( - f"Failed to send final batch to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) - except json.JSONDecodeError as e: logger.error(f"Invalid JSON in request body: {e}") errors.append({"error": "Invalid JSON in request body", "details": str(e)}) From 655d7dbd6ff432709e702a787a98dbd96c651d53 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 13:52:39 +0000 Subject: [PATCH 073/135] add more logic to batch and also missing libraries --- .../terraform/lambda/postcodeSplitter/variables.tf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf b/infrastructure/terraform/lambda/postcodeSplitter/variables.tf index 0c8ba5b2..7bd68543 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/variables.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/variables.tf @@ -27,3 +27,9 @@ output "resolved_image_uri" { + + + + + + From 9b414924d06876c24f7db2663556bd07325fd275 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 16:37:55 +0000 Subject: [PATCH 074/135] run this end to end --- backend/address2UPRN/main.py | 301 +++++++++++++++++++++++++-- sfr/principal_pitch/2_export_data.py | 30 ++- 2 files changed, 309 insertions(+), 22 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 777dde0e..0f735f2a 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -9,6 +9,8 @@ import re from typing import Set import json import requests +from uuid import UUID +from backend.app.db.functions.tasks.Tasks import SubTaskInterface logger = setup_logger() @@ -323,32 +325,41 @@ def get_uprn_candidates( ) -def get_uprn( +def get_uprn_with_epc_df( user_inputed_address: str, - postcode: str, + epc_df: pd.DataFrame, return_address=False, return_EPC=False, return_score=True, ): """ - Return uprn (str) - Return False if failed to find a sensible matching epc - Return Nons when epc found but no UPRN - """ - df = get_epc_data_with_postcode(postcode=postcode) + Return uprn (str) using a pre-fetched EPC dataframe. + This avoids calling the API multiple times for the same postcode. - if df.empty: + Args: + user_inputed_address: The user's address string + epc_df: Pre-fetched EPC data for the postcode + return_address: Whether to return the matched address + return_EPC: Whether to return the EPC rating + return_score: Whether to return the lexiscore + + Returns: + uprn (str), or tuple if return_address/return_EPC/return_score are True + Returns None if no match found, lexiscore < 0.7, or UPRN is empty + """ + if epc_df.empty: return None scored_df = get_uprn_candidates( - df, + epc_df, user_address=user_inputed_address, ) # Best score best_score = scored_df.iloc[0]["lexiscore"] - if best_score <= 0: + # Return None if score is below threshold + if best_score < 0.7: return None # All rank-1 rows (possible draw) @@ -386,6 +397,32 @@ def get_uprn( return found_uprn +def get_uprn( + user_inputed_address: str, + postcode: str, + return_address=False, + return_EPC=False, + return_score=True, +): + """ + Return uprn (str) + Return False if failed to find a sensible matching epc + Return None when epc found but no UPRN + + This function fetches EPC data via API for a single postcode. + For processing multiple addresses in the same postcode, use get_uprn_with_epc_df instead. + """ + df = get_epc_data_with_postcode(postcode=postcode) + + return get_uprn_with_epc_df( + user_inputed_address=user_inputed_address, + epc_df=df, + return_address=return_address, + return_EPC=return_EPC, + return_score=return_score, + ) + + def resolve_uprns_for_postcode_group( group_df: pd.DataFrame, epc_df: pd.DataFrame, @@ -508,20 +545,246 @@ def run_all_test(): ) -def handler(event, context): +def handler(event, context, local=False): print("=== Address2UPRN Lambda Handler ===") print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") + + # Handle local testing + if local is True: + event = { + "Records": [ + { + "body": json.dumps({ + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "rows": [ + { + "landlord_property_id": "00000002POR", + "UPRN": "766019911", + "Address 1": "9 Redland Way", + "Address 2": "Aylesbury Vale", + "postcode": "HP21 9RJ", + "landlord_property_type": "House", + "postcode_clean": "HP219RJ" + }, + { + "landlord_property_id": "00000003MTR", + "UPRN": "100120781544", + "Address 1": "16 Lime Crescent", + "Address 2": "BICESTER", + "postcode": "OX26 3XJ", + "landlord_property_type": "House", + "postcode_clean": "OX263XJ" + }, + { + "landlord_property_id": "00000004HBY", + "UPRN": "14033542", + "Address 1": "14 Dunbar Drive", + "Address 2": "Woodley", + "postcode": "RG5 4HA", + "landlord_property_type": "House", + "postcode_clean": "RG54HA" + } + ] + }) + } + ] + } + print(f"Event: {json.dumps(event, indent=2, default=str)}") - print(f"Context: {context}") print("===================================") - return {"statusCode": 200, "body": "hello world"} + # Handle both single event and batch events (SQS, etc.) + records = event.get("Records", [event]) + results = [] + errors = [] + subtask_interface = SubTaskInterface() -# TO do function dispatcher, + for record in records: + task_id = None + subtask_id = None + try: + # Parse body (inputs) + if isinstance(record.get("body"), str): + body = json.loads(record["body"]) + else: + body = record.get("body", {}) -# get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate) -# fix that -# Look again at flat 1 -# pandas reader the seperate postcode_splitter -# dump into s3 + # Validate required fields + task_id = body.get("task_id") + rows = body.get("rows", []) + + if not task_id: + errors.append({"error": "Missing required field: task_id"}) + continue + + if not rows: + errors.append({"error": "Missing or empty rows data"}) + continue + + # Convert task_id to UUID + try: + task_id = UUID(task_id) if isinstance(task_id, str) else task_id + except ValueError as e: + errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) + continue + + # Create a subtask for this batch + subtask_id = subtask_interface.create_subtask( + task_id=task_id, inputs={"row_count": len(rows)} + ) + logger.info(f"Created subtask {subtask_id} for task {task_id} with {len(rows)} rows") + + # Process the rows + logger.info(f"Processing {len(rows)} rows for task {task_id}") + + # Convert rows to DataFrame + df = pd.DataFrame(rows) + + # Create user_input column by concatenating Address 1 and Address 2 + df["user_input"] = (df["Address 1"].fillna("") + " " + df["Address 2"].fillna("")).str.strip() + logger.info(f"Created user_input column from Address 1 and Address 2") + + clean_df = df.dropna(subset=["postcode_clean"]) + + postcode_to_addresses = { + postcode: group.to_dict(orient="records") + for postcode, group in clean_df.groupby("postcode_clean", sort=False) + } + + logger.info(f"Total postcodes: {len(postcode_to_addresses)}") + + # Process each postcode group + postcodes_processed = 0 + addresses_processed = 0 + uprns_found = 0 + results_data = [] + + for postcode, postcode_rows in postcode_to_addresses.items(): + logger.info(f"Processing postcode: {postcode} with {len(postcode_rows)} rows") + + # Validate postcode before processing + if not is_valid_postcode(postcode): + logger.warning(f"Postcode {postcode} is invalid, skipping") + continue + + # Fetch EPC data once per postcode + try: + epc_df = get_epc_data_with_postcode(postcode=postcode) + logger.info(f"Fetched {len(epc_df)} EPC records for postcode {postcode}") + except Exception as e: + logger.error(f"Failed to fetch EPC data for postcode {postcode}: {e}") + continue + + # Process each address in this postcode with the same EPC data + for row in postcode_rows: + try: + user_input = row.get("user_input", "") + if not user_input: + logger.warning(f"Skipping row with missing user_input for postcode {postcode}") + continue + + # Get UPRN using the pre-fetched EPC data with all return options + result = get_uprn_with_epc_df( + user_inputed_address=user_input, + epc_df=epc_df, + return_address=True, + return_EPC=True, + return_score=True + ) + + # Parse result tuple if successful + if result: + uprn, found_address, epc, score = result + uprns_found += 1 + logger.info(f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})") + + results_data.append({ + **row, # Include all original data + "found_uprn": uprn, + "found_address": found_address, + "epc_rating": epc, + "lexiscore": score + }) + else: + logger.warning(f"No UPRN found for {user_input} in {postcode}") + results_data.append({ + **row, # Include all original data + "found_uprn": None, + "found_address": None, + "epc_rating": None, + "lexiscore": None + }) + + addresses_processed += 1 + + except Exception as e: + logger.error(f"Error processing address {row.get('user_input', 'unknown')}: {e}") + # Still add the row with error markers + results_data.append({ + **row, + "found_uprn": None, + "found_address": None, + "epc_rating": None, + "score": None, + "error": str(e) + }) + continue + + postcodes_processed += 1 + + # Create results DataFrame + result_df = pd.DataFrame(results_data) + logger.info(f"Created results DataFrame with {len(result_df)} rows") + + results.append({ + "subtask_id": str(subtask_id), + "rows_processed": len(rows), + "postcodes_processed": postcodes_processed, + "addresses_processed": addresses_processed, + "uprns_found": uprns_found, + "status": "processed" + }) + + # Mark subtask as completed + try: + subtask_interface.update_subtask_status( + subtask_id, "completed", outputs={"rows_processed": len(rows)} + ) + logger.info(f"Marked subtask {subtask_id} as completed") + except Exception as db_error: + logger.error(f"Failed to mark subtask as completed: {db_error}") + + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in request body: {e}") + errors.append({"error": "Invalid JSON in request body", "details": str(e)}) + # Mark subtask as failed if we have one + if subtask_id: + try: + subtask_interface.update_subtask_status( + subtask_id, "failed", outputs={"error": str(e)} + ) + except Exception as db_error: + logger.error(f"Failed to update subtask status: {db_error}") + except Exception as e: + logger.error(f"Unexpected error processing record: {e}", exc_info=True) + errors.append({"error": "Unexpected error", "details": str(e)}) + # Mark subtask as failed if we have one + if subtask_id: + try: + subtask_interface.update_subtask_status( + subtask_id, "failed", outputs={"error": str(e)} + ) + except Exception as db_error: + logger.error(f"Failed to update subtask status: {db_error}") + + # Return error if all records failed + if errors and not results: + return {"statusCode": 500, "body": json.dumps({"errors": errors})} + + return { + "statusCode": 200, + "body": json.dumps( + {"processed": results, "errors": errors if errors else None} + ), + } diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 1841cf3f..9470710d 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -28,12 +28,12 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 544 +PORTFOLIO_ID = 476 SCENARIOS = [ - 1027, + 953, ] scenario_names = { - 1027: "EPC C", + 953: "All Properties, Most Economic", } project_name = "manchester" @@ -330,6 +330,30 @@ for scenario_id in SCENARIOS: df[df["predicted_post_works_sap"] == ""] + # Expected columns list + expected_columns = [ + "suspended_floor_insulation", + "solid_floor_insulation", + "external_wall_insulation", + "internal_wall_insulation", + "cavity_wall_insulation", + "loft_insulation", + "flat_roof_insulation", + "room_roof_insulation", + "secondary_glazing", + "double_glazing", + "solar_pv", + "high_heat_retention_storage_heaters", + "air_source_heat_pump", + "boiler_upgrade", + "roomstat_programmer_trvs", + "time_temperature_zone_control", + ] + # Add missing columns with default values + for col in expected_columns: + if col not in df.columns: + df[col] = "" + # Create excel to store to filename = f"{scenario_names[scenario_id]} - {project_name}.xlsx" with pd.ExcelWriter(filename) as writer: From 762dccde01761b6c026dc83820a65e2279ac4d1b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 16:44:08 +0000 Subject: [PATCH 075/135] run this end to end --- backend/address2UPRN/main.py | 179 +++++++++++------- .../modules/s3_iam_policy/variables.tf | 3 + 2 files changed, 109 insertions(+), 73 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 0f735f2a..6841d6a6 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -555,38 +555,40 @@ def handler(event, context, local=False): event = { "Records": [ { - "body": json.dumps({ - "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "rows": [ - { - "landlord_property_id": "00000002POR", - "UPRN": "766019911", - "Address 1": "9 Redland Way", - "Address 2": "Aylesbury Vale", - "postcode": "HP21 9RJ", - "landlord_property_type": "House", - "postcode_clean": "HP219RJ" - }, - { - "landlord_property_id": "00000003MTR", - "UPRN": "100120781544", - "Address 1": "16 Lime Crescent", - "Address 2": "BICESTER", - "postcode": "OX26 3XJ", - "landlord_property_type": "House", - "postcode_clean": "OX263XJ" - }, - { - "landlord_property_id": "00000004HBY", - "UPRN": "14033542", - "Address 1": "14 Dunbar Drive", - "Address 2": "Woodley", - "postcode": "RG5 4HA", - "landlord_property_type": "House", - "postcode_clean": "RG54HA" - } - ] - }) + "body": json.dumps( + { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "rows": [ + { + "landlord_property_id": "00000002POR", + "UPRN": "766019911", + "Address 1": "9 Redland Way", + "Address 2": "Aylesbury Vale", + "postcode": "HP21 9RJ", + "landlord_property_type": "House", + "postcode_clean": "HP219RJ", + }, + { + "landlord_property_id": "00000003MTR", + "UPRN": "100120781544", + "Address 1": "16 Lime Crescent", + "Address 2": "BICESTER", + "postcode": "OX26 3XJ", + "landlord_property_type": "House", + "postcode_clean": "OX263XJ", + }, + { + "landlord_property_id": "00000004HBY", + "UPRN": "14033542", + "Address 1": "14 Dunbar Drive", + "Address 2": "Woodley", + "postcode": "RG5 4HA", + "landlord_property_type": "House", + "postcode_clean": "RG54HA", + }, + ], + } + ) } ] } @@ -633,7 +635,9 @@ def handler(event, context, local=False): subtask_id = subtask_interface.create_subtask( task_id=task_id, inputs={"row_count": len(rows)} ) - logger.info(f"Created subtask {subtask_id} for task {task_id} with {len(rows)} rows") + logger.info( + f"Created subtask {subtask_id} for task {task_id} with {len(rows)} rows" + ) # Process the rows logger.info(f"Processing {len(rows)} rows for task {task_id}") @@ -642,11 +646,13 @@ def handler(event, context, local=False): df = pd.DataFrame(rows) # Create user_input column by concatenating Address 1 and Address 2 - df["user_input"] = (df["Address 1"].fillna("") + " " + df["Address 2"].fillna("")).str.strip() + df["user_input"] = ( + df["Address 1"].fillna("") + " " + df["Address 2"].fillna("") + ).str.strip() logger.info(f"Created user_input column from Address 1 and Address 2") clean_df = df.dropna(subset=["postcode_clean"]) - + postcode_to_addresses = { postcode: group.to_dict(orient="records") for postcode, group in clean_df.groupby("postcode_clean", sort=False) @@ -661,7 +667,9 @@ def handler(event, context, local=False): results_data = [] for postcode, postcode_rows in postcode_to_addresses.items(): - logger.info(f"Processing postcode: {postcode} with {len(postcode_rows)} rows") + logger.info( + f"Processing postcode: {postcode} with {len(postcode_rows)} rows" + ) # Validate postcode before processing if not is_valid_postcode(postcode): @@ -671,9 +679,13 @@ def handler(event, context, local=False): # Fetch EPC data once per postcode try: epc_df = get_epc_data_with_postcode(postcode=postcode) - logger.info(f"Fetched {len(epc_df)} EPC records for postcode {postcode}") + logger.info( + f"Fetched {len(epc_df)} EPC records for postcode {postcode}" + ) except Exception as e: - logger.error(f"Failed to fetch EPC data for postcode {postcode}: {e}") + logger.error( + f"Failed to fetch EPC data for postcode {postcode}: {e}" + ) continue # Process each address in this postcode with the same EPC data @@ -681,7 +693,9 @@ def handler(event, context, local=False): try: user_input = row.get("user_input", "") if not user_input: - logger.warning(f"Skipping row with missing user_input for postcode {postcode}") + logger.warning( + f"Skipping row with missing user_input for postcode {postcode}" + ) continue # Get UPRN using the pre-fetched EPC data with all return options @@ -690,45 +704,57 @@ def handler(event, context, local=False): epc_df=epc_df, return_address=True, return_EPC=True, - return_score=True + return_score=True, ) # Parse result tuple if successful if result: uprn, found_address, epc, score = result uprns_found += 1 - logger.info(f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})") + logger.info( + f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" + ) - results_data.append({ - **row, # Include all original data - "found_uprn": uprn, - "found_address": found_address, - "epc_rating": epc, - "lexiscore": score - }) + results_data.append( + { + **row, # Include all original data + "found_uprn": uprn, + "found_address": found_address, + "epc_rating": epc, + "lexiscore": score, + } + ) else: - logger.warning(f"No UPRN found for {user_input} in {postcode}") - results_data.append({ - **row, # Include all original data - "found_uprn": None, - "found_address": None, - "epc_rating": None, - "lexiscore": None - }) + logger.warning( + f"No UPRN found for {user_input} in {postcode}" + ) + results_data.append( + { + **row, # Include all original data + "found_uprn": None, + "found_address": None, + "epc_rating": None, + "lexiscore": None, + } + ) addresses_processed += 1 except Exception as e: - logger.error(f"Error processing address {row.get('user_input', 'unknown')}: {e}") + logger.error( + f"Error processing address {row.get('user_input', 'unknown')}: {e}" + ) # Still add the row with error markers - results_data.append({ - **row, - "found_uprn": None, - "found_address": None, - "epc_rating": None, - "score": None, - "error": str(e) - }) + results_data.append( + { + **row, + "found_uprn": None, + "found_address": None, + "epc_rating": None, + "score": None, + "error": str(e), + } + ) continue postcodes_processed += 1 @@ -737,14 +763,16 @@ def handler(event, context, local=False): result_df = pd.DataFrame(results_data) logger.info(f"Created results DataFrame with {len(result_df)} rows") - results.append({ - "subtask_id": str(subtask_id), - "rows_processed": len(rows), - "postcodes_processed": postcodes_processed, - "addresses_processed": addresses_processed, - "uprns_found": uprns_found, - "status": "processed" - }) + results.append( + { + "subtask_id": str(subtask_id), + "rows_processed": len(rows), + "postcodes_processed": postcodes_processed, + "addresses_processed": addresses_processed, + "uprns_found": uprns_found, + "status": "processed", + } + ) # Mark subtask as completed try: @@ -788,3 +816,8 @@ def handler(event, context, local=False): {"processed": results, "errors": errors if errors else None} ), } + + +# TODO: +# Don't add results to return messages as its too verbose +# capture the exepection as e, into s3, to find the logs go to s3 diff --git a/infrastructure/terraform/modules/s3_iam_policy/variables.tf b/infrastructure/terraform/modules/s3_iam_policy/variables.tf index ed53ea1f..e2b3d7a8 100644 --- a/infrastructure/terraform/modules/s3_iam_policy/variables.tf +++ b/infrastructure/terraform/modules/s3_iam_policy/variables.tf @@ -37,3 +37,6 @@ variable "tags" { type = map(string) default = {} } + + + From 538f207d2f4d5950d9a14b53bb0f28a27211ff13 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 16:57:27 +0000 Subject: [PATCH 076/135] env variables added --- .github/workflows/deploy_terraform.yml | 7 +++ backend/address2UPRN/handler/Dockerfile | 19 ++++++-- backend/address2UPRN/main.py | 1 + .../terraform/lambda/address2UPRN/main.tf | 43 ++++++++++++++++--- 4 files changed, 62 insertions(+), 8 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 514fc7af..20242ec8 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -90,10 +90,17 @@ jobs: ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} dockerfile_path: backend/address2UPRN/handler/Dockerfile build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} # ============================================================ # 3️⃣ Deploy Address 2 UPRN Lambda diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index d01550a2..419b4d66 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -1,6 +1,16 @@ FROM public.ecr.aws/lambda/python:3.10 # FROM python:3.11.10-bullseye + +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + + # Set working directory (Lambda task root) WORKDIR /var/task @@ -13,10 +23,13 @@ COPY backend/address2UPRN/handler/requirements.txt . # Install dependencies into Lambda runtime RUN pip install --no-cache-dir -r requirements.txt -# ----------------------------- -# Copy application code -# ----------------------------- + +# Copy necessary files for database and utility imports COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +# Copy the handler COPY backend/address2UPRN/main.py . # ----------------------------- diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 6841d6a6..d361db15 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -821,3 +821,4 @@ def handler(event, context, local=False): # TODO: # Don't add results to return messages as its too verbose # capture the exepection as e, into s3, to find the logs go to s3 +# Upload results to s3 as well as csv diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 46b193f2..4a82d634 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -1,3 +1,19 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" # TODO: dont hardcode this + region = "eu-west-2" + } +} +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + module "address2uprn" { source = "../modules/lambda_with_sqs" @@ -6,9 +22,26 @@ module "address2uprn" { image_uri = local.image_uri - - environment = { - STAGE = var.stage - LOG_LEVEL = "info" - } + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + GOOGLE_SOLAR_API_KEY = "test" + SAP_PREDICTIONS_BUCKET = "test" + CARBON_PREDICTIONS_BUCKET = "test" + HEAT_PREDICTIONS_BUCKET = "test" + HEATING_KWH_PREDICTIONS_BUCKET = "test" + HOTWATER_KWH_PREDICTIONS_BUCKET = "test" + API_KEY = "test" + ENVIRONMENT = "test" + SECRET_KEY = "test" + PLAN_TRIGGER_BUCKET = "test" + DATA_BUCKET = "test" + EPC_AUTH_TOKEN = "test" + ENGINE_SQS_URL = "test" + ENERGY_ASSESSMENTS_BUCKET = "test" + }, + ) } From a7509aecdc827806d4ed092f4788912c45001eae Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 16:59:57 +0000 Subject: [PATCH 077/135] added very serious logs --- backend/address2UPRN/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index d361db15..2cec8a2e 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -807,6 +807,7 @@ def handler(event, context, local=False): logger.error(f"Failed to update subtask status: {db_error}") # Return error if all records failed + logger.fatal(results) if errors and not results: return {"statusCode": 500, "body": json.dumps({"errors": errors})} From 3ee12c5f0ede5b6a6b0af0fe6c825826b429b5ba Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:00:09 +0000 Subject: [PATCH 078/135] redploy --- .github/workflows/deploy_terraform.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 20242ec8..ebdeb32d 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -201,4 +201,7 @@ jobs: secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} - AWS_REGION: ${{ secrets.DEV_AWS_REGION }} \ No newline at end of file + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + + \ No newline at end of file From d4fcf0c6cd309b4674638128af4cf1744c2979b3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:06:41 +0000 Subject: [PATCH 079/135] add requirements --- .github/workflows/deploy_terraform.yml | 3 +++ backend/address2UPRN/handler/requirements.txt | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index ebdeb32d..8a889833 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -204,4 +204,7 @@ jobs: AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + + \ No newline at end of file diff --git a/backend/address2UPRN/handler/requirements.txt b/backend/address2UPRN/handler/requirements.txt index eba2c846..6ef41b2d 100644 --- a/backend/address2UPRN/handler/requirements.txt +++ b/backend/address2UPRN/handler/requirements.txt @@ -4,3 +4,8 @@ requests tqdm openpyxl epc-api-python==1.0.2 +boto3==1.35.44 +sqlmodel +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 \ No newline at end of file From 47c14e798c10c67a3ecbc17e6526ff3c70f28778 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:20:32 +0000 Subject: [PATCH 080/135] add epc auth token --- .github/workflows/_build_image.yml | 3 +++ .github/workflows/deploy_terraform.yml | 3 ++- infrastructure/terraform/lambda/address2UPRN/main.tf | 1 - 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index 641e31f9..a5e16a51 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -38,6 +38,8 @@ on: required: false DEV_DB_NAME: required: false + EPC_AUTH_TOKEN: + required: false jobs: build: @@ -47,6 +49,7 @@ jobs: DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + EPC_AUTH_TOKEN: ${{ secrets.EPC_AUTH_TOKEN }} outputs: image_digest: ${{ steps.digest.outputs.image_digest }} diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 8a889833..c089d0c5 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -94,6 +94,7 @@ jobs: DEV_DB_HOST=$DEV_DB_HOST DEV_DB_PORT=$DEV_DB_PORT DEV_DB_NAME=$DEV_DB_NAME + EPC_AUTH_TOKEN=$EPC_AUTH_TOKEN secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -101,6 +102,7 @@ jobs: DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} # ============================================================ # 3️⃣ Deploy Address 2 UPRN Lambda @@ -207,4 +209,3 @@ jobs: - \ No newline at end of file diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 4a82d634..caf06785 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -39,7 +39,6 @@ module "address2uprn" { SECRET_KEY = "test" PLAN_TRIGGER_BUCKET = "test" DATA_BUCKET = "test" - EPC_AUTH_TOKEN = "test" ENGINE_SQS_URL = "test" ENERGY_ASSESSMENTS_BUCKET = "test" }, From c3ff4c9d6b5f14eec9a8adf904875e7e5f91b250 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:21:12 +0000 Subject: [PATCH 081/135] add epc auth token --- backend/address2UPRN/handler/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 419b4d66..155c37ad 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -5,10 +5,12 @@ FROM public.ecr.aws/lambda/python:3.10 ARG DEV_DB_HOST ARG DEV_DB_PORT ARG DEV_DB_NAME +ARG EPC_AUTH_TOKEN ENV DB_HOST=${DEV_DB_HOST} ENV DB_PORT=${DEV_DB_PORT} ENV DB_NAME=${DEV_DB_NAME} +ENV EPC_AUTH_TOKEN=${EPC_AUTH_TOKEN}} # Set working directory (Lambda task root) From 6618eafa8ccf9098992c09950127e7d68be534bb Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:22:24 +0000 Subject: [PATCH 082/135] additional bracket removed --- backend/address2UPRN/handler/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 155c37ad..07159357 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -10,7 +10,7 @@ ARG EPC_AUTH_TOKEN ENV DB_HOST=${DEV_DB_HOST} ENV DB_PORT=${DEV_DB_PORT} ENV DB_NAME=${DEV_DB_NAME} -ENV EPC_AUTH_TOKEN=${EPC_AUTH_TOKEN}} +ENV EPC_AUTH_TOKEN=${EPC_AUTH_TOKEN} # Set working directory (Lambda task root) From d4cd63d749785b003bf9da2558aaa7cd1647a40e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:22:33 +0000 Subject: [PATCH 083/135] additional bracket removed --- .github/workflows/deploy_terraform.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index c089d0c5..c5ed7e93 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -209,3 +209,7 @@ jobs: + + + + From e7691570fdf5ae1cd5651001bc310e180473ecd3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:42:30 +0000 Subject: [PATCH 084/135] merge --- .github/workflows/deploy_terraform.yml | 3 +++ backend/address2UPRN/main.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index c5ed7e93..122fb2e1 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -213,3 +213,6 @@ jobs: + + + diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 2cec8a2e..7e001b8d 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -807,7 +807,7 @@ def handler(event, context, local=False): logger.error(f"Failed to update subtask status: {db_error}") # Return error if all records failed - logger.fatal(results) + logger.info(results) if errors and not results: return {"statusCode": 500, "body": json.dumps({"errors": errors})} From b1164ffd90b89b054e05d4755408b77da501cfb2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:50:47 +0000 Subject: [PATCH 085/135] get rid of local --- backend/address2UPRN/main.py | 7 ++++--- backend/postcode_splitter/main.py | 7 +++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 7e001b8d..812b9206 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -358,9 +358,9 @@ def get_uprn_with_epc_df( # Best score best_score = scored_df.iloc[0]["lexiscore"] - # Return None if score is below threshold - if best_score < 0.7: - return None + # # Return None if score is below threshold + # if best_score < 0.7: + # return None # All rank-1 rows (possible draw) top_rank_df = scored_df[scored_df["lexirank"] == 1] @@ -807,6 +807,7 @@ def handler(event, context, local=False): logger.error(f"Failed to update subtask status: {db_error}") # Return error if all records failed + logger.info(results_data) logger.info(results) if errors and not results: return {"statusCode": 500, "body": json.dumps({"errors": errors})} diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index eb7cf044..943435b9 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -162,7 +162,8 @@ def handler(event, context, local=False): csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) # just do 5 well we are testing, sqs connection - df = df.head(5) + if local: + df = df.head(5) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") # Sanitise postcodes @@ -193,7 +194,9 @@ def handler(event, context, local=False): task_id=str(task_id), rows=all_rows, ) - logger.info(f"Sent all {len(all_rows)} rows in single batch to address2UPRN queue") + logger.info( + f"Sent all {len(all_rows)} rows in single batch to address2UPRN queue" + ) except Exception as e: logger.error( f"Failed to send all rows to address2UPRN queue: {e}", From c9ec097a438b8b8a49b5d9bfcdf23f0d5b9e138d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 17:55:43 +0000 Subject: [PATCH 086/135] pr review --- .github/workflows/deploy_terraform.yml | 18 ++---------------- backend/address2UPRN/main.py | 1 - 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 122fb2e1..da98f4d9 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -116,8 +116,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} - # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} - terraform_apply: 'true' + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -158,8 +157,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} - # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} - terraform_apply: 'true' + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -204,15 +202,3 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - - - - - - - - - - - - diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 812b9206..8d1ba21d 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -17,7 +17,6 @@ logger = setup_logger() EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", - "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=", ) if EPC_AUTH_TOKEN is None: From 958ab72e0acefcca541559f8608ed3252c21d7eb Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 12:24:47 +0000 Subject: [PATCH 087/135] deploy to main with new policy --- backend/address2UPRN/main.py | 51 ++++++++++++++++++++++++- backend/postcode_splitter/main.py | 6 +++ infrastructure/terraform/shared/main.tf | 15 ++++++++ utils/s3.py | 1 - 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 8d1ba21d..0aedd082 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -10,11 +10,13 @@ from typing import Set import json import requests from uuid import UUID +import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface +from utils.s3 import save_csv_to_s3 +from datetime import datetime logger = setup_logger() - EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", ) @@ -502,6 +504,46 @@ def resolve_uprns_for_postcode_group( ) +def save_results_to_s3( + results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None +) -> bool: + """ + Save results DataFrame to S3 as CSV. + + :param results_df: The DataFrame containing results + :param task_id: The task ID (used for file naming) + :param bucket_name: The S3 bucket name (defaults to env variable) + :return: True if successful, False otherwise + """ + if bucket_name is None: + bucket_name = os.getenv("S3_BUCKET_NAME") + + if not bucket_name: + logger.error( + "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set" + ) + return False + + try: + # Create a filename with the task ID + file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}" + file_key = f"ara_raw_outputs/{task_id}/{sub_task_id}/{file_name}.csv" + + # Save to S3 + success = save_csv_to_s3(results_df, bucket_name, file_key) + + if success: + logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}") + return True + else: + logger.error(f"Failed to save results to S3") + return False + + except Exception as e: + logger.error(f"Error saving results to S3: {str(e)}") + return False + + def test(a, b): assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" @@ -760,7 +802,12 @@ def handler(event, context, local=False): # Create results DataFrame result_df = pd.DataFrame(results_data) - logger.info(f"Created results DataFrame with {len(result_df)} rows") + + # Save results to S3 + try: + save_results_to_s3(result_df, str(task_id), str(subtask_id)) + except Exception as s3_error: + logger.error(f"Failed to save results to S3: {s3_error}") results.append( { diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 943435b9..73a79d2c 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -164,6 +164,12 @@ def handler(event, context, local=False): # just do 5 well we are testing, sqs connection if local: df = df.head(5) + + # TODO: DELETE ME, if you see this in the PR. + # TODO: DELETE ME, if you see this in the PR. + # TODO: DELETE ME, if you see this in the PR. + df = df.head(5) + logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") # Sanitise postcodes diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 5e189dc9..4ec57c3e 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -305,6 +305,21 @@ module "address2uprn_registry" { } +# S3 policy for postcode splitter to read from retrofit data bucket +module "address2uprn_s3_read_and_write" { + source = "../modules/s3_iam_policy" + + policy_name = "Address2UPRNReadandWriteS3" + policy_description = "Allow address2uprn Lambda to read and write from retrofit-data bucket" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] + resource_paths = ["/*"] +} + +output "postcode_splitter_s3_read_arn" { + value = module.postcode_splitter_s3_read.policy_arn +} + ################################################ # Condition ETL – Lambda ECR ################################################ diff --git a/utils/s3.py b/utils/s3.py index 2e67d4f0..0e79c26b 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -8,7 +8,6 @@ from botocore.exceptions import NoCredentialsError, PartialCredentialsError logger = setup_logger() - def read_from_s3(bucket_name, s3_file_name): """ Read an object from s3. Decoding of the data is left for outside of this function From d9708fe516b276b931f45f5f4da6251ae3afab22 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 12:30:28 +0000 Subject: [PATCH 088/135] push policy --- infrastructure/terraform/lambda/address2UPRN/main.tf | 6 ++++++ infrastructure/terraform/shared/main.tf | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index caf06785..12f0a4b3 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -44,3 +44,9 @@ module "address2uprn" { }, ) } + +# Attach S3 read policy to the Lambda execution role +resource "aws_iam_role_policy_attachment" "address2uprn_read_and_write" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.address_2_uprn_s3_read_and_write_arn +} \ No newline at end of file diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 4ec57c3e..9733f5f9 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -316,8 +316,8 @@ module "address2uprn_s3_read_and_write" { resource_paths = ["/*"] } -output "postcode_splitter_s3_read_arn" { - value = module.postcode_splitter_s3_read.policy_arn +output "address_2_uprn_s3_read_and_write_arn" { + value = module.address2uprn_s3_read_and_write.policy_arn } ################################################ From 37c89fb6ef35e6db86440c025b610ddc695c24c1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 12:34:58 +0000 Subject: [PATCH 089/135] address2uprn --- infrastructure/terraform/lambda/address2UPRN/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 12f0a4b3..a6f56074 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -47,6 +47,6 @@ module "address2uprn" { # Attach S3 read policy to the Lambda execution role resource "aws_iam_role_policy_attachment" "address2uprn_read_and_write" { - role = module.lambda.role_name + role = module.address2uprn.role_name policy_arn = data.terraform_remote_state.shared.outputs.address_2_uprn_s3_read_and_write_arn } \ No newline at end of file From d7a76821457104071fdf1addd2f0910d0a850fa3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 12:40:39 +0000 Subject: [PATCH 090/135] terraform version --- .github/workflows/deploy_terraform.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index da98f4d9..e8e82edf 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -116,7 +116,8 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} - terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + terraform_apply: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -157,7 +158,8 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} - terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + terraform_apply: 'true' secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} From f296a865ff9416d315759ea7416d29e35ad30600 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 13:04:40 +0000 Subject: [PATCH 091/135] added s3 bucket name --- infrastructure/terraform/lambda/address2UPRN/main.tf | 1 + infrastructure/terraform/lambda/postcodeSplitter/main.tf | 1 + 2 files changed, 2 insertions(+) diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index a6f56074..79e2bb2f 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -41,6 +41,7 @@ module "address2uprn" { DATA_BUCKET = "test" ENGINE_SQS_URL = "test" ENERGY_ASSESSMENTS_BUCKET = "test" + S3_BUCKET_NAME = data.terraform_remote_state.retrofit_sap_data.outputs.bucket_name }, ) } diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 81120772..78d927d3 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -55,6 +55,7 @@ module "lambda" { ENGINE_SQS_URL = "test" ENERGY_ASSESSMENTS_BUCKET = "test" ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url + S3_BUCKET_NAME = "retrofit-data-dev" # Hardcoded as deployed via serverless i believe }, ) } From 1bf322005c0599067fa2f41aa3707230f3167d7f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 13:55:03 +0000 Subject: [PATCH 092/135] added outputs --- infrastructure/terraform/lambda/address2UPRN/main.tf | 2 +- infrastructure/terraform/shared/main.tf | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 79e2bb2f..5f0c4a11 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -41,7 +41,7 @@ module "address2uprn" { DATA_BUCKET = "test" ENGINE_SQS_URL = "test" ENERGY_ASSESSMENTS_BUCKET = "test" - S3_BUCKET_NAME = data.terraform_remote_state.retrofit_sap_data.outputs.bucket_name + S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, ) } diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 9733f5f9..eb2a679d 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -133,6 +133,11 @@ module "retrofit_sap_data" { allowed_origins = var.allowed_origins } +output "retrofit_sap_data_bucket_name" { + value = module.retrofit_sap_data.bucket_name + description = "Name of the retrofit SAP data bucket" +} + module "retrofit_carbon_predictions" { source = "../modules/s3" bucketname = "retrofit-carbon-predictions-${var.stage}" From 3bdd4a4a97efc87fc24eeded8e6f3a2f58cf70f6 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 15:03:38 +0000 Subject: [PATCH 093/135] test first with just 5 --- .devcontainer/backend/Dockerfile | 2 + .devcontainer/backend/devcontainer.json | 3 +- backend/address2UPRN/main.py | 52 ++++++++----------------- 3 files changed, 20 insertions(+), 37 deletions(-) diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index 99cd66d6..f48fb99f 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -3,6 +3,8 @@ FROM python:3.11.10-bullseye ARG USER=vscode ARG DEBIAN_FRONTEND=noninteractive +ARG DOCKER_GID=1003 + # 1) Toolchain + utilities for building libpostal RUN apt-get update && apt-get install -y --no-install-recommends \ diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index 6e2edc93..73348c4d 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -6,7 +6,8 @@ "workspaceFolder": "/workspaces/model", "postStartCommand": "bash .devcontainer/backend/post-install.sh", "mounts": [ - "source=${localEnv:HOME},target=/home/vscode,type=bind" + "source=${localEnv:HOME},target=/home/vscode,type=bind", + "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ], "customizations": { "vscode": { diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 0aedd082..e635b305 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -329,9 +329,6 @@ def get_uprn_candidates( def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, - return_address=False, - return_EPC=False, - return_score=True, ): """ Return uprn (str) using a pre-fetched EPC dataframe. @@ -371,8 +368,6 @@ def get_uprn_with_epc_df( return None address = top_rank_df["address"].values[0] - lexiscore = float(top_rank_df["lexiscore"].values[0]) - epc = top_rank_df["current-energy-efficiency"].values[0] score = float(top_rank_df["lexiscore"].values[0]) # logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") @@ -382,20 +377,7 @@ def get_uprn_with_epc_df( if found_uprn == "": return None - if return_address: - if return_EPC is False: - return found_uprn, address - else: - if return_score is False: - return found_uprn, address, epc - else: - return ( - found_uprn, - address, - epc, - score, - ) - return found_uprn + return (found_uprn, address, score) def get_uprn( @@ -688,7 +670,11 @@ def handler(event, context, local=False): # Create user_input column by concatenating Address 1 and Address 2 df["user_input"] = ( - df["Address 1"].fillna("") + " " + df["Address 2"].fillna("") + df["Address 1"].fillna("") + + " " + + df["Address 2"].fillna("") + + " " + + df["Address 3"].fillna("") ).str.strip() logger.info(f"Created user_input column from Address 1 and Address 2") @@ -743,14 +729,11 @@ def handler(event, context, local=False): result = get_uprn_with_epc_df( user_inputed_address=user_input, epc_df=epc_df, - return_address=True, - return_EPC=True, - return_score=True, ) # Parse result tuple if successful if result: - uprn, found_address, epc, score = result + uprn, found_address, score = result uprns_found += 1 logger.info( f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" @@ -759,10 +742,9 @@ def handler(event, context, local=False): results_data.append( { **row, # Include all original data - "found_uprn": uprn, - "found_address": found_address, - "epc_rating": epc, - "lexiscore": score, + "uprn": uprn, + "domna_found_address": found_address, + "domna_lexiscore": score, } ) else: @@ -772,10 +754,9 @@ def handler(event, context, local=False): results_data.append( { **row, # Include all original data - "found_uprn": None, - "found_address": None, - "epc_rating": None, - "lexiscore": None, + "uprn": None, + "domna_found_address": None, + "domna_lexiscore": None, } ) @@ -789,10 +770,9 @@ def handler(event, context, local=False): results_data.append( { **row, - "found_uprn": None, - "found_address": None, - "epc_rating": None, - "score": None, + "uprn": None, + "domna_found_address": None, + "domna_lexiscore": None, "error": str(e), } ) From c2f29e86dfd5658dd6979b4da0b91a541814ff00 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 15:11:20 +0000 Subject: [PATCH 094/135] made tests pass and redploy --- .github/workflows/deploy_terraform.yml | 3 +++ backend/address2UPRN/main.py | 17 ++++++++--------- backend/postcode_splitter/main.py | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index e8e82edf..90595632 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -204,3 +204,6 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + + diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index e635b305..f4aa0dc9 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -329,6 +329,7 @@ def get_uprn_candidates( def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, + verbose=False, ): """ Return uprn (str) using a pre-fetched EPC dataframe. @@ -377,15 +378,16 @@ def get_uprn_with_epc_df( if found_uprn == "": return None - return (found_uprn, address, score) + if verbose: + return (found_uprn, address, score) + else: + return found_uprn def get_uprn( user_inputed_address: str, postcode: str, - return_address=False, - return_EPC=False, - return_score=True, + verbose=False, ): """ Return uprn (str) @@ -400,9 +402,7 @@ def get_uprn( return get_uprn_with_epc_df( user_inputed_address=user_inputed_address, epc_df=df, - return_address=return_address, - return_EPC=return_EPC, - return_score=return_score, + verbose=verbose, ) @@ -727,8 +727,7 @@ def handler(event, context, local=False): # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( - user_inputed_address=user_input, - epc_df=epc_df, + user_inputed_address=user_input, epc_df=epc_df, verbose=True ) # Parse result tuple if successful diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 73a79d2c..8c0048e2 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -168,7 +168,7 @@ def handler(event, context, local=False): # TODO: DELETE ME, if you see this in the PR. # TODO: DELETE ME, if you see this in the PR. # TODO: DELETE ME, if you see this in the PR. - df = df.head(5) + df = df.head(1983) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") From c4e30a0d561db675a368eb9f2778953803475a6c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 15:11:36 +0000 Subject: [PATCH 095/135] made tests pass and redploy --- backend/postcode_splitter/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 8c0048e2..73a79d2c 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -168,7 +168,7 @@ def handler(event, context, local=False): # TODO: DELETE ME, if you see this in the PR. # TODO: DELETE ME, if you see this in the PR. # TODO: DELETE ME, if you see this in the PR. - df = df.head(1983) + df = df.head(5) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") From 1c2b1422fe89f25784dfd523c7f1096e996dafcd Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 15:24:38 +0000 Subject: [PATCH 096/135] running 1983 --- backend/postcode_splitter/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 73a79d2c..8c0048e2 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -168,7 +168,7 @@ def handler(event, context, local=False): # TODO: DELETE ME, if you see this in the PR. # TODO: DELETE ME, if you see this in the PR. # TODO: DELETE ME, if you see this in the PR. - df = df.head(5) + df = df.head(1983) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") From 5dc9cea564517844b29b6a11687ea0a478a6d182 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 15:25:49 +0000 Subject: [PATCH 097/135] running 1983 --- .github/workflows/deploy_fastapi_backend.yml | 1 + .github/workflows/deploy_terraform.yml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/deploy_fastapi_backend.yml b/.github/workflows/deploy_fastapi_backend.yml index 32e30bfa..b60fa1d1 100644 --- a/.github/workflows/deploy_fastapi_backend.yml +++ b/.github/workflows/deploy_fastapi_backend.yml @@ -135,3 +135,4 @@ jobs: # Deploy to AWS Lambda via Serverless sls deploy --stage ${{ github.ref_name }} --verbose + diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 90595632..834a60c2 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -207,3 +207,6 @@ jobs: + + + From 04cc6468dd18307586e4dde0c6c4ce48e6959d4d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 15:44:36 +0000 Subject: [PATCH 098/135] save --- .github/workflows/_deploy_lambda.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index b8731446..b2f2ce49 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -112,3 +112,5 @@ jobs: -var="lambda_name=${{ inputs.lambda_name }}" \ -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ -var="image_digest=${{ inputs.image_digest }}" + + From 4325bdf9900b3abc4e1d8f17c572f181136e18c8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 16:05:16 +0000 Subject: [PATCH 099/135] get rid of local is true to remove suspicion --- backend/postcode_splitter/main.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 8c0048e2..e834c44e 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -97,7 +97,7 @@ def send_to_address2uprn_queue(task_id: str, rows: list) -> str: return response["MessageId"] -def handler(event, context, local=False): +def handler(event, context): print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") @@ -117,12 +117,6 @@ def handler(event, context, local=False): task_id = None subtask_id = None try: - # For local development - if local is True: - record = {} - record["body"] = ( - '{"task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917","s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"}' - ) # Parse body (inputs) if isinstance(record.get("body"), str): body = json.loads(record["body"]) @@ -161,13 +155,7 @@ def handler(event, context, local=False): csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - # just do 5 well we are testing, sqs connection - if local: - df = df.head(5) - # TODO: DELETE ME, if you see this in the PR. - # TODO: DELETE ME, if you see this in the PR. - # TODO: DELETE ME, if you see this in the PR. df = df.head(1983) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") From 385a1b8e84ad39fb9b309489e3e9b113e5f4fe7a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 16:07:53 +0000 Subject: [PATCH 100/135] get rid of local is true to remove suspicion --- .github/workflows/deploy_terraform.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 834a60c2..7e24f60f 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -210,3 +210,9 @@ jobs: + + + + + + From 51e910ce6ec1031467efa300352d267f2a515487 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 16:28:27 +0000 Subject: [PATCH 101/135] add a workflow button --- .github/workflows/deploy_terraform.yml | 1 + sfr/principal_pitch/2_export_data.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 7e24f60f..02bb1b76 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -9,6 +9,7 @@ on: - '.github/workflows/deploy_terraform.yml' - '.github/workflows/_build_image.yml' - '.github/workflows/_deploy_lambda.yml' + workflow_dispatch: jobs: determine_stage: diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 9470710d..81e7a9fc 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -28,12 +28,12 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 476 +PORTFOLIO_ID = 561 SCENARIOS = [ - 953, + 1053, ] scenario_names = { - 953: "All Properties, Most Economic", + 1053: "EPC C", } project_name = "manchester" @@ -286,6 +286,8 @@ for scenario_id in SCENARIOS: "current_sap_points", "total_floor_area", "number_of_rooms", + "lodgement_date", + "is_expired", "id", ] ] From d07fc351a59292a57c3b47eb8b0436d9434f6346 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:04:27 +0000 Subject: [PATCH 102/135] added permission to add --- backend/postcode_splitter/main.py | 152 +++++++++++++++--- .../terraform/lambda/postcodeSplitter/main.tf | 2 +- infrastructure/terraform/shared/main.tf | 2 +- 3 files changed, 132 insertions(+), 24 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index e834c44e..2714f330 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -4,12 +4,13 @@ import json import pandas as pd import requests import boto3 -from uuid import UUID +from uuid import UUID, uuid4 from urllib.parse import unquote -from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict +from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict, save_csv_to_s3 from utils.logger import setup_logger from tqdm import tqdm from backend.app.db.functions.tasks.Tasks import SubTaskInterface +from datetime import datetime logger = setup_logger() @@ -62,13 +63,55 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]: raise ValueError(f"Could not parse S3 URI") from e -def send_to_address2uprn_queue(task_id: str, rows: list) -> str: +def upload_batch_to_s3(batch_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None) -> str: """ - Send a postcode group to the address2UPRN SQS queue. + Upload batch DataFrame to S3 as CSV. + + Args: + batch_df: The DataFrame containing batch data + task_id: The parent task ID (used for file path) + sub_task_id: The subtask ID (used for file path) + bucket_name: The S3 bucket name (defaults to env variable) + + Returns: + S3 URI (s3://bucket/key) of the uploaded file + """ + if bucket_name is None: + bucket_name = os.getenv("S3_BUCKET_NAME") + + if not bucket_name: + logger.error( + "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set" + ) + raise ValueError("S3_BUCKET_NAME not configured") + + try: + file_name = f"{datetime.now().isoformat()}_{str(uuid4())[:8]}" + file_key = f"ara_postcode_splitter_batches/{task_id}/{sub_task_id}/{file_name}.csv" + + success = save_csv_to_s3(batch_df, bucket_name, file_key) + + if success: + s3_uri = f"s3://{bucket_name}/{file_key}" + logger.info(f"Successfully uploaded batch to {s3_uri}") + return s3_uri + else: + logger.error(f"Failed to upload batch to S3") + raise ValueError("Failed to save CSV to S3") + + except Exception as e: + logger.error(f"Error uploading batch to S3: {str(e)}") + raise + + +def send_to_address2uprn_queue(task_id: str, sub_task_id: str, s3_uri: str) -> str: + """ + Send a batch to the address2UPRN SQS queue with S3 reference. Args: task_id: The parent task ID - rows: List of row dictionaries for this postcode group + sub_task_id: The new subtask ID for this batch + s3_uri: S3 URI pointing to the batch CSV file Returns: Message ID from SQS @@ -81,7 +124,8 @@ def send_to_address2uprn_queue(task_id: str, rows: list) -> str: message_body = { "task_id": task_id, - "rows": rows, + "sub_task_id": sub_task_id, + "s3_uri": s3_uri, } response = sqs_client.send_message( @@ -91,12 +135,59 @@ def send_to_address2uprn_queue(task_id: str, rows: list) -> str: logger.info( f"Sent message to address2UPRN queue. " - f"Task: {task_id}, MessageId: {response['MessageId']}" + f"Task: {task_id}, SubTask: {sub_task_id}, MessageId: {response['MessageId']}" ) return response["MessageId"] +def create_batch_and_send_to_address2uprn( + batch_rows: list, + task_id: str, + subtask_interface: SubTaskInterface, + bucket_name: str, +) -> str: + """ + Create a batch DataFrame, upload to S3, create subtask, and send to address2UPRN queue. + + Args: + batch_rows: List of row dictionaries for this batch + task_id: The parent task ID + subtask_interface: SubTaskInterface instance + bucket_name: S3 bucket name + + Returns: + The created batch subtask ID + """ + # Generate unique batch subtask ID + batch_sub_task_id = str(uuid4()) + + # Upload batch to S3 + batch_df = pd.DataFrame(batch_rows) + s3_uri = upload_batch_to_s3(batch_df, str(task_id), batch_sub_task_id, bucket_name) + + # Create a new subtask for this batch with all inputs + created_batch_sub_task_id = subtask_interface.create_subtask( + task_id=task_id, + inputs={ + "task_id": str(task_id), + "sub_task_id": batch_sub_task_id, + "batch_size": len(batch_rows), + "s3_uri": s3_uri, + } + ) + logger.info(f"Created batch subtask {created_batch_sub_task_id}") + + # Send message with S3 reference + send_to_address2uprn_queue( + task_id=str(task_id), + sub_task_id=batch_sub_task_id, + s3_uri=s3_uri, + ) + + return created_batch_sub_task_id + + def handler(event, context): print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") @@ -112,6 +203,7 @@ def handler(event, context): results = [] errors = [] subtask_interface = SubTaskInterface() + bucket_name = os.getenv("S3_BUCKET_NAME") for record in records: task_id = None @@ -148,6 +240,12 @@ def handler(event, context): ) logger.info(f"Created subtask {subtask_id} for task {task_id}") + # Mark subtask as in progress + subtask_interface.update_subtask_status( + subtask_id, "in progress" + ) + logger.info(f"Marked subtask {subtask_id} as in progress") + # Read CSV from S3 logger.info(f"Processing S3 URI: {s3_uri}") bucket, key = parse_s3_uri(s3_uri) @@ -184,9 +282,11 @@ def handler(event, context): for postcode, rows in postcode_to_addresses.items(): all_rows.extend(rows) try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=all_rows, + create_batch_and_send_to_address2uprn( + batch_rows=all_rows, + task_id=task_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) logger.info( f"Sent all {len(all_rows)} rows in single batch to address2UPRN queue" @@ -214,9 +314,11 @@ def handler(event, context): # First, send the current batch if it has data if batch_rows: try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=batch_rows, + create_batch_and_send_to_address2uprn( + batch_rows=batch_rows, + task_id=task_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) logger.info( f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" @@ -236,9 +338,11 @@ def handler(event, context): # Send the large postcode on its own try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=rows, + create_batch_and_send_to_address2uprn( + batch_rows=rows, + task_id=task_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) logger.info( f"Sent large postcode {postcode} ({len(rows)} rows) to address2UPRN queue" @@ -263,9 +367,11 @@ def handler(event, context): f"Batch threshold reached: current {len(batch_rows)} + next postcode {len(rows)} = {current_batch_size} > {batch_size}" ) try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=batch_rows, + create_batch_and_send_to_address2uprn( + batch_rows=batch_rows, + task_id=task_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) logger.info( f"Sent batch of {len(batch_rows)} rows to address2UPRN queue (total sent: {total_sent})" @@ -290,9 +396,11 @@ def handler(event, context): # Send remaining batch if batch_rows: try: - send_to_address2uprn_queue( - task_id=str(task_id), - rows=batch_rows, + create_batch_and_send_to_address2uprn( + batch_rows=batch_rows, + task_id=task_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) total_sent += len(batch_rows) logger.info( diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index 78d927d3..e17d272d 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -55,7 +55,7 @@ module "lambda" { ENGINE_SQS_URL = "test" ENERGY_ASSESSMENTS_BUCKET = "test" ADDRESS2UPRN_QUEUE_URL = data.terraform_remote_state.address2uprn.outputs.address2uprn_queue_url - S3_BUCKET_NAME = "retrofit-data-dev" # Hardcoded as deployed via serverless i believe + S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, ) } diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index eb2a679d..acf8c281 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -386,7 +386,7 @@ module "postcode_splitter_s3_read" { policy_name = "PostcodeSplitterReadS3" policy_description = "Allow postcode splitter Lambda to read from retrofit-data bucket" bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] - actions = ["s3:GetObject"] + actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] resource_paths = ["/*"] } From dac676f538844d8c0b97c5ed23cddc9738750d27 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:05:29 +0000 Subject: [PATCH 103/135] don't bombard yet --- backend/postcode_splitter/main.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 2714f330..7aaf1fbb 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -63,7 +63,9 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]: raise ValueError(f"Could not parse S3 URI") from e -def upload_batch_to_s3(batch_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None) -> str: +def upload_batch_to_s3( + batch_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None +) -> str: """ Upload batch DataFrame to S3 as CSV. @@ -87,7 +89,9 @@ def upload_batch_to_s3(batch_df: pd.DataFrame, task_id: str, sub_task_id: str, b try: file_name = f"{datetime.now().isoformat()}_{str(uuid4())[:8]}" - file_key = f"ara_postcode_splitter_batches/{task_id}/{sub_task_id}/{file_name}.csv" + file_key = ( + f"ara_postcode_splitter_batches/{task_id}/{sub_task_id}/{file_name}.csv" + ) success = save_csv_to_s3(batch_df, bucket_name, file_key) @@ -128,10 +132,11 @@ def send_to_address2uprn_queue(task_id: str, sub_task_id: str, s3_uri: str) -> s "s3_uri": s3_uri, } - response = sqs_client.send_message( - QueueUrl=queue_url, - MessageBody=json.dumps(message_body), - ) + # Don't run on sqs yet + # response = sqs_client.send_message( + # QueueUrl=queue_url, + # MessageBody=json.dumps(message_body), + # ) logger.info( f"Sent message to address2UPRN queue. " @@ -174,7 +179,7 @@ def create_batch_and_send_to_address2uprn( "sub_task_id": batch_sub_task_id, "batch_size": len(batch_rows), "s3_uri": s3_uri, - } + }, ) logger.info(f"Created batch subtask {created_batch_sub_task_id}") @@ -241,9 +246,7 @@ def handler(event, context): logger.info(f"Created subtask {subtask_id} for task {task_id}") # Mark subtask as in progress - subtask_interface.update_subtask_status( - subtask_id, "in progress" - ) + subtask_interface.update_subtask_status(subtask_id, "in progress") logger.info(f"Marked subtask {subtask_id} as in progress") # Read CSV from S3 From df141e4122e020b8f037e31a56838ff234daf367 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:08:00 +0000 Subject: [PATCH 104/135] post code splitter main py --- backend/postcode_splitter/main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 7aaf1fbb..85dbc2da 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -132,18 +132,19 @@ def send_to_address2uprn_queue(task_id: str, sub_task_id: str, s3_uri: str) -> s "s3_uri": s3_uri, } - # Don't run on sqs yet + # # Don't run on sqs yet # response = sqs_client.send_message( # QueueUrl=queue_url, # MessageBody=json.dumps(message_body), # ) - logger.info( - f"Sent message to address2UPRN queue. " - f"Task: {task_id}, SubTask: {sub_task_id}, MessageId: {response['MessageId']}" - ) + # logger.info( + # f"Sent message to address2UPRN queue. " + # f"Task: {task_id}, SubTask: {sub_task_id}, MessageId: {response['MessageId']}" + # ) - return response["MessageId"] + # return response["MessageId"] + return str(uuid4()) def create_batch_and_send_to_address2uprn( From 5f8eca84b62452bf6c3708f0c5bfb03af4ef1700 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:12:11 +0000 Subject: [PATCH 105/135] deploy --- .github/workflows/deploy_terraform.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 02bb1b76..776bbd38 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -217,3 +217,5 @@ jobs: + + From bf7b8d87e5b380d71ae77b249cfccfb7afa99b19 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:20:28 +0000 Subject: [PATCH 106/135] add docker file and specify lambda images --- backend/address2UPRN/handler/Dockerfile | 2 +- backend/condition/handler/Dockerfile | 2 +- backend/postcode_splitter/handler/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 07159357..5f274456 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM public.ecr.aws/lambda/python:3.10 +FROM --platform=linux/amd64 public.ecr.aws/lambda/python:3.10 # FROM python:3.11.10-bullseye diff --git a/backend/condition/handler/Dockerfile b/backend/condition/handler/Dockerfile index 71556895..be0d5ca5 100644 --- a/backend/condition/handler/Dockerfile +++ b/backend/condition/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM public.ecr.aws/lambda/python:3.11 +FROM --platform=linux/amd64 public.ecr.aws/lambda/python:3.11 # For local running: # FROM python:3.11.10-bullseye diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 74c00b9f..8e30f9e3 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM public.ecr.aws/lambda/python:3.11 +FROM --platform=linux/amd64 public.ecr.aws/lambda/python:3.11 ARG DEV_DB_HOST ARG DEV_DB_PORT From ee8554314b951e165d281967d09c4963c36c4932 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:23:35 +0000 Subject: [PATCH 107/135] add docker file and specify lambda images --- .github/workflows/deploy_terraform.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 776bbd38..990dbdfa 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -209,13 +209,3 @@ jobs: - - - - - - - - - - From 0ab0d5505f4c5aababc9c6f57d988b91c984c2bf Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:29:11 +0000 Subject: [PATCH 108/135] no cache --- .github/workflows/_build_image.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index a5e16a51..caf1ccb8 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -92,6 +92,7 @@ jobs: done <<< "${{ inputs.build_args }}" docker build \ + --no-cache \ -f ${{ inputs.dockerfile_path }} \ $BUILD_ARGS \ -t $IMAGE_URI \ From 3af620a61a0ce4a91ea8c2923eea5c23778c52ef Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:38:18 +0000 Subject: [PATCH 109/135] ensure we don't use any platform but linux/amd64 --- .github/workflows/_build_image.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index caf1ccb8..f4b94fc0 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -93,6 +93,7 @@ jobs: docker build \ --no-cache \ + --platform linux/amd64 \ -f ${{ inputs.dockerfile_path }} \ $BUILD_ARGS \ -t $IMAGE_URI \ From 0f4c1c0029706474317997420f70290f442455b5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:52:11 +0000 Subject: [PATCH 110/135] only in docker build --- backend/address2UPRN/handler/Dockerfile | 2 +- backend/condition/handler/Dockerfile | 2 +- backend/postcode_splitter/handler/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index 5f274456..07159357 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 public.ecr.aws/lambda/python:3.10 +FROM public.ecr.aws/lambda/python:3.10 # FROM python:3.11.10-bullseye diff --git a/backend/condition/handler/Dockerfile b/backend/condition/handler/Dockerfile index be0d5ca5..71556895 100644 --- a/backend/condition/handler/Dockerfile +++ b/backend/condition/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 public.ecr.aws/lambda/python:3.11 +FROM public.ecr.aws/lambda/python:3.11 # For local running: # FROM python:3.11.10-bullseye diff --git a/backend/postcode_splitter/handler/Dockerfile b/backend/postcode_splitter/handler/Dockerfile index 8e30f9e3..74c00b9f 100644 --- a/backend/postcode_splitter/handler/Dockerfile +++ b/backend/postcode_splitter/handler/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 public.ecr.aws/lambda/python:3.11 +FROM public.ecr.aws/lambda/python:3.11 ARG DEV_DB_HOST ARG DEV_DB_PORT From c7bd70e17f3d339099040976e66a04047f0eaded Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 18:52:23 +0000 Subject: [PATCH 111/135] only in docker build --- .github/workflows/deploy_terraform.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 990dbdfa..6ee9de11 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -209,3 +209,7 @@ jobs: + + + + From 7637e87c3c7f2188e5c06fdcd50b3151fc75818c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 19:03:49 +0000 Subject: [PATCH 112/135] deleted all images in ecr --- .github/workflows/_deploy_lambda.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index b2f2ce49..1a690e02 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -114,3 +114,4 @@ jobs: -var="image_digest=${{ inputs.image_digest }}" + From ff78ddc5a0dbc299a47a21b4f2456f1f6c82f45e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Feb 2026 19:09:43 +0000 Subject: [PATCH 113/135] deleted all images in ecr --- .github/workflows/_build_image.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index f4b94fc0..5e5b5155 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -91,15 +91,16 @@ jobs: BUILD_ARGS="$BUILD_ARGS --build-arg $temp" done <<< "${{ inputs.build_args }}" - docker build \ + docker buildx build \ --no-cache \ --platform linux/amd64 \ + --provenance=false \ + --sbom=false \ + --push \ -f ${{ inputs.dockerfile_path }} \ $BUILD_ARGS \ -t $IMAGE_URI \ ${{ inputs.build_context }} - - docker push $IMAGE_URI - name: Resolve image digest id: digest From 1814c5988c151759c90e9a9807c636162a95c14d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 11:05:05 +0000 Subject: [PATCH 114/135] run on sqs --- .github/workflows/_build_image.yml | 2 +- backend/postcode_splitter/main.py | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.github/workflows/_build_image.yml b/.github/workflows/_build_image.yml index 5e5b5155..3435c92d 100644 --- a/.github/workflows/_build_image.yml +++ b/.github/workflows/_build_image.yml @@ -90,7 +90,7 @@ jobs: temp=$(eval echo "$line") BUILD_ARGS="$BUILD_ARGS --build-arg $temp" done <<< "${{ inputs.build_args }}" - + docker buildx build \ --no-cache \ --platform linux/amd64 \ diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 85dbc2da..3d0f0d8d 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -132,19 +132,17 @@ def send_to_address2uprn_queue(task_id: str, sub_task_id: str, s3_uri: str) -> s "s3_uri": s3_uri, } - # # Don't run on sqs yet - # response = sqs_client.send_message( - # QueueUrl=queue_url, - # MessageBody=json.dumps(message_body), - # ) + response = sqs_client.send_message( + QueueUrl=queue_url, + MessageBody=json.dumps(message_body), + ) - # logger.info( - # f"Sent message to address2UPRN queue. " - # f"Task: {task_id}, SubTask: {sub_task_id}, MessageId: {response['MessageId']}" - # ) + logger.info( + f"Sent message to address2UPRN queue. " + f"Task: {task_id}, SubTask: {sub_task_id}, MessageId: {response['MessageId']}" + ) - # return response["MessageId"] - return str(uuid4()) + return response["MessageId"] def create_batch_and_send_to_address2uprn( From 8152dc516666ce6d9183e73b3879a2f5f028cbd7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 11:15:15 +0000 Subject: [PATCH 115/135] deploy with new address2uprn handling --- backend/address2UPRN/main.py | 163 ++++++++++++------------------ backend/postcode_splitter/main.py | 51 +--------- utils/s3.py | 51 ++++++++++ 3 files changed, 118 insertions(+), 147 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index f4aa0dc9..f843d28a 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -12,11 +12,16 @@ import requests from uuid import UUID import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface -from utils.s3 import save_csv_to_s3 +from utils.s3 import ( + save_csv_to_s3, + read_csv_from_s3 as read_csv_from_s3_dict, + parse_s3_uri, +) from datetime import datetime logger = setup_logger() + EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", ) @@ -526,48 +531,6 @@ def save_results_to_s3( return False -def test(a, b): - assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" - - -def run_all_test(): - # Basic usage with different post codes styles - test(get_epc_data_with_postcode("b93 8sy").shape[0], 63) - test(get_epc_data_with_postcode("B938sy").shape[0], 63) - test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) - test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) - - test(get_uprn("68", "b93 8sy"), "100070989938") - test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") - test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") - test(get_uprn("28 A", "se6 4tf"), "100023278633") - test(get_uprn("28A", "se6 4tf"), "100023278633") - test(get_uprn("6 Aitken Close", "E8 4SQ"), False) - - # unique case - test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") - test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198") - test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198") - test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False) - test( - get_uprn("1 Semley Gate", "e9 5nh"), "10008238188" - ) # this one return "flat 1, in 1 semley gate" - test( - get_uprn("48 Oswald Street", "E5 0BT"), False - ) # this one return "flat 1, in 1 semley gate" - test( - get_uprn("42 Oswald Street", "E5 0BT"), False - ) # this one return "flat 1, in 1 semley gate" - test( - get_uprn("46 Oswald Street", "E5 0BT"), False - ) # this one return "flat 1, in 1 semley gate" - get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street") - get_uprn_candidates( - get_epc_data_with_postcode("Cr2 7dl"), - "FLAT 3; 42 MORETON ROAD, SOUTH CROYDON, SURREY", - ) - - def handler(event, context, local=False): print("=== Address2UPRN Lambda Handler ===") print(f"Function: {context.function_name}") @@ -581,35 +544,8 @@ def handler(event, context, local=False): "body": json.dumps( { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "rows": [ - { - "landlord_property_id": "00000002POR", - "UPRN": "766019911", - "Address 1": "9 Redland Way", - "Address 2": "Aylesbury Vale", - "postcode": "HP21 9RJ", - "landlord_property_type": "House", - "postcode_clean": "HP219RJ", - }, - { - "landlord_property_id": "00000003MTR", - "UPRN": "100120781544", - "Address 1": "16 Lime Crescent", - "Address 2": "BICESTER", - "postcode": "OX26 3XJ", - "landlord_property_type": "House", - "postcode_clean": "OX263XJ", - }, - { - "landlord_property_id": "00000004HBY", - "UPRN": "14033542", - "Address 1": "14 Dunbar Drive", - "Address 2": "Woodley", - "postcode": "RG5 4HA", - "landlord_property_type": "House", - "postcode_clean": "RG54HA", - }, - ], + "sub_task_id": "a1b2c3d4-e5f6-7a8b-9c0d-e1f2a3b4c5d6", + "s3_uri": "", } ) } @@ -637,14 +573,19 @@ def handler(event, context, local=False): # Validate required fields task_id = body.get("task_id") - rows = body.get("rows", []) + sub_task_id = body.get("sub_task_id") + s3_uri = body.get("s3_uri") if not task_id: errors.append({"error": "Missing required field: task_id"}) continue - if not rows: - errors.append({"error": "Missing or empty rows data"}) + if not sub_task_id: + errors.append({"error": "Missing required field: sub_task_id"}) + continue + + if not s3_uri: + errors.append({"error": "Missing required field: s3_uri"}) continue # Convert task_id to UUID @@ -654,29 +595,56 @@ def handler(event, context, local=False): errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) continue - # Create a subtask for this batch - subtask_id = subtask_interface.create_subtask( - task_id=task_id, inputs={"row_count": len(rows)} - ) - logger.info( - f"Created subtask {subtask_id} for task {task_id} with {len(rows)} rows" - ) + # Convert sub_task_id to UUID + try: + subtask_id = ( + UUID(sub_task_id) if isinstance(sub_task_id, str) else sub_task_id + ) + except ValueError as e: + errors.append( + {"error": f"Invalid UUID format for sub_task_id: {str(e)}"} + ) + continue + + # Update existing subtask to 'in progress' + subtask_interface.update_subtask_status(subtask_id, "in progress") + logger.info(f"Processing subtask {subtask_id} for task {task_id}") + + # Parse S3 URI and read CSV from S3 + logger.info(f"Reading data from S3: {s3_uri}") + try: + bucket, key = parse_s3_uri(s3_uri) + csv_data = read_csv_from_s3_dict(bucket, key) + df = pd.DataFrame(csv_data) + logger.info(f"Loaded {len(df)} rows from S3") + except Exception as s3_error: + logger.error(f"Failed to read data from S3: {s3_error}") + errors.append( + {"error": "Failed to read data from S3", "details": str(s3_error)} + ) + try: + subtask_interface.update_subtask_status( + subtask_id, "failed", outputs={"error": str(s3_error)} + ) + except Exception as db_error: + logger.error(f"Failed to update subtask status: {db_error}") + continue # Process the rows - logger.info(f"Processing {len(rows)} rows for task {task_id}") + logger.info(f"Processing {len(df)} rows for task {task_id}") - # Convert rows to DataFrame - df = pd.DataFrame(rows) - - # Create user_input column by concatenating Address 1 and Address 2 - df["user_input"] = ( - df["Address 1"].fillna("") - + " " - + df["Address 2"].fillna("") - + " " - + df["Address 3"].fillna("") - ).str.strip() - logger.info(f"Created user_input column from Address 1 and Address 2") + # Create user_input column by concatenating Address columns if not already present + if "user_input" not in df.columns: + df["user_input"] = ( + df["Address 1"].fillna("") + + " " + + df["Address 2"].fillna("") + + " " + + df["Address 3"].fillna("") + ).str.strip() + logger.info(f"Created user_input column from Address 1 and Address 2") + else: + logger.info(f"user_input column already present in data") clean_df = df.dropna(subset=["postcode_clean"]) @@ -791,7 +759,6 @@ def handler(event, context, local=False): results.append( { "subtask_id": str(subtask_id), - "rows_processed": len(rows), "postcodes_processed": postcodes_processed, "addresses_processed": addresses_processed, "uprns_found": uprns_found, @@ -802,7 +769,9 @@ def handler(event, context, local=False): # Mark subtask as completed try: subtask_interface.update_subtask_status( - subtask_id, "completed", outputs={"rows_processed": len(rows)} + subtask_id, + "completed", + outputs={"rows_processed": "todo -> show sensible output"}, ) logger.info(f"Marked subtask {subtask_id} as completed") except Exception as db_error: diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 3d0f0d8d..930fac7f 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -5,8 +5,7 @@ import pandas as pd import requests import boto3 from uuid import UUID, uuid4 -from urllib.parse import unquote -from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict, save_csv_to_s3 +from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict, save_csv_to_s3, parse_s3_uri from utils.logger import setup_logger from tqdm import tqdm from backend.app.db.functions.tasks.Tasks import SubTaskInterface @@ -15,54 +14,6 @@ from datetime import datetime logger = setup_logger() -def parse_s3_uri(s3_uri: str) -> tuple[str, str]: - """ - Parse S3 URI to extract bucket and key. - - Supports two formats: - 1. S3 URI format: s3://bucket/key - """ - logger.info("Parsing S3 URI") - - try: - # Check if it's an S3 URI format - if s3_uri.startswith("s3://"): - parts = s3_uri[5:].split("/", 1) - if len(parts) < 2: - raise ValueError("S3 URI must include both bucket and key") - bucket = parts[0] - key = parts[1] - logger.info(f"Extracted bucket: {bucket}, key: {key}") - return bucket, key - - # Otherwise, treat as AWS console URL - logger.info("Parsing as AWS console URL") - - # Split base URL and query string - if "?" not in s3_uri: - raise ValueError("No query string found") - - base, query = s3_uri.split("?", 1) - - # Extract bucket from base URL - if "/s3/object/" not in base: - raise ValueError("No '/s3/object/' found in URL path") - - path_parts = base.split("/s3/object/") - bucket = path_parts[1] - logger.info(f"Extracted bucket: {bucket}") - - # Extract prefix from query parameters - params = dict(item.split("=") for item in query.split("&") if "=" in item) - key = unquote(params.get("prefix", "")) - logger.info(f"Extracted key: {key}") - - return bucket, key - except Exception as e: - logger.error(f"Error parsing S3 URI: {type(e).__name__}: {e}") - raise ValueError(f"Could not parse S3 URI") from e - - def upload_batch_to_s3( batch_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None ) -> str: diff --git a/utils/s3.py b/utils/s3.py index 0e79c26b..0ba036f7 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -3,11 +3,62 @@ import boto3 import csv import pandas as pd from io import BytesIO, StringIO +from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError logger = setup_logger() + +def parse_s3_uri(s3_uri: str) -> tuple[str, str]: + """ + Parse S3 URI to extract bucket and key. + + Supports two formats: + 1. S3 URI format: s3://bucket/key + 2. AWS console URL format with query parameters + """ + logger.info("Parsing S3 URI") + + try: + # Check if it's an S3 URI format + if s3_uri.startswith("s3://"): + parts = s3_uri[5:].split("/", 1) + if len(parts) < 2: + raise ValueError("S3 URI must include both bucket and key") + bucket = parts[0] + key = parts[1] + logger.info(f"Extracted bucket: {bucket}, key: {key}") + return bucket, key + + # Otherwise, treat as AWS console URL + logger.info("Parsing as AWS console URL") + + # Split base URL and query string + if "?" not in s3_uri: + raise ValueError("No query string found") + + base, query = s3_uri.split("?", 1) + + # Extract bucket from base URL + if "/s3/object/" not in base: + raise ValueError("No '/s3/object/' found in URL path") + + path_parts = base.split("/s3/object/") + bucket = path_parts[1] + logger.info(f"Extracted bucket: {bucket}") + + # Extract prefix from query parameters + params = dict(item.split("=") for item in query.split("&") if "=" in item) + key = unquote(params.get("prefix", "")) + logger.info(f"Extracted key: {key}") + + return bucket, key + except Exception as e: + logger.error(f"Error parsing S3 URI: {type(e).__name__}: {e}") + raise ValueError(f"Could not parse S3 URI") from e + + def read_from_s3(bucket_name, s3_file_name): """ Read an object from s3. Decoding of the data is left for outside of this function From 0dbc5f985cb80c12b00b6653cb62dfa4e5e95f71 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 12:37:53 +0000 Subject: [PATCH 116/135] wrong subtask id being sent --- backend/postcode_splitter/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 930fac7f..e49a7f0d 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -136,7 +136,7 @@ def create_batch_and_send_to_address2uprn( # Send message with S3 reference send_to_address2uprn_queue( task_id=str(task_id), - sub_task_id=batch_sub_task_id, + sub_task_id=created_batch_sub_task_id, s3_uri=s3_uri, ) From e70a8b3c62c998d7596df2869f8a67ca08570d21 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 12:40:53 +0000 Subject: [PATCH 117/135] wrong subtask id being sent --- .github/workflows/deploy_terraform.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 6ee9de11..d2fd7b5b 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -205,11 +205,3 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - - - - - - - - From 581f0ad49fb8859a7e983e05db6058e31ffb8a79 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 12:57:36 +0000 Subject: [PATCH 118/135] uudi needs to be str --- backend/postcode_splitter/main.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index e49a7f0d..b3c78b20 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -5,7 +5,11 @@ import pandas as pd import requests import boto3 from uuid import UUID, uuid4 -from utils.s3 import read_csv_from_s3 as read_csv_from_s3_dict, save_csv_to_s3, parse_s3_uri +from utils.s3 import ( + read_csv_from_s3 as read_csv_from_s3_dict, + save_csv_to_s3, + parse_s3_uri, +) from utils.logger import setup_logger from tqdm import tqdm from backend.app.db.functions.tasks.Tasks import SubTaskInterface @@ -136,7 +140,7 @@ def create_batch_and_send_to_address2uprn( # Send message with S3 reference send_to_address2uprn_queue( task_id=str(task_id), - sub_task_id=created_batch_sub_task_id, + sub_task_id=str(created_batch_sub_task_id), s3_uri=s3_uri, ) From d99ee337670800fc5955331e27d9926afb99efd9 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 12:57:47 +0000 Subject: [PATCH 119/135] uudi needs to be str --- .github/workflows/_deploy_lambda.yml | 1 + .github/workflows/unit_tests.yml | 46 ++++++++++++++-------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 1a690e02..9f8619f9 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -115,3 +115,4 @@ jobs: + diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index cc6431b8..5521a481 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -1,30 +1,30 @@ -name: Run unit tests +# name: Run unit tests -on: - pull_request: - branches: - - "**" +# on: +# pull_request: +# branches: +# - "**" -jobs: - test: - runs-on: ubuntu-latest +# jobs: +# test: +# runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v4 - with: - python-version: '3.11' +# - name: Set up Python 3.11 +# uses: actions/setup-python@v4 +# with: +# python-version: '3.11' - - name: Install tox via Makefile - run: | - make setup +# - name: Install tox via Makefile +# run: | +# make setup - - name: Run tests with tox via Makefile - env: - EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} - run: | - make test \ No newline at end of file +# - name: Run tests with tox via Makefile +# env: +# EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} +# run: | +# make test \ No newline at end of file From a4b259959f37d22ac01011db5e8453bb561bb8f3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 13:35:05 +0000 Subject: [PATCH 120/135] set defaults --- backend/app/config.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/backend/app/config.py b/backend/app/config.py index 41552ae5..feb312b4 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -18,37 +18,37 @@ def resolve_env_file() -> Optional[str]: class Settings(BaseSettings): - API_KEY: str + API_KEY: str = "changeme" API_KEY_NAME: str = "X-API-KEY" - SECRET_KEY: str - ENVIRONMENT: str - DATA_BUCKET: str + SECRET_KEY: str = "changeme" + ENVIRONMENT: str = "changeme" + DATA_BUCKET: str = "changeme" PLAN_TRIGGER_BUCKET: str - ENGINE_SQS_URL: str + ENGINE_SQS_URL: str = "changeme" # Third parties - EPC_AUTH_TOKEN: str - GOOGLE_SOLAR_API_KEY: str + EPC_AUTH_TOKEN: str = "changeme" + GOOGLE_SOLAR_API_KEY: str = "changeme" # Database settings - DB_HOST: str - DB_PASSWORD: str - DB_USERNAME: str - DB_PORT: str - DB_NAME: str + DB_HOST: str = "changeme" + DB_PASSWORD: str = "changeme" + DB_USERNAME: str = "changeme" + DB_PORT: str = "changeme" + DB_NAME: str = "changeme" # Prediction buckets - SAP_PREDICTIONS_BUCKET: str - CARBON_PREDICTIONS_BUCKET: str - HEAT_PREDICTIONS_BUCKET: str + SAP_PREDICTIONS_BUCKET: str = "changeme" + CARBON_PREDICTIONS_BUCKET: str = "changeme" + HEAT_PREDICTIONS_BUCKET: str = "changeme" # LIGHTING_COST_PREDICTIONS_BUCKET: str # HEATING_COST_PREDICTIONS_BUCKET: str # HOT_WATER_COST_PREDICTIONS_BUCKET: str - HEATING_KWH_PREDICTIONS_BUCKET: str - HOTWATER_KWH_PREDICTIONS_BUCKET: str + HEATING_KWH_PREDICTIONS_BUCKET: str = "changeme" + HOTWATER_KWH_PREDICTIONS_BUCKET: str = "changeme" # Other S3 buckts - ENERGY_ASSESSMENTS_BUCKET: str + ENERGY_ASSESSMENTS_BUCKET: str = "changeme" # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None From 5770e0f066ebf514116f0e6a18d9bca9c5a7ff0f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 13:35:27 +0000 Subject: [PATCH 121/135] set defaults --- .github/workflows/_deploy_lambda.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 9f8619f9..528300f8 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -113,6 +113,3 @@ jobs: -var="ecr_repo_url=${{ steps.repo.outputs.ecr_repo_url }}" \ -var="image_digest=${{ inputs.image_digest }}" - - - From da79ccf7595927cb105f9b0b2f727c43c8ad563f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 14:08:09 +0000 Subject: [PATCH 122/135] just do 5 --- backend/postcode_splitter/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index b3c78b20..1049295b 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -211,7 +211,8 @@ def handler(event, context): csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - df = df.head(1983) + # df = df.head(1983) + df = df.head(5) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") From d6ea88adf3860d7715f173820199291bf227e2c6 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 14:08:38 +0000 Subject: [PATCH 123/135] just do 5 --- .github/workflows/deploy_terraform.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index d2fd7b5b..4dcbf129 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -205,3 +205,4 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + From 8e574c24014ee15534de3847762e3800690f521f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Feb 2026 18:30:47 +0000 Subject: [PATCH 124/135] post code splitter works --- .github/workflows/deploy_terraform.yml | 2 +- backend/address2UPRN/main.py | 31 +-- backend/postcode_splitter/main.py | 361 +++++++++---------------- 3 files changed, 130 insertions(+), 264 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 4dcbf129..2fd12fe6 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -77,7 +77,7 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - # if: env.STAGE == 'prod' + if: env.STAGE == 'prod' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index f843d28a..7fc11570 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -544,8 +544,8 @@ def handler(event, context, local=False): "body": json.dumps( { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "sub_task_id": "a1b2c3d4-e5f6-7a8b-9c0d-e1f2a3b4c5d6", - "s3_uri": "", + "sub_task_id": "1c09df07-fd29-4de7-b146-fafb591856a9", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-13T15:54:58.568594_67557923.csv", } ) } @@ -573,14 +573,14 @@ def handler(event, context, local=False): # Validate required fields task_id = body.get("task_id") - sub_task_id = body.get("sub_task_id") + subtask_id = body.get("sub_task_id") s3_uri = body.get("s3_uri") if not task_id: errors.append({"error": "Missing required field: task_id"}) continue - if not sub_task_id: + if not subtask_id: errors.append({"error": "Missing required field: sub_task_id"}) continue @@ -598,7 +598,7 @@ def handler(event, context, local=False): # Convert sub_task_id to UUID try: subtask_id = ( - UUID(sub_task_id) if isinstance(sub_task_id, str) else sub_task_id + UUID(subtask_id) if isinstance(subtask_id, str) else subtask_id ) except ValueError as e: errors.append( @@ -756,16 +756,6 @@ def handler(event, context, local=False): except Exception as s3_error: logger.error(f"Failed to save results to S3: {s3_error}") - results.append( - { - "subtask_id": str(subtask_id), - "postcodes_processed": postcodes_processed, - "addresses_processed": addresses_processed, - "uprns_found": uprns_found, - "status": "processed", - } - ) - # Mark subtask as completed try: subtask_interface.update_subtask_status( @@ -777,17 +767,6 @@ def handler(event, context, local=False): except Exception as db_error: logger.error(f"Failed to mark subtask as completed: {db_error}") - except json.JSONDecodeError as e: - logger.error(f"Invalid JSON in request body: {e}") - errors.append({"error": "Invalid JSON in request body", "details": str(e)}) - # Mark subtask as failed if we have one - if subtask_id: - try: - subtask_interface.update_subtask_status( - subtask_id, "failed", outputs={"error": str(e)} - ) - except Exception as db_error: - logger.error(f"Failed to update subtask status: {db_error}") except Exception as e: logger.error(f"Unexpected error processing record: {e}", exc_info=True) errors.append({"error": "Unexpected error", "details": str(e)}) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 1049295b..6d8d1095 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -101,8 +101,9 @@ def send_to_address2uprn_queue(task_id: str, sub_task_id: str, s3_uri: str) -> s def create_batch_and_send_to_address2uprn( - batch_rows: list, + batch_df: pd.DataFrame, task_id: str, + sub_task_id: str, subtask_interface: SubTaskInterface, bucket_name: str, ) -> str: @@ -118,291 +119,177 @@ def create_batch_and_send_to_address2uprn( Returns: The created batch subtask ID """ - # Generate unique batch subtask ID - batch_sub_task_id = str(uuid4()) - # Upload batch to S3 - batch_df = pd.DataFrame(batch_rows) - s3_uri = upload_batch_to_s3(batch_df, str(task_id), batch_sub_task_id, bucket_name) + + s3_uri = upload_batch_to_s3(batch_df, str(task_id), str(sub_task_id), bucket_name) # Create a new subtask for this batch with all inputs created_batch_sub_task_id = subtask_interface.create_subtask( task_id=task_id, inputs={ "task_id": str(task_id), - "sub_task_id": batch_sub_task_id, - "batch_size": len(batch_rows), "s3_uri": s3_uri, }, ) + logger.info(f"Created batch subtask {created_batch_sub_task_id}") - # Send message with S3 reference - send_to_address2uprn_queue( - task_id=str(task_id), - sub_task_id=str(created_batch_sub_task_id), - s3_uri=s3_uri, - ) + # # Send message with S3 reference + # send_to_address2uprn_queue( + # task_id=str(task_id), + # sub_task_id=str(created_batch_sub_task_id), + # s3_uri=s3_uri, + # ) return created_batch_sub_task_id -def handler(event, context): +def handler(event, context, local=False): print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") # Example SQS message for testing (copy and paste into SQS): - # { - # "task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917", - # "s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv" - # } - + if local is True: + event = { + "Records": [ + { + "body": json.dumps( + { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv", + } + ) + } + ] + } # Handle both single event and batch events (SQS, etc.) records = event.get("Records", [event]) results = [] errors = [] subtask_interface = SubTaskInterface() bucket_name = os.getenv("S3_BUCKET_NAME") + if local: + bucket_name = "retrofit-data-dev" for record in records: + if local: + record = records[0] task_id = None subtask_id = None - try: - # Parse body (inputs) - if isinstance(record.get("body"), str): - body = json.loads(record["body"]) - else: - body = record.get("body", {}) + # Parse body (inputs) - # Validate required fields - task_id = body.get("task_id") - s3_uri = body.get("s3_uri") + if isinstance(record.get("body"), str): + body = json.loads(record["body"]) + else: + body = record.get("body", {}) - if not task_id: - errors.append({"error": "Missing required field: task_id"}) - continue + # Validate required fields + task_id = body.get("task_id") + subtask_id = body.get("sub_task_id") + s3_uri = body.get("s3_uri") - if not s3_uri: - errors.append({"error": "Missing required field: s3_uri"}) - continue + # Convert task_id to UUID + task_id = UUID(task_id) if isinstance(task_id, str) else task_id + subtask_id = UUID(subtask_id) if isinstance(subtask_id, str) else subtask_id - # Convert task_id to UUID - try: - task_id = UUID(task_id) if isinstance(task_id, str) else task_id - except ValueError as e: - errors.append({"error": f"Invalid UUID format for task_id: {str(e)}"}) - continue + # Mark subtask as in progress + subtask_interface.update_subtask_status(subtask_id, "in progress") + logger.info(f"Marked subtask {subtask_id} as in progress") - # Create a new subtask for this postcode splitter invocation - subtask_id = subtask_interface.create_subtask( - task_id=task_id, inputs={"s3_uri": s3_uri} + # Read CSV from S3 + bucket, key = parse_s3_uri(s3_uri) + logger.info(f"S3 Bucket: {bucket}, Key: {key}") + + csv_data = read_csv_from_s3_dict(bucket, key) + df = pd.DataFrame(csv_data) + + # TODO: Change the input to the file you want + # df = df.head(1983) + df = df.head(502) + + logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") + + # Sanitise postcodes + df["postcode_clean"] = df["postcode"].str.upper().str.replace(" ", "") + + df = df.dropna(subset=["postcode_clean"]) + + batch_size = 500 + if df.shape[0] < batch_size: + create_batch_and_send_to_address2uprn( + batch_df=df, + task_id=task_id, + sub_task_id=subtask_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) - logger.info(f"Created subtask {subtask_id} for task {task_id}") - - # Mark subtask as in progress - subtask_interface.update_subtask_status(subtask_id, "in progress") - logger.info(f"Marked subtask {subtask_id} as in progress") - - # Read CSV from S3 - logger.info(f"Processing S3 URI: {s3_uri}") - bucket, key = parse_s3_uri(s3_uri) - logger.info(f"S3 Bucket: {bucket}, Key: {key}") - - csv_data = read_csv_from_s3_dict(bucket, key) - df = pd.DataFrame(csv_data) - - # df = df.head(1983) - df = df.head(5) - - logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") - - # Sanitise postcodes - df["postcode_clean"] = df["postcode"].str.upper().str.replace(" ", "") - - clean_df = df.dropna(subset=["postcode_clean"]) - + else: postcode_to_addresses = { - postcode: group.to_dict(orient="records") - for postcode, group in clean_df.groupby("postcode_clean", sort=False) + postcode: group + for postcode, group in df.groupby("postcode_clean", sort=False) } - logger.info(f"Total postcodes: {len(postcode_to_addresses)}") + count = 0 + buffer = [] - # Calculate total rows to send - total_rows = sum(len(rows) for rows in postcode_to_addresses.values()) - logger.info(f"Total rows to send: {total_rows}") + for postcode, group_df in postcode_to_addresses.items(): + group_len = len(group_df) - batch_size = 500 - - # If all rows fit in one batch, just send them all at once - if total_rows <= batch_size: - all_rows = [] - for postcode, rows in postcode_to_addresses.items(): - all_rows.extend(rows) - try: - create_batch_and_send_to_address2uprn( - batch_rows=all_rows, - task_id=task_id, - subtask_interface=subtask_interface, - bucket_name=bucket_name, - ) - logger.info( - f"Sent all {len(all_rows)} rows in single batch to address2UPRN queue" - ) - except Exception as e: - logger.error( - f"Failed to send all rows to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) - else: - # Multi-batch processing for large datasets - batch_rows = [] - total_sent = 0 - - for postcode, rows in postcode_to_addresses.items(): - logger.info(f"Processing postcode {postcode} with {len(rows)} rows") - # If postcode itself is larger than batch_size, send it individually - if len(rows) > batch_size: - # First, send the current batch if it has data - if batch_rows: - try: - create_batch_and_send_to_address2uprn( - batch_rows=batch_rows, - task_id=task_id, - subtask_interface=subtask_interface, - bucket_name=bucket_name, - ) - logger.info( - f"Sent batch of {len(batch_rows)} rows to address2UPRN queue" - ) - batch_rows = [] - except Exception as e: - logger.error( - f"Failed to send batch to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) - - # Send the large postcode on its own - try: - create_batch_and_send_to_address2uprn( - batch_rows=rows, - task_id=task_id, - subtask_interface=subtask_interface, - bucket_name=bucket_name, - ) - logger.info( - f"Sent large postcode {postcode} ({len(rows)} rows) to address2UPRN queue" - ) - except Exception as e: - logger.error( - f"Failed to send large postcode to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) - continue - - # If adding this postcode's rows would exceed batch_size, send current batch - current_batch_size = len(batch_rows) + len(rows) - if batch_rows and current_batch_size > batch_size: - logger.info( - f"Batch threshold reached: current {len(batch_rows)} + next postcode {len(rows)} = {current_batch_size} > {batch_size}" - ) - try: - create_batch_and_send_to_address2uprn( - batch_rows=batch_rows, - task_id=task_id, - subtask_interface=subtask_interface, - bucket_name=bucket_name, - ) - logger.info( - f"Sent batch of {len(batch_rows)} rows to address2UPRN queue (total sent: {total_sent})" - ) - total_sent += len(batch_rows) - batch_rows = [] - except Exception as e: - logger.error( - f"Failed to send batch to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) - - # Add current postcode's rows to batch - batch_rows.extend(rows) - - # Send remaining batch - if batch_rows: - try: + # If single postcode is bigger than batch_size → send directly + if group_len >= batch_size: + if buffer: create_batch_and_send_to_address2uprn( - batch_rows=batch_rows, + batch_df=pd.concat(buffer, ignore_index=True), task_id=task_id, + sub_task_id=subtask_id, subtask_interface=subtask_interface, bucket_name=bucket_name, ) - total_sent += len(batch_rows) - logger.info( - f"Sent final batch of {len(batch_rows)} rows to address2UPRN queue (total sent: {total_sent})" - ) - batch_rows = [] - except Exception as e: - logger.error( - f"Failed to send final batch to address2UPRN queue: {e}", - exc_info=True, - ) - errors.append( - { - "error": "Failed to send to address2UPRN queue", - "details": str(e), - } - ) + buffer = [] + count = 0 - except json.JSONDecodeError as e: - logger.error(f"Invalid JSON in request body: {e}") - errors.append({"error": "Invalid JSON in request body", "details": str(e)}) - # Mark subtask as failed if we have one - if subtask_id: - try: - subtask_interface.update_subtask_status( - subtask_id, "failed", outputs={"error": str(e)} + create_batch_and_send_to_address2uprn( + batch_df=group_df, + task_id=task_id, + sub_task_id=subtask_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, ) - except Exception as db_error: - logger.error(f"Failed to update subtask status: {db_error}") - except Exception as e: - logger.error(f"Unexpected error processing record: {e}", exc_info=True) - errors.append({"error": "Unexpected error", "details": str(e)}) - # Mark subtask as failed if we have one - if subtask_id: - try: - subtask_interface.update_subtask_status( - subtask_id, "failed", outputs={"error": str(e)} - ) - except Exception as db_error: - logger.error(f"Failed to update subtask status: {db_error}") + continue - # Return error if all records failed - if errors and not results: - return {"statusCode": 500, "body": json.dumps({"errors": errors})} + # If adding would exceed batch → flush first + if count + group_len > batch_size: + create_batch_and_send_to_address2uprn( + batch_df=pd.concat(buffer, ignore_index=True), + task_id=task_id, + sub_task_id=subtask_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, + ) + buffer = [] + count = 0 + + # Add group + buffer.append(group_df) + count += group_len + + # Final flush + if buffer: + create_batch_and_send_to_address2uprn( + batch_df=pd.concat(buffer, ignore_index=True), + task_id=task_id, + sub_task_id=subtask_id, + subtask_interface=subtask_interface, + bucket_name=bucket_name, + ) + + # Mark subtask as completed + subtask_interface.update_subtask_status( + subtask_id, + "completed", + outputs={"rows_processed": "todo -> show sensible output"}, + ) return { "statusCode": 200, From c1f784b87fd90e09a5af74ab1189d9f04e017f33 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 12:13:16 +0000 Subject: [PATCH 125/135] address 2uprn and postcode splitter works locally --- backend/address2UPRN/main.py | 6 ++++-- backend/postcode_splitter/main.py | 6 +----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 7fc11570..c51171e5 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -504,6 +504,8 @@ def save_results_to_s3( """ if bucket_name is None: bucket_name = os.getenv("S3_BUCKET_NAME") + if bucket_name is None: + bucket_name = "retrofit-data-dev" if not bucket_name: logger.error( @@ -544,8 +546,8 @@ def handler(event, context, local=False): "body": json.dumps( { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "sub_task_id": "1c09df07-fd29-4de7-b146-fafb591856a9", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-13T15:54:58.568594_67557923.csv", + "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", } ) } diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 6d8d1095..6cc40fc4 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -204,10 +204,6 @@ def handler(event, context, local=False): csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - # TODO: Change the input to the file you want - # df = df.head(1983) - df = df.head(502) - logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") # Sanitise postcodes @@ -288,7 +284,7 @@ def handler(event, context, local=False): subtask_interface.update_subtask_status( subtask_id, "completed", - outputs={"rows_processed": "todo -> show sensible output"}, + outputs={"rows_processed": "completed"}, ) return { From a6c827c47fb298b31cb4e7c0a1d033033f84ecfa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 12:30:57 +0000 Subject: [PATCH 126/135] terraform apply --- .github/workflows/deploy_terraform.yml | 6 ++-- .github/workflows/unit_tests.yml | 46 +++++++++++++------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 2fd12fe6..e7c8fb94 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -117,8 +117,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: address2uprn-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.address2uprn_image.outputs.image_digest }} - # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} - terraform_apply: 'true' + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} @@ -159,8 +158,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.postcodeSplitter_image.outputs.image_digest }} - # terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} - terraform_apply: 'true' + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 5521a481..cc6431b8 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -1,30 +1,30 @@ -# name: Run unit tests +name: Run unit tests -# on: -# pull_request: -# branches: -# - "**" +on: + pull_request: + branches: + - "**" -# jobs: -# test: -# runs-on: ubuntu-latest +jobs: + test: + runs-on: ubuntu-latest -# steps: -# - name: Checkout code -# uses: actions/checkout@v4 + steps: + - name: Checkout code + uses: actions/checkout@v4 -# - name: Set up Python 3.11 -# uses: actions/setup-python@v4 -# with: -# python-version: '3.11' + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: '3.11' -# - name: Install tox via Makefile -# run: | -# make setup + - name: Install tox via Makefile + run: | + make setup -# - name: Run tests with tox via Makefile -# env: -# EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} -# run: | -# make test \ No newline at end of file + - name: Run tests with tox via Makefile + env: + EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} + run: | + make test \ No newline at end of file From dbba066ba57e6026a86c645d2daf0077d74e64f2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 12:51:56 +0000 Subject: [PATCH 127/135] remove docker as i don't need locally working workflows anymore --- .devcontainer/backend/Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index f48fb99f..99cd66d6 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -3,8 +3,6 @@ FROM python:3.11.10-bullseye ARG USER=vscode ARG DEBIAN_FRONTEND=noninteractive -ARG DOCKER_GID=1003 - # 1) Toolchain + utilities for building libpostal RUN apt-get update && apt-get install -y --no-install-recommends \ From 62a8f543f60f4548f2376886337d1a46053947e5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 13:04:27 +0000 Subject: [PATCH 128/135] get rid of comments --- backend/address2UPRN/main.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index c51171e5..6ca2fd5c 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -3,7 +3,6 @@ import os from urllib.parse import urlencode import pandas as pd from difflib import SequenceMatcher -from tqdm import tqdm from utils.logger import setup_logger import re from typing import Set @@ -334,22 +333,10 @@ def get_uprn_candidates( def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, - verbose=False, ): """ Return uprn (str) using a pre-fetched EPC dataframe. This avoids calling the API multiple times for the same postcode. - - Args: - user_inputed_address: The user's address string - epc_df: Pre-fetched EPC data for the postcode - return_address: Whether to return the matched address - return_EPC: Whether to return the EPC rating - return_score: Whether to return the lexiscore - - Returns: - uprn (str), or tuple if return_address/return_EPC/return_score are True - Returns None if no match found, lexiscore < 0.7, or UPRN is empty """ if epc_df.empty: return None From ed8d5629170ab328c7bed6d5b249916a839e91db Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 13:49:49 +0000 Subject: [PATCH 129/135] added logger and verbose --- backend/address2UPRN/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 6ca2fd5c..73fe7c7d 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -333,6 +333,7 @@ def get_uprn_candidates( def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, + verbose: bool = False, ): """ Return uprn (str) using a pre-fetched EPC dataframe. @@ -363,7 +364,7 @@ def get_uprn_with_epc_df( address = top_rank_df["address"].values[0] score = float(top_rank_df["lexiscore"].values[0]) - # logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") + logger.info(f"Address found to be: {address}, with lexiscore {score}") # Safe to return the agreed UPRN found_uprn = top_rank_df.iloc[0]["uprn"] @@ -379,7 +380,7 @@ def get_uprn_with_epc_df( def get_uprn( user_inputed_address: str, postcode: str, - verbose=False, + verbose: bool = False, ): """ Return uprn (str) From 61377497ff5405a7af0cd1414e5a8c71eb32dadc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 14:07:23 +0000 Subject: [PATCH 130/135] get rid of unneccsary variable declartion --- backend/address2UPRN/main.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 73fe7c7d..a067593e 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -646,9 +646,7 @@ def handler(event, context, local=False): logger.info(f"Total postcodes: {len(postcode_to_addresses)}") # Process each postcode group - postcodes_processed = 0 - addresses_processed = 0 - uprns_found = 0 + results_data = [] for postcode, postcode_rows in postcode_to_addresses.items(): @@ -691,7 +689,6 @@ def handler(event, context, local=False): # Parse result tuple if successful if result: uprn, found_address, score = result - uprns_found += 1 logger.info( f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" ) @@ -717,8 +714,6 @@ def handler(event, context, local=False): } ) - addresses_processed += 1 - except Exception as e: logger.error( f"Error processing address {row.get('user_input', 'unknown')}: {e}" @@ -735,8 +730,6 @@ def handler(event, context, local=False): ) continue - postcodes_processed += 1 - # Create results DataFrame result_df = pd.DataFrame(results_data) From 4ca538ecb2efe27128ac2460966ff962bedd950c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 14:12:09 +0000 Subject: [PATCH 131/135] added commnets on script --- backend/address2UPRN/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py index 59855dbc..090ac5ae 100644 --- a/backend/address2UPRN/script.py +++ b/backend/address2UPRN/script.py @@ -1,3 +1,5 @@ +# one time script for a customer forhousing + import pandas as pd from tqdm import tqdm from backend.address2UPRN.main import get_uprn From 0a87ba786c61a089fba8f22533727813128960f8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 14:14:01 +0000 Subject: [PATCH 132/135] local run stuff --- backend/address2UPRN/main.py | 2 -- backend/postcode_splitter/main.py | 9 --------- 2 files changed, 11 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index a067593e..af29a095 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -492,8 +492,6 @@ def save_results_to_s3( """ if bucket_name is None: bucket_name = os.getenv("S3_BUCKET_NAME") - if bucket_name is None: - bucket_name = "retrofit-data-dev" if not bucket_name: logger.error( diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 6cc40fc4..70ecf5f1 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -23,15 +23,6 @@ def upload_batch_to_s3( ) -> str: """ Upload batch DataFrame to S3 as CSV. - - Args: - batch_df: The DataFrame containing batch data - task_id: The parent task ID (used for file path) - sub_task_id: The subtask ID (used for file path) - bucket_name: The S3 bucket name (defaults to env variable) - - Returns: - S3 URI (s3://bucket/key) of the uploaded file """ if bucket_name is None: bucket_name = os.getenv("S3_BUCKET_NAME") From 12b99669822b72f54a09901c804372044255ffce Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 14:16:57 +0000 Subject: [PATCH 133/135] send message to address2uprn --- backend/postcode_splitter/main.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 70ecf5f1..4f63ed4b 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -101,14 +101,6 @@ def create_batch_and_send_to_address2uprn( """ Create a batch DataFrame, upload to S3, create subtask, and send to address2UPRN queue. - Args: - batch_rows: List of row dictionaries for this batch - task_id: The parent task ID - subtask_interface: SubTaskInterface instance - bucket_name: S3 bucket name - - Returns: - The created batch subtask ID """ # Upload batch to S3 @@ -125,12 +117,12 @@ def create_batch_and_send_to_address2uprn( logger.info(f"Created batch subtask {created_batch_sub_task_id}") - # # Send message with S3 reference - # send_to_address2uprn_queue( - # task_id=str(task_id), - # sub_task_id=str(created_batch_sub_task_id), - # s3_uri=s3_uri, - # ) + # Send message with S3 reference + send_to_address2uprn_queue( + task_id=str(task_id), + sub_task_id=str(created_batch_sub_task_id), + s3_uri=s3_uri, + ) return created_batch_sub_task_id From 9f6d61b178d6ef6c8e6902d0dc4032117c94a818 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 14:21:44 +0000 Subject: [PATCH 134/135] get rid of todo --- infrastructure/terraform/lambda/address2UPRN/main.tf | 2 +- infrastructure/terraform/lambda/postcodeSplitter/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 5f0c4a11..5a36153e 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -2,7 +2,7 @@ data "terraform_remote_state" "shared" { backend = "s3" config = { bucket = "assessment-model-terraform-state" - key = "env:/${var.stage}/terraform.tfstate" # TODO: dont hardcode this + key = "env:/${var.stage}/terraform.tfstate" region = "eu-west-2" } } diff --git a/infrastructure/terraform/lambda/postcodeSplitter/main.tf b/infrastructure/terraform/lambda/postcodeSplitter/main.tf index e17d272d..d37a01c9 100644 --- a/infrastructure/terraform/lambda/postcodeSplitter/main.tf +++ b/infrastructure/terraform/lambda/postcodeSplitter/main.tf @@ -2,7 +2,7 @@ data "terraform_remote_state" "shared" { backend = "s3" config = { bucket = "assessment-model-terraform-state" - key = "env:/${var.stage}/terraform.tfstate" # TODO: dont hardcode this + key = "env:/${var.stage}/terraform.tfstate" region = "eu-west-2" } } From 42cac343576a4cf1f0bb2c02df145dd8e53ed293 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 16 Feb 2026 15:50:01 +0000 Subject: [PATCH 135/135] only run on branches it was told to --- .github/workflows/deploy_terraform.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index e7c8fb94..6280abcd 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -77,10 +77,10 @@ jobs: run: terraform plan -var-file=${STAGE}.tfvars -out=tfplan - name: Terraform Apply - if: env.STAGE == 'prod' + if: env.TERRAFORM_APPLY == 'true' working-directory: infrastructure/terraform/shared run: terraform apply -auto-approve tfplan - + # ============================================================ # 2️⃣ Build Address 2 UPRN image and Push # ============================================================