From 1b53b47048500ef30142714c13211f5f740f43a1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 12:37:50 +0000 Subject: [PATCH] add this in a sensible branch --- backend/address2UPRN/README.md | 14 ++++++++------ backend/address2UPRN/main.py | 9 ++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md index 6d26f281..e34e45f6 100644 --- a/backend/address2UPRN/README.md +++ b/backend/address2UPRN/README.md @@ -5,10 +5,11 @@ Before you run: Step 1) Get the list and ensure the following columns exists +I believe lower and upper case matter: * Address 1 * Address 2 * Address 3 -* postcode +* Postcode And save it as a .csv file @@ -23,16 +24,17 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key -task_id = a7b70a02-4df4-45b5-a50b-196e095910bb -sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e +task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 +sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f +s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev { - "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", - "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e", - "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" + "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09", + "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv" } Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches///.csv diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index d0ba36e6..c458e40d 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -351,9 +351,9 @@ def handler(event, context, local=False): { "body": json.dumps( { - "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", + "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09", + "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv", } ) } @@ -441,6 +441,9 @@ def handler(event, context, local=False): # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") + df["postcode_clean"] = ( + df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "") + ) clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = {