add this in a sensible branch

2026-06-08 11:17:27 +00:00 · 2026-03-17 12:37:50 +00:00 · 2026-03-17 12:37:50 +00:00 · 1b53b47048
commit 1b53b47048
parent ad189b4cac
2 changed files with 14 additions and 9 deletions
--- a/backend/address2UPRN/README.md
+++ b/backend/address2UPRN/README.md
@ -5,10 +5,11 @@ Before you run:

 Step 1) Get the list and ensure the following columns exists

+I believe lower and upper case matter:
 * Address 1
 * Address 2
 * Address 3
-* postcode
+* Postcode

 And save it as a .csv file

@ -23,16 +24,17 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal

 Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key

-task_id = a7b70a02-4df4-45b5-a50b-196e095910bb
-sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e
+task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 
+sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f
+s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv

 Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling
 postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev

 {
-    "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
-    "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
-    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
+    "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
+    "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
+    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv"
 }
 Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv

--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@ -351,9 +351,9 @@ def handler(event, context, local=False):
                {
                    "body": json.dumps(
                        {
-                            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
-                            "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
-                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
+                            "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
+                            "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
+                            "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv",
                        }
                    )
                }
@ -441,6 +441,9 @@ def handler(event, context, local=False):
            # Process the rows
            logger.info(f"Processing {len(df)} rows for task {task_id}")

+            df["postcode_clean"] = (
+                df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "")
+            )
            clean_df = df.dropna(subset=["postcode_clean"])

            postcode_to_addresses = {