From 1b53b47048500ef30142714c13211f5f740f43a1 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 12:37:50 +0000
Subject: [PATCH] add this in a sensible branch

---
 backend/address2UPRN/README.md | 14 ++++++++------
 backend/address2UPRN/main.py   |  9 ++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md
index 6d26f281..e34e45f6 100644
--- a/backend/address2UPRN/README.md
+++ b/backend/address2UPRN/README.md
@@ -5,10 +5,11 @@ Before you run:
 
 Step 1) Get the list and ensure the following columns exists
 
+I believe lower and upper case matter:
 * Address 1
 * Address 2
 * Address 3
-* postcode
+* Postcode
 
 And save it as a .csv file
 
@@ -23,16 +24,17 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal
 
 Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key
 
-task_id = a7b70a02-4df4-45b5-a50b-196e095910bb
-sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e
+task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 
+sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f
+s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv
 
 Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling
 postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev
 
 {
-    "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
-    "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
-    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
+    "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
+    "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
+    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv"
 }
 Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
 
diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py
index d0ba36e6..c458e40d 100644
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@@ -351,9 +351,9 @@ def handler(event, context, local=False):
                 {
                     "body": json.dumps(
                         {
-                            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
-                            "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
-                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
+                            "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
+                            "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
+                            "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv",
                         }
                     )
                 }
@@ -441,6 +441,9 @@ def handler(event, context, local=False):
             # Process the rows
             logger.info(f"Processing {len(df)} rows for task {task_id}")
 
+            df["postcode_clean"] = (
+                df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "")
+            )
             clean_df = df.dropna(subset=["postcode_clean"])
 
             postcode_to_addresses = {