From 461cdd23674eb556c2e072ba55030068f8dbaacb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 6 Jun 2024 11:18:35 +0100 Subject: [PATCH] set up basic data preparation process --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/stonewater/shdf_3_clustering.py | 75 +++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 etl/customers/stonewater/shdf_3_clustering.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 4413bb06..b0f9c00d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 6f308057..1122b380 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py new file mode 100644 index 00000000..1a84f1d4 --- /dev/null +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -0,0 +1,75 @@ +import numpy as np +import pandas as pd + + +def app(): + """ + This script handles the preparation of the data from Stonewater, to archetype a collection + of 5.3k properties and reduce that down to a representative set of 450 properties. + + Here, we prepare the input data for clustering + :return: + """ + + # TODO: Temp read from local machine - move to s3 + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4 + ) + + # Drop the bottom 4 rows, which are completely missing + asset_list = asset_list.head(-4) + + # Keep just the columns we're interested in + asset_list = asset_list[ + [ + "Osm. ID", + "Org. ref.", + "Postcode", + "House no", + "Name", + "Address line 2", + "City/Town", + "County", + "Address ID", # This is not uprn + ] + ].rename( + columns={ + "Osm. ID": "internal_id", + "Org. ref.": "customer_asset_id", + "Postcode": "postcode", + "House no": "house_number", + "Name": "address1", + "Address line 2": "address2", + "City/Town": "city_town", + "County": "county", + "Address ID": "external_address_id", + } + ) + + # Create full address + # TODO: handle cases where one of these is null + asset_list["full_address"] = ( + asset_list["address1"] + ", " + + asset_list["address2"] + ", " + + asset_list["city_town"] + ", " + + asset_list["county"] + ", " + + asset_list["postcode"] + ) + + asset_list["full_address"] = np.where( + ~pd.isnull(asset_list["address2"]), + ( + asset_list["address1"] + ", " + + asset_list["address2"] + ", " + + asset_list["city_town"].str.title() + ", " + + asset_list["county"] + ", " + + asset_list["postcode"] + ), + asset_list["address1"] + ", " + + asset_list["city_town"].str.title() + ", " + + asset_list["county"] + ", " + + asset_list["postcode"] + ) + + if pd.isnull(asset_list["full_address"]).sum(): + raise ValueError("Missing full addresses")