diff --git a/.idea/Model.iml b/.idea/Model.iml
index 4413bb06..b0f9c00d 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 6f308057..1122b380 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
-
+
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
new file mode 100644
index 00000000..1a84f1d4
--- /dev/null
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+
+
+def app():
+ """
+ This script handles the preparation of the data from Stonewater, to archetype a collection
+ of 5.3k properties and reduce that down to a representative set of 450 properties.
+
+ Here, we prepare the input data for clustering
+ :return:
+ """
+
+ # TODO: Temp read from local machine - move to s3
+ asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+ )
+
+ # Drop the bottom 4 rows, which are completely missing
+ asset_list = asset_list.head(-4)
+
+ # Keep just the columns we're interested in
+ asset_list = asset_list[
+ [
+ "Osm. ID",
+ "Org. ref.",
+ "Postcode",
+ "House no",
+ "Name",
+ "Address line 2",
+ "City/Town",
+ "County",
+ "Address ID", # This is not uprn
+ ]
+ ].rename(
+ columns={
+ "Osm. ID": "internal_id",
+ "Org. ref.": "customer_asset_id",
+ "Postcode": "postcode",
+ "House no": "house_number",
+ "Name": "address1",
+ "Address line 2": "address2",
+ "City/Town": "city_town",
+ "County": "county",
+ "Address ID": "external_address_id",
+ }
+ )
+
+ # Create full address
+ # TODO: handle cases where one of these is null
+ asset_list["full_address"] = (
+ asset_list["address1"] + ", " +
+ asset_list["address2"] + ", " +
+ asset_list["city_town"] + ", " +
+ asset_list["county"] + ", " +
+ asset_list["postcode"]
+ )
+
+ asset_list["full_address"] = np.where(
+ ~pd.isnull(asset_list["address2"]),
+ (
+ asset_list["address1"] + ", " +
+ asset_list["address2"] + ", " +
+ asset_list["city_town"].str.title() + ", " +
+ asset_list["county"] + ", " +
+ asset_list["postcode"]
+ ),
+ asset_list["address1"] + ", " +
+ asset_list["city_town"].str.title() + ", " +
+ asset_list["county"] + ", " +
+ asset_list["postcode"]
+ )
+
+ if pd.isnull(asset_list["full_address"]).sum():
+ raise ValueError("Missing full addresses")