From 461cdd23674eb556c2e072ba55030068f8dbaacb Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 6 Jun 2024 11:18:35 +0100
Subject: [PATCH] set up basic data preparation process

---
 .idea/Model.iml                               |  2 +-
 .idea/misc.xml                                |  2 +-
 etl/customers/stonewater/shdf_3_clustering.py | 75 +++++++++++++++++++
 3 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 etl/customers/stonewater/shdf_3_clustering.py
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 4413bb06..b0f9c00d 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 6f308057..1122b380 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
new file mode 100644
index 00000000..1a84f1d4
--- /dev/null
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+
+
+def app():
+    """
+    This script handles the preparation of the data from Stonewater, to archetype a collection
+    of 5.3k properties and reduce that down to a representative set of 450 properties.
+
+    Here, we prepare the input data for clustering
+    :return:
+    """
+
+    # TODO: Temp read from local machine - move to s3
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+    )
+
+    # Drop the bottom 4 rows, which are completely missing
+    asset_list = asset_list.head(-4)
+
+    # Keep just the columns we're interested in
+    asset_list = asset_list[
+        [
+            "Osm. ID",
+            "Org. ref.",
+            "Postcode",
+            "House no",
+            "Name",
+            "Address line 2",
+            "City/Town",
+            "County",
+            "Address ID",  # This is not uprn
+        ]
+    ].rename(
+        columns={
+            "Osm. ID": "internal_id",
+            "Org. ref.": "customer_asset_id",
+            "Postcode": "postcode",
+            "House no": "house_number",
+            "Name": "address1",
+            "Address line 2": "address2",
+            "City/Town": "city_town",
+            "County": "county",
+            "Address ID": "external_address_id",
+        }
+    )
+
+    # Create full address
+    # TODO: handle cases where one of these is null
+    asset_list["full_address"] = (
+        asset_list["address1"] + ", " +
+        asset_list["address2"] + ", " +
+        asset_list["city_town"] + ", " +
+        asset_list["county"] + ", " +
+        asset_list["postcode"]
+    )
+
+    asset_list["full_address"] = np.where(
+        ~pd.isnull(asset_list["address2"]),
+        (
+            asset_list["address1"] + ", " +
+            asset_list["address2"] + ", " +
+            asset_list["city_town"].str.title() + ", " +
+            asset_list["county"] + ", " +
+            asset_list["postcode"]
+        ),
+        asset_list["address1"] + ", " +
+        asset_list["city_town"].str.title() + ", " +
+        asset_list["county"] + ", " +
+        asset_list["postcode"]
+    )
+
+    if pd.isnull(asset_list["full_address"]).sum():
+        raise ValueError("Missing full addresses")