set up basic data preparation process

2026-07-27 23:35:01 +00:00 · 2024-06-06 11:18:35 +01:00 · 2024-06-06 11:18:35 +01:00 · 461cdd2367
commit 461cdd2367
parent 9217ef67f4
3 changed files with 77 additions and 2 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+
+
+def app():
+    """
+    This script handles the preparation of the data from Stonewater, to archetype a collection
+    of 5.3k properties and reduce that down to a representative set of 450 properties.
+
+    Here, we prepare the input data for clustering
+    :return:
+    """
+
+    # TODO: Temp read from local machine - move to s3
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+    )
+
+    # Drop the bottom 4 rows, which are completely missing
+    asset_list = asset_list.head(-4)
+
+    # Keep just the columns we're interested in
+    asset_list = asset_list[
+        [
+            "Osm. ID",
+            "Org. ref.",
+            "Postcode",
+            "House no",
+            "Name",
+            "Address line 2",
+            "City/Town",
+            "County",
+            "Address ID",  # This is not uprn
+        ]
+    ].rename(
+        columns={
+            "Osm. ID": "internal_id",
+            "Org. ref.": "customer_asset_id",
+            "Postcode": "postcode",
+            "House no": "house_number",
+            "Name": "address1",
+            "Address line 2": "address2",
+            "City/Town": "city_town",
+            "County": "county",
+            "Address ID": "external_address_id",
+        }
+    )
+
+    # Create full address
+    # TODO: handle cases where one of these is null
+    asset_list["full_address"] = (
+        asset_list["address1"] + ", " +
+        asset_list["address2"] + ", " +
+        asset_list["city_town"] + ", " +
+        asset_list["county"] + ", " +
+        asset_list["postcode"]
+    )
+
+    asset_list["full_address"] = np.where(
+        ~pd.isnull(asset_list["address2"]),
+        (
+            asset_list["address1"] + ", " +
+            asset_list["address2"] + ", " +
+            asset_list["city_town"].str.title() + ", " +
+            asset_list["county"] + ", " +
+            asset_list["postcode"]
+        ),
+        asset_list["address1"] + ", " +
+        asset_list["city_town"].str.title() + ", " +
+        asset_list["county"] + ", " +
+        asset_list["postcode"]
+    )
+
+    if pd.isnull(asset_list["full_address"]).sum():
+        raise ValueError("Missing full addresses")