From 5b9a36d6d28981b030e7f63d4652318ae811b26c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 13 Jun 2024 11:28:12 +0100
Subject: [PATCH] finished stonewater

---
 backend/SearchEpc.py                          | 21 ++--
 etl/customers/stonewater/shdf_3_clustering.py | 97 ++++++++++++++-----
 2 files changed, 89 insertions(+), 29 deletions(-)

diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 275669cc..37c2b7f9 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -482,15 +482,22 @@ class SearchEpc:
                 if lmks_to_drop is not None:
                     epc_data = epc_data[~epc_data["lmk-key"].isin(lmks_to_drop)]
 
-                if not epc_data.empty:
-                    # Further processing of the EPC data
+                try:
+                    epc_data['lodgement-datetime'] = pd.to_datetime(
+                        epc_data['lodgement-datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
+                    )
+                except Exception as e:
+                    logger.error("Problem formatting lodgement-datime, appling fallback: " + str(e))
                     epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce')
 
-                    if exclude_old:
-                        # Exclude EPC data older than 10 years
-                        epc_data = epc_data[
-                            epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
-                            ]
+                if exclude_old:
+                    # Exclude EPC data older than 10 years
+                    epc_data = epc_data[
+                        epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
+                        ]
+
+                if not epc_data.empty:
+                    # Further processing of the EPC data
 
                     epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
                     epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index c853fa94..5129dfb1 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -11,7 +11,8 @@ from fuzzywuzzy import fuzz
 import numpy as np
 import pandas as pd
 import time
-from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet
+from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
+    save_dataframe_to_s3_parquet, save_pickle_to_s3
 
 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@@ -1360,7 +1361,10 @@ def compile_data_final():
                 )
                 p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
 
-            searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
+            if not p_os_df.empty:
+                searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
+            else:
+                searcher.ordnance_survey_client.property_type = ""
             # Now we estimate
             searcher.newest_epc = searcher.estimate_epc(
                 property_type=searcher.ordnance_survey_client.property_type,
@@ -1395,20 +1399,19 @@ def compile_data_final():
 
         if searcher.older_epcs is not None:
             older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
-
     # Store in S3
     # TODO - read in instead of running
-    # save_data_to_s3(
-    #     data=json.dumps(epc_data_batch_2),
-    #     s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.json",
-    #     bucket_name="retrofit-data-dev"
-    # )
-    # 
-    # save_data_to_s3(
-    #     data=json.dumps(older_epcs_batch_2),
-    #     s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.json",
-    #     bucket_name="retrofit-data-dev"
-    # )
+    save_pickle_to_s3(
+        data=epc_data_batch_2,
+        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+
+    save_pickle_to_s3(
+        data=older_epcs_batch_2,
+        s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
+        bucket_name="retrofit-data-dev"
+    )
 
     epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
     complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@@ -1439,15 +1442,15 @@ def compile_data_final():
         spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
         spatial_data_to_uprn.append(spatial_df)
 
-    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
-
     # TODO: Let's store this in s3
-    # save_data_to_s3(
-    #     data=json.dumps(spatial_data_to_uprn.to_dict("records")),
-    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
+    # save_pickle_to_s3(
+    #     data=spatial_data_to_uprn,
+    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
     #     bucket_name="retrofit-data-dev"
     # )
 
+    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+
     spatial_data_to_uprn = spatial_data_to_uprn.drop(
         columns=["partition", "filename"]
     ).rename(columns={"UPRN": "uprn"})
@@ -1455,10 +1458,16 @@ def compile_data_final():
 
     property_attributes = complete_epcs.merge(
         spatial_data_to_uprn,
-        how="left",
+        how="inner",
         on="uprn"
     )
 
+    property_attributes = property_attributes.merge(
+        asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
+    )
+
+    # TODO: Add on data from the asset list such as ownership
+
     # We drop the columns we don't care about for clustering
     property_attributes = property_attributes.drop(
         columns=[
@@ -1502,7 +1511,7 @@ def compile_data_final():
 
     # Fields to transform: lodgement-datetime
     property_attributes["days_since_last_epc"] = (
-        datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"])
+        datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
     ).dt.days
 
     property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
@@ -1561,6 +1570,7 @@ def compile_data_final():
         "mainheatc-env-eff": "N",
         "floor-level": "NODATA!",
         "hot-water-energy-eff": "N/A",
+        "glazed-type": "unknown"
     }
 
     # Consolidation columns to single value
@@ -1608,6 +1618,19 @@ def compile_data_final():
 
     property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
     property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
+    property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
+        property_attributes["days_since_last_epc"].mean()
+    )
+
+    missings = pd.isnull(property_attributes).sum()
+    missings = missings[missings > 0]
+
+    # Save this
+    # save_pickle_to_s3(
+    #     data=property_attributes,
+    #     bucket_name="retrofit-data-dev",
+    #     s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+    # )
 
     # CLUSTERING!!
 
@@ -1680,7 +1703,7 @@ def compile_data_final():
     )
 
     pipeline = Pipeline(steps=[('preprocessor', preprocessor),
-                               ('kmeans', KMeans(n_clusters=10, random_state=0))])
+                               ('kmeans', KMeans(n_clusters=450, random_state=0))])
 
     # Fit the pipeline to the data
     pipeline.fit(property_attributes)
@@ -1718,6 +1741,36 @@ def compile_data_final():
     # Sorting to verify
     property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
 
+    ################################################
+    # Prepare outputs!!!!
+    ################################################
+    property_attributes.reset_index(inplace=True)
+    property_attributes["archetype_representative"] = property_attributes["rank"] == 1
+
+    asset_list_with_archetypes = asset_list.merge(
+        property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
+        on="internal_id"
+    )
+
+    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
+    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
+    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
+        "archetype_representative"].fillna(False)
+
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
+
+    stonewater_uprn_lookup = asset_list_with_archetypes[
+        ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
+    ]
+
+    stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
+
     ################################################
     # Agglomertive Clustering
     ################################################