finished stonewater

2026-07-27 23:35:01 +00:00 · 2024-06-13 11:28:12 +01:00 · 2024-06-13 11:28:12 +01:00 · 5b9a36d6d2
commit 5b9a36d6d2
parent 496ae8c969
2 changed files with 89 additions and 29 deletions
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -482,15 +482,22 @@ class SearchEpc:
                if lmks_to_drop is not None:
                    epc_data = epc_data[~epc_data["lmk-key"].isin(lmks_to_drop)]

-                if not epc_data.empty:
-                    # Further processing of the EPC data
+                try:
+                    epc_data['lodgement-datetime'] = pd.to_datetime(
+                        epc_data['lodgement-datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
+                    )
+                except Exception as e:
+                    logger.error("Problem formatting lodgement-datime, appling fallback: " + str(e))
                    epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce')

-                    if exclude_old:
-                        # Exclude EPC data older than 10 years
-                        epc_data = epc_data[
-                            epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
-                            ]
+                if exclude_old:
+                    # Exclude EPC data older than 10 years
+                    epc_data = epc_data[
+                        epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
+                        ]
+
+                if not epc_data.empty:
+                    # Further processing of the EPC data

                    epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
                    epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -11,7 +11,8 @@ from fuzzywuzzy import fuzz
 import numpy as np
 import pandas as pd
 import time
-from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet
+from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
+    save_dataframe_to_s3_parquet, save_pickle_to_s3

 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -1360,7 +1361,10 @@ def compile_data_final():
                )
                p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]

-            searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
+            if not p_os_df.empty:
+                searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
+            else:
+                searcher.ordnance_survey_client.property_type = ""
            # Now we estimate
            searcher.newest_epc = searcher.estimate_epc(
                property_type=searcher.ordnance_survey_client.property_type,
@ -1395,20 +1399,19 @@ def compile_data_final():

        if searcher.older_epcs is not None:
            older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
-
    # Store in S3
    # TODO - read in instead of running
-    # save_data_to_s3(
-    #     data=json.dumps(epc_data_batch_2),
-    #     s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.json",
-    #     bucket_name="retrofit-data-dev"
-    # )
-    # 
-    # save_data_to_s3(
-    #     data=json.dumps(older_epcs_batch_2),
-    #     s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.json",
-    #     bucket_name="retrofit-data-dev"
-    # )
+    save_pickle_to_s3(
+        data=epc_data_batch_2,
+        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+
+    save_pickle_to_s3(
+        data=older_epcs_batch_2,
+        s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
+        bucket_name="retrofit-data-dev"
+    )

    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
    complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@ -1439,15 +1442,15 @@ def compile_data_final():
        spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
        spatial_data_to_uprn.append(spatial_df)

-    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
-
    # TODO: Let's store this in s3
-    # save_data_to_s3(
-    #     data=json.dumps(spatial_data_to_uprn.to_dict("records")),
-    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
+    # save_pickle_to_s3(
+    #     data=spatial_data_to_uprn,
+    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
    #     bucket_name="retrofit-data-dev"
    # )

+    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+
    spatial_data_to_uprn = spatial_data_to_uprn.drop(
        columns=["partition", "filename"]
    ).rename(columns={"UPRN": "uprn"})
@ -1455,10 +1458,16 @@ def compile_data_final():

    property_attributes = complete_epcs.merge(
        spatial_data_to_uprn,
-        how="left",
+        how="inner",
        on="uprn"
    )

+    property_attributes = property_attributes.merge(
+        asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
+    )
+
+    # TODO: Add on data from the asset list such as ownership
+
    # We drop the columns we don't care about for clustering
    property_attributes = property_attributes.drop(
        columns=[
@ -1502,7 +1511,7 @@ def compile_data_final():

    # Fields to transform: lodgement-datetime
    property_attributes["days_since_last_epc"] = (
-        datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"])
+        datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
    ).dt.days

    property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
@ -1561,6 +1570,7 @@ def compile_data_final():
        "mainheatc-env-eff": "N",
        "floor-level": "NODATA!",
        "hot-water-energy-eff": "N/A",
+        "glazed-type": "unknown"
    }

    # Consolidation columns to single value
@ -1608,6 +1618,19 @@ def compile_data_final():

    property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
    property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
+    property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
+        property_attributes["days_since_last_epc"].mean()
+    )
+
+    missings = pd.isnull(property_attributes).sum()
+    missings = missings[missings > 0]
+
+    # Save this
+    # save_pickle_to_s3(
+    #     data=property_attributes,
+    #     bucket_name="retrofit-data-dev",
+    #     s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+    # )

    # CLUSTERING!!

@ -1680,7 +1703,7 @@ def compile_data_final():
    )

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
-                               ('kmeans', KMeans(n_clusters=10, random_state=0))])
+                               ('kmeans', KMeans(n_clusters=450, random_state=0))])

    # Fit the pipeline to the data
    pipeline.fit(property_attributes)
@ -1718,6 +1741,36 @@ def compile_data_final():
    # Sorting to verify
    property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)

+    ################################################
+    # Prepare outputs!!!!
+    ################################################
+    property_attributes.reset_index(inplace=True)
+    property_attributes["archetype_representative"] = property_attributes["rank"] == 1
+
+    asset_list_with_archetypes = asset_list.merge(
+        property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
+        on="internal_id"
+    )
+
+    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
+    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
+    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
+        "archetype_representative"].fillna(False)
+
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
+
+    stonewater_uprn_lookup = asset_list_with_archetypes[
+        ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
+    ]
+
+    stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
+
    ################################################
    # Agglomertive Clustering
    ################################################