From 5b9a36d6d28981b030e7f63d4652318ae811b26c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Jun 2024 11:28:12 +0100 Subject: [PATCH] finished stonewater --- backend/SearchEpc.py | 21 ++-- etl/customers/stonewater/shdf_3_clustering.py | 97 ++++++++++++++----- 2 files changed, 89 insertions(+), 29 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 275669cc..37c2b7f9 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -482,15 +482,22 @@ class SearchEpc: if lmks_to_drop is not None: epc_data = epc_data[~epc_data["lmk-key"].isin(lmks_to_drop)] - if not epc_data.empty: - # Further processing of the EPC data + try: + epc_data['lodgement-datetime'] = pd.to_datetime( + epc_data['lodgement-datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce' + ) + except Exception as e: + logger.error("Problem formatting lodgement-datime, appling fallback: " + str(e)) epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce') - if exclude_old: - # Exclude EPC data older than 10 years - epc_data = epc_data[ - epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10)) - ] + if exclude_old: + # Exclude EPC data older than 10 years + epc_data = epc_data[ + epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10)) + ] + + if not epc_data.empty: + # Further processing of the EPC data epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1) epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1)) diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index c853fa94..5129dfb1 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -11,7 +11,8 @@ from fuzzywuzzy import fuzz import numpy as np import pandas as pd import time -from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet +from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \ + save_dataframe_to_s3_parquet, save_pickle_to_s3 load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -1360,7 +1361,10 @@ def compile_data_final(): ) p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]] - searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0]) + if not p_os_df.empty: + searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0]) + else: + searcher.ordnance_survey_client.property_type = "" # Now we estimate searcher.newest_epc = searcher.estimate_epc( property_type=searcher.ordnance_survey_client.property_type, @@ -1395,20 +1399,19 @@ def compile_data_final(): if searcher.older_epcs is not None: older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs - # Store in S3 # TODO - read in instead of running - # save_data_to_s3( - # data=json.dumps(epc_data_batch_2), - # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.json", - # bucket_name="retrofit-data-dev" - # ) - # - # save_data_to_s3( - # data=json.dumps(older_epcs_batch_2), - # s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.json", - # bucket_name="retrofit-data-dev" - # ) + save_pickle_to_s3( + data=epc_data_batch_2, + s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) + + save_pickle_to_s3( + data=older_epcs_batch_2, + s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) complete_epcs = pd.concat([epc_data, epc_data_batch_2]) @@ -1439,15 +1442,15 @@ def compile_data_final(): spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)] spatial_data_to_uprn.append(spatial_df) - spatial_data_to_uprn = pd.concat(spatial_data_to_uprn) - # TODO: Let's store this in s3 - # save_data_to_s3( - # data=json.dumps(spatial_data_to_uprn.to_dict("records")), - # s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json", + # save_pickle_to_s3( + # data=spatial_data_to_uprn, + # s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", # bucket_name="retrofit-data-dev" # ) + spatial_data_to_uprn = pd.concat(spatial_data_to_uprn) + spatial_data_to_uprn = spatial_data_to_uprn.drop( columns=["partition", "filename"] ).rename(columns={"UPRN": "uprn"}) @@ -1455,10 +1458,16 @@ def compile_data_final(): property_attributes = complete_epcs.merge( spatial_data_to_uprn, - how="left", + how="inner", on="uprn" ) + property_attributes = property_attributes.merge( + asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id" + ) + + # TODO: Add on data from the asset list such as ownership + # We drop the columns we don't care about for clustering property_attributes = property_attributes.drop( columns=[ @@ -1502,7 +1511,7 @@ def compile_data_final(): # Fields to transform: lodgement-datetime property_attributes["days_since_last_epc"] = ( - datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"]) + datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce") ).dt.days property_attributes = property_attributes.drop(columns=["lodgement-datetime"]) @@ -1561,6 +1570,7 @@ def compile_data_final(): "mainheatc-env-eff": "N", "floor-level": "NODATA!", "hot-water-energy-eff": "N/A", + "glazed-type": "unknown" } # Consolidation columns to single value @@ -1608,6 +1618,19 @@ def compile_data_final(): property_attributes["estimated"] = property_attributes["estimated"].fillna(False) property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False) + property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna( + property_attributes["days_since_last_epc"].mean() + ) + + missings = pd.isnull(property_attributes).sum() + missings = missings[missings > 0] + + # Save this + # save_pickle_to_s3( + # data=property_attributes, + # bucket_name="retrofit-data-dev", + # s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl" + # ) # CLUSTERING!! @@ -1680,7 +1703,7 @@ def compile_data_final(): ) pipeline = Pipeline(steps=[('preprocessor', preprocessor), - ('kmeans', KMeans(n_clusters=10, random_state=0))]) + ('kmeans', KMeans(n_clusters=450, random_state=0))]) # Fit the pipeline to the data pipeline.fit(property_attributes) @@ -1718,6 +1741,36 @@ def compile_data_final(): # Sorting to verify property_attributes.sort_values(by=['cluster', 'rank'], inplace=True) + ################################################ + # Prepare outputs!!!! + ################################################ + property_attributes.reset_index(inplace=True) + property_attributes["archetype_representative"] = property_attributes["rank"] == 1 + + asset_list_with_archetypes = asset_list.merge( + property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left", + on="internal_id" + ) + + asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999) + asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str) + asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE") + + asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999) + asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str) + asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE") + + asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[ + "archetype_representative"].fillna(False) + + asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False) + + stonewater_uprn_lookup = asset_list_with_archetypes[ + ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"] + ] + + stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx") + ################################################ # Agglomertive Clustering ################################################