From 36b3b4bea55c136a34f46acb2cc13ed4a3aa2529 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 23 Jul 2024 20:07:50 +0100
Subject: [PATCH] revised clusterin

---
 .../stonewater/outputs 27th June 2024.py      | 99 +++++++++++--------
 etl/customers/stonewater/shdf_3_clustering.py | 64 ++++++++++--
 2 files changed, 112 insertions(+), 51 deletions(-)

diff --git a/etl/customers/stonewater/outputs 27th June 2024.py b/etl/customers/stonewater/outputs 27th June 2024.py
index 7a78469c..cf0c6478 100644
--- a/etl/customers/stonewater/outputs 27th June 2024.py	
+++ b/etl/customers/stonewater/outputs 27th June 2024.py	
@@ -11,8 +11,9 @@ In this script, we do the following things:
 import pandas as pd
 import json
 from utils.s3 import read_pickle_from_s3
+from backend.app.utils import sap_to_epc
 
-stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
+stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
 archetyped_asset_list = stonewater_asset_list[
     [
         "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
@@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
 archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
 
 # Read in and merge on clustering features
-clustering_features = read_pickle_from_s3(
-    bucket_name="retrofit-data-dev",
-    s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+clustering_features = pd.read_csv(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
 )
 
-# Move property-type and built-form to the first two columns
-columns_to_move = ['property-type', 'built-form']
-
-# Get the remaining columns
-remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
-
-# Create the new column order
-new_column_order = columns_to_move + remaining_columns
-
-# Reorder the DataFrame
-clustering_features = clustering_features[new_column_order]
-
 archetyped_asset_list = archetyped_asset_list.merge(
-    clustering_features,
-    on="internal_id",
-    how="inner"
-)
+    clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
+    left_on="internal_id",
+    right_on="Osm. ID"
+).drop(columns=["Osm. ID"])
 
 archetyped_asset_list = archetyped_asset_list.rename(
     columns={
@@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
 # archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
 
 # We store the location data, which will be used for the mapping. We just need the longitude and latitude
+stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
+
 mapping_data = stonewater_asset_list[
     stonewater_asset_list["archetype_representative"]
-][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
+][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
+    archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
+    how="left",
+    on="uprn"
+)
+
+# We need to merge on longitude and latitude
+spatial_data_to_uprn = read_pickle_from_s3(
+    s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+    bucket_name="retrofit-data-dev"
+)
+
+
+# Function to convert specific columns to bool dtype
+def convert_specific_columns_to_bool(df, columns):
+    for column in columns:
+        if column in df.columns:
+            df[column] = df[column].astype(bool)
+    return df
+
+
+spatial_data_to_uprn = [convert_specific_columns_to_bool(
+    df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
+) for df in spatial_data_to_uprn]
+
+spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+spatial_data_to_uprn = spatial_data_to_uprn.drop(
+    columns=["partition", "filename"]
+).rename(columns={"UPRN": "uprn"})
+spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")
 
 mapping_data = mapping_data.merge(
-    clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
+    spatial_data_to_uprn[
+        ["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
+    ],
+    how="left",
+    on="uprn"
 )
 mapping_data = mapping_data.drop(columns=["internal_id"])
 
@@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
     f.write(json.dumps(mapping_data.to_dict(orient="records")))
 
 # We also include some data for visualising the breakdown of EPCS
-proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
 # Invert the true and false
-proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
-proportion_of_real_epcs = proportion_of_real_epcs.rename(
-    columns={"estimated": "is_real_epc"}
-)
+# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
+# proportion_of_real_epcs = proportion_of_real_epcs.rename(
+#     columns={"estimated": "is_real_epc"}
+# )
+#
+# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
+#     f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
 
-with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
-    f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
+# Produce the breakdown of EPC ratings for properties to be surveyed
+clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)
 
-# Produce the breakdown of EPC ratings
 epc_rating_breakdown = (
-    clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
+    clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
     .value_counts()
     .to_frame()
     .reset_index()
 )
 
 epc_rating_breakdown = epc_rating_breakdown.rename(
-    columns={"current-energy-rating": "EPC"}
+    columns={"index": "EPC", "representative_epc": "count"}
 )
 
 with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
     f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
-
-epc_a_properties = clustering_features[
-    (clustering_features["current-energy-rating"] == "A")
-    & (~clustering_features["estimated"])
-    ]
-
-epc_a_properties = epc_a_properties.merge(
-    stonewater_asset_list,
-    on="internal_id",
-    how="inner"
-)
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index 7d0b6336..18cfee79 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -1084,11 +1084,11 @@ def compile_data():
     spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
 
     # TODO: Let's store this in s3
-    save_data_to_s3(
-        data=json.dumps(spatial_data_to_uprn.to_dict("records")),
-        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
-        bucket_name="retrofit-data-dev"
-    )
+    # save_data_to_s3(
+    #     data=json.dumps(spatial_data_to_uprn.to_dict("records")),
+    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
+    #     bucket_name="retrofit-data-dev"
+    # )
 
     # We merge this spatial data onto final EPCS
 
@@ -2070,7 +2070,7 @@ def updated_version():
         [
             "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
             "city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
-            "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date"
+            "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
         ]
     ]
 
@@ -2192,6 +2192,7 @@ def updated_version():
             'conservation_status',
             'is_listed_building',
             'is_heritage_building',
+            "county",
             # Walls
             "walls_reduced",
             # Roof
@@ -2204,9 +2205,12 @@ def updated_version():
             "Age",
             "Total Floor Area",
             "representative_sap",
+            "days_since_lodgement",
         ]
     ]
 
+    clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
+
     def split_property_type(row):
         parts = row.split(':')
         property_type = parts[0].strip()
@@ -2224,10 +2228,11 @@ def updated_version():
         "property_type",
         "walls_reduced",
         "roof_type",
-        "Main Fuel"
+        "Main Fuel",
+        "county",
     ]
 
-    def combine_small_groups(clustering_features, grouping_columns, threshold=1):
+    def combine_small_groups(clustering_features, grouping_columns, threshold=2):
         # Identify small groups
         group_sizes = clustering_features.groupby(grouping_columns).size()
         small_groups = group_sizes[group_sizes <= threshold].index.tolist()
@@ -2397,6 +2402,49 @@ def updated_version():
         asset_list_with_archetypes["cluster"]
     )
 
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["archetype_representative"] = (
+        asset_list_with_archetypes["archetype_representative"].fillna(False)
+    )
+
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False)
+
+    # Produce the archetyping features
+    archetyping_features_csv = assigned_clusters[
+        [
+            "internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
+            "is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
+        ]
+    ].merge(
+        asset_list[
+            ["internal_id", "uprn", "external_address_id"]
+        ],
+        how="left",
+        on="internal_id"
+    ).merge(
+        master_sheet_clustering_features,
+        how="left",
+        right_on="Address ID",
+        left_on="external_address_id"
+    ).drop(columns=["Address ID"]).rename(
+        columns={
+            "internal_id": "Osm. ID",
+            "external_address_id": "Address ID",
+        }
+    )
+
+    archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
+    archetyping_features_csv.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
+    )
+
+    representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
+    print(representatives["postal_region"].nunique())
+    print(representatives["county"].nunique())
+
 
 def read_asset_list():
     asset_list = pd.read_excel(