From 4456ab29eeac9a3407408d84b39ccc328dd8983a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 28 Jun 2024 11:03:22 +0100
Subject: [PATCH] added the grouped clustering

---
 .../stonewater/outputs 27th June 2024.py      |  31 ++-
 etl/customers/stonewater/shdf_3_clustering.py | 206 ++++++++++++------
 2 files changed, 161 insertions(+), 76 deletions(-)

diff --git a/etl/customers/stonewater/outputs 27th June 2024.py b/etl/customers/stonewater/outputs 27th June 2024.py
index ebb6fc5b..d8bf43be 100644
--- a/etl/customers/stonewater/outputs 27th June 2024.py	
+++ b/etl/customers/stonewater/outputs 27th June 2024.py	
@@ -11,7 +11,7 @@ In this script, we do the following things:
 import pandas as pd
 from utils.s3 import read_pickle_from_s3
 
-archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv")
+archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
 archetyped_asset_list = archetyped_asset_list[
     [
         "internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
@@ -34,15 +34,22 @@ archetyped_asset_list = archetyped_asset_list.merge(
     how="inner"
 )
 
+# Look at number of combinations
+# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
+# - If we look at the number of combinations of property type, built form, and walls description, this jumps
+# massively to 237 unique combinations
+# - Adding roof description to the mix, we have 857 unique combinations
+# - Adding floor description, we have 1278 unique combinations
+# This doesn't even begin to consider the other variables that we have in the dataset, such as the property dimensions,
+# location, and other factors.
+# Ideally, we would perfectly separate these variables but this is not possible, given the constraint of needing ~450
+# archetypes. We will need to make some compromises here. This is where a clustering algorithm can help us.
+# We don't end up with perfect separation but we can get a good enough separation to make the archetypes useful, and can
+# base the archetypes on a number of energy performance metrics, as well as location and other factors.
+# archetyped_asset_list[
+#     ["property-type", "built-form", "walls-description", "roof-description",
+#      "floor-description"]].drop_duplicates().shape
+
 property_type_archetypes = archetyped_asset_list[
-    ["cluster", "rank", "property-type", "built-form", "walls-description"]]
-
-# Key variables for separation:
-# - property-type
-# - built-form
-# - walls-description
-# - roof-description
-
-clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape
-
-clustering_features["walls-description"].value_counts()
+    ["cluster", "rank", "property-type", "built-form", "walls-description"]
+]
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index caaf84a6..fa6551b7 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -14,6 +14,11 @@ import pandas as pd
 import time
 from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
     save_dataframe_to_s3_parquet, save_pickle_to_s3
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from scipy.spatial.distance import cdist
 
 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@@ -1090,6 +1095,26 @@ def concatenate_row(row):
     return ', '.join(row.dropna().replace('', None).dropna().astype(str))
 
 
+def adjust_clusters(cluster_allocation, total_clusters):
+    current_total = sum(cluster_allocation.values())
+    adjustment = total_clusters - current_total
+    if adjustment > 0:
+        # Increase clusters, start from the largest group
+        for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
+            cluster_allocation[group] += 1
+            adjustment -= 1
+            if adjustment == 0:
+                break
+    elif adjustment < 0:
+        # Decrease clusters, start from the largest group
+        for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
+            cluster_allocation[group] -= 1
+            adjustment += 1
+            if adjustment == 0:
+                break
+    return cluster_allocation
+
+
 def compile_data_final():
     # Updated version:
 
@@ -1667,7 +1692,7 @@ def compile_data_final():
         'windows-description': WindowAttributes,
         'lighting-description': LightingAttributes
     }
-    
+
     for variable_to_clean in cleaned.keys():
 
         unique_descriptions = property_attributes[variable_to_clean].unique()
@@ -1691,28 +1716,45 @@ def compile_data_final():
             descriptions_to_append = pd.DataFrame(descriptions_to_append)
             clean_df = pd.concat([clean_df, descriptions_to_append])
 
-            starting_size = len(property_attributes)
-            property_attributes = property_attributes.merge(
-                clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
-            )
-            if starting_size != property_attributes.shape[0]:
-                raise Exception("something went wrong")
-            property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
-            # Fill missings
-            for k in clean_df.columns:
-                if k in property_attributes.columns:
-                    property_attributes[k] = property_attributes[k].fillna("missing")
+        clean_df = clean_df.rename(
+            columns={
+                "thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
+                "is_assumed": f"{variable_to_clean}_is_assumed",
+            }
+        )
+
+        if 'thermal_transmittance_unit' in clean_df.columns:
+            clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
+
+        starting_size = len(property_attributes)
+        property_attributes = property_attributes.merge(
+            clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
+        )
+        if starting_size != property_attributes.shape[0]:
+            raise Exception("something went wrong")
+        property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
+        # Fill missings
+        for k in clean_df.columns:
+            if k in property_attributes.columns:
+                property_attributes[k] = property_attributes[k].fillna("missing")
 
     # We group some variables such as thermal transmittance for walls, roof, floors
+    # ranges = {
+    #     "< 0.1": (0, 0.1),
+    #     "0.1 - 0.3": (0.1, 0.3),
+    #     "0.3 - 0.5": (0.3, 0.5),
+    #     "0.5 - 0.7": (0.5, 0.7),
+    #     "0.9 - 1": (0.9, 1),
+    #     "1 - 1.5": (1, 1.5),
+    #     "1.5 - 2": (1.5, 2),
+    #     "2+": (2, 2.5)
+    # }
+
     ranges = {
         "< 0.1": (0, 0.1),
         "0.1 - 0.3": (0.1, 0.3),
         "0.3 - 0.5": (0.3, 0.5),
-        "0.5 - 0.7": (0.5, 0.7),
-        "0.9 - 1": (0.9, 1),
-        "1 - 1.5": (1, 1.5),
-        "1.5 - 2": (1.5, 2),
-        "2+": (2, 2.5)
+        "0.5+": (0.5, 2.5),
     }
 
     # Generate the lookup table
@@ -1733,7 +1775,7 @@ def compile_data_final():
     ]
     for i, col in enumerate(thermal_transmittance_cols):
         # Perform the mapping
-        to_col = f"to_{i}"
+        to_col = f"to_{col}"
         property_attributes[col] = property_attributes[col].astype(str)
         property_attributes = property_attributes.merge(
             thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
@@ -1750,72 +1792,108 @@ def compile_data_final():
     # Perform the mapping
 
     # CLUSTERING!!
-
-    from sklearn.cluster import KMeans
-    from sklearn.preprocessing import StandardScaler, OneHotEncoder
-    from sklearn.compose import ColumnTransformer
-    from sklearn.pipeline import Pipeline
-    from scipy.spatial.distance import cdist
-    id_column = 'internal_id'
-    property_attributes.set_index(id_column, inplace=True)
+    grouping_columns = [
+        'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type'
+    ]
 
     # Define the preprocessing for numerical and categorical features
     numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
     categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
+    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
 
     for col in categorical_features:
         property_attributes[col] = property_attributes[col].astype(str)
 
-    preprocessor = ColumnTransformer(
-        transformers=[
-            ('num', StandardScaler(), numerical_features),
-            ('cat', OneHotEncoder(), categorical_features)
+    id_column = 'internal_id'
+    n_clusters = 450
+    random_state = 0
+
+    training_data_grouped = property_attributes.groupby(grouping_columns)
+    group_sizes = {name: len(group) for name, group in training_data_grouped}
+    total_size = sum(group_sizes.values())
+    cluster_allocation = {
+        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
+    }
+
+    # Adjust cluster allocation to ensure total clusters sum to 450
+    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
+
+    # TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
+    #       collect the results of the clustering and then perform the transformations afterwards
+
+    final_clusters = []
+    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
+
+        group_n_clusters = cluster_allocation[group_variables]
+        group_data.set_index(id_column, inplace=True)
+
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', StandardScaler(), numerical_features),
+                ('cat', OneHotEncoder(), categorical_features)
+            ]
+        )
+
+        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
+
+        # Fit the pipeline to the data
+        pipeline.fit(group_data)
+
+        # Transform the data using the fitted pipeline
+        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
+
+        # Get cluster labels
+        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
+
+        # Get centroids (already in the same transformed space)
+        centroids = pipeline.named_steps['kmeans'].cluster_centers_
+
+        # if the data isn't an array, make it one
+        if not isinstance(processed_data, np.ndarray):
+            processed_data = processed_data.toarray()
+
+        # Calculate distances from each point to the centroid of its cluster
+        distances_to_centroids = [
+            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
+            for i, label in enumerate(group_data['cluster'])
         ]
-    )
 
-    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
-                               ('kmeans', KMeans(n_clusters=450, random_state=0))])
+        group_data['distance_to_centroid'] = distances_to_centroids
 
-    # Fit the pipeline to the data
-    pipeline.fit(property_attributes)
+        # for cluster_id in group_data['cluster'].unique():
+        #     cluster_data = group_data[group_data['cluster'] == cluster_id]
+        #     min_distance = cluster_data['distance_to_centroid'].min()
+        #     print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
+        #     if min_distance != 0:
+        #         print(f"No point with zero distance found in cluster {cluster_id}")
 
-    # Transform the data using the fitted pipeline
-    processed_data = pipeline.named_steps['preprocessor'].transform(property_attributes)
+        # Ranking rows by distance within each cluster
+        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
 
-    # Get cluster labels
-    property_attributes['cluster'] = pipeline.named_steps['kmeans'].labels_
+        # Sorting to verify
+        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
+        group_data.reset_index(inplace=True)
 
-    # Get centroids (already in the same transformed space)
-    centroids = pipeline.named_steps['kmeans'].cluster_centers_
+        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
+        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
+        final_clusters.append(to_append)
 
-    processed_data = processed_data.toarray()
+    final_clusters = pd.concat(final_clusters)
+    # remap the clusters from the current names to 1 -> n_clusters
 
-    # Calculate distances from each point to the centroid of its cluster
-    distances_to_centroids = [
-        cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
-        for i, label in enumerate(property_attributes['cluster'])
-    ]
-
-    property_attributes['distance_to_centroid'] = distances_to_centroids
-
-    for cluster_id in property_attributes['cluster'].unique():
-        cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
-        min_distance = cluster_data['distance_to_centroid'].min()
-        print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
-        if min_distance != 0:
-            print(f"No point with zero distance found in cluster {cluster_id}")
-
-    # Ranking rows by distance within each cluster
-    property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(
-        method='first')
-
-    # Sorting to verify
-    property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
+    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
+    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
+    final_clusters["cluster"] = final_clusters["cluster"].astype(str)
 
     ################################################
     # Prepare outputs!!!!
     ################################################
+
     property_attributes.reset_index(inplace=True)
+    property_attributes = property_attributes.merge(
+        final_clusters, how="left", on="internal_id"
+    )
     property_attributes["archetype_representative"] = property_attributes["rank"] == 1
 
     asset_list_with_archetypes = asset_list.merge(
@@ -1834,7 +1912,7 @@ def compile_data_final():
     asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
         "archetype_representative"].fillna(False)
 
-    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)
 
     stonewater_uprn_lookup = asset_list_with_archetypes[
         ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]