From 4456ab29eeac9a3407408d84b39ccc328dd8983a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 28 Jun 2024 11:03:22 +0100 Subject: [PATCH] added the grouped clustering --- .../stonewater/outputs 27th June 2024.py | 31 ++- etl/customers/stonewater/shdf_3_clustering.py | 206 ++++++++++++------ 2 files changed, 161 insertions(+), 76 deletions(-) diff --git a/etl/customers/stonewater/outputs 27th June 2024.py b/etl/customers/stonewater/outputs 27th June 2024.py index ebb6fc5b..d8bf43be 100644 --- a/etl/customers/stonewater/outputs 27th June 2024.py +++ b/etl/customers/stonewater/outputs 27th June 2024.py @@ -11,7 +11,7 @@ In this script, we do the following things: import pandas as pd from utils.s3 import read_pickle_from_s3 -archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv") +archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv") archetyped_asset_list = archetyped_asset_list[ [ "internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank" @@ -34,15 +34,22 @@ archetyped_asset_list = archetyped_asset_list.merge( how="inner" ) +# Look at number of combinations +# - If we look at the number of combinations of property type & built form, we have 25 unique combinations +# - If we look at the number of combinations of property type, built form, and walls description, this jumps +# massively to 237 unique combinations +# - Adding roof description to the mix, we have 857 unique combinations +# - Adding floor description, we have 1278 unique combinations +# This doesn't even begin to consider the other variables that we have in the dataset, such as the property dimensions, +# location, and other factors. +# Ideally, we would perfectly separate these variables but this is not possible, given the constraint of needing ~450 +# archetypes. We will need to make some compromises here. This is where a clustering algorithm can help us. +# We don't end up with perfect separation but we can get a good enough separation to make the archetypes useful, and can +# base the archetypes on a number of energy performance metrics, as well as location and other factors. +# archetyped_asset_list[ +# ["property-type", "built-form", "walls-description", "roof-description", +# "floor-description"]].drop_duplicates().shape + property_type_archetypes = archetyped_asset_list[ - ["cluster", "rank", "property-type", "built-form", "walls-description"]] - -# Key variables for separation: -# - property-type -# - built-form -# - walls-description -# - roof-description - -clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape - -clustering_features["walls-description"].value_counts() + ["cluster", "rank", "property-type", "built-form", "walls-description"] +] diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index caaf84a6..fa6551b7 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -14,6 +14,11 @@ import pandas as pd import time from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \ save_dataframe_to_s3_parquet, save_pickle_to_s3 +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from scipy.spatial.distance import cdist load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -1090,6 +1095,26 @@ def concatenate_row(row): return ', '.join(row.dropna().replace('', None).dropna().astype(str)) +def adjust_clusters(cluster_allocation, total_clusters): + current_total = sum(cluster_allocation.values()) + adjustment = total_clusters - current_total + if adjustment > 0: + # Increase clusters, start from the largest group + for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]): + cluster_allocation[group] += 1 + adjustment -= 1 + if adjustment == 0: + break + elif adjustment < 0: + # Decrease clusters, start from the largest group + for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]): + cluster_allocation[group] -= 1 + adjustment += 1 + if adjustment == 0: + break + return cluster_allocation + + def compile_data_final(): # Updated version: @@ -1667,7 +1692,7 @@ def compile_data_final(): 'windows-description': WindowAttributes, 'lighting-description': LightingAttributes } - + for variable_to_clean in cleaned.keys(): unique_descriptions = property_attributes[variable_to_clean].unique() @@ -1691,28 +1716,45 @@ def compile_data_final(): descriptions_to_append = pd.DataFrame(descriptions_to_append) clean_df = pd.concat([clean_df, descriptions_to_append]) - starting_size = len(property_attributes) - property_attributes = property_attributes.merge( - clean_df, how="left", left_on=variable_to_clean, right_on="original_description" - ) - if starting_size != property_attributes.shape[0]: - raise Exception("something went wrong") - property_attributes = property_attributes.drop(columns=["original_description", "clean_description"]) - # Fill missings - for k in clean_df.columns: - if k in property_attributes.columns: - property_attributes[k] = property_attributes[k].fillna("missing") + clean_df = clean_df.rename( + columns={ + "thermal_transmittance": f"{variable_to_clean}_thermal_transmittance", + "is_assumed": f"{variable_to_clean}_is_assumed", + } + ) + + if 'thermal_transmittance_unit' in clean_df.columns: + clean_df = clean_df.drop(columns=['thermal_transmittance_unit']) + + starting_size = len(property_attributes) + property_attributes = property_attributes.merge( + clean_df, how="left", left_on=variable_to_clean, right_on="original_description" + ) + if starting_size != property_attributes.shape[0]: + raise Exception("something went wrong") + property_attributes = property_attributes.drop(columns=["original_description", "clean_description"]) + # Fill missings + for k in clean_df.columns: + if k in property_attributes.columns: + property_attributes[k] = property_attributes[k].fillna("missing") # We group some variables such as thermal transmittance for walls, roof, floors + # ranges = { + # "< 0.1": (0, 0.1), + # "0.1 - 0.3": (0.1, 0.3), + # "0.3 - 0.5": (0.3, 0.5), + # "0.5 - 0.7": (0.5, 0.7), + # "0.9 - 1": (0.9, 1), + # "1 - 1.5": (1, 1.5), + # "1.5 - 2": (1.5, 2), + # "2+": (2, 2.5) + # } + ranges = { "< 0.1": (0, 0.1), "0.1 - 0.3": (0.1, 0.3), "0.3 - 0.5": (0.3, 0.5), - "0.5 - 0.7": (0.5, 0.7), - "0.9 - 1": (0.9, 1), - "1 - 1.5": (1, 1.5), - "1.5 - 2": (1.5, 2), - "2+": (2, 2.5) + "0.5+": (0.5, 2.5), } # Generate the lookup table @@ -1733,7 +1775,7 @@ def compile_data_final(): ] for i, col in enumerate(thermal_transmittance_cols): # Perform the mapping - to_col = f"to_{i}" + to_col = f"to_{col}" property_attributes[col] = property_attributes[col].astype(str) property_attributes = property_attributes.merge( thermal_transmittance_lookup_table.rename(columns={"to": to_col}), @@ -1750,72 +1792,108 @@ def compile_data_final(): # Perform the mapping # CLUSTERING!! - - from sklearn.cluster import KMeans - from sklearn.preprocessing import StandardScaler, OneHotEncoder - from sklearn.compose import ColumnTransformer - from sklearn.pipeline import Pipeline - from scipy.spatial.distance import cdist - id_column = 'internal_id' - property_attributes.set_index(id_column, inplace=True) + grouping_columns = [ + 'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type' + ] # Define the preprocessing for numerical and categorical features numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist() + categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]] for col in categorical_features: property_attributes[col] = property_attributes[col].astype(str) - preprocessor = ColumnTransformer( - transformers=[ - ('num', StandardScaler(), numerical_features), - ('cat', OneHotEncoder(), categorical_features) + id_column = 'internal_id' + n_clusters = 450 + random_state = 0 + + training_data_grouped = property_attributes.groupby(grouping_columns) + group_sizes = {name: len(group) for name, group in training_data_grouped} + total_size = sum(group_sizes.values()) + cluster_allocation = { + name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items() + } + + # Adjust cluster allocation to ensure total clusters sum to 450 + cluster_allocation = adjust_clusters(cluster_allocation, n_clusters) + + # TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to + # collect the results of the clustering and then perform the transformations afterwards + + final_clusters = [] + for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)): + + group_n_clusters = cluster_allocation[group_variables] + group_data.set_index(id_column, inplace=True) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), numerical_features), + ('cat', OneHotEncoder(), categorical_features) + ] + ) + + pipeline = Pipeline(steps=[('preprocessor', preprocessor), + ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))]) + + # Fit the pipeline to the data + pipeline.fit(group_data) + + # Transform the data using the fitted pipeline + processed_data = pipeline.named_steps['preprocessor'].transform(group_data) + + # Get cluster labels + group_data['cluster'] = pipeline.named_steps['kmeans'].labels_ + + # Get centroids (already in the same transformed space) + centroids = pipeline.named_steps['kmeans'].cluster_centers_ + + # if the data isn't an array, make it one + if not isinstance(processed_data, np.ndarray): + processed_data = processed_data.toarray() + + # Calculate distances from each point to the centroid of its cluster + distances_to_centroids = [ + cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0] + for i, label in enumerate(group_data['cluster']) ] - ) - pipeline = Pipeline(steps=[('preprocessor', preprocessor), - ('kmeans', KMeans(n_clusters=450, random_state=0))]) + group_data['distance_to_centroid'] = distances_to_centroids - # Fit the pipeline to the data - pipeline.fit(property_attributes) + # for cluster_id in group_data['cluster'].unique(): + # cluster_data = group_data[group_data['cluster'] == cluster_id] + # min_distance = cluster_data['distance_to_centroid'].min() + # print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}") + # if min_distance != 0: + # print(f"No point with zero distance found in cluster {cluster_id}") - # Transform the data using the fitted pipeline - processed_data = pipeline.named_steps['preprocessor'].transform(property_attributes) + # Ranking rows by distance within each cluster + group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first') - # Get cluster labels - property_attributes['cluster'] = pipeline.named_steps['kmeans'].labels_ + # Sorting to verify + group_data.sort_values(by=['cluster', 'rank'], inplace=True) + group_data.reset_index(inplace=True) - # Get centroids (already in the same transformed space) - centroids = pipeline.named_steps['kmeans'].cluster_centers_ + to_append = group_data[["internal_id", "cluster", "rank"]].copy() + to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables) + final_clusters.append(to_append) - processed_data = processed_data.toarray() + final_clusters = pd.concat(final_clusters) + # remap the clusters from the current names to 1 -> n_clusters - # Calculate distances from each point to the centroid of its cluster - distances_to_centroids = [ - cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0] - for i, label in enumerate(property_attributes['cluster']) - ] - - property_attributes['distance_to_centroid'] = distances_to_centroids - - for cluster_id in property_attributes['cluster'].unique(): - cluster_data = property_attributes[property_attributes['cluster'] == cluster_id] - min_distance = cluster_data['distance_to_centroid'].min() - print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}") - if min_distance != 0: - print(f"No point with zero distance found in cluster {cluster_id}") - - # Ranking rows by distance within each cluster - property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank( - method='first') - - # Sorting to verify - property_attributes.sort_values(by=['cluster', 'rank'], inplace=True) + cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())} + final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping) + final_clusters["cluster"] = final_clusters["cluster"].astype(str) ################################################ # Prepare outputs!!!! ################################################ + property_attributes.reset_index(inplace=True) + property_attributes = property_attributes.merge( + final_clusters, how="left", on="internal_id" + ) property_attributes["archetype_representative"] = property_attributes["rank"] == 1 asset_list_with_archetypes = asset_list.merge( @@ -1834,7 +1912,7 @@ def compile_data_final(): asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[ "archetype_representative"].fillna(False) - asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False) + asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False) stonewater_uprn_lookup = asset_list_with_archetypes[ ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]