working on new clustering for stonewater

2026-06-08 11:17:27 +00:00 · 2024-07-23 18:14:21 +01:00 · 2024-07-23 18:14:21 +01:00 · 173b7ec434
commit 173b7ec434
parent 603b3e1db2
1 changed files with 138 additions and 66 deletions
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -19,6 +19,7 @@ from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from scipy.spatial.distance import cdist
+from sklearn.metrics import pairwise_distances_argmin_min

 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -2200,77 +2201,148 @@ def updated_version():
            # Fuel
            "Main Fuel",
            "Age",
-            "Total Floor Area"
+            "Total Floor Area",
            "representative_sap",
        ]
    ]

-    z = master_sheet_clustering_features[
-        ["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"]
-    ].drop_duplicates()
+    def split_property_type(row):
+        parts = row.split(':')
+        property_type = parts[0].strip()
+        built_form = parts[1].strip() if len(parts) > 1 else ''
+        property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
+        return pd.Series([property_type, built_form, property_extended_feature])

-    # TODO: heating - remap
-    # Boiler: A rated Regular Boiler
-    # 1944
-    # Boiler: A rated Combi
-    # 1335
-    # Electric Storage Systems: High heat retention storage heaters
-    # 543
-    # Electric Storage Systems: Fan storage heaters
-    # 284
-    # Electric (direct acting) room heaters: Panel, convector or radiant heaters
-    # 253
-    # Boiler: C rated Regular Boiler
-    # 142
-    # Community Heating Systems: Community boilers only (RdSAP)
-    # 127
-    # Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C
-    # 126
-    # Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C
-    # 70
-    # Boiler: E rated Regular Boiler
-    # 62
-    # Boiler: E rated Combi
-    # 59
-    # Electric Storage Systems: Old (large volume) storage heaters
-    # 55
-    # Electric Storage Systems: Modern (slimline) storage heaters
-    # 49
-    # Boiler: B rated Regular Boiler
-    # 46
-    # Boiler: C rated Combi
-    # 44
-    # Heat Pump: Electric Heat pumps: Ground source heat pump in other cases
-    # 39
-    # Boiler: D rated Regular Boiler
-    # 16
-    # Community Heating Systems: Community CHP and boilers (RdSAP)
-    # 14
-    # Heat Pump: Electric Heat pumps: Air source heat pump in other cases
-    # 13
-    # Boiler: F rated Combi
-    # 12
-    # Boiler: G rated Regular Boiler
-    # 10
-    # Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters
-    # 8
-    # Electric (direct acting) room heaters: Water- or oil-filled radiators
-    # 4
-    # Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters
-    # 3
-    # Boiler: D rated Combi
-    # 3
-    # Boiler: A rated CPSU
-    # 2
-    # Heat Pump: (from database)
-    # 1
-    # System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi
-    # 1
-    # No Heating
-    # 1
-    # Solid fuel room heaters: Open fire in grate
-    # 1
-    # Boiler: F rated Regular Boiler
+    clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
+        clustering_features['Property Type'].apply(split_property_type)
+    )
+    clustering_features = clustering_features.drop(columns=["Property Type"])
+
+    # These are the variables we MUST split by
+    grouping_columns = [
+        "property_type",
+        "walls_reduced",
+        "roof_type",
+        "Main Fuel"
+    ]
+
+    def combine_small_groups(clustering_features, grouping_columns, threshold=1):
+        # Identify small groups
+        group_sizes = clustering_features.groupby(grouping_columns).size()
+        small_groups = group_sizes[group_sizes <= threshold].index.tolist()
+
+        # Remove small groups from the original clustering_features
+        small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
+        clustering_features = clustering_features[
+            ~clustering_features.set_index(grouping_columns).index.isin(small_groups)
+        ]
+
+        if small_group_data.empty:
+            return clustering_features
+
+        # Combine small groups with the nearest available group
+        # Using Euclidean distance for numerical features as a simple measure of similarity
+        numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
+        small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean()
+        large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean()
+
+        closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values)
+        closest_group_index = large_group_centroids.index[closest_groups]
+
+        combined_data = []
+        for small_group, closest_group in zip(small_groups, closest_group_index):
+            small_group_data.loc[
+                small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group
+            combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group])
+
+        combined_data = pd.concat(combined_data)
+
+        combined_data = pd.concat([clustering_features, combined_data])
+
+        return combined_data
+
+    clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
+
+    ########################################################################
+    # Clustering
+    ########################################################################
+    numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
+    categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist()
+    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
+
+    for col in categorical_features:
+        clustering_features[col] = clustering_features[col].astype(str)
+
+    id_column = 'internal_id'
+    n_clusters = 450
+    random_state = 0
+
+    training_data_grouped = clustering_features.groupby(grouping_columns)
+    group_sizes = {name: len(group) for name, group in training_data_grouped}
+    total_size = sum(group_sizes.values())
+    cluster_allocation = {
+        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
+    }
+
+    # Adjust cluster allocation to ensure total clusters sum to 450
+    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
+
+    final_clusters = []
+    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
+
+        group_n_clusters = cluster_allocation[group_variables]
+        group_data.set_index(id_column, inplace=True)
+
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', StandardScaler(), numerical_features),
+                ('cat', OneHotEncoder(), categorical_features)
+            ]
+        )
+
+        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
+
+        # Fit the pipeline to the data
+        pipeline.fit(group_data)
+
+        # Transform the data using the fitted pipeline
+        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
+
+        # Get cluster labels
+        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
+
+        # Get centroids (already in the same transformed space)
+        centroids = pipeline.named_steps['kmeans'].cluster_centers_
+
+        # if the data isn't an array, make it one
+        if not isinstance(processed_data, np.ndarray):
+            processed_data = processed_data.toarray()
+
+        # Calculate distances from each point to the centroid of its cluster
+        distances_to_centroids = [
+            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
+            for i, label in enumerate(group_data['cluster'])
+        ]
+
+        group_data['distance_to_centroid'] = distances_to_centroids
+
+        # Ranking rows by distance within each cluster
+        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
+
+        # Sorting to verify
+        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
+        group_data.reset_index(inplace=True)
+
+        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
+        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
+        final_clusters.append(to_append)
+
+    final_clusters = pd.concat(final_clusters)
+    # remap the clusters from the current names to 1 -> n_clusters
+    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
+    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
+    final_clusters["cluster"] = final_clusters["cluster"].astype(str)


 def read_asset_list():