diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index e4818e2c..380bc36b 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -19,6 +19,7 @@ from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from scipy.spatial.distance import cdist +from sklearn.metrics import pairwise_distances_argmin_min load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -2200,77 +2201,148 @@ def updated_version(): # Fuel "Main Fuel", "Age", - "Total Floor Area" + "Total Floor Area", "representative_sap", ] ] - z = master_sheet_clustering_features[ - ["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"] - ].drop_duplicates() + def split_property_type(row): + parts = row.split(':') + property_type = parts[0].strip() + built_form = parts[1].strip() if len(parts) > 1 else '' + property_extended_feature = parts[2].strip() if len(parts) > 2 else '' + return pd.Series([property_type, built_form, property_extended_feature]) - # TODO: heating - remap - # Boiler: A rated Regular Boiler - # 1944 - # Boiler: A rated Combi - # 1335 - # Electric Storage Systems: High heat retention storage heaters - # 543 - # Electric Storage Systems: Fan storage heaters - # 284 - # Electric (direct acting) room heaters: Panel, convector or radiant heaters - # 253 - # Boiler: C rated Regular Boiler - # 142 - # Community Heating Systems: Community boilers only (RdSAP) - # 127 - # Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C - # 126 - # Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C - # 70 - # Boiler: E rated Regular Boiler - # 62 - # Boiler: E rated Combi - # 59 - # Electric Storage Systems: Old (large volume) storage heaters - # 55 - # Electric Storage Systems: Modern (slimline) storage heaters - # 49 - # Boiler: B rated Regular Boiler - # 46 - # Boiler: C rated Combi - # 44 - # Heat Pump: Electric Heat pumps: Ground source heat pump in other cases - # 39 - # Boiler: D rated Regular Boiler - # 16 - # Community Heating Systems: Community CHP and boilers (RdSAP) - # 14 - # Heat Pump: Electric Heat pumps: Air source heat pump in other cases - # 13 - # Boiler: F rated Combi - # 12 - # Boiler: G rated Regular Boiler - # 10 - # Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters - # 8 - # Electric (direct acting) room heaters: Water- or oil-filled radiators - # 4 - # Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters - # 3 - # Boiler: D rated Combi - # 3 - # Boiler: A rated CPSU - # 2 - # Heat Pump: (from database) - # 1 - # System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi - # 1 - # No Heating - # 1 - # Solid fuel room heaters: Open fire in grate - # 1 - # Boiler: F rated Regular Boiler + clustering_features[['property_type', 'built_form', 'property_extended_feature']] = ( + clustering_features['Property Type'].apply(split_property_type) + ) + clustering_features = clustering_features.drop(columns=["Property Type"]) + + # These are the variables we MUST split by + grouping_columns = [ + "property_type", + "walls_reduced", + "roof_type", + "Main Fuel" + ] + + def combine_small_groups(clustering_features, grouping_columns, threshold=1): + # Identify small groups + group_sizes = clustering_features.groupby(grouping_columns).size() + small_groups = group_sizes[group_sizes <= threshold].index.tolist() + + # Remove small groups from the original clustering_features + small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)] + clustering_features = clustering_features[ + ~clustering_features.set_index(grouping_columns).index.isin(small_groups) + ] + + if small_group_data.empty: + return clustering_features + + # Combine small groups with the nearest available group + # Using Euclidean distance for numerical features as a simple measure of similarity + numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist() + small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean() + large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean() + + closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values) + closest_group_index = large_group_centroids.index[closest_groups] + + combined_data = [] + for small_group, closest_group in zip(small_groups, closest_group_index): + small_group_data.loc[ + small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group + combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group]) + + combined_data = pd.concat(combined_data) + + combined_data = pd.concat([clustering_features, combined_data]) + + return combined_data + + clustering_features_combined = combine_small_groups(clustering_features, grouping_columns) + + ######################################################################## + # Clustering + ######################################################################## + numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist() + categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist() + categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]] + + for col in categorical_features: + clustering_features[col] = clustering_features[col].astype(str) + + id_column = 'internal_id' + n_clusters = 450 + random_state = 0 + + training_data_grouped = clustering_features.groupby(grouping_columns) + group_sizes = {name: len(group) for name, group in training_data_grouped} + total_size = sum(group_sizes.values()) + cluster_allocation = { + name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items() + } + + # Adjust cluster allocation to ensure total clusters sum to 450 + cluster_allocation = adjust_clusters(cluster_allocation, n_clusters) + + final_clusters = [] + for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)): + + group_n_clusters = cluster_allocation[group_variables] + group_data.set_index(id_column, inplace=True) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), numerical_features), + ('cat', OneHotEncoder(), categorical_features) + ] + ) + + pipeline = Pipeline(steps=[('preprocessor', preprocessor), + ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))]) + + # Fit the pipeline to the data + pipeline.fit(group_data) + + # Transform the data using the fitted pipeline + processed_data = pipeline.named_steps['preprocessor'].transform(group_data) + + # Get cluster labels + group_data['cluster'] = pipeline.named_steps['kmeans'].labels_ + + # Get centroids (already in the same transformed space) + centroids = pipeline.named_steps['kmeans'].cluster_centers_ + + # if the data isn't an array, make it one + if not isinstance(processed_data, np.ndarray): + processed_data = processed_data.toarray() + + # Calculate distances from each point to the centroid of its cluster + distances_to_centroids = [ + cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0] + for i, label in enumerate(group_data['cluster']) + ] + + group_data['distance_to_centroid'] = distances_to_centroids + + # Ranking rows by distance within each cluster + group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first') + + # Sorting to verify + group_data.sort_values(by=['cluster', 'rank'], inplace=True) + group_data.reset_index(inplace=True) + + to_append = group_data[["internal_id", "cluster", "rank"]].copy() + to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables) + final_clusters.append(to_append) + + final_clusters = pd.concat(final_clusters) + # remap the clusters from the current names to 1 -> n_clusters + cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())} + final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping) + final_clusters["cluster"] = final_clusters["cluster"].astype(str) def read_asset_list():