diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 380bc36b..7d0b6336 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -2064,7 +2064,8 @@ def updated_version(): ) clustering_features = asset_list[ - asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] + asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] & + ~pd.isnull(asset_list["uprn"]) ][ [ "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2", @@ -2233,32 +2234,44 @@ def updated_version(): # Remove small groups from the original clustering_features small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)] - clustering_features = clustering_features[ + clustering_features_ok = clustering_features[ ~clustering_features.set_index(grouping_columns).index.isin(small_groups) ] if small_group_data.empty: return clustering_features + # One-Hot Encode categorical variables + categorical_features = ( + clustering_features_ok.drop(columns=["internal_id"]) + .select_dtypes(include=['object', 'category']).columns.tolist() + ) + ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') + ohe.fit(clustering_features_ok[categorical_features]) + # Combine small groups with the nearest available group - # Using Euclidean distance for numerical features as a simple measure of similarity - numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist() - small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean() - large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean() + small_group_ohe = ohe.transform(small_group_data[categorical_features]) + large_group_ohe = ohe.transform(clustering_features_ok[categorical_features]) - closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values) - closest_group_index = large_group_centroids.index[closest_groups] + numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist() + small_group_numerical = small_group_data[numerical_features].values + large_group_numerical = clustering_features_ok[numerical_features].values - combined_data = [] + # Concatenate one-hot encoded categorical and numerical features + small_group_features = np.hstack([small_group_ohe, small_group_numerical]) + large_group_features = np.hstack([large_group_ohe, large_group_numerical]) + + # Calculate distances and find nearest groups + closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features) + closest_group_index = clustering_features_ok.iloc[closest_groups].index + + # Update small groups to the nearest large group for small_group, closest_group in zip(small_groups, closest_group_index): - small_group_data.loc[ - small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group - combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group]) - - combined_data = pd.concat(combined_data) - - combined_data = pd.concat([clustering_features, combined_data]) + small_group_mask = small_group_data.set_index(grouping_columns).index == small_group + small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[ + closest_group, grouping_columns].values + combined_data = pd.concat([clustering_features_ok, small_group_data]) return combined_data clustering_features_combined = combine_small_groups(clustering_features, grouping_columns) @@ -2266,18 +2279,18 @@ def updated_version(): ######################################################################## # Clustering ######################################################################## - numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist() - categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist() + numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist() + categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist() categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]] for col in categorical_features: - clustering_features[col] = clustering_features[col].astype(str) + clustering_features_combined[col] = clustering_features_combined[col].astype(str) id_column = 'internal_id' n_clusters = 450 random_state = 0 - training_data_grouped = clustering_features.groupby(grouping_columns) + training_data_grouped = clustering_features_combined.groupby(grouping_columns) group_sizes = {name: len(group) for name, group in training_data_grouped} total_size = sum(group_sizes.values()) cluster_allocation = { @@ -2344,6 +2357,46 @@ def updated_version(): final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping) final_clusters["cluster"] = final_clusters["cluster"].astype(str) + assigned_clusters = clustering_features_combined.merge( + final_clusters, how="left", on="internal_id" + ) + + assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1 + + asset_list_with_archetypes = asset_list.merge( + assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left", + on="internal_id" + ) + + # We populate the reasons for no archetype + # 1) If it's not a priority postcode + asset_list_with_archetypes["cluster"] = np.where( + ~asset_list_with_archetypes["is_priority_postcode"], + "NOT PRIORITY POSTCODE", + asset_list_with_archetypes["cluster"] + ) + + # 2) If it's EPC C or above + asset_list_with_archetypes["cluster"] = np.where( + asset_list_with_archetypes["is_epc_c_or_above"], + "EPC C OR ABOVE", + asset_list_with_archetypes["cluster"] + ) + + # If it's in Wave 2.1 + asset_list_with_archetypes["cluster"] = np.where( + asset_list_with_archetypes["In Osmosis Wave 2.1"], + "IN WAVE 2.1", + asset_list_with_archetypes["cluster"] + ) + + # Has missing uprn + asset_list_with_archetypes["cluster"] = np.where( + pd.isnull(asset_list_with_archetypes["uprn"]), + "MISSING UPRN", + asset_list_with_archetypes["cluster"] + ) + def read_asset_list(): asset_list = pd.read_excel(