creating output

2026-07-27 23:35:01 +00:00 · 2024-07-23 18:52:09 +01:00 · 2024-07-23 18:52:09 +01:00 · c9af04de73
commit c9af04de73
parent 173b7ec434
1 changed files with 73 additions and 20 deletions
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -2064,7 +2064,8 @@ def updated_version():
    )

    clustering_features = asset_list[
-        asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"]
+        asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
+        ~pd.isnull(asset_list["uprn"])
        ][
        [
            "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
@ -2233,32 +2234,44 @@ def updated_version():

        # Remove small groups from the original clustering_features
        small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
-        clustering_features = clustering_features[
+        clustering_features_ok = clustering_features[
            ~clustering_features.set_index(grouping_columns).index.isin(small_groups)
        ]

        if small_group_data.empty:
            return clustering_features

+        # One-Hot Encode categorical variables
+        categorical_features = (
+            clustering_features_ok.drop(columns=["internal_id"])
+            .select_dtypes(include=['object', 'category']).columns.tolist()
+        )
+        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        ohe.fit(clustering_features_ok[categorical_features])
+
        # Combine small groups with the nearest available group
-        # Using Euclidean distance for numerical features as a simple measure of similarity
-        numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
-        small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean()
-        large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean()
+        small_group_ohe = ohe.transform(small_group_data[categorical_features])
+        large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])

-        closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values)
-        closest_group_index = large_group_centroids.index[closest_groups]
+        numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
+        small_group_numerical = small_group_data[numerical_features].values
+        large_group_numerical = clustering_features_ok[numerical_features].values

-        combined_data = []
+        # Concatenate one-hot encoded categorical and numerical features
+        small_group_features = np.hstack([small_group_ohe, small_group_numerical])
+        large_group_features = np.hstack([large_group_ohe, large_group_numerical])
+
+        # Calculate distances and find nearest groups
+        closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
+        closest_group_index = clustering_features_ok.iloc[closest_groups].index
+
+        # Update small groups to the nearest large group
        for small_group, closest_group in zip(small_groups, closest_group_index):
-            small_group_data.loc[
-                small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group
-            combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group])
-
-        combined_data = pd.concat(combined_data)
-
-        combined_data = pd.concat([clustering_features, combined_data])
+            small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
+            small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
+                closest_group, grouping_columns].values

+        combined_data = pd.concat([clustering_features_ok, small_group_data])
        return combined_data

    clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
@ -2266,18 +2279,18 @@ def updated_version():
    ########################################################################
    # Clustering
    ########################################################################
-    numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
-    categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist()
+    numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
+    categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]

    for col in categorical_features:
-        clustering_features[col] = clustering_features[col].astype(str)
+        clustering_features_combined[col] = clustering_features_combined[col].astype(str)

    id_column = 'internal_id'
    n_clusters = 450
    random_state = 0

-    training_data_grouped = clustering_features.groupby(grouping_columns)
+    training_data_grouped = clustering_features_combined.groupby(grouping_columns)
    group_sizes = {name: len(group) for name, group in training_data_grouped}
    total_size = sum(group_sizes.values())
    cluster_allocation = {
@ -2344,6 +2357,46 @@ def updated_version():
    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
    final_clusters["cluster"] = final_clusters["cluster"].astype(str)

+    assigned_clusters = clustering_features_combined.merge(
+        final_clusters, how="left", on="internal_id"
+    )
+
+    assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
+
+    asset_list_with_archetypes = asset_list.merge(
+        assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
+        on="internal_id"
+    )
+
+    # We populate the reasons for no archetype
+    # 1) If it's not a priority postcode
+    asset_list_with_archetypes["cluster"] = np.where(
+        ~asset_list_with_archetypes["is_priority_postcode"],
+        "NOT PRIORITY POSTCODE",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # 2) If it's EPC C or above
+    asset_list_with_archetypes["cluster"] = np.where(
+        asset_list_with_archetypes["is_epc_c_or_above"],
+        "EPC C OR ABOVE",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # If it's in Wave 2.1
+    asset_list_with_archetypes["cluster"] = np.where(
+        asset_list_with_archetypes["In Osmosis Wave 2.1"],
+        "IN WAVE 2.1",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # Has missing uprn
+    asset_list_with_archetypes["cluster"] = np.where(
+        pd.isnull(asset_list_with_archetypes["uprn"]),
+        "MISSING UPRN",
+        asset_list_with_archetypes["cluster"]
+    )
+

 def read_asset_list():
    asset_list = pd.read_excel(