cleaning columns for stonewater clustering

2026-07-27 23:35:01 +00:00 · 2024-06-27 14:17:39 +01:00 · 2024-06-27 14:17:39 +01:00 · 4e85d1380e
commit 4e85d1380e
parent e9366c72e8
1 changed files with 62 additions and 0 deletions
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -1668,6 +1668,7 @@ def compile_data_final():
        'lighting-description': LightingAttributes
    }
    for variable_to_clean in cleaned.keys():
+
        unique_descriptions = property_attributes[variable_to_clean].unique()
        clean_df = pd.DataFrame(cleaned[variable_to_clean])
        # Check if we have any
@ -1686,6 +1687,67 @@ def compile_data_final():
                }
                descriptions_to_append.append(to_append)

+            descriptions_to_append = pd.DataFrame(descriptions_to_append)
+            clean_df = pd.concat([clean_df, descriptions_to_append])
+
+            starting_size = len(property_attributes)
+            property_attributes = property_attributes.merge(
+                clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
+            )
+            if starting_size != property_attributes.shape[0]:
+                raise Exception("something went wrong")
+            property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
+            # Fill missings
+            for k in clean_df.columns:
+                if k in property_attributes.columns:
+                    property_attributes[k] = property_attributes[k].fillna("missing")
+
+    # We group some variables such as thermal transmittance for walls, roof, floors
+    ranges = {
+        "< 0.1": (0, 0.1),
+        "0.1 - 0.3": (0.1, 0.3),
+        "0.3 - 0.5": (0.3, 0.5),
+        "0.5 - 0.7": (0.5, 0.7),
+        "0.9 - 1": (0.9, 1),
+        "1 - 1.5": (1, 1.5),
+        "1.5 - 2": (1.5, 2),
+        "2+": (2, 2.5)
+    }
+
+    # Generate the lookup table
+    thermal_transmittance_lookup_table = []
+    for i in range(1, 251):
+        value = i / 100
+        for label, (low, high) in ranges.items():
+            if low < value <= high:
+                thermal_transmittance_lookup_table.append({"from": value, "to": label})
+                break
+
+    # Convert to DataFrame for display
+    thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
+    thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
+
+    thermal_transmittance_cols = [
+        c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
+    ]
+    for i, col in enumerate(thermal_transmittance_cols):
+        # Perform the mapping
+        to_col = f"to_{i}"
+        property_attributes[col] = property_attributes[col].astype(str)
+        property_attributes = property_attributes.merge(
+            thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
+            how="left",
+            left_on=col,
+            right_on="from",
+            suffixes=("", f"_{i}")
+        )
+        property_attributes = property_attributes.drop(columns=["from", col])
+        property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
+
+    # Drop the description columns that are the keys in cleaned
+    property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
+    # Perform the mapping
+
    # CLUSTERING!!

    from sklearn.cluster import KMeans