diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index b8e71ae7..8b878f26 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1668,6 +1668,7 @@ def compile_data_final(): 'lighting-description': LightingAttributes } for variable_to_clean in cleaned.keys(): + unique_descriptions = property_attributes[variable_to_clean].unique() clean_df = pd.DataFrame(cleaned[variable_to_clean]) # Check if we have any @@ -1686,6 +1687,67 @@ def compile_data_final(): } descriptions_to_append.append(to_append) + descriptions_to_append = pd.DataFrame(descriptions_to_append) + clean_df = pd.concat([clean_df, descriptions_to_append]) + + starting_size = len(property_attributes) + property_attributes = property_attributes.merge( + clean_df, how="left", left_on=variable_to_clean, right_on="original_description" + ) + if starting_size != property_attributes.shape[0]: + raise Exception("something went wrong") + property_attributes = property_attributes.drop(columns=["original_description", "clean_description"]) + # Fill missings + for k in clean_df.columns: + if k in property_attributes.columns: + property_attributes[k] = property_attributes[k].fillna("missing") + + # We group some variables such as thermal transmittance for walls, roof, floors + ranges = { + "< 0.1": (0, 0.1), + "0.1 - 0.3": (0.1, 0.3), + "0.3 - 0.5": (0.3, 0.5), + "0.5 - 0.7": (0.5, 0.7), + "0.9 - 1": (0.9, 1), + "1 - 1.5": (1, 1.5), + "1.5 - 2": (1.5, 2), + "2+": (2, 2.5) + } + + # Generate the lookup table + thermal_transmittance_lookup_table = [] + for i in range(1, 251): + value = i / 100 + for label, (low, high) in ranges.items(): + if low < value <= high: + thermal_transmittance_lookup_table.append({"from": value, "to": label}) + break + + # Convert to DataFrame for display + thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table) + thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) + + thermal_transmittance_cols = [ + c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c + ] + for i, col in enumerate(thermal_transmittance_cols): + # Perform the mapping + to_col = f"to_{i}" + property_attributes[col] = property_attributes[col].astype(str) + property_attributes = property_attributes.merge( + thermal_transmittance_lookup_table.rename(columns={"to": to_col}), + how="left", + left_on=col, + right_on="from", + suffixes=("", f"_{i}") + ) + property_attributes = property_attributes.drop(columns=["from", col]) + property_attributes[to_col] = property_attributes[to_col].fillna("unknown") + + # Drop the description columns that are the keys in cleaned + property_attributes = property_attributes.drop(columns=list(cleaned.keys())) + # Perform the mapping + # CLUSTERING!! from sklearn.cluster import KMeans