cleaning columns for stonewater clustering

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-27 14:17:39 +01:00
parent e9366c72e8
commit 4e85d1380e

View file

@ -1668,6 +1668,7 @@ def compile_data_final():
'lighting-description': LightingAttributes
}
for variable_to_clean in cleaned.keys():
unique_descriptions = property_attributes[variable_to_clean].unique()
clean_df = pd.DataFrame(cleaned[variable_to_clean])
# Check if we have any
@ -1686,6 +1687,67 @@ def compile_data_final():
}
descriptions_to_append.append(to_append)
descriptions_to_append = pd.DataFrame(descriptions_to_append)
clean_df = pd.concat([clean_df, descriptions_to_append])
starting_size = len(property_attributes)
property_attributes = property_attributes.merge(
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
)
if starting_size != property_attributes.shape[0]:
raise Exception("something went wrong")
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
# Fill missings
for k in clean_df.columns:
if k in property_attributes.columns:
property_attributes[k] = property_attributes[k].fillna("missing")
# We group some variables such as thermal transmittance for walls, roof, floors
ranges = {
"< 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"0.5 - 0.7": (0.5, 0.7),
"0.9 - 1": (0.9, 1),
"1 - 1.5": (1, 1.5),
"1.5 - 2": (1.5, 2),
"2+": (2, 2.5)
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
thermal_transmittance_cols = [
c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
]
for i, col in enumerate(thermal_transmittance_cols):
# Perform the mapping
to_col = f"to_{i}"
property_attributes[col] = property_attributes[col].astype(str)
property_attributes = property_attributes.merge(
thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
how="left",
left_on=col,
right_on="from",
suffixes=("", f"_{i}")
)
property_attributes = property_attributes.drop(columns=["from", col])
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
# Drop the description columns that are the keys in cleaned
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
# Perform the mapping
# CLUSTERING!!
from sklearn.cluster import KMeans