mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added the grouped clustering
This commit is contained in:
parent
6f32aa672b
commit
4456ab29ee
2 changed files with 161 additions and 76 deletions
|
|
@ -11,7 +11,7 @@ In this script, we do the following things:
|
|||
import pandas as pd
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
|
||||
archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv")
|
||||
archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
|
||||
archetyped_asset_list = archetyped_asset_list[
|
||||
[
|
||||
"internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
|
||||
|
|
@ -34,15 +34,22 @@ archetyped_asset_list = archetyped_asset_list.merge(
|
|||
how="inner"
|
||||
)
|
||||
|
||||
# Look at number of combinations
|
||||
# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
|
||||
# - If we look at the number of combinations of property type, built form, and walls description, this jumps
|
||||
# massively to 237 unique combinations
|
||||
# - Adding roof description to the mix, we have 857 unique combinations
|
||||
# - Adding floor description, we have 1278 unique combinations
|
||||
# This doesn't even begin to consider the other variables that we have in the dataset, such as the property dimensions,
|
||||
# location, and other factors.
|
||||
# Ideally, we would perfectly separate these variables but this is not possible, given the constraint of needing ~450
|
||||
# archetypes. We will need to make some compromises here. This is where a clustering algorithm can help us.
|
||||
# We don't end up with perfect separation but we can get a good enough separation to make the archetypes useful, and can
|
||||
# base the archetypes on a number of energy performance metrics, as well as location and other factors.
|
||||
# archetyped_asset_list[
|
||||
# ["property-type", "built-form", "walls-description", "roof-description",
|
||||
# "floor-description"]].drop_duplicates().shape
|
||||
|
||||
property_type_archetypes = archetyped_asset_list[
|
||||
["cluster", "rank", "property-type", "built-form", "walls-description"]]
|
||||
|
||||
# Key variables for separation:
|
||||
# - property-type
|
||||
# - built-form
|
||||
# - walls-description
|
||||
# - roof-description
|
||||
|
||||
clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape
|
||||
|
||||
clustering_features["walls-description"].value_counts()
|
||||
["cluster", "rank", "property-type", "built-form", "walls-description"]
|
||||
]
|
||||
|
|
|
|||
|
|
@ -14,6 +14,11 @@ import pandas as pd
|
|||
import time
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
|
||||
save_dataframe_to_s3_parquet, save_pickle_to_s3
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from scipy.spatial.distance import cdist
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
|
@ -1090,6 +1095,26 @@ def concatenate_row(row):
|
|||
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
|
||||
|
||||
|
||||
def adjust_clusters(cluster_allocation, total_clusters):
|
||||
current_total = sum(cluster_allocation.values())
|
||||
adjustment = total_clusters - current_total
|
||||
if adjustment > 0:
|
||||
# Increase clusters, start from the largest group
|
||||
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
|
||||
cluster_allocation[group] += 1
|
||||
adjustment -= 1
|
||||
if adjustment == 0:
|
||||
break
|
||||
elif adjustment < 0:
|
||||
# Decrease clusters, start from the largest group
|
||||
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
|
||||
cluster_allocation[group] -= 1
|
||||
adjustment += 1
|
||||
if adjustment == 0:
|
||||
break
|
||||
return cluster_allocation
|
||||
|
||||
|
||||
def compile_data_final():
|
||||
# Updated version:
|
||||
|
||||
|
|
@ -1667,7 +1692,7 @@ def compile_data_final():
|
|||
'windows-description': WindowAttributes,
|
||||
'lighting-description': LightingAttributes
|
||||
}
|
||||
|
||||
|
||||
for variable_to_clean in cleaned.keys():
|
||||
|
||||
unique_descriptions = property_attributes[variable_to_clean].unique()
|
||||
|
|
@ -1691,28 +1716,45 @@ def compile_data_final():
|
|||
descriptions_to_append = pd.DataFrame(descriptions_to_append)
|
||||
clean_df = pd.concat([clean_df, descriptions_to_append])
|
||||
|
||||
starting_size = len(property_attributes)
|
||||
property_attributes = property_attributes.merge(
|
||||
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
|
||||
)
|
||||
if starting_size != property_attributes.shape[0]:
|
||||
raise Exception("something went wrong")
|
||||
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
|
||||
# Fill missings
|
||||
for k in clean_df.columns:
|
||||
if k in property_attributes.columns:
|
||||
property_attributes[k] = property_attributes[k].fillna("missing")
|
||||
clean_df = clean_df.rename(
|
||||
columns={
|
||||
"thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
|
||||
"is_assumed": f"{variable_to_clean}_is_assumed",
|
||||
}
|
||||
)
|
||||
|
||||
if 'thermal_transmittance_unit' in clean_df.columns:
|
||||
clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
|
||||
|
||||
starting_size = len(property_attributes)
|
||||
property_attributes = property_attributes.merge(
|
||||
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
|
||||
)
|
||||
if starting_size != property_attributes.shape[0]:
|
||||
raise Exception("something went wrong")
|
||||
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
|
||||
# Fill missings
|
||||
for k in clean_df.columns:
|
||||
if k in property_attributes.columns:
|
||||
property_attributes[k] = property_attributes[k].fillna("missing")
|
||||
|
||||
# We group some variables such as thermal transmittance for walls, roof, floors
|
||||
# ranges = {
|
||||
# "< 0.1": (0, 0.1),
|
||||
# "0.1 - 0.3": (0.1, 0.3),
|
||||
# "0.3 - 0.5": (0.3, 0.5),
|
||||
# "0.5 - 0.7": (0.5, 0.7),
|
||||
# "0.9 - 1": (0.9, 1),
|
||||
# "1 - 1.5": (1, 1.5),
|
||||
# "1.5 - 2": (1.5, 2),
|
||||
# "2+": (2, 2.5)
|
||||
# }
|
||||
|
||||
ranges = {
|
||||
"< 0.1": (0, 0.1),
|
||||
"0.1 - 0.3": (0.1, 0.3),
|
||||
"0.3 - 0.5": (0.3, 0.5),
|
||||
"0.5 - 0.7": (0.5, 0.7),
|
||||
"0.9 - 1": (0.9, 1),
|
||||
"1 - 1.5": (1, 1.5),
|
||||
"1.5 - 2": (1.5, 2),
|
||||
"2+": (2, 2.5)
|
||||
"0.5+": (0.5, 2.5),
|
||||
}
|
||||
|
||||
# Generate the lookup table
|
||||
|
|
@ -1733,7 +1775,7 @@ def compile_data_final():
|
|||
]
|
||||
for i, col in enumerate(thermal_transmittance_cols):
|
||||
# Perform the mapping
|
||||
to_col = f"to_{i}"
|
||||
to_col = f"to_{col}"
|
||||
property_attributes[col] = property_attributes[col].astype(str)
|
||||
property_attributes = property_attributes.merge(
|
||||
thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
|
||||
|
|
@ -1750,72 +1792,108 @@ def compile_data_final():
|
|||
# Perform the mapping
|
||||
|
||||
# CLUSTERING!!
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from scipy.spatial.distance import cdist
|
||||
id_column = 'internal_id'
|
||||
property_attributes.set_index(id_column, inplace=True)
|
||||
grouping_columns = [
|
||||
'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type'
|
||||
]
|
||||
|
||||
# Define the preprocessing for numerical and categorical features
|
||||
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
||||
|
||||
for col in categorical_features:
|
||||
property_attributes[col] = property_attributes[col].astype(str)
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numerical_features),
|
||||
('cat', OneHotEncoder(), categorical_features)
|
||||
id_column = 'internal_id'
|
||||
n_clusters = 450
|
||||
random_state = 0
|
||||
|
||||
training_data_grouped = property_attributes.groupby(grouping_columns)
|
||||
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
||||
total_size = sum(group_sizes.values())
|
||||
cluster_allocation = {
|
||||
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
|
||||
}
|
||||
|
||||
# Adjust cluster allocation to ensure total clusters sum to 450
|
||||
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
|
||||
|
||||
# TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
|
||||
# collect the results of the clustering and then perform the transformations afterwards
|
||||
|
||||
final_clusters = []
|
||||
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
|
||||
|
||||
group_n_clusters = cluster_allocation[group_variables]
|
||||
group_data.set_index(id_column, inplace=True)
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numerical_features),
|
||||
('cat', OneHotEncoder(), categorical_features)
|
||||
]
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(group_data)
|
||||
|
||||
# Transform the data using the fitted pipeline
|
||||
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
|
||||
|
||||
# Get cluster labels
|
||||
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
|
||||
|
||||
# Get centroids (already in the same transformed space)
|
||||
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
||||
|
||||
# if the data isn't an array, make it one
|
||||
if not isinstance(processed_data, np.ndarray):
|
||||
processed_data = processed_data.toarray()
|
||||
|
||||
# Calculate distances from each point to the centroid of its cluster
|
||||
distances_to_centroids = [
|
||||
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
||||
for i, label in enumerate(group_data['cluster'])
|
||||
]
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=450, random_state=0))])
|
||||
group_data['distance_to_centroid'] = distances_to_centroids
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(property_attributes)
|
||||
# for cluster_id in group_data['cluster'].unique():
|
||||
# cluster_data = group_data[group_data['cluster'] == cluster_id]
|
||||
# min_distance = cluster_data['distance_to_centroid'].min()
|
||||
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
||||
# if min_distance != 0:
|
||||
# print(f"No point with zero distance found in cluster {cluster_id}")
|
||||
|
||||
# Transform the data using the fitted pipeline
|
||||
processed_data = pipeline.named_steps['preprocessor'].transform(property_attributes)
|
||||
# Ranking rows by distance within each cluster
|
||||
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
||||
|
||||
# Get cluster labels
|
||||
property_attributes['cluster'] = pipeline.named_steps['kmeans'].labels_
|
||||
# Sorting to verify
|
||||
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
group_data.reset_index(inplace=True)
|
||||
|
||||
# Get centroids (already in the same transformed space)
|
||||
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
||||
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
|
||||
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
|
||||
final_clusters.append(to_append)
|
||||
|
||||
processed_data = processed_data.toarray()
|
||||
final_clusters = pd.concat(final_clusters)
|
||||
# remap the clusters from the current names to 1 -> n_clusters
|
||||
|
||||
# Calculate distances from each point to the centroid of its cluster
|
||||
distances_to_centroids = [
|
||||
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
||||
for i, label in enumerate(property_attributes['cluster'])
|
||||
]
|
||||
|
||||
property_attributes['distance_to_centroid'] = distances_to_centroids
|
||||
|
||||
for cluster_id in property_attributes['cluster'].unique():
|
||||
cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
|
||||
min_distance = cluster_data['distance_to_centroid'].min()
|
||||
print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
||||
if min_distance != 0:
|
||||
print(f"No point with zero distance found in cluster {cluster_id}")
|
||||
|
||||
# Ranking rows by distance within each cluster
|
||||
property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(
|
||||
method='first')
|
||||
|
||||
# Sorting to verify
|
||||
property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
|
||||
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
||||
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
||||
|
||||
################################################
|
||||
# Prepare outputs!!!!
|
||||
################################################
|
||||
|
||||
property_attributes.reset_index(inplace=True)
|
||||
property_attributes = property_attributes.merge(
|
||||
final_clusters, how="left", on="internal_id"
|
||||
)
|
||||
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
|
||||
|
||||
asset_list_with_archetypes = asset_list.merge(
|
||||
|
|
@ -1834,7 +1912,7 @@ def compile_data_final():
|
|||
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
|
||||
"archetype_representative"].fillna(False)
|
||||
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)
|
||||
|
||||
stonewater_uprn_lookup = asset_list_with_archetypes[
|
||||
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue