working on new clustering for stonewater

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-23 18:14:21 +01:00
parent 603b3e1db2
commit 173b7ec434

View file

@ -19,6 +19,7 @@ from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances_argmin_min
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -2200,77 +2201,148 @@ def updated_version():
# Fuel
"Main Fuel",
"Age",
"Total Floor Area"
"Total Floor Area",
"representative_sap",
]
]
z = master_sheet_clustering_features[
["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"]
].drop_duplicates()
def split_property_type(row):
parts = row.split(':')
property_type = parts[0].strip()
built_form = parts[1].strip() if len(parts) > 1 else ''
property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
return pd.Series([property_type, built_form, property_extended_feature])
# TODO: heating - remap
# Boiler: A rated Regular Boiler
# 1944
# Boiler: A rated Combi
# 1335
# Electric Storage Systems: High heat retention storage heaters
# 543
# Electric Storage Systems: Fan storage heaters
# 284
# Electric (direct acting) room heaters: Panel, convector or radiant heaters
# 253
# Boiler: C rated Regular Boiler
# 142
# Community Heating Systems: Community boilers only (RdSAP)
# 127
# Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C
# 126
# Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C
# 70
# Boiler: E rated Regular Boiler
# 62
# Boiler: E rated Combi
# 59
# Electric Storage Systems: Old (large volume) storage heaters
# 55
# Electric Storage Systems: Modern (slimline) storage heaters
# 49
# Boiler: B rated Regular Boiler
# 46
# Boiler: C rated Combi
# 44
# Heat Pump: Electric Heat pumps: Ground source heat pump in other cases
# 39
# Boiler: D rated Regular Boiler
# 16
# Community Heating Systems: Community CHP and boilers (RdSAP)
# 14
# Heat Pump: Electric Heat pumps: Air source heat pump in other cases
# 13
# Boiler: F rated Combi
# 12
# Boiler: G rated Regular Boiler
# 10
# Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters
# 8
# Electric (direct acting) room heaters: Water- or oil-filled radiators
# 4
# Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters
# 3
# Boiler: D rated Combi
# 3
# Boiler: A rated CPSU
# 2
# Heat Pump: (from database)
# 1
# System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi
# 1
# No Heating
# 1
# Solid fuel room heaters: Open fire in grate
# 1
# Boiler: F rated Regular Boiler
clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
clustering_features['Property Type'].apply(split_property_type)
)
clustering_features = clustering_features.drop(columns=["Property Type"])
# These are the variables we MUST split by
grouping_columns = [
"property_type",
"walls_reduced",
"roof_type",
"Main Fuel"
]
def combine_small_groups(clustering_features, grouping_columns, threshold=1):
# Identify small groups
group_sizes = clustering_features.groupby(grouping_columns).size()
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
# Remove small groups from the original clustering_features
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
clustering_features = clustering_features[
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
]
if small_group_data.empty:
return clustering_features
# Combine small groups with the nearest available group
# Using Euclidean distance for numerical features as a simple measure of similarity
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean()
large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean()
closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values)
closest_group_index = large_group_centroids.index[closest_groups]
combined_data = []
for small_group, closest_group in zip(small_groups, closest_group_index):
small_group_data.loc[
small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group
combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group])
combined_data = pd.concat(combined_data)
combined_data = pd.concat([clustering_features, combined_data])
return combined_data
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
########################################################################
# Clustering
########################################################################
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
for col in categorical_features:
clustering_features[col] = clustering_features[col].astype(str)
id_column = 'internal_id'
n_clusters = 450
random_state = 0
training_data_grouped = clustering_features.groupby(grouping_columns)
group_sizes = {name: len(group) for name, group in training_data_grouped}
total_size = sum(group_sizes.values())
cluster_allocation = {
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
}
# Adjust cluster allocation to ensure total clusters sum to 450
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
final_clusters = []
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
group_n_clusters = cluster_allocation[group_variables]
group_data.set_index(id_column, inplace=True)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
# Fit the pipeline to the data
pipeline.fit(group_data)
# Transform the data using the fitted pipeline
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
# Get cluster labels
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
# Get centroids (already in the same transformed space)
centroids = pipeline.named_steps['kmeans'].cluster_centers_
# if the data isn't an array, make it one
if not isinstance(processed_data, np.ndarray):
processed_data = processed_data.toarray()
# Calculate distances from each point to the centroid of its cluster
distances_to_centroids = [
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
for i, label in enumerate(group_data['cluster'])
]
group_data['distance_to_centroid'] = distances_to_centroids
# Ranking rows by distance within each cluster
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
# Sorting to verify
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
group_data.reset_index(inplace=True)
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
final_clusters.append(to_append)
final_clusters = pd.concat(final_clusters)
# remap the clusters from the current names to 1 -> n_clusters
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
def read_asset_list():