mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on new clustering for stonewater
This commit is contained in:
parent
603b3e1db2
commit
173b7ec434
1 changed files with 138 additions and 66 deletions
|
|
@ -19,6 +19,7 @@ from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from scipy.spatial.distance import cdist
|
||||
from sklearn.metrics import pairwise_distances_argmin_min
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
|
@ -2200,77 +2201,148 @@ def updated_version():
|
|||
# Fuel
|
||||
"Main Fuel",
|
||||
"Age",
|
||||
"Total Floor Area"
|
||||
"Total Floor Area",
|
||||
"representative_sap",
|
||||
]
|
||||
]
|
||||
|
||||
z = master_sheet_clustering_features[
|
||||
["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"]
|
||||
].drop_duplicates()
|
||||
def split_property_type(row):
|
||||
parts = row.split(':')
|
||||
property_type = parts[0].strip()
|
||||
built_form = parts[1].strip() if len(parts) > 1 else ''
|
||||
property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
|
||||
return pd.Series([property_type, built_form, property_extended_feature])
|
||||
|
||||
# TODO: heating - remap
|
||||
# Boiler: A rated Regular Boiler
|
||||
# 1944
|
||||
# Boiler: A rated Combi
|
||||
# 1335
|
||||
# Electric Storage Systems: High heat retention storage heaters
|
||||
# 543
|
||||
# Electric Storage Systems: Fan storage heaters
|
||||
# 284
|
||||
# Electric (direct acting) room heaters: Panel, convector or radiant heaters
|
||||
# 253
|
||||
# Boiler: C rated Regular Boiler
|
||||
# 142
|
||||
# Community Heating Systems: Community boilers only (RdSAP)
|
||||
# 127
|
||||
# Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C
|
||||
# 126
|
||||
# Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C
|
||||
# 70
|
||||
# Boiler: E rated Regular Boiler
|
||||
# 62
|
||||
# Boiler: E rated Combi
|
||||
# 59
|
||||
# Electric Storage Systems: Old (large volume) storage heaters
|
||||
# 55
|
||||
# Electric Storage Systems: Modern (slimline) storage heaters
|
||||
# 49
|
||||
# Boiler: B rated Regular Boiler
|
||||
# 46
|
||||
# Boiler: C rated Combi
|
||||
# 44
|
||||
# Heat Pump: Electric Heat pumps: Ground source heat pump in other cases
|
||||
# 39
|
||||
# Boiler: D rated Regular Boiler
|
||||
# 16
|
||||
# Community Heating Systems: Community CHP and boilers (RdSAP)
|
||||
# 14
|
||||
# Heat Pump: Electric Heat pumps: Air source heat pump in other cases
|
||||
# 13
|
||||
# Boiler: F rated Combi
|
||||
# 12
|
||||
# Boiler: G rated Regular Boiler
|
||||
# 10
|
||||
# Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters
|
||||
# 8
|
||||
# Electric (direct acting) room heaters: Water- or oil-filled radiators
|
||||
# 4
|
||||
# Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters
|
||||
# 3
|
||||
# Boiler: D rated Combi
|
||||
# 3
|
||||
# Boiler: A rated CPSU
|
||||
# 2
|
||||
# Heat Pump: (from database)
|
||||
# 1
|
||||
# System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi
|
||||
# 1
|
||||
# No Heating
|
||||
# 1
|
||||
# Solid fuel room heaters: Open fire in grate
|
||||
# 1
|
||||
# Boiler: F rated Regular Boiler
|
||||
clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
|
||||
clustering_features['Property Type'].apply(split_property_type)
|
||||
)
|
||||
clustering_features = clustering_features.drop(columns=["Property Type"])
|
||||
|
||||
# These are the variables we MUST split by
|
||||
grouping_columns = [
|
||||
"property_type",
|
||||
"walls_reduced",
|
||||
"roof_type",
|
||||
"Main Fuel"
|
||||
]
|
||||
|
||||
def combine_small_groups(clustering_features, grouping_columns, threshold=1):
|
||||
# Identify small groups
|
||||
group_sizes = clustering_features.groupby(grouping_columns).size()
|
||||
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
|
||||
|
||||
# Remove small groups from the original clustering_features
|
||||
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
|
||||
clustering_features = clustering_features[
|
||||
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
|
||||
]
|
||||
|
||||
if small_group_data.empty:
|
||||
return clustering_features
|
||||
|
||||
# Combine small groups with the nearest available group
|
||||
# Using Euclidean distance for numerical features as a simple measure of similarity
|
||||
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean()
|
||||
large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean()
|
||||
|
||||
closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values)
|
||||
closest_group_index = large_group_centroids.index[closest_groups]
|
||||
|
||||
combined_data = []
|
||||
for small_group, closest_group in zip(small_groups, closest_group_index):
|
||||
small_group_data.loc[
|
||||
small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group
|
||||
combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group])
|
||||
|
||||
combined_data = pd.concat(combined_data)
|
||||
|
||||
combined_data = pd.concat([clustering_features, combined_data])
|
||||
|
||||
return combined_data
|
||||
|
||||
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
|
||||
|
||||
########################################################################
|
||||
# Clustering
|
||||
########################################################################
|
||||
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
||||
|
||||
for col in categorical_features:
|
||||
clustering_features[col] = clustering_features[col].astype(str)
|
||||
|
||||
id_column = 'internal_id'
|
||||
n_clusters = 450
|
||||
random_state = 0
|
||||
|
||||
training_data_grouped = clustering_features.groupby(grouping_columns)
|
||||
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
||||
total_size = sum(group_sizes.values())
|
||||
cluster_allocation = {
|
||||
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
|
||||
}
|
||||
|
||||
# Adjust cluster allocation to ensure total clusters sum to 450
|
||||
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
|
||||
|
||||
final_clusters = []
|
||||
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
|
||||
|
||||
group_n_clusters = cluster_allocation[group_variables]
|
||||
group_data.set_index(id_column, inplace=True)
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numerical_features),
|
||||
('cat', OneHotEncoder(), categorical_features)
|
||||
]
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(group_data)
|
||||
|
||||
# Transform the data using the fitted pipeline
|
||||
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
|
||||
|
||||
# Get cluster labels
|
||||
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
|
||||
|
||||
# Get centroids (already in the same transformed space)
|
||||
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
||||
|
||||
# if the data isn't an array, make it one
|
||||
if not isinstance(processed_data, np.ndarray):
|
||||
processed_data = processed_data.toarray()
|
||||
|
||||
# Calculate distances from each point to the centroid of its cluster
|
||||
distances_to_centroids = [
|
||||
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
||||
for i, label in enumerate(group_data['cluster'])
|
||||
]
|
||||
|
||||
group_data['distance_to_centroid'] = distances_to_centroids
|
||||
|
||||
# Ranking rows by distance within each cluster
|
||||
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
||||
|
||||
# Sorting to verify
|
||||
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
group_data.reset_index(inplace=True)
|
||||
|
||||
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
|
||||
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
|
||||
final_clusters.append(to_append)
|
||||
|
||||
final_clusters = pd.concat(final_clusters)
|
||||
# remap the clusters from the current names to 1 -> n_clusters
|
||||
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
|
||||
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
||||
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
||||
|
||||
|
||||
def read_asset_list():
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue