mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
creating output
This commit is contained in:
parent
173b7ec434
commit
c9af04de73
1 changed files with 73 additions and 20 deletions
|
|
@ -2064,7 +2064,8 @@ def updated_version():
|
|||
)
|
||||
|
||||
clustering_features = asset_list[
|
||||
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"]
|
||||
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
|
||||
~pd.isnull(asset_list["uprn"])
|
||||
][
|
||||
[
|
||||
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
|
||||
|
|
@ -2233,32 +2234,44 @@ def updated_version():
|
|||
|
||||
# Remove small groups from the original clustering_features
|
||||
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
|
||||
clustering_features = clustering_features[
|
||||
clustering_features_ok = clustering_features[
|
||||
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
|
||||
]
|
||||
|
||||
if small_group_data.empty:
|
||||
return clustering_features
|
||||
|
||||
# One-Hot Encode categorical variables
|
||||
categorical_features = (
|
||||
clustering_features_ok.drop(columns=["internal_id"])
|
||||
.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
)
|
||||
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
||||
ohe.fit(clustering_features_ok[categorical_features])
|
||||
|
||||
# Combine small groups with the nearest available group
|
||||
# Using Euclidean distance for numerical features as a simple measure of similarity
|
||||
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean()
|
||||
large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean()
|
||||
small_group_ohe = ohe.transform(small_group_data[categorical_features])
|
||||
large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
|
||||
|
||||
closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values)
|
||||
closest_group_index = large_group_centroids.index[closest_groups]
|
||||
numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
small_group_numerical = small_group_data[numerical_features].values
|
||||
large_group_numerical = clustering_features_ok[numerical_features].values
|
||||
|
||||
combined_data = []
|
||||
# Concatenate one-hot encoded categorical and numerical features
|
||||
small_group_features = np.hstack([small_group_ohe, small_group_numerical])
|
||||
large_group_features = np.hstack([large_group_ohe, large_group_numerical])
|
||||
|
||||
# Calculate distances and find nearest groups
|
||||
closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
|
||||
closest_group_index = clustering_features_ok.iloc[closest_groups].index
|
||||
|
||||
# Update small groups to the nearest large group
|
||||
for small_group, closest_group in zip(small_groups, closest_group_index):
|
||||
small_group_data.loc[
|
||||
small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group
|
||||
combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group])
|
||||
|
||||
combined_data = pd.concat(combined_data)
|
||||
|
||||
combined_data = pd.concat([clustering_features, combined_data])
|
||||
small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
|
||||
small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
|
||||
closest_group, grouping_columns].values
|
||||
|
||||
combined_data = pd.concat([clustering_features_ok, small_group_data])
|
||||
return combined_data
|
||||
|
||||
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
|
||||
|
|
@ -2266,18 +2279,18 @@ def updated_version():
|
|||
########################################################################
|
||||
# Clustering
|
||||
########################################################################
|
||||
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
||||
|
||||
for col in categorical_features:
|
||||
clustering_features[col] = clustering_features[col].astype(str)
|
||||
clustering_features_combined[col] = clustering_features_combined[col].astype(str)
|
||||
|
||||
id_column = 'internal_id'
|
||||
n_clusters = 450
|
||||
random_state = 0
|
||||
|
||||
training_data_grouped = clustering_features.groupby(grouping_columns)
|
||||
training_data_grouped = clustering_features_combined.groupby(grouping_columns)
|
||||
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
||||
total_size = sum(group_sizes.values())
|
||||
cluster_allocation = {
|
||||
|
|
@ -2344,6 +2357,46 @@ def updated_version():
|
|||
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
||||
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
||||
|
||||
assigned_clusters = clustering_features_combined.merge(
|
||||
final_clusters, how="left", on="internal_id"
|
||||
)
|
||||
|
||||
assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
|
||||
|
||||
asset_list_with_archetypes = asset_list.merge(
|
||||
assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
|
||||
on="internal_id"
|
||||
)
|
||||
|
||||
# We populate the reasons for no archetype
|
||||
# 1) If it's not a priority postcode
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
~asset_list_with_archetypes["is_priority_postcode"],
|
||||
"NOT PRIORITY POSTCODE",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
# 2) If it's EPC C or above
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
asset_list_with_archetypes["is_epc_c_or_above"],
|
||||
"EPC C OR ABOVE",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
# If it's in Wave 2.1
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
asset_list_with_archetypes["In Osmosis Wave 2.1"],
|
||||
"IN WAVE 2.1",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
# Has missing uprn
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
pd.isnull(asset_list_with_archetypes["uprn"]),
|
||||
"MISSING UPRN",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
|
||||
def read_asset_list():
|
||||
asset_list = pd.read_excel(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue