creating output

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-23 18:52:09 +01:00
parent 173b7ec434
commit c9af04de73

View file

@ -2064,7 +2064,8 @@ def updated_version():
)
clustering_features = asset_list[
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"]
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
~pd.isnull(asset_list["uprn"])
][
[
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
@ -2233,32 +2234,44 @@ def updated_version():
# Remove small groups from the original clustering_features
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
clustering_features = clustering_features[
clustering_features_ok = clustering_features[
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
]
if small_group_data.empty:
return clustering_features
# One-Hot Encode categorical variables
categorical_features = (
clustering_features_ok.drop(columns=["internal_id"])
.select_dtypes(include=['object', 'category']).columns.tolist()
)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(clustering_features_ok[categorical_features])
# Combine small groups with the nearest available group
# Using Euclidean distance for numerical features as a simple measure of similarity
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
small_group_centroids = small_group_data.groupby(grouping_columns)[numerical_features].mean()
large_group_centroids = clustering_features.groupby(grouping_columns)[numerical_features].mean()
small_group_ohe = ohe.transform(small_group_data[categorical_features])
large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
closest_groups, _ = pairwise_distances_argmin_min(small_group_centroids.values, large_group_centroids.values)
closest_group_index = large_group_centroids.index[closest_groups]
numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
small_group_numerical = small_group_data[numerical_features].values
large_group_numerical = clustering_features_ok[numerical_features].values
combined_data = []
# Concatenate one-hot encoded categorical and numerical features
small_group_features = np.hstack([small_group_ohe, small_group_numerical])
large_group_features = np.hstack([large_group_ohe, large_group_numerical])
# Calculate distances and find nearest groups
closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
closest_group_index = clustering_features_ok.iloc[closest_groups].index
# Update small groups to the nearest large group
for small_group, closest_group in zip(small_groups, closest_group_index):
small_group_data.loc[
small_group_data.set_index(grouping_columns).index == small_group, grouping_columns] = closest_group
combined_data.append(small_group_data[small_group_data.set_index(grouping_columns).index == closest_group])
combined_data = pd.concat(combined_data)
combined_data = pd.concat([clustering_features, combined_data])
small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
closest_group, grouping_columns].values
combined_data = pd.concat([clustering_features_ok, small_group_data])
return combined_data
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
@ -2266,18 +2279,18 @@ def updated_version():
########################################################################
# Clustering
########################################################################
numerical_features = clustering_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = clustering_features.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
for col in categorical_features:
clustering_features[col] = clustering_features[col].astype(str)
clustering_features_combined[col] = clustering_features_combined[col].astype(str)
id_column = 'internal_id'
n_clusters = 450
random_state = 0
training_data_grouped = clustering_features.groupby(grouping_columns)
training_data_grouped = clustering_features_combined.groupby(grouping_columns)
group_sizes = {name: len(group) for name, group in training_data_grouped}
total_size = sum(group_sizes.values())
cluster_allocation = {
@ -2344,6 +2357,46 @@ def updated_version():
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
assigned_clusters = clustering_features_combined.merge(
final_clusters, how="left", on="internal_id"
)
assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
asset_list_with_archetypes = asset_list.merge(
assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
on="internal_id"
)
# We populate the reasons for no archetype
# 1) If it's not a priority postcode
asset_list_with_archetypes["cluster"] = np.where(
~asset_list_with_archetypes["is_priority_postcode"],
"NOT PRIORITY POSTCODE",
asset_list_with_archetypes["cluster"]
)
# 2) If it's EPC C or above
asset_list_with_archetypes["cluster"] = np.where(
asset_list_with_archetypes["is_epc_c_or_above"],
"EPC C OR ABOVE",
asset_list_with_archetypes["cluster"]
)
# If it's in Wave 2.1
asset_list_with_archetypes["cluster"] = np.where(
asset_list_with_archetypes["In Osmosis Wave 2.1"],
"IN WAVE 2.1",
asset_list_with_archetypes["cluster"]
)
# Has missing uprn
asset_list_with_archetypes["cluster"] = np.where(
pd.isnull(asset_list_with_archetypes["uprn"]),
"MISSING UPRN",
asset_list_with_archetypes["cluster"]
)
def read_asset_list():
asset_list = pd.read_excel(