From 36b3b4bea55c136a34f46acb2cc13ed4a3aa2529 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jul 2024 20:07:50 +0100 Subject: [PATCH] revised clusterin --- .../stonewater/outputs 27th June 2024.py | 99 +++++++++++-------- etl/customers/stonewater/shdf_3_clustering.py | 64 ++++++++++-- 2 files changed, 112 insertions(+), 51 deletions(-) diff --git a/etl/customers/stonewater/outputs 27th June 2024.py b/etl/customers/stonewater/outputs 27th June 2024.py index 7a78469c..cf0c6478 100644 --- a/etl/customers/stonewater/outputs 27th June 2024.py +++ b/etl/customers/stonewater/outputs 27th June 2024.py @@ -11,8 +11,9 @@ In this script, we do the following things: import pandas as pd import json from utils.s3 import read_pickle_from_s3 +from backend.app.utils import sap_to_epc -stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv") +stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv") archetyped_asset_list = stonewater_asset_list[ [ "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster", @@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int) archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"]) # Read in and merge on clustering features -clustering_features = read_pickle_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl" +clustering_features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", ) -# Move property-type and built-form to the first two columns -columns_to_move = ['property-type', 'built-form'] - -# Get the remaining columns -remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move] - -# Create the new column order -new_column_order = columns_to_move + remaining_columns - -# Reorder the DataFrame -clustering_features = clustering_features[new_column_order] - archetyped_asset_list = archetyped_asset_list.merge( - clustering_features, - on="internal_id", - how="inner" -) + clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]), + left_on="internal_id", + right_on="Osm. ID" +).drop(columns=["Osm. ID"]) archetyped_asset_list = archetyped_asset_list.rename( columns={ @@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64') # archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False) # We store the location data, which will be used for the mapping. We just need the longitude and latitude +stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64') + mapping_data = stonewater_asset_list[ stonewater_asset_list["archetype_representative"] -][["internal_id", "uprn", "standardised_address", "standardised_postcode"]] +][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge( + archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]], + how="left", + on="uprn" +) + +# We need to merge on longitude and latitude +spatial_data_to_uprn = read_pickle_from_s3( + s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", + bucket_name="retrofit-data-dev" +) + + +# Function to convert specific columns to bool dtype +def convert_specific_columns_to_bool(df, columns): + for column in columns: + if column in df.columns: + df[column] = df[column].astype(bool) + return df + + +spatial_data_to_uprn = [convert_specific_columns_to_bool( + df, ['conservation_status', 'is_listed_building', 'is_heritage_building'] +) for df in spatial_data_to_uprn] + +spatial_data_to_uprn = pd.concat(spatial_data_to_uprn) +spatial_data_to_uprn = spatial_data_to_uprn.drop( + columns=["partition", "filename"] +).rename(columns={"UPRN": "uprn"}) +spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64") mapping_data = mapping_data.merge( - clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]], + spatial_data_to_uprn[ + ["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"] + ], + how="left", + on="uprn" ) mapping_data = mapping_data.drop(columns=["internal_id"]) @@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") f.write(json.dumps(mapping_data.to_dict(orient="records"))) # We also include some data for visualising the breakdown of EPCS -proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index() # Invert the true and false -proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"] -proportion_of_real_epcs = proportion_of_real_epcs.rename( - columns={"estimated": "is_real_epc"} -) +# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index() +# proportion_of_real_epcs = proportion_of_real_epcs.rename( +# columns={"estimated": "is_real_epc"} +# ) +# +# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f: +# f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records"))) -with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f: - f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records"))) +# Produce the breakdown of EPC ratings for properties to be surveyed +clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc) -# Produce the breakdown of EPC ratings epc_rating_breakdown = ( - clustering_features[~clustering_features["estimated"]]["current-energy-rating"] + clustering_features[clustering_features["archetype_representative"]]["representative_epc"] .value_counts() .to_frame() .reset_index() ) epc_rating_breakdown = epc_rating_breakdown.rename( - columns={"current-energy-rating": "EPC"} + columns={"index": "EPC", "representative_epc": "count"} ) with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f: f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records"))) - -epc_a_properties = clustering_features[ - (clustering_features["current-energy-rating"] == "A") - & (~clustering_features["estimated"]) - ] - -epc_a_properties = epc_a_properties.merge( - stonewater_asset_list, - on="internal_id", - how="inner" -) diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 7d0b6336..18cfee79 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1084,11 +1084,11 @@ def compile_data(): spatial_data_to_uprn = pd.concat(spatial_data_to_uprn) # TODO: Let's store this in s3 - save_data_to_s3( - data=json.dumps(spatial_data_to_uprn.to_dict("records")), - s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json", - bucket_name="retrofit-data-dev" - ) + # save_data_to_s3( + # data=json.dumps(spatial_data_to_uprn.to_dict("records")), + # s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json", + # bucket_name="retrofit-data-dev" + # ) # We merge this spatial data onto final EPCS @@ -2070,7 +2070,7 @@ def updated_version(): [ "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2", "city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date", - "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date" + "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date", ] ] @@ -2192,6 +2192,7 @@ def updated_version(): 'conservation_status', 'is_listed_building', 'is_heritage_building', + "county", # Walls "walls_reduced", # Roof @@ -2204,9 +2205,12 @@ def updated_version(): "Age", "Total Floor Area", "representative_sap", + "days_since_lodgement", ] ] + clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999) + def split_property_type(row): parts = row.split(':') property_type = parts[0].strip() @@ -2224,10 +2228,11 @@ def updated_version(): "property_type", "walls_reduced", "roof_type", - "Main Fuel" + "Main Fuel", + "county", ] - def combine_small_groups(clustering_features, grouping_columns, threshold=1): + def combine_small_groups(clustering_features, grouping_columns, threshold=2): # Identify small groups group_sizes = clustering_features.groupby(grouping_columns).size() small_groups = group_sizes[group_sizes <= threshold].index.tolist() @@ -2397,6 +2402,49 @@ def updated_version(): asset_list_with_archetypes["cluster"] ) + asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999) + asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str) + asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE") + + asset_list_with_archetypes["archetype_representative"] = ( + asset_list_with_archetypes["archetype_representative"].fillna(False) + ) + + asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False) + + # Produce the archetyping features + archetyping_features_csv = assigned_clusters[ + [ + "internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building", + "is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement" + ] + ].merge( + asset_list[ + ["internal_id", "uprn", "external_address_id"] + ], + how="left", + on="internal_id" + ).merge( + master_sheet_clustering_features, + how="left", + right_on="Address ID", + left_on="external_address_id" + ).drop(columns=["Address ID"]).rename( + columns={ + "internal_id": "Osm. ID", + "external_address_id": "Address ID", + } + ) + + archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True) + archetyping_features_csv.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False + ) + + representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]] + print(representatives["postal_region"].nunique()) + print(representatives["county"].nunique()) + def read_asset_list(): asset_list = pd.read_excel(