revised clusterin

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-23 20:07:50 +01:00
parent 3ac9dcd366
commit 36b3b4bea5
2 changed files with 112 additions and 51 deletions

View file

@ -11,8 +11,9 @@ In this script, we do the following things:
import pandas as pd
import json
from utils.s3 import read_pickle_from_s3
from backend.app.utils import sap_to_epc
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
archetyped_asset_list = stonewater_asset_list[
[
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
# Read in and merge on clustering features
clustering_features = read_pickle_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
clustering_features = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
)
# Move property-type and built-form to the first two columns
columns_to_move = ['property-type', 'built-form']
# Get the remaining columns
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
# Create the new column order
new_column_order = columns_to_move + remaining_columns
# Reorder the DataFrame
clustering_features = clustering_features[new_column_order]
archetyped_asset_list = archetyped_asset_list.merge(
clustering_features,
on="internal_id",
how="inner"
)
clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
left_on="internal_id",
right_on="Osm. ID"
).drop(columns=["Osm. ID"])
archetyped_asset_list = archetyped_asset_list.rename(
columns={
@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
mapping_data = stonewater_asset_list[
stonewater_asset_list["archetype_representative"]
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
how="left",
on="uprn"
)
# We need to merge on longitude and latitude
spatial_data_to_uprn = read_pickle_from_s3(
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
bucket_name="retrofit-data-dev"
)
# Function to convert specific columns to bool dtype
def convert_specific_columns_to_bool(df, columns):
for column in columns:
if column in df.columns:
df[column] = df[column].astype(bool)
return df
spatial_data_to_uprn = [convert_specific_columns_to_bool(
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
) for df in spatial_data_to_uprn]
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")
mapping_data = mapping_data.merge(
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
spatial_data_to_uprn[
["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
],
how="left",
on="uprn"
)
mapping_data = mapping_data.drop(columns=["internal_id"])
@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
f.write(json.dumps(mapping_data.to_dict(orient="records")))
# We also include some data for visualising the breakdown of EPCS
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
# Invert the true and false
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
proportion_of_real_epcs = proportion_of_real_epcs.rename(
columns={"estimated": "is_real_epc"}
)
# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
# proportion_of_real_epcs = proportion_of_real_epcs.rename(
# columns={"estimated": "is_real_epc"}
# )
#
# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
# f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
# Produce the breakdown of EPC ratings for properties to be surveyed
clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)
# Produce the breakdown of EPC ratings
epc_rating_breakdown = (
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
.value_counts()
.to_frame()
.reset_index()
)
epc_rating_breakdown = epc_rating_breakdown.rename(
columns={"current-energy-rating": "EPC"}
columns={"index": "EPC", "representative_epc": "count"}
)
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
epc_a_properties = clustering_features[
(clustering_features["current-energy-rating"] == "A")
& (~clustering_features["estimated"])
]
epc_a_properties = epc_a_properties.merge(
stonewater_asset_list,
on="internal_id",
how="inner"
)

View file

@ -1084,11 +1084,11 @@ def compile_data():
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
# TODO: Let's store this in s3
save_data_to_s3(
data=json.dumps(spatial_data_to_uprn.to_dict("records")),
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
bucket_name="retrofit-data-dev"
)
# save_data_to_s3(
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
# bucket_name="retrofit-data-dev"
# )
# We merge this spatial data onto final EPCS
@ -2070,7 +2070,7 @@ def updated_version():
[
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
"city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date"
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
]
]
@ -2192,6 +2192,7 @@ def updated_version():
'conservation_status',
'is_listed_building',
'is_heritage_building',
"county",
# Walls
"walls_reduced",
# Roof
@ -2204,9 +2205,12 @@ def updated_version():
"Age",
"Total Floor Area",
"representative_sap",
"days_since_lodgement",
]
]
clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
def split_property_type(row):
parts = row.split(':')
property_type = parts[0].strip()
@ -2224,10 +2228,11 @@ def updated_version():
"property_type",
"walls_reduced",
"roof_type",
"Main Fuel"
"Main Fuel",
"county",
]
def combine_small_groups(clustering_features, grouping_columns, threshold=1):
def combine_small_groups(clustering_features, grouping_columns, threshold=2):
# Identify small groups
group_sizes = clustering_features.groupby(grouping_columns).size()
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
@ -2397,6 +2402,49 @@ def updated_version():
asset_list_with_archetypes["cluster"]
)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["archetype_representative"] = (
asset_list_with_archetypes["archetype_representative"].fillna(False)
)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False)
# Produce the archetyping features
archetyping_features_csv = assigned_clusters[
[
"internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
"is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
]
].merge(
asset_list[
["internal_id", "uprn", "external_address_id"]
],
how="left",
on="internal_id"
).merge(
master_sheet_clustering_features,
how="left",
right_on="Address ID",
left_on="external_address_id"
).drop(columns=["Address ID"]).rename(
columns={
"internal_id": "Osm. ID",
"external_address_id": "Address ID",
}
)
archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
archetyping_features_csv.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
)
representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
print(representatives["postal_region"].nunique())
print(representatives["county"].nunique())
def read_asset_list():
asset_list = pd.read_excel(