mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
revised clusterin
This commit is contained in:
parent
3ac9dcd366
commit
36b3b4bea5
2 changed files with 112 additions and 51 deletions
|
|
@ -11,8 +11,9 @@ In this script, we do the following things:
|
|||
import pandas as pd
|
||||
import json
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
from backend.app.utils import sap_to_epc
|
||||
|
||||
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
|
||||
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
|
||||
archetyped_asset_list = stonewater_asset_list[
|
||||
[
|
||||
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
|
||||
|
|
@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
|
|||
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
|
||||
|
||||
# Read in and merge on clustering features
|
||||
clustering_features = read_pickle_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
clustering_features = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
|
||||
)
|
||||
|
||||
# Move property-type and built-form to the first two columns
|
||||
columns_to_move = ['property-type', 'built-form']
|
||||
|
||||
# Get the remaining columns
|
||||
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
|
||||
|
||||
# Create the new column order
|
||||
new_column_order = columns_to_move + remaining_columns
|
||||
|
||||
# Reorder the DataFrame
|
||||
clustering_features = clustering_features[new_column_order]
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.merge(
|
||||
clustering_features,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
|
||||
left_on="internal_id",
|
||||
right_on="Osm. ID"
|
||||
).drop(columns=["Osm. ID"])
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.rename(
|
||||
columns={
|
||||
|
|
@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
|
|||
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
|
||||
|
||||
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
|
||||
stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
|
||||
|
||||
mapping_data = stonewater_asset_list[
|
||||
stonewater_asset_list["archetype_representative"]
|
||||
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
|
||||
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
|
||||
archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
# We need to merge on longitude and latitude
|
||||
spatial_data_to_uprn = read_pickle_from_s3(
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
|
||||
# Function to convert specific columns to bool dtype
|
||||
def convert_specific_columns_to_bool(df, columns):
|
||||
for column in columns:
|
||||
if column in df.columns:
|
||||
df[column] = df[column].astype(bool)
|
||||
return df
|
||||
|
||||
|
||||
spatial_data_to_uprn = [convert_specific_columns_to_bool(
|
||||
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
|
||||
) for df in spatial_data_to_uprn]
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
||||
columns=["partition", "filename"]
|
||||
).rename(columns={"UPRN": "uprn"})
|
||||
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")
|
||||
|
||||
mapping_data = mapping_data.merge(
|
||||
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
|
||||
spatial_data_to_uprn[
|
||||
["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
|
||||
],
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
mapping_data = mapping_data.drop(columns=["internal_id"])
|
||||
|
||||
|
|
@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
|
|||
f.write(json.dumps(mapping_data.to_dict(orient="records")))
|
||||
|
||||
# We also include some data for visualising the breakdown of EPCS
|
||||
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
|
||||
# Invert the true and false
|
||||
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
|
||||
proportion_of_real_epcs = proportion_of_real_epcs.rename(
|
||||
columns={"estimated": "is_real_epc"}
|
||||
)
|
||||
# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
|
||||
# proportion_of_real_epcs = proportion_of_real_epcs.rename(
|
||||
# columns={"estimated": "is_real_epc"}
|
||||
# )
|
||||
#
|
||||
# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
|
||||
# f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
|
||||
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
|
||||
# Produce the breakdown of EPC ratings for properties to be surveyed
|
||||
clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)
|
||||
|
||||
# Produce the breakdown of EPC ratings
|
||||
epc_rating_breakdown = (
|
||||
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
|
||||
clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
|
||||
.value_counts()
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
epc_rating_breakdown = epc_rating_breakdown.rename(
|
||||
columns={"current-energy-rating": "EPC"}
|
||||
columns={"index": "EPC", "representative_epc": "count"}
|
||||
)
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
|
||||
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
|
||||
|
||||
epc_a_properties = clustering_features[
|
||||
(clustering_features["current-energy-rating"] == "A")
|
||||
& (~clustering_features["estimated"])
|
||||
]
|
||||
|
||||
epc_a_properties = epc_a_properties.merge(
|
||||
stonewater_asset_list,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1084,11 +1084,11 @@ def compile_data():
|
|||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
|
||||
# TODO: Let's store this in s3
|
||||
save_data_to_s3(
|
||||
data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
||||
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
# We merge this spatial data onto final EPCS
|
||||
|
||||
|
|
@ -2070,7 +2070,7 @@ def updated_version():
|
|||
[
|
||||
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
|
||||
"city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
|
||||
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date"
|
||||
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
|
||||
]
|
||||
]
|
||||
|
||||
|
|
@ -2192,6 +2192,7 @@ def updated_version():
|
|||
'conservation_status',
|
||||
'is_listed_building',
|
||||
'is_heritage_building',
|
||||
"county",
|
||||
# Walls
|
||||
"walls_reduced",
|
||||
# Roof
|
||||
|
|
@ -2204,9 +2205,12 @@ def updated_version():
|
|||
"Age",
|
||||
"Total Floor Area",
|
||||
"representative_sap",
|
||||
"days_since_lodgement",
|
||||
]
|
||||
]
|
||||
|
||||
clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
|
||||
|
||||
def split_property_type(row):
|
||||
parts = row.split(':')
|
||||
property_type = parts[0].strip()
|
||||
|
|
@ -2224,10 +2228,11 @@ def updated_version():
|
|||
"property_type",
|
||||
"walls_reduced",
|
||||
"roof_type",
|
||||
"Main Fuel"
|
||||
"Main Fuel",
|
||||
"county",
|
||||
]
|
||||
|
||||
def combine_small_groups(clustering_features, grouping_columns, threshold=1):
|
||||
def combine_small_groups(clustering_features, grouping_columns, threshold=2):
|
||||
# Identify small groups
|
||||
group_sizes = clustering_features.groupby(grouping_columns).size()
|
||||
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
|
||||
|
|
@ -2397,6 +2402,49 @@ def updated_version():
|
|||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
|
||||
|
||||
asset_list_with_archetypes["archetype_representative"] = (
|
||||
asset_list_with_archetypes["archetype_representative"].fillna(False)
|
||||
)
|
||||
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False)
|
||||
|
||||
# Produce the archetyping features
|
||||
archetyping_features_csv = assigned_clusters[
|
||||
[
|
||||
"internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
|
||||
"is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
|
||||
]
|
||||
].merge(
|
||||
asset_list[
|
||||
["internal_id", "uprn", "external_address_id"]
|
||||
],
|
||||
how="left",
|
||||
on="internal_id"
|
||||
).merge(
|
||||
master_sheet_clustering_features,
|
||||
how="left",
|
||||
right_on="Address ID",
|
||||
left_on="external_address_id"
|
||||
).drop(columns=["Address ID"]).rename(
|
||||
columns={
|
||||
"internal_id": "Osm. ID",
|
||||
"external_address_id": "Address ID",
|
||||
}
|
||||
)
|
||||
|
||||
archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
|
||||
archetyping_features_csv.to_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
|
||||
)
|
||||
|
||||
representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
|
||||
print(representatives["postal_region"].nunique())
|
||||
print(representatives["county"].nunique())
|
||||
|
||||
|
||||
def read_asset_list():
|
||||
asset_list = pd.read_excel(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue