diff --git a/.idea/misc.xml b/.idea/misc.xml
index 1122b380..78660f34 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -4,6 +4,9 @@
+
+
+
diff --git a/etl/customers/stonewater/outputs 27th June 2024.py b/etl/customers/stonewater/outputs 27th June 2024.py
index d8bf43be..7a78469c 100644
--- a/etl/customers/stonewater/outputs 27th June 2024.py
+++ b/etl/customers/stonewater/outputs 27th June 2024.py
@@ -9,14 +9,16 @@ In this script, we do the following things:
3) Mapping of the archetypes
"""
import pandas as pd
+import json
from utils.s3 import read_pickle_from_s3
-archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
-archetyped_asset_list = archetyped_asset_list[
+stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
+archetyped_asset_list = stonewater_asset_list[
[
- "internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
+ "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
+ "archetype_representative", "rank"
]
-]
+].copy()
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
# Sort
@@ -28,12 +30,38 @@ clustering_features = read_pickle_from_s3(
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
)
+# Move property-type and built-form to the first two columns
+columns_to_move = ['property-type', 'built-form']
+
+# Get the remaining columns
+remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
+
+# Create the new column order
+new_column_order = columns_to_move + remaining_columns
+
+# Reorder the DataFrame
+clustering_features = clustering_features[new_column_order]
+
archetyped_asset_list = archetyped_asset_list.merge(
clustering_features,
on="internal_id",
how="inner"
)
+archetyped_asset_list = archetyped_asset_list.rename(
+ columns={
+ "internal_id": "Osm. ID",
+ "customer_asset_id": "Org. ref.",
+ "external_address_id": "Address ID",
+ "cluster": "Archetype ID",
+ "archetype_representative": "Archetype Representative",
+ "rank": "Archetype Group Rank",
+ }
+)
+archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
+# Create an extract of the features
+
+
# Look at number of combinations
# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
# - If we look at the number of combinations of property type, built form, and walls description, this jumps
@@ -50,6 +78,55 @@ archetyped_asset_list = archetyped_asset_list.merge(
# ["property-type", "built-form", "walls-description", "roof-description",
# "floor-description"]].drop_duplicates().shape
-property_type_archetypes = archetyped_asset_list[
- ["cluster", "rank", "property-type", "built-form", "walls-description"]
-]
+# Save this as an excel
+# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
+
+# We store the location data, which will be used for the mapping. We just need the longitude and latitude
+mapping_data = stonewater_asset_list[
+ stonewater_asset_list["archetype_representative"]
+][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
+
+mapping_data = mapping_data.merge(
+ clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
+)
+mapping_data = mapping_data.drop(columns=["internal_id"])
+
+with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
+ f.write(json.dumps(mapping_data.to_dict(orient="records")))
+
+# We also include some data for visualising the breakdown of EPCS
+proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
+# Invert the true and false
+proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
+proportion_of_real_epcs = proportion_of_real_epcs.rename(
+ columns={"estimated": "is_real_epc"}
+)
+
+with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
+ f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
+
+# Produce the breakdown of EPC ratings
+epc_rating_breakdown = (
+ clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
+ .value_counts()
+ .to_frame()
+ .reset_index()
+)
+
+epc_rating_breakdown = epc_rating_breakdown.rename(
+ columns={"current-energy-rating": "EPC"}
+)
+
+with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
+ f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
+
+epc_a_properties = clustering_features[
+ (clustering_features["current-energy-rating"] == "A")
+ & (~clustering_features["estimated"])
+ ]
+
+epc_a_properties = epc_a_properties.merge(
+ stonewater_asset_list,
+ on="internal_id",
+ how="inner"
+)
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index fa6551b7..bdac5ec2 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -678,7 +678,8 @@ def compile_data():
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
asset_list = pd.read_excel(
- "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+ "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+ header=4
)
udprn_data = pd.read_excel(
@@ -1128,7 +1129,8 @@ def compile_data_final():
########################################################################
asset_list = pd.read_excel(
- "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+ "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+ header=4
)
udprn_data = pd.read_excel(
@@ -1788,12 +1790,13 @@ def compile_data_final():
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
# Drop the description columns that are the keys in cleaned
+ print("PUT ME BACK!!??")
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
# Perform the mapping
# CLUSTERING!!
grouping_columns = [
- 'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type'
+ 'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
]
# Define the preprocessing for numerical and categorical features