minor

2026-07-27 23:35:01 +00:00 · 2024-07-01 13:35:00 +01:00 · 2024-07-01 13:35:00 +01:00 · 51333ff31a
commit 51333ff31a
parent c5693289c3
3 changed files with 93 additions and 10 deletions
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -4,6 +4,9 @@
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
+  <component name="PyCharmProfessionalAdvertiser">
+    <option name="shown" value="true" />
+  </component>
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/etl/customers/stonewater/outputs
+++ b/etl/customers/stonewater/outputs
@ -9,14 +9,16 @@ In this script, we do the following things:
 3) Mapping of the archetypes
 """
 import pandas as pd
+import json
 from utils.s3 import read_pickle_from_s3

-archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
-archetyped_asset_list = archetyped_asset_list[
+stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
+archetyped_asset_list = stonewater_asset_list[
    [
-        "internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
+        "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
+        "archetype_representative", "rank"
    ]
-]
+].copy()
 archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
 archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
 # Sort
@ -28,12 +30,38 @@ clustering_features = read_pickle_from_s3(
    s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
 )

+# Move property-type and built-form to the first two columns
+columns_to_move = ['property-type', 'built-form']
+
+# Get the remaining columns
+remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
+
+# Create the new column order
+new_column_order = columns_to_move + remaining_columns
+
+# Reorder the DataFrame
+clustering_features = clustering_features[new_column_order]
+
 archetyped_asset_list = archetyped_asset_list.merge(
    clustering_features,
    on="internal_id",
    how="inner"
 )

+archetyped_asset_list = archetyped_asset_list.rename(
+    columns={
+        "internal_id": "Osm. ID",
+        "customer_asset_id": "Org. ref.",
+        "external_address_id": "Address ID",
+        "cluster": "Archetype ID",
+        "archetype_representative": "Archetype Representative",
+        "rank": "Archetype Group Rank",
+    }
+)
+archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
+# Create an extract of the features
+
+
 # Look at number of combinations
 # - If we look at the number of combinations of property type & built form, we have 25 unique combinations
 # - If we look at the number of combinations of property type, built form, and walls description, this jumps
@ -50,6 +78,55 @@ archetyped_asset_list = archetyped_asset_list.merge(
 #     ["property-type", "built-form", "walls-description", "roof-description",
 #      "floor-description"]].drop_duplicates().shape

-property_type_archetypes = archetyped_asset_list[
-    ["cluster", "rank", "property-type", "built-form", "walls-description"]
-]
+# Save this as an excel
+# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
+
+# We store the location data, which will be used for the mapping. We just need the longitude and latitude
+mapping_data = stonewater_asset_list[
+    stonewater_asset_list["archetype_representative"]
+][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
+
+mapping_data = mapping_data.merge(
+    clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
+)
+mapping_data = mapping_data.drop(columns=["internal_id"])
+
+with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
+    f.write(json.dumps(mapping_data.to_dict(orient="records")))
+
+# We also include some data for visualising the breakdown of EPCS
+proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
+# Invert the true and false
+proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
+proportion_of_real_epcs = proportion_of_real_epcs.rename(
+    columns={"estimated": "is_real_epc"}
+)
+
+with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
+    f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
+
+# Produce the breakdown of EPC ratings
+epc_rating_breakdown = (
+    clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
+    .value_counts()
+    .to_frame()
+    .reset_index()
+)
+
+epc_rating_breakdown = epc_rating_breakdown.rename(
+    columns={"current-energy-rating": "EPC"}
+)
+
+with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
+    f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
+
+epc_a_properties = clustering_features[
+    (clustering_features["current-energy-rating"] == "A")
+    & (~clustering_features["estimated"])
+    ]
+
+epc_a_properties = epc_a_properties.merge(
+    stonewater_asset_list,
+    on="internal_id",
+    how="inner"
+)
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -678,7 +678,8 @@ def compile_data():
    # )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})

    asset_list = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        header=4
    )

    udprn_data = pd.read_excel(
@ -1128,7 +1129,8 @@ def compile_data_final():
    ########################################################################

    asset_list = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        header=4
    )

    udprn_data = pd.read_excel(
@ -1788,12 +1790,13 @@ def compile_data_final():
        property_attributes[to_col] = property_attributes[to_col].fillna("unknown")

    # Drop the description columns that are the keys in cleaned
+    print("PUT ME BACK!!??")
    property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
    # Perform the mapping

    # CLUSTERING!!
    grouping_columns = [
-        'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type'
+        'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
    ]

    # Define the preprocessing for numerical and categorical features