From 603b3e1db2b5247c4bafc6165a3a2198892444df Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 23 Jul 2024 17:07:22 +0100
Subject: [PATCH] Stonewater WIP

---
 etl/bill_savings/data_collection.py           |   2 +-
 etl/customers/stonewater/shdf_3_clustering.py | 196 ++++++++++++++++++
 etl/sfr/epc_average_by_postcode.py            |   3 +
 3 files changed, 200 insertions(+), 1 deletion(-)

diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py
index f2a1a5c6..d2283ac4 100644
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@@ -133,7 +133,7 @@ def app():
     energy_consumption_data = []
     for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
         # Skip the first 50
-        if i < 245:
+        if i < 250:
             continue
 
         data = pd.read_csv(directory / "certificates.csv", low_memory=False)
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index 93797db0..e4818e2c 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -1995,6 +1995,29 @@ def updated_version():
     # Pull in the EPC data
     epc_data = read_epc_data(uprn_lookup_2)
 
+    # Pull in the spatial data to UPRN
+    spatial_data_to_uprn = read_pickle_from_s3(
+        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+
+    # Function to convert specific columns to bool dtype
+    def convert_specific_columns_to_bool(df, columns):
+        for column in columns:
+            if column in df.columns:
+                df[column] = df[column].astype(bool)
+        return df
+
+    spatial_data_to_uprn = [convert_specific_columns_to_bool(
+        df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
+    ) for df in spatial_data_to_uprn]
+
+    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+    spatial_data_to_uprn = spatial_data_to_uprn.drop(
+        columns=["partition", "filename"]
+    ).rename(columns={"UPRN": "uprn"})
+    spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
+
     ########################################################################
     # Prepare the data
     ########################################################################
@@ -2067,6 +2090,17 @@ def updated_version():
         clustering_features["parity_modelled_sap"]
     )
 
+    # We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
+    # is too many
+    clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
+
+    # Merge on spatial features
+    clustering_features = clustering_features.merge(
+        spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
+        how="left",
+        on="uprn"
+    )
+
     # incorect_epcs = clustering_features[
     #     clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
     # incorect_epcs = incorect_epcs[
@@ -2076,6 +2110,168 @@ def updated_version():
     # # Store data
     # incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
 
+    # We add in the key features, which are used for clustering
+    master_sheet_clustering_features = master_sheet[
+        ["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
+    ].copy()
+
+    # Step 1: Remap walls - we end up with 11 types
+    master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
+        {
+            "TimberFrame: AsBuilt": "Other wall type, as built",
+            "SystemBuilt: AsBuilt": "Other wall type, as built",
+            "Sandstone: AsBuilt": "Other wall type, as built",
+            "Sandstone: Internal": "Other wall type, internal or external",
+            "SystemBuilt: External": "Other wall type, internal or external",
+            "GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
+            "TimberFrame: Internal": "Other wall type, internal or external",
+            "Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
+            "SystemBuilt: Internal": "Other wall type, internal or external",
+            "Cavity: Internal": "Other wall type, internal or external",
+        }
+    )
+
+    # Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
+    #         gives us the insulation thickness
+
+    # Clean an incorrect value
+    master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
+        {
+            "PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
+            "PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
+            'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
+            'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
+            'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
+        }
+    )
+
+    master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
+        master_sheet_clustering_features['Roofs'].apply(
+            lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
+        )
+    )
+
+    # Strip any extra whitespace
+    master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
+    master_sheet_clustering_features['roof_insulation_thickness'] = (
+        master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
+    )
+
+    def map_thickness(thickness):
+        try:
+            value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
+            return "Above 250mm" if value > 250 else "Below 250mm"
+        except ValueError:
+            return thickness  # Return the original value if it cannot be converted to a float
+
+    master_sheet_clustering_features['roof_insulation_category'] = (
+        master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
+    )
+
+    # Ideas
+    # 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
+    # as a secondary category
+    # 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
+    # (could split on :, take first part)
+
+    clustering_features = clustering_features.merge(
+        master_sheet_clustering_features,
+        how="left",
+        on="Address ID"
+    )
+
+    # Reduce down to the final set of features we need
+    clustering_features = clustering_features[
+        [
+            "internal_id",
+            "Property Type",
+            # Location
+            "postal_region",
+            'conservation_status',
+            'is_listed_building',
+            'is_heritage_building',
+            # Walls
+            "walls_reduced",
+            # Roof
+            "roof_type",
+            "roof_insulation_category",
+            # Heating
+            "Heating",
+            # Fuel
+            "Main Fuel",
+            "Age",
+            "Total Floor Area"
+            "representative_sap",
+        ]
+    ]
+
+    z = master_sheet_clustering_features[
+        ["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"]
+    ].drop_duplicates()
+
+    # TODO: heating - remap
+    # Boiler: A rated Regular Boiler
+    # 1944
+    # Boiler: A rated Combi
+    # 1335
+    # Electric Storage Systems: High heat retention storage heaters
+    # 543
+    # Electric Storage Systems: Fan storage heaters
+    # 284
+    # Electric (direct acting) room heaters: Panel, convector or radiant heaters
+    # 253
+    # Boiler: C rated Regular Boiler
+    # 142
+    # Community Heating Systems: Community boilers only (RdSAP)
+    # 127
+    # Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C
+    # 126
+    # Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C
+    # 70
+    # Boiler: E rated Regular Boiler
+    # 62
+    # Boiler: E rated Combi
+    # 59
+    # Electric Storage Systems: Old (large volume) storage heaters
+    # 55
+    # Electric Storage Systems: Modern (slimline) storage heaters
+    # 49
+    # Boiler: B rated Regular Boiler
+    # 46
+    # Boiler: C rated Combi
+    # 44
+    # Heat Pump: Electric Heat pumps: Ground source heat pump in other cases
+    # 39
+    # Boiler: D rated Regular Boiler
+    # 16
+    # Community Heating Systems: Community CHP and boilers (RdSAP)
+    # 14
+    # Heat Pump: Electric Heat pumps: Air source heat pump in other cases
+    # 13
+    # Boiler: F rated Combi
+    # 12
+    # Boiler: G rated Regular Boiler
+    # 10
+    # Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters
+    # 8
+    # Electric (direct acting) room heaters: Water- or oil-filled radiators
+    # 4
+    # Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters
+    # 3
+    # Boiler: D rated Combi
+    # 3
+    # Boiler: A rated CPSU
+    # 2
+    # Heat Pump: (from database)
+    # 1
+    # System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi
+    # 1
+    # No Heating
+    # 1
+    # Solid fuel room heaters: Open fire in grate
+    # 1
+    # Boiler: F rated Regular Boiler
+
 
 def read_asset_list():
     asset_list = pd.read_excel(
diff --git a/etl/sfr/epc_average_by_postcode.py b/etl/sfr/epc_average_by_postcode.py
index 93683000..859f530c 100644
--- a/etl/sfr/epc_average_by_postcode.py
+++ b/etl/sfr/epc_average_by_postcode.py
@@ -75,3 +75,6 @@ def app():
         agg.insert(0, "Address", address1)
 
         collected_data.append(agg)
+
+    collected_df = pd.concat(collected_data)
+    collected_df.to_csv("EPC Averages SFR.csv", index=False)