From 603b3e1db2b5247c4bafc6165a3a2198892444df Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jul 2024 17:07:22 +0100 Subject: [PATCH] Stonewater WIP --- etl/bill_savings/data_collection.py | 2 +- etl/customers/stonewater/shdf_3_clustering.py | 196 ++++++++++++++++++ etl/sfr/epc_average_by_postcode.py | 3 + 3 files changed, 200 insertions(+), 1 deletion(-) diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index f2a1a5c6..d2283ac4 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -133,7 +133,7 @@ def app(): energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): # Skip the first 50 - if i < 245: + if i < 250: continue data = pd.read_csv(directory / "certificates.csv", low_memory=False) diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 93797db0..e4818e2c 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1995,6 +1995,29 @@ def updated_version(): # Pull in the EPC data epc_data = read_epc_data(uprn_lookup_2) + # Pull in the spatial data to UPRN + spatial_data_to_uprn = read_pickle_from_s3( + s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", + bucket_name="retrofit-data-dev" + ) + + # Function to convert specific columns to bool dtype + def convert_specific_columns_to_bool(df, columns): + for column in columns: + if column in df.columns: + df[column] = df[column].astype(bool) + return df + + spatial_data_to_uprn = [convert_specific_columns_to_bool( + df, ['conservation_status', 'is_listed_building', 'is_heritage_building'] + ) for df in spatial_data_to_uprn] + + spatial_data_to_uprn = pd.concat(spatial_data_to_uprn) + spatial_data_to_uprn = spatial_data_to_uprn.drop( + columns=["partition", "filename"] + ).rename(columns={"UPRN": "uprn"}) + spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str) + ######################################################################## # Prepare the data ######################################################################## @@ -2067,6 +2090,17 @@ def updated_version(): clustering_features["parity_modelled_sap"] ) + # We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which + # is too many + clustering_features["postal_region"] = clustering_features["postcode"].str[:-3] + + # Merge on spatial features + clustering_features = clustering_features.merge( + spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]], + how="left", + on="uprn" + ) + # incorect_epcs = clustering_features[ # clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]] # incorect_epcs = incorect_epcs[ @@ -2076,6 +2110,168 @@ def updated_version(): # # Store data # incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False) + # We add in the key features, which are used for clustering + master_sheet_clustering_features = master_sheet[ + ["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"] + ].copy() + + # Step 1: Remap walls - we end up with 11 types + master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace( + { + "TimberFrame: AsBuilt": "Other wall type, as built", + "SystemBuilt: AsBuilt": "Other wall type, as built", + "Sandstone: AsBuilt": "Other wall type, as built", + "Sandstone: Internal": "Other wall type, internal or external", + "SystemBuilt: External": "Other wall type, internal or external", + "GraniteOrWhinstone: AsBuilt": "Other wall type, as built", + "TimberFrame: Internal": "Other wall type, internal or external", + "Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity", + "SystemBuilt: Internal": "Other wall type, internal or external", + "Cavity: Internal": "Other wall type, internal or external", + } + ) + + # Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second + # gives us the insulation thickness + + # Clean an incorrect value + master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace( + { + "PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm", + "PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm", + 'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm", + 'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm", + 'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm', + } + ) + + master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = ( + master_sheet_clustering_features['Roofs'].apply( + lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, '']) + ) + ) + + # Strip any extra whitespace + master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip() + master_sheet_clustering_features['roof_insulation_thickness'] = ( + master_sheet_clustering_features['roof_insulation_thickness'].str.strip() + ) + + def map_thickness(thickness): + try: + value = float(thickness.replace('mm', '').replace('+', '').replace(' ', '')) + return "Above 250mm" if value > 250 else "Below 250mm" + except ValueError: + return thickness # Return the original value if it cannot be converted to a float + + master_sheet_clustering_features['roof_insulation_category'] = ( + master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness) + ) + + # Ideas + # 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access + # as a secondary category + # 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split + # (could split on :, take first part) + + clustering_features = clustering_features.merge( + master_sheet_clustering_features, + how="left", + on="Address ID" + ) + + # Reduce down to the final set of features we need + clustering_features = clustering_features[ + [ + "internal_id", + "Property Type", + # Location + "postal_region", + 'conservation_status', + 'is_listed_building', + 'is_heritage_building', + # Walls + "walls_reduced", + # Roof + "roof_type", + "roof_insulation_category", + # Heating + "Heating", + # Fuel + "Main Fuel", + "Age", + "Total Floor Area" + "representative_sap", + ] + ] + + z = master_sheet_clustering_features[ + ["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"] + ].drop_duplicates() + + # TODO: heating - remap + # Boiler: A rated Regular Boiler + # 1944 + # Boiler: A rated Combi + # 1335 + # Electric Storage Systems: High heat retention storage heaters + # 543 + # Electric Storage Systems: Fan storage heaters + # 284 + # Electric (direct acting) room heaters: Panel, convector or radiant heaters + # 253 + # Boiler: C rated Regular Boiler + # 142 + # Community Heating Systems: Community boilers only (RdSAP) + # 127 + # Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C + # 126 + # Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C + # 70 + # Boiler: E rated Regular Boiler + # 62 + # Boiler: E rated Combi + # 59 + # Electric Storage Systems: Old (large volume) storage heaters + # 55 + # Electric Storage Systems: Modern (slimline) storage heaters + # 49 + # Boiler: B rated Regular Boiler + # 46 + # Boiler: C rated Combi + # 44 + # Heat Pump: Electric Heat pumps: Ground source heat pump in other cases + # 39 + # Boiler: D rated Regular Boiler + # 16 + # Community Heating Systems: Community CHP and boilers (RdSAP) + # 14 + # Heat Pump: Electric Heat pumps: Air source heat pump in other cases + # 13 + # Boiler: F rated Combi + # 12 + # Boiler: G rated Regular Boiler + # 10 + # Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters + # 8 + # Electric (direct acting) room heaters: Water- or oil-filled radiators + # 4 + # Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters + # 3 + # Boiler: D rated Combi + # 3 + # Boiler: A rated CPSU + # 2 + # Heat Pump: (from database) + # 1 + # System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi + # 1 + # No Heating + # 1 + # Solid fuel room heaters: Open fire in grate + # 1 + # Boiler: F rated Regular Boiler + def read_asset_list(): asset_list = pd.read_excel( diff --git a/etl/sfr/epc_average_by_postcode.py b/etl/sfr/epc_average_by_postcode.py index 93683000..859f530c 100644 --- a/etl/sfr/epc_average_by_postcode.py +++ b/etl/sfr/epc_average_by_postcode.py @@ -75,3 +75,6 @@ def app(): agg.insert(0, "Address", address1) collected_data.append(agg) + + collected_df = pd.concat(collected_data) + collected_df.to_csv("EPC Averages SFR.csv", index=False)