diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 5f101d81..b5ec8c46 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -7,6 +7,9 @@ import pandas as pd import numpy as np from epc_api.client import EpcClient from backend.OrdnanceSurvey import OrdnanceSuveyClient +from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes +from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from BaseUtility import Definitions from utils.logger import setup_logger from typing import List @@ -181,6 +184,7 @@ class SearchEpc: self.newest_epc = None self.older_epcs = None self.full_sap_epc = None + self.metadata = None # These are the address and postcode values, which we store in the database self.address_clean = None @@ -306,7 +310,10 @@ class SearchEpc: if (property_type is None) and (address is None): return rows - if len(uprns) == 1: + unique_property_types = {r["property-type"] for r in rows} + + # We allow for variation in property type across flats/maisonettes + if (len(uprns) == 1) and ((len(unique_property_types) == 1) or unique_property_types == {"Flat", "Maisonette"}): return rows if property_type is not None: @@ -784,3 +791,86 @@ class SearchEpc: self.address_clean = self.ordnance_survey_client.address_os self.postcode_clean = self.ordnance_survey_client.postcode_os return + + def check_attribute_variations(self): + attribute_map = { + "walls-description": { + "cleaner": WallAttributes, + "attribute": [ + "is_cavity_wall", "is_solid_brick", "is_system_built", "is_timber_frame", + "is_granite_or_whinstone", "is_cob", "is_sandstone_or_limestone", "is_park_home" + ], + "name": "has_wall_type_ever_varied" + }, + "roof-description": { + "cleaner": RoofAttributes, + "attribute": [ + "is_flat", "is_pitched", "is_roof_room", "is_thatched", "has_dwelling_above" + ], + "name": "has_roof_type_ever_varied" + }, + "floor-description": { + "cleaner": FloorAttributes, + "attribute": [ + "is_to_unheated_space", "is_to_external_air", "is_suspended", "is_solid", "is_to_external_air", + ], + "name": "has_floor_type_ever_varied" + } + } + + attribute_variations = {} + for attribute, attribute_objs in attribute_map.items(): + attribute_variations[attribute_objs["name"]] = False + cleaner = attribute_objs["cleaner"] + type_timeline = pd.DataFrame([cleaner(epc[attribute]).process() for epc in self.older_epcs] + [ + cleaner(self.newest_epc[attribute]).process() + ]) + # For eac col in attribute_objs["attribute"] we check if the timeline has ever varied, i.e has gone + # from true to false + for col in attribute_objs["attribute"]: + if type_timeline[col].nunique() > 1: + attribute_variations[attribute_objs["name"]] = True + break + + return attribute_variations + + def identify_flat_floor(self): + # If there is no dwelling above, it is a top floor flat + processed_roof = RoofAttributes(self.newest_epc["roof-description"]).process() + if not processed_roof["has_dwelling_above"]: + return "top" + + # We know that there is a dwelling above. If there's also a drwelling below, it is a mid floor flat + processed_floor = FloorAttributes(self.newest_epc["floor-description"]).process() + if processed_floor["another_property_below"]: + return "mid" + + # Otherwise ground floor + return "ground" + + def get_metadata(self): + if self.newest_epc is None: + raise ValueError("No EPC data available") + + # We check if the property has ever been downgraded on SAP + has_sap_ever_downgraded = False + sap_timeline = [int(epc["current-energy-efficiency"]) for epc in self.older_epcs] + [ + int(self.newest_epc["current-energy-efficiency"]) + ] + # We check if there has ever been a decrease by differencing + has_sap_ever_downgraded = any(np.diff(sap_timeline) < 0) + + # We check if the wall type has ever varied over time + attribute_varations = self.check_attribute_variations() + + # If the property is a flat, we distinguish between top, mid, ground floor + floor = None + if self.newest_epc["property-type"] == "Flat": + floor = self.identify_flat_floor() + + self.metadata = { + "days_since_last_epc": (pd.Timestamp.now() - pd.Timestamp(self.newest_epc["lodgement-date"])).days, + "has_sap_ever_downgraded": has_sap_ever_downgraded, + "floor": floor, + **attribute_varations + } diff --git a/etl/customers/aiha/epc_data_pull.py b/etl/customers/aiha/epc_data_pull.py new file mode 100644 index 00000000..8aaaf5ba --- /dev/null +++ b/etl/customers/aiha/epc_data_pull.py @@ -0,0 +1,363 @@ +import os +from tqdm import tqdm +from dotenv import load_dotenv +import pandas as pd +from backend.SearchEpc import SearchEpc +from etl.spatial.OpenUprnClient import OpenUprnClient + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +pd.set_option('display.max_rows', 500) +pd.set_option('display.max_columns', 500) +pd.set_option('display.width', 1000) + + +def app(): + # Retrieve EPC data for the SHDF AIHA portfolio + + data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Khalim Review - 240902 - KSQ - AIHA - SHDF Wave " + "3 bid - Supplementary information.xlsx", + sheet_name="All units information", + header=3 + ) + + # Remove the .eg row + data = data.tail(-1) + + # Remove the bottom 2 rows + data = data.head(-2) + data = data.reset_index(drop=True) + data["row_id"] = data.index + + ammendments = { + "12 11-18 Schonfeld Square": "12 Schonfeld Square", + "35 35-37 Schonfeld Square": "35 Schonfeld Square", + '77 Schonfeld Square': '77 Lordship Road', + "83 Lordship Road (Schonfeld Square)": "83 Lordship Road", + "A 80 Bethune Road": "80A Bethune Road", + "86B Bethune Road": "Flat B, 86 Bethune Road", + "22 Glendale Road": "22 Glendale Avenue", + "121 Southbourne Road": "121 Southbourne Grove", + } + + no_epc = [ + "80B Bethune Road", + "89B Manor Road", + "12 Monkville Avenue", + "9 Greenview", + ] + + property_type_map = { + "House, mid-terrace": "House", + "House, end terrace": "House", + "House, semi-detached": "House", + "House, detached": "House", + "Flat": "Flat", + } + + epc_data = [] + epc_metadata = [] + for _, home in tqdm(data.iterrows(), total=len(data)): + + # Build address 1 based on if there is: + # 1) Address letter or number + # 2) Street address + + modified = False + address1 = "" + address1_backup = "" + + if home["Address letter or number"] in ["A", "B", "C"]: + + house_no = home['Street address'].split(' ')[0] + street = ' '.join(home['Street address'].split(' ')[1:]) + address1 = f"{house_no}{home['Address letter or number']} {street}" + + address1_backup = f"Flat {home['Address letter or number']} {house_no} {street}" + modified = True + + else: + if not pd.isnull(home["Address letter or number"]): + address1 += f"{home['Address letter or number']} " + if not pd.isnull(home["Street address"]): + address1 += f"{home['Street address']}" + address1 = address1.strip() + + if address1.split(" ")[-1].lower() == "rd": + # Replace with road + address1 = address1.lower().replace(" rd", " road") + + # Specific ammendments + if address1 in ammendments: + address1 = ammendments[address1] + + if address1 in no_epc: + continue + + searcher = SearchEpc( + address1=address1, + postcode=home["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=property_type_map[home["Property type"]] + ) + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None and modified: + searcher = SearchEpc( + address1=address1_backup, + postcode=home["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=property_type_map[home["Property type"]] + ) + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + raise Exception("Not found") + + epc_data.append( + { + "row_id": home["row_id"], + **searcher.newest_epc + } + ) + + searcher.get_metadata() + + epc_metadata.append( + { + "row_id": home["row_id"], + "address": address1, + "postcode": home["Postcode"], + **searcher.metadata + } + ) + + epc_metadata = pd.DataFrame(epc_metadata) + epc_data = pd.DataFrame(epc_data) + + # Check matched addresses + matched_addresses = epc_metadata[["row_id", "address", "postcode"]].copy() + matched_addresses = matched_addresses.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ) + + # We look for differences between the asset list and the EPC data + comparison_cols = { + "Property type": [ + { + "epc_col": "property-type", + "map": property_type_map + }, + { + "epc_col": "built-form", + "map": { + "House, mid-terrace": "Mid-Terrace", + "House, end terrace": "End-Terrace", + "House, semi-detached": "Semi-Detached", + "House, detached": "Detached", + "Flat": "Flat", + } + } + ], + "Energy starting band (EPC)": [ + { + "epc_col": "current-energy-rating", + "map": {} + } + ], + "Wall type": [ + { + "epc_col": "walls-description", + "search_terms": { + "solid": "Solid brick", + "cavity": "Cavity wall", + "solid - internal lining": "Solid brick", + } + } + ], + "Roof type": [ + { + "epc_col": "roof-description", + "search_terms": { + "pitched": "Pitched", + "n/a - (flat above)": "another dwelling above" + } + } + ], + "Floor type": [ + { + "epc_col": "floor-description", + "search_terms": { + "solid": "Solid", + "suspended": "Suspended", + "solid - floating floor for services": "Solid" + } + } + ], + } + + import re + differences = [] + for asset_list_col, list_of_configs in comparison_cols.items(): + + if asset_list_col in ["Wall type", "Roof type", "Floor type"]: + config = list_of_configs[0] + # We handle this differently + remapped = data[["row_id", asset_list_col]].copy() + # Strip the asset list col incase of leading/trailing spaces + remapped[asset_list_col] = remapped[asset_list_col].str.strip() + remapped[asset_list_col] = remapped[asset_list_col].str.lower() + remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner") + # We do a search term check + remapped["Match"] = None + for search_term, epc_term in config["search_terms"].items(): + if "/" in search_term: + escaped_search_term = re.escape(search_term) + remapped.loc[remapped[asset_list_col].str.contains(escaped_search_term), "Match"] = ( + remapped.loc[ + remapped[asset_list_col].str.contains(escaped_search_term), config["epc_col"] + ].str.contains(epc_term) + ) + else: + remapped.loc[remapped[asset_list_col].str.contains(search_term), "Match"] = ( + remapped.loc[ + remapped[asset_list_col].str.contains(search_term), config["epc_col"] + ].str.contains(epc_term) + ) + + if pd.isnull(remapped["Match"]).sum(): + raise Exception("Not all matched") + + remapped["Match"] = remapped["Match"].astype(bool) + + if not all(remapped["Match"]): + differences.append( + { + "Column": asset_list_col, + "Differences": remapped[~remapped["Match"]], + } + ) + + continue + + for config in list_of_configs: + + remapped = data[["row_id", asset_list_col]].copy() + if config["map"]: + remapped[asset_list_col] = remapped[asset_list_col].map(config["map"]) + + # Merge on + remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner") + remapped["Match"] = remapped[asset_list_col] == remapped[config["epc_col"]] + if not all(remapped["Match"]): + differences.append( + { + "Column": asset_list_col, + "Differences": remapped[~remapped["Match"]], + } + ) + + # Check for property type + property_type_differences = differences[0]["Differences"].copy() + property_type_differences = property_type_differences.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ) + print(property_type_differences) + + # Check for built form + built_form_differences = differences[1]["Differences"].copy() + built_form_differences = built_form_differences[built_form_differences["Property type"] != "Flat"] + built_form_differences = built_form_differences.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ) + print(built_form_differences) + + # Check for energy rating + energy_rating_differences = differences[2]["Differences"].copy() + energy_rating_differences = energy_rating_differences.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ).merge( + epc_data[["row_id", "uprn"]], on="row_id", how="inner" + ) + print(energy_rating_differences) + + # Check for wall type + wall_type_differences = differences[3]["Differences"].copy() + wall_type_differences = wall_type_differences.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ).merge( + epc_data[["row_id", "uprn"]], on="row_id", how="inner" + ) + print(wall_type_differences) # Many wall type differences + + # Check for roof type + roof_type_differences = differences[4]["Differences"].copy() + roof_type_differences = roof_type_differences.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ).merge( + epc_data[["row_id", "uprn"]], on="row_id", how="inner" + ) + print(roof_type_differences) # Many roof type differences + + # Check for floor type + floor_type_differences = differences[5]["Differences"].copy() + floor_type_differences = floor_type_differences.merge( + data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner" + ).merge( + epc_data[["row_id", "uprn"]], on="row_id", how="inner" + ) + print(floor_type_differences) # Many floor type differences + + # TODO: 47 Ashtead Road [100021024699] shows solid brick wall on EPC - is probably cavity wall + + # We have the EPC data. Let's check conservation area/historic/listed building status + portfolio_spatial_data = OpenUprnClient.get_spatial_data( + epc_data["uprn"].unique().tolist(), bucket_name="retrofit-data-dev" + ) + + portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str) + + spatial_data = data[["row_id", "Planning constraints"]].merge( + epc_data[["row_id", "uprn"]], on="row_id", how="left", + + ).merge( + portfolio_spatial_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]], + left_on="uprn", + right_on="UPRN", how="left" + ) + + spatial_data[ + (spatial_data["Planning constraints"] == "None") + ]["conservation_status"].value_counts() + + # One property is in a conservation area, that was not picked up in the asset data + print(spatial_data[ + (spatial_data["Planning constraints"] == "None") & + (spatial_data["conservation_status"] == True) + ].merge( + data[["row_id", "Address letter or number", "Street address", "Postcode"]], on="row_id", how="left" + )) + + # All properties match up apart from one where the asset data indicates it's in a conservation area, however + # the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio + + # Draft archetyping + archetyping_data = data[ + [ + "row_id", + "Energy starting band (EPC)", + "Property type", + "Property year built", + "Gross internal area (sqm)", + "Current heating system type", + "Wall type", + "Floor type", + "Roof type", + "Window type", + "Location (Floor)", + ] + ]