diff --git a/asset_list/utils.py b/asset_list/utils.py index fe2b7d14..c7d0cc0a 100644 --- a/asset_list/utils.py +++ b/asset_list/utils.py @@ -1,6 +1,8 @@ import time import random import pandas as pd + +from adhoc.investigation import newest_epc from backend.SearchEpc import SearchEpc from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc from tqdm import tqdm @@ -9,6 +11,132 @@ from utils.logger import setup_logger logger = setup_logger() +def get_data_for_property( + address1: str, + postcode: str, + full_address: str, + property_type: [str | None], + built_form: [str | None], + uprn: [str | float | None], + epc_auth_token: str, + find_my_epc_return_page: bool +): + """ + Utility function that will fetch the data for a single property + :return: + """ + + if property_type == "block of flats": + return None + + house_number = str(address1).strip() + full_address = full_address.strip() + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number + + if pd.isnull(uprn): + uprn = None + + searcher = SearchEpc( + address1=str(house_no), + postcode=postcode, + auth_token=epc_auth_token, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5, + uprn=uprn + ) + # Force the skipping of estimating the EPC + # We check if the property was split + + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form + searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None and uprn is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=epc_auth_token, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + + # As a final resort, we estimate the EPC + if property_type is not None and searcher.newest_epc is None: + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + return None + + # Retrieve data from FindMyEPC + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], + postcode=searcher.newest_epc["postcode"] + ) + find_epc_response = find_epc_searcher.retrieve_newest_find_my_epc_data( + return_page=find_my_epc_return_page + ) + + except ValueError as e: + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_response = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_response = ({}, None) if find_my_epc_return_page else ({}) + else: + logger.error(f"Error retrieving FindMyEPC data: {e}") + raise Exception(f"Error retrieving FindMyEPC data: {e}") + else: + find_epc_response = ({}, None) if find_my_epc_return_page else ({}) + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + + find_my_epc_page = None + if find_my_epc_return_page: + find_my_epc_data, find_my_epc_page = find_epc_response + else: + find_my_epc_data = find_epc_response + + return newest_epc, older_epcs, find_my_epc_data, find_my_epc_page + + def get_data( df, manual_uprn_map, diff --git a/backend/Property.py b/backend/Property.py index 609a9d75..e5639aa2 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1221,11 +1221,12 @@ class Property: None: "Natural Gas (Community Scheme)", "mains gas": "Natural Gas (Community Scheme)", "biomass": "Smokeless Fuel", + "electricity": "Electricity" } if self.main_fuel["fuel_type"] in fuel_map: # We assume when None as it's unknown self.heating_energy_source = fuel_map[self.main_fuel["fuel_type"]] else: - raise Exception("Implement me") + raise NotImplementedError(f"Unhandled fuel {self.main_fuel['fuel_type']}") if self.hotwater["heater_type"] is not None: self.hot_water_energy_source = heater_type_to_fuel[self.hotwater["heater_type"]] @@ -1247,7 +1248,7 @@ class Property: secondary_heating = self.data["secondheat-description"] self.hot_water_energy_source = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[secondary_heating]["fuel"] else: - raise Exception("Investiage me") + raise NotImplementedError(f"Investiage me - unhandled hot water fuel {fuel}") else: self.hot_water_energy_source = hotwater_appliance_to_fuel[self.hotwater["appliance"]] diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py index 1ba80223..eadd0788 100644 --- a/backend/tests/test_integration.py +++ b/backend/tests/test_integration.py @@ -91,7 +91,7 @@ costs_by_floor_area = costs_by_floor_area.groupby("current-energy-efficiency")[ ].mean().reset_index() sample_epc_data = epc_data[pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2015-01-01"].drop_duplicates("UPRN").sample( - 10000).reset_index(drop=True) + 20000).reset_index(drop=True) # TODO: In Property find_energy_sources, sort out biomass community heating - what fuel type # TODO: We might be able to remove find_energy_sources entirely and remove estimate_electrical_consumption. It's used diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py b/etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py new file mode 100644 index 00000000..c68a0b58 --- /dev/null +++ b/etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py @@ -0,0 +1,145 @@ +""" +This scipt prepares the raw data that was sent over by Peabody for production of +a standardised asset list + +They have sent over just short of 100,000 properties and so, to make this easier, we will do the following +1) Break the data up into subsets of 25,000 +2) Combine the data provided into a single list +""" +import json +import time +import os +import pandas as pd +from tqdm import tqdm +from dotenv import load_dotenv +from asset_list.utils import get_data_for_property +from utils.logger import setup_logger + +logger = setup_logger() + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +property_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " + "- Data Extracts for Domna.xlsx", + sheet_name="Properties" +) +sustainability_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " + "- Data Extracts for Domna.xlsx", + sheet_name="Sustainability" +) + +# Basic overview: +# 1) We have 10,634 postcodes. If we needed to make requests to the ordnance survey API for +# all of these postcodes, it would cost at least £106, not accounting for double requests for postcodes +# where we have more than 100 properties (WE DONT!) +# 2) This is on average 9.36 properties per postcode +# 3) The UPRN in the property_list matches to the Org Ref in the sustainability data. These +# is an additional UPRN column in sustainability data which appears to be the ordnance survey UPRN +# 4) There appears to be some anomalous records, e.g. a flat with 543 m2 floor area and another flat +# with 6m2 floor area +# 5) Based on the residential indicator, all properties appear to be resi +# 6) We should do some quick calcs on how much it might cost to fetch all of the solar API data +# 7) We have 8785 missing UPRNS, which we should potentially try and fill +# 8) In the backend, we should probably start storing the raw EPC input data to allow for much quicker +# re-runs. All we really need to do is store the find my EPC data, perhaps against UPRN and RRN, as well +# as the raw EPC data, against uprn. This will be useful for scenario re-builds and will be much much +# quicker, as a starting point. Do we store in the database vs s3? TBC + +n_postcodes = property_list["Post Code"].nunique() +postcode_summary = property_list.groupby("Post Code")["UPRN"].count().reset_index() +postcode_summary["UPRN"].mean() + +test_match = property_list.merge(sustainability_data, left_on="UPRN", right_on="Org Ref") + + +def classify_floor_area(x): + if x <= 72: + return "0-72" + if x <= 97: + return "73-97" + if x <= 199: + return "98-199" + return "200+" + + +sustainability_data["Postal Region"] = sustainability_data["Postcode"].str.split(" ").str[0] +sustainability_data["Floor Area Band"] = sustainability_data["Total Floor Area (m2)"].apply( + lambda x: classify_floor_area(x) +) + +archetypes = sustainability_data[ + ["Type", "Attachment", "Construction Years", "Wall Construction", "Wall Insulation", + "Roof Construction", "Roof Insulation", "Floor Construction", "Floor Insulation", + "Glazing", "Heating", "Boiler Efficiency", "Main Fuel", "Controls Adequacy", + "Floor Area Band"] +].drop_duplicates() + +# Maps the property types to the format recognised by the EPC api +property_type_map = {} +# Maps the build form to the format recognised by the OS api +built_form_map = {} + +# Proposed data fetching +# 1) grab propeties with UPRN and fetch the assocated EPC data & find my EPC data +# Some thoughts: +# S3 is quite cheap to query however we may incur some cost if we're making hundreds of thousands of calls +# to S3 to fetch data out of it. It's cheap to fetch data, if we aren't taking data out of S3, but we +# should consider this. This may influence whether or not we want to store each record individually +# against UPRN, or store against the 10,641 postcodes. We can fetch the data and store in a single +# large dump and then determine later if we want to split it up + +# TODO: Handle properties without uprn +# TODO: I think we can json dump all of this, but check if we can load and re-use the page source +# TODO: Create batches? + +batch_size = 500 +batch_indexes = list(range(0, len(sustainability_data), batch_size)) + +# TODO: SET +working_directory = "" +download_contents = os.listdir(working_directory) + +for i in range(0, len(sustainability_data.standardised_asset_list), batch_size): + + batch_name = f"batch_{i}_to_{i + batch_size}" + # TODO: Check this + if batch_name in download_contents: + # Means we already have the data downloaded + continue + + batch_data = {} + for _, property_data in tqdm(sustainability_data.iterrows(), total=len(sustainability_data)): + os_uprn = property_data["UPRN"] + address1 = property_data["Address 1"] + postcode = property_data["Postcode"] + full_address_components = [ + x for x in [property_data["Address 1"], property_data["Address 2"], property_data["Address 3"]] + if not pd.isnull(x) + ] + full_address = ", ".join(full_address_components) + + fetched_data = get_data_for_property( + address1=address1, + postcode=postcode, + full_address=full_address, + property_type=property_type_map[property_data["Type"]], + built_form=built_form_map[property_data["Attachment"]], + uprn=property_data["UPRN"], + epc_auth_token=EPC_AUTH_TOKEN, + find_my_epc_return_page=True + ) + + batch_data[property_data["Org Ref"]] = fetched_data + + # TODO: We likely want to do something like this: to slow down + # TODO: We also perhaps store the data in batches + if len(batch_data) % 50 == 0 and len(batch_data) > 0: + logger.info("Sleeping for 10 seconds to avoid hitting API rate limit") + time.sleep(10) + + # Store the batch data in the wd + with open(os.path.join(working_directory, batch_name), "wb") as f: + json.dump(batch_data, f) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index b8c24cb8..c9cca011 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -371,7 +371,7 @@ class RetrieveFindMyEpc: return all_find_my_epc_data - def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None, return_page=False): """ For a post code and address, we pull out all the required data from the find my epc website """ @@ -577,6 +577,10 @@ class RetrieveFindMyEpc: **low_carbon_energy_sources, } + if return_page: + # We return the page text as well, which can be parsed again later + return resulting_data, postcode_response.text + return resulting_data def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):