handled electric community heating

This commit is contained in:
Khalim Conn-Kowlessar 2025-11-14 21:10:21 +00:00
parent 4d30d6588d
commit 3624b34dd0
5 changed files with 282 additions and 4 deletions

View file

@ -1,6 +1,8 @@
import time
import random
import pandas as pd
from adhoc.investigation import newest_epc
from backend.SearchEpc import SearchEpc
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
from tqdm import tqdm
@ -9,6 +11,132 @@ from utils.logger import setup_logger
logger = setup_logger()
def get_data_for_property(
address1: str,
postcode: str,
full_address: str,
property_type: [str | None],
built_form: [str | None],
uprn: [str | float | None],
epc_auth_token: str,
find_my_epc_return_page: bool
):
"""
Utility function that will fetch the data for a single property
:return:
"""
if property_type == "block of flats":
return None
house_number = str(address1).strip()
full_address = full_address.strip()
house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
if house_no is None:
house_no = house_number
if pd.isnull(uprn):
uprn = None
searcher = SearchEpc(
address1=str(house_no),
postcode=postcode,
auth_token=epc_auth_token,
os_api_key="",
property_type=None,
fast=True,
full_address=full_address,
max_retries=5,
uprn=uprn
)
# Force the skipping of estimating the EPC
# We check if the property was split
searcher.ordnance_survey_client.property_type = property_type
searcher.ordnance_survey_client.built_form = built_form
searcher.find_property(skip_os=True)
# Check if we have a flat or appartment
if searcher.newest_epc is None and uprn is None:
# Try again:
if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
# Backup
add1 = full_address.split(",")
if len(add1) > 1:
add1 = add1[1].strip()
else:
# Try splitting on space
add1 = full_address.split(" ")[0].strip()
else:
add1 = str(house_number)
searcher = SearchEpc(
address1=add1,
postcode=postcode,
auth_token=epc_auth_token,
os_api_key="",
property_type=None,
fast=True,
full_address=full_address,
max_retries=5
)
if (
"flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
house_number.lower()
):
searcher.ordnance_survey_client.property_type = "Flat"
searcher.find_property(skip_os=True)
# As a final resort, we estimate the EPC
if property_type is not None and searcher.newest_epc is None:
searcher.ordnance_survey_client.property_type = property_type
searcher.ordnance_survey_client.built_form = built_form
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
return None
# Retrieve data from FindMyEPC
try:
find_epc_searcher = RetrieveFindMyEpc(
address=searcher.newest_epc["address"],
postcode=searcher.newest_epc["postcode"]
)
find_epc_response = find_epc_searcher.retrieve_newest_find_my_epc_data(
return_page=find_my_epc_return_page
)
except ValueError as e:
if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
try:
find_epc_searcher = RetrieveFindMyEpc(
address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
)
find_epc_response = find_epc_searcher.retrieve_newest_find_my_epc_data()
except ValueError as e:
if "No EPC found" in str(e):
find_epc_response = ({}, None) if find_my_epc_return_page else ({})
else:
logger.error(f"Error retrieving FindMyEPC data: {e}")
raise Exception(f"Error retrieving FindMyEPC data: {e}")
else:
find_epc_response = ({}, None) if find_my_epc_return_page else ({})
except Exception as e:
raise Exception(f"Error retrieving FindMyEPC data: {e}")
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
find_my_epc_page = None
if find_my_epc_return_page:
find_my_epc_data, find_my_epc_page = find_epc_response
else:
find_my_epc_data = find_epc_response
return newest_epc, older_epcs, find_my_epc_data, find_my_epc_page
def get_data(
df,
manual_uprn_map,

View file

@ -1221,11 +1221,12 @@ class Property:
None: "Natural Gas (Community Scheme)",
"mains gas": "Natural Gas (Community Scheme)",
"biomass": "Smokeless Fuel",
"electricity": "Electricity"
}
if self.main_fuel["fuel_type"] in fuel_map: # We assume when None as it's unknown
self.heating_energy_source = fuel_map[self.main_fuel["fuel_type"]]
else:
raise Exception("Implement me")
raise NotImplementedError(f"Unhandled fuel {self.main_fuel['fuel_type']}")
if self.hotwater["heater_type"] is not None:
self.hot_water_energy_source = heater_type_to_fuel[self.hotwater["heater_type"]]
@ -1247,7 +1248,7 @@ class Property:
secondary_heating = self.data["secondheat-description"]
self.hot_water_energy_source = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[secondary_heating]["fuel"]
else:
raise Exception("Investiage me")
raise NotImplementedError(f"Investiage me - unhandled hot water fuel {fuel}")
else:
self.hot_water_energy_source = hotwater_appliance_to_fuel[self.hotwater["appliance"]]

View file

@ -91,7 +91,7 @@ costs_by_floor_area = costs_by_floor_area.groupby("current-energy-efficiency")[
].mean().reset_index()
sample_epc_data = epc_data[pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2015-01-01"].drop_duplicates("UPRN").sample(
10000).reset_index(drop=True)
20000).reset_index(drop=True)
# TODO: In Property find_energy_sources, sort out biomass community heating - what fuel type
# TODO: We might be able to remove find_energy_sources entirely and remove estimate_electrical_consumption. It's used

View file

@ -0,0 +1,145 @@
"""
This scipt prepares the raw data that was sent over by Peabody for production of
a standardised asset list
They have sent over just short of 100,000 properties and so, to make this easier, we will do the following
1) Break the data up into subsets of 25,000
2) Combine the data provided into a single list
"""
import json
import time
import os
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from asset_list.utils import get_data_for_property
from utils.logger import setup_logger
logger = setup_logger()
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
property_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Properties"
)
sustainability_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Sustainability"
)
# Basic overview:
# 1) We have 10,634 postcodes. If we needed to make requests to the ordnance survey API for
# all of these postcodes, it would cost at least £106, not accounting for double requests for postcodes
# where we have more than 100 properties (WE DONT!)
# 2) This is on average 9.36 properties per postcode
# 3) The UPRN in the property_list matches to the Org Ref in the sustainability data. These
# is an additional UPRN column in sustainability data which appears to be the ordnance survey UPRN
# 4) There appears to be some anomalous records, e.g. a flat with 543 m2 floor area and another flat
# with 6m2 floor area
# 5) Based on the residential indicator, all properties appear to be resi
# 6) We should do some quick calcs on how much it might cost to fetch all of the solar API data
# 7) We have 8785 missing UPRNS, which we should potentially try and fill
# 8) In the backend, we should probably start storing the raw EPC input data to allow for much quicker
# re-runs. All we really need to do is store the find my EPC data, perhaps against UPRN and RRN, as well
# as the raw EPC data, against uprn. This will be useful for scenario re-builds and will be much much
# quicker, as a starting point. Do we store in the database vs s3? TBC
n_postcodes = property_list["Post Code"].nunique()
postcode_summary = property_list.groupby("Post Code")["UPRN"].count().reset_index()
postcode_summary["UPRN"].mean()
test_match = property_list.merge(sustainability_data, left_on="UPRN", right_on="Org Ref")
def classify_floor_area(x):
if x <= 72:
return "0-72"
if x <= 97:
return "73-97"
if x <= 199:
return "98-199"
return "200+"
sustainability_data["Postal Region"] = sustainability_data["Postcode"].str.split(" ").str[0]
sustainability_data["Floor Area Band"] = sustainability_data["Total Floor Area (m2)"].apply(
lambda x: classify_floor_area(x)
)
archetypes = sustainability_data[
["Type", "Attachment", "Construction Years", "Wall Construction", "Wall Insulation",
"Roof Construction", "Roof Insulation", "Floor Construction", "Floor Insulation",
"Glazing", "Heating", "Boiler Efficiency", "Main Fuel", "Controls Adequacy",
"Floor Area Band"]
].drop_duplicates()
# Maps the property types to the format recognised by the EPC api
property_type_map = {}
# Maps the build form to the format recognised by the OS api
built_form_map = {}
# Proposed data fetching
# 1) grab propeties with UPRN and fetch the assocated EPC data & find my EPC data
# Some thoughts:
# S3 is quite cheap to query however we may incur some cost if we're making hundreds of thousands of calls
# to S3 to fetch data out of it. It's cheap to fetch data, if we aren't taking data out of S3, but we
# should consider this. This may influence whether or not we want to store each record individually
# against UPRN, or store against the 10,641 postcodes. We can fetch the data and store in a single
# large dump and then determine later if we want to split it up
# TODO: Handle properties without uprn
# TODO: I think we can json dump all of this, but check if we can load and re-use the page source
# TODO: Create batches?
batch_size = 500
batch_indexes = list(range(0, len(sustainability_data), batch_size))
# TODO: SET
working_directory = ""
download_contents = os.listdir(working_directory)
for i in range(0, len(sustainability_data.standardised_asset_list), batch_size):
batch_name = f"batch_{i}_to_{i + batch_size}"
# TODO: Check this
if batch_name in download_contents:
# Means we already have the data downloaded
continue
batch_data = {}
for _, property_data in tqdm(sustainability_data.iterrows(), total=len(sustainability_data)):
os_uprn = property_data["UPRN"]
address1 = property_data["Address 1"]
postcode = property_data["Postcode"]
full_address_components = [
x for x in [property_data["Address 1"], property_data["Address 2"], property_data["Address 3"]]
if not pd.isnull(x)
]
full_address = ", ".join(full_address_components)
fetched_data = get_data_for_property(
address1=address1,
postcode=postcode,
full_address=full_address,
property_type=property_type_map[property_data["Type"]],
built_form=built_form_map[property_data["Attachment"]],
uprn=property_data["UPRN"],
epc_auth_token=EPC_AUTH_TOKEN,
find_my_epc_return_page=True
)
batch_data[property_data["Org Ref"]] = fetched_data
# TODO: We likely want to do something like this: to slow down
# TODO: We also perhaps store the data in batches
if len(batch_data) % 50 == 0 and len(batch_data) > 0:
logger.info("Sleeping for 10 seconds to avoid hitting API rate limit")
time.sleep(10)
# Store the batch data in the wd
with open(os.path.join(working_directory, batch_name), "wb") as f:
json.dump(batch_data, f)

View file

@ -371,7 +371,7 @@ class RetrieveFindMyEpc:
return all_find_my_epc_data
def retrieve_newest_find_my_epc_data(self, sap_2012_date=None):
def retrieve_newest_find_my_epc_data(self, sap_2012_date=None, return_page=False):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
@ -577,6 +577,10 @@ class RetrieveFindMyEpc:
**low_carbon_energy_sources,
}
if return_page:
# We return the page text as well, which can be parsed again later
return resulting_data, postcode_response.text
return resulting_data
def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):