mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
handled electric community heating
This commit is contained in:
parent
4d30d6588d
commit
3624b34dd0
5 changed files with 282 additions and 4 deletions
|
|
@ -1,6 +1,8 @@
|
|||
import time
|
||||
import random
|
||||
import pandas as pd
|
||||
|
||||
from adhoc.investigation import newest_epc
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
|
||||
from tqdm import tqdm
|
||||
|
|
@ -9,6 +11,132 @@ from utils.logger import setup_logger
|
|||
logger = setup_logger()
|
||||
|
||||
|
||||
def get_data_for_property(
|
||||
address1: str,
|
||||
postcode: str,
|
||||
full_address: str,
|
||||
property_type: [str | None],
|
||||
built_form: [str | None],
|
||||
uprn: [str | float | None],
|
||||
epc_auth_token: str,
|
||||
find_my_epc_return_page: bool
|
||||
):
|
||||
"""
|
||||
Utility function that will fetch the data for a single property
|
||||
:return:
|
||||
"""
|
||||
|
||||
if property_type == "block of flats":
|
||||
return None
|
||||
|
||||
house_number = str(address1).strip()
|
||||
full_address = full_address.strip()
|
||||
house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
|
||||
if house_no is None:
|
||||
house_no = house_number
|
||||
|
||||
if pd.isnull(uprn):
|
||||
uprn = None
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=str(house_no),
|
||||
postcode=postcode,
|
||||
auth_token=epc_auth_token,
|
||||
os_api_key="",
|
||||
property_type=None,
|
||||
fast=True,
|
||||
full_address=full_address,
|
||||
max_retries=5,
|
||||
uprn=uprn
|
||||
)
|
||||
# Force the skipping of estimating the EPC
|
||||
# We check if the property was split
|
||||
|
||||
searcher.ordnance_survey_client.property_type = property_type
|
||||
searcher.ordnance_survey_client.built_form = built_form
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
# Check if we have a flat or appartment
|
||||
if searcher.newest_epc is None and uprn is None:
|
||||
# Try again:
|
||||
if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
|
||||
# Backup
|
||||
add1 = full_address.split(",")
|
||||
if len(add1) > 1:
|
||||
add1 = add1[1].strip()
|
||||
else:
|
||||
# Try splitting on space
|
||||
add1 = full_address.split(" ")[0].strip()
|
||||
else:
|
||||
add1 = str(house_number)
|
||||
searcher = SearchEpc(
|
||||
address1=add1,
|
||||
postcode=postcode,
|
||||
auth_token=epc_auth_token,
|
||||
os_api_key="",
|
||||
property_type=None,
|
||||
fast=True,
|
||||
full_address=full_address,
|
||||
max_retries=5
|
||||
)
|
||||
|
||||
if (
|
||||
"flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
|
||||
house_number.lower()
|
||||
):
|
||||
searcher.ordnance_survey_client.property_type = "Flat"
|
||||
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
# As a final resort, we estimate the EPC
|
||||
if property_type is not None and searcher.newest_epc is None:
|
||||
searcher.ordnance_survey_client.property_type = property_type
|
||||
searcher.ordnance_survey_client.built_form = built_form
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
if searcher.newest_epc is None:
|
||||
return None
|
||||
|
||||
# Retrieve data from FindMyEPC
|
||||
try:
|
||||
find_epc_searcher = RetrieveFindMyEpc(
|
||||
address=searcher.newest_epc["address"],
|
||||
postcode=searcher.newest_epc["postcode"]
|
||||
)
|
||||
find_epc_response = find_epc_searcher.retrieve_newest_find_my_epc_data(
|
||||
return_page=find_my_epc_return_page
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
|
||||
try:
|
||||
find_epc_searcher = RetrieveFindMyEpc(
|
||||
address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
|
||||
)
|
||||
find_epc_response = find_epc_searcher.retrieve_newest_find_my_epc_data()
|
||||
except ValueError as e:
|
||||
if "No EPC found" in str(e):
|
||||
find_epc_response = ({}, None) if find_my_epc_return_page else ({})
|
||||
else:
|
||||
logger.error(f"Error retrieving FindMyEPC data: {e}")
|
||||
raise Exception(f"Error retrieving FindMyEPC data: {e}")
|
||||
else:
|
||||
find_epc_response = ({}, None) if find_my_epc_return_page else ({})
|
||||
except Exception as e:
|
||||
raise Exception(f"Error retrieving FindMyEPC data: {e}")
|
||||
|
||||
newest_epc = searcher.newest_epc
|
||||
older_epcs = searcher.older_epcs
|
||||
|
||||
find_my_epc_page = None
|
||||
if find_my_epc_return_page:
|
||||
find_my_epc_data, find_my_epc_page = find_epc_response
|
||||
else:
|
||||
find_my_epc_data = find_epc_response
|
||||
|
||||
return newest_epc, older_epcs, find_my_epc_data, find_my_epc_page
|
||||
|
||||
|
||||
def get_data(
|
||||
df,
|
||||
manual_uprn_map,
|
||||
|
|
|
|||
|
|
@ -1221,11 +1221,12 @@ class Property:
|
|||
None: "Natural Gas (Community Scheme)",
|
||||
"mains gas": "Natural Gas (Community Scheme)",
|
||||
"biomass": "Smokeless Fuel",
|
||||
"electricity": "Electricity"
|
||||
}
|
||||
if self.main_fuel["fuel_type"] in fuel_map: # We assume when None as it's unknown
|
||||
self.heating_energy_source = fuel_map[self.main_fuel["fuel_type"]]
|
||||
else:
|
||||
raise Exception("Implement me")
|
||||
raise NotImplementedError(f"Unhandled fuel {self.main_fuel['fuel_type']}")
|
||||
|
||||
if self.hotwater["heater_type"] is not None:
|
||||
self.hot_water_energy_source = heater_type_to_fuel[self.hotwater["heater_type"]]
|
||||
|
|
@ -1247,7 +1248,7 @@ class Property:
|
|||
secondary_heating = self.data["secondheat-description"]
|
||||
self.hot_water_energy_source = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[secondary_heating]["fuel"]
|
||||
else:
|
||||
raise Exception("Investiage me")
|
||||
raise NotImplementedError(f"Investiage me - unhandled hot water fuel {fuel}")
|
||||
else:
|
||||
self.hot_water_energy_source = hotwater_appliance_to_fuel[self.hotwater["appliance"]]
|
||||
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ costs_by_floor_area = costs_by_floor_area.groupby("current-energy-efficiency")[
|
|||
].mean().reset_index()
|
||||
|
||||
sample_epc_data = epc_data[pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2015-01-01"].drop_duplicates("UPRN").sample(
|
||||
10000).reset_index(drop=True)
|
||||
20000).reset_index(drop=True)
|
||||
|
||||
# TODO: In Property find_energy_sources, sort out biomass community heating - what fuel type
|
||||
# TODO: We might be able to remove find_energy_sources entirely and remove estimate_electrical_consumption. It's used
|
||||
|
|
|
|||
145
etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py
Normal file
145
etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
"""
|
||||
This scipt prepares the raw data that was sent over by Peabody for production of
|
||||
a standardised asset list
|
||||
|
||||
They have sent over just short of 100,000 properties and so, to make this easier, we will do the following
|
||||
1) Break the data up into subsets of 25,000
|
||||
2) Combine the data provided into a single list
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from dotenv import load_dotenv
|
||||
from asset_list.utils import get_data_for_property
|
||||
from utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
property_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
|
||||
"- Data Extracts for Domna.xlsx",
|
||||
sheet_name="Properties"
|
||||
)
|
||||
sustainability_data = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
|
||||
"- Data Extracts for Domna.xlsx",
|
||||
sheet_name="Sustainability"
|
||||
)
|
||||
|
||||
# Basic overview:
|
||||
# 1) We have 10,634 postcodes. If we needed to make requests to the ordnance survey API for
|
||||
# all of these postcodes, it would cost at least £106, not accounting for double requests for postcodes
|
||||
# where we have more than 100 properties (WE DONT!)
|
||||
# 2) This is on average 9.36 properties per postcode
|
||||
# 3) The UPRN in the property_list matches to the Org Ref in the sustainability data. These
|
||||
# is an additional UPRN column in sustainability data which appears to be the ordnance survey UPRN
|
||||
# 4) There appears to be some anomalous records, e.g. a flat with 543 m2 floor area and another flat
|
||||
# with 6m2 floor area
|
||||
# 5) Based on the residential indicator, all properties appear to be resi
|
||||
# 6) We should do some quick calcs on how much it might cost to fetch all of the solar API data
|
||||
# 7) We have 8785 missing UPRNS, which we should potentially try and fill
|
||||
# 8) In the backend, we should probably start storing the raw EPC input data to allow for much quicker
|
||||
# re-runs. All we really need to do is store the find my EPC data, perhaps against UPRN and RRN, as well
|
||||
# as the raw EPC data, against uprn. This will be useful for scenario re-builds and will be much much
|
||||
# quicker, as a starting point. Do we store in the database vs s3? TBC
|
||||
|
||||
n_postcodes = property_list["Post Code"].nunique()
|
||||
postcode_summary = property_list.groupby("Post Code")["UPRN"].count().reset_index()
|
||||
postcode_summary["UPRN"].mean()
|
||||
|
||||
test_match = property_list.merge(sustainability_data, left_on="UPRN", right_on="Org Ref")
|
||||
|
||||
|
||||
def classify_floor_area(x):
|
||||
if x <= 72:
|
||||
return "0-72"
|
||||
if x <= 97:
|
||||
return "73-97"
|
||||
if x <= 199:
|
||||
return "98-199"
|
||||
return "200+"
|
||||
|
||||
|
||||
sustainability_data["Postal Region"] = sustainability_data["Postcode"].str.split(" ").str[0]
|
||||
sustainability_data["Floor Area Band"] = sustainability_data["Total Floor Area (m2)"].apply(
|
||||
lambda x: classify_floor_area(x)
|
||||
)
|
||||
|
||||
archetypes = sustainability_data[
|
||||
["Type", "Attachment", "Construction Years", "Wall Construction", "Wall Insulation",
|
||||
"Roof Construction", "Roof Insulation", "Floor Construction", "Floor Insulation",
|
||||
"Glazing", "Heating", "Boiler Efficiency", "Main Fuel", "Controls Adequacy",
|
||||
"Floor Area Band"]
|
||||
].drop_duplicates()
|
||||
|
||||
# Maps the property types to the format recognised by the EPC api
|
||||
property_type_map = {}
|
||||
# Maps the build form to the format recognised by the OS api
|
||||
built_form_map = {}
|
||||
|
||||
# Proposed data fetching
|
||||
# 1) grab propeties with UPRN and fetch the assocated EPC data & find my EPC data
|
||||
# Some thoughts:
|
||||
# S3 is quite cheap to query however we may incur some cost if we're making hundreds of thousands of calls
|
||||
# to S3 to fetch data out of it. It's cheap to fetch data, if we aren't taking data out of S3, but we
|
||||
# should consider this. This may influence whether or not we want to store each record individually
|
||||
# against UPRN, or store against the 10,641 postcodes. We can fetch the data and store in a single
|
||||
# large dump and then determine later if we want to split it up
|
||||
|
||||
# TODO: Handle properties without uprn
|
||||
# TODO: I think we can json dump all of this, but check if we can load and re-use the page source
|
||||
# TODO: Create batches?
|
||||
|
||||
batch_size = 500
|
||||
batch_indexes = list(range(0, len(sustainability_data), batch_size))
|
||||
|
||||
# TODO: SET
|
||||
working_directory = ""
|
||||
download_contents = os.listdir(working_directory)
|
||||
|
||||
for i in range(0, len(sustainability_data.standardised_asset_list), batch_size):
|
||||
|
||||
batch_name = f"batch_{i}_to_{i + batch_size}"
|
||||
# TODO: Check this
|
||||
if batch_name in download_contents:
|
||||
# Means we already have the data downloaded
|
||||
continue
|
||||
|
||||
batch_data = {}
|
||||
for _, property_data in tqdm(sustainability_data.iterrows(), total=len(sustainability_data)):
|
||||
os_uprn = property_data["UPRN"]
|
||||
address1 = property_data["Address 1"]
|
||||
postcode = property_data["Postcode"]
|
||||
full_address_components = [
|
||||
x for x in [property_data["Address 1"], property_data["Address 2"], property_data["Address 3"]]
|
||||
if not pd.isnull(x)
|
||||
]
|
||||
full_address = ", ".join(full_address_components)
|
||||
|
||||
fetched_data = get_data_for_property(
|
||||
address1=address1,
|
||||
postcode=postcode,
|
||||
full_address=full_address,
|
||||
property_type=property_type_map[property_data["Type"]],
|
||||
built_form=built_form_map[property_data["Attachment"]],
|
||||
uprn=property_data["UPRN"],
|
||||
epc_auth_token=EPC_AUTH_TOKEN,
|
||||
find_my_epc_return_page=True
|
||||
)
|
||||
|
||||
batch_data[property_data["Org Ref"]] = fetched_data
|
||||
|
||||
# TODO: We likely want to do something like this: to slow down
|
||||
# TODO: We also perhaps store the data in batches
|
||||
if len(batch_data) % 50 == 0 and len(batch_data) > 0:
|
||||
logger.info("Sleeping for 10 seconds to avoid hitting API rate limit")
|
||||
time.sleep(10)
|
||||
|
||||
# Store the batch data in the wd
|
||||
with open(os.path.join(working_directory, batch_name), "wb") as f:
|
||||
json.dump(batch_data, f)
|
||||
|
|
@ -371,7 +371,7 @@ class RetrieveFindMyEpc:
|
|||
|
||||
return all_find_my_epc_data
|
||||
|
||||
def retrieve_newest_find_my_epc_data(self, sap_2012_date=None):
|
||||
def retrieve_newest_find_my_epc_data(self, sap_2012_date=None, return_page=False):
|
||||
"""
|
||||
For a post code and address, we pull out all the required data from the find my epc website
|
||||
"""
|
||||
|
|
@ -577,6 +577,10 @@ class RetrieveFindMyEpc:
|
|||
**low_carbon_energy_sources,
|
||||
}
|
||||
|
||||
if return_page:
|
||||
# We return the page text as well, which can be parsed again later
|
||||
return resulting_data, postcode_response.text
|
||||
|
||||
return resulting_data
|
||||
|
||||
def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue