mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
558 lines
20 KiB
Python
558 lines
20 KiB
Python
import os
|
|
import time
|
|
import re
|
|
|
|
import usaddress
|
|
import pandas as pd
|
|
import numpy as np
|
|
from epc_api.client import EpcClient
|
|
from backend.OrdnanceSurvey import OrdnanceSuveyClient
|
|
from utils.logger import setup_logger
|
|
from typing import List
|
|
from fuzzywuzzy import process
|
|
|
|
logger = setup_logger()
|
|
|
|
vartypes = {
|
|
'low-energy-fixed-light-count': "Int64",
|
|
# 'address': 'str',
|
|
# 'uprn-source': 'str',
|
|
'floor-height': 'float',
|
|
'heating-cost-potential': 'float',
|
|
'unheated-corridor-length': 'float',
|
|
'hot-water-cost-potential': 'float',
|
|
'construction-age-band': 'str',
|
|
'potential-energy-rating': 'str',
|
|
'mainheat-energy-eff': 'str',
|
|
'windows-env-eff': 'str',
|
|
'lighting-energy-eff': 'str',
|
|
'environment-impact-potential': "Int64",
|
|
'glazed-type': 'str',
|
|
'heating-cost-current': 'float',
|
|
'address3': 'str',
|
|
'mainheatcont-description': 'str',
|
|
'sheating-energy-eff': 'str',
|
|
'property-type': 'str',
|
|
'local-authority-label': 'str',
|
|
'fixed-lighting-outlets-count': "Int64",
|
|
'energy-tariff': 'str',
|
|
'mechanical-ventilation': 'str',
|
|
'hot-water-cost-current': 'str',
|
|
'county': 'str',
|
|
'postcode': 'str',
|
|
'solar-water-heating-flag': 'str',
|
|
'constituency': 'str',
|
|
'co2-emissions-potential': 'float',
|
|
'number-heated-rooms': 'float',
|
|
'floor-description': 'str',
|
|
'energy-consumption-potential': 'float',
|
|
'local-authority': 'str',
|
|
'built-form': 'str',
|
|
'number-open-fireplaces': "Int64",
|
|
'windows-description': 'str',
|
|
'glazed-area': 'str',
|
|
# 'inspection-date': str,
|
|
'mains-gas-flag': 'str',
|
|
'co2-emiss-curr-per-floor-area': 'float',
|
|
'address1': 'str',
|
|
'heat-loss-corridor': 'str',
|
|
'flat-storey-count': "Int64",
|
|
'constituency-label': 'str',
|
|
'roof-energy-eff': 'str',
|
|
'total-floor-area': 'float',
|
|
'building-reference-number': 'str',
|
|
'environment-impact-current': 'float',
|
|
'co2-emissions-current': 'float',
|
|
'roof-description': 'str',
|
|
'floor-energy-eff': 'str',
|
|
'number-habitable-rooms': 'float',
|
|
'address2': 'str',
|
|
'hot-water-env-eff': 'str',
|
|
'posttown': 'str',
|
|
'mainheatc-energy-eff': 'str',
|
|
'main-fuel': 'str',
|
|
'lighting-env-eff': 'str',
|
|
'windows-energy-eff': 'str',
|
|
'floor-env-eff': 'str',
|
|
'sheating-env-eff': 'str',
|
|
'lighting-description': 'str',
|
|
'roof-env-eff': 'str',
|
|
'walls-energy-eff': 'str',
|
|
'photo-supply': 'float',
|
|
'lighting-cost-potential': 'float',
|
|
'mainheat-env-eff': 'str',
|
|
'multi-glaze-proportion': 'float',
|
|
'main-heating-controls': 'str',
|
|
# 'lodgement-datetime',
|
|
'flat-top-storey': 'str',
|
|
'current-energy-rating': 'str',
|
|
'secondheat-description': 'str',
|
|
'walls-env-eff': 'str',
|
|
'transaction-type': 'str',
|
|
# 'uprn': "Int64",
|
|
'current-energy-efficiency': 'float',
|
|
'energy-consumption-current': 'float',
|
|
'mainheat-description': 'str',
|
|
'lighting-cost-current': 'float',
|
|
# 'lodgement-date',
|
|
'extension-count': "Int64",
|
|
'mainheatc-env-eff': 'str',
|
|
'lmk-key': 'str',
|
|
'wind-turbine-count': "Int64",
|
|
'tenure': 'str',
|
|
'floor-level': 'str',
|
|
'potential-energy-efficiency': "Int64",
|
|
'hot-water-energy-eff': 'str',
|
|
'low-energy-lighting': 'float',
|
|
'walls-description': 'str',
|
|
'hotwater-description': 'str'
|
|
}
|
|
|
|
|
|
class SearchEpc:
|
|
"""
|
|
Given address information about a home, this class is responsible for retrieving the EPC data associated
|
|
to the property.
|
|
|
|
For a home, we might have address lines 1, 2, 3 and 4, as well as a postcode.
|
|
|
|
Often, simply searching the EPC database with address line 1 and postcode will be enough to find
|
|
the property, but there are some cases where this is not true and we might need to utilise other
|
|
combinations about the home to find the property
|
|
"""
|
|
|
|
MAX_RETRIES = 5
|
|
|
|
SUCCESS = {
|
|
"status": 200,
|
|
"message": "success",
|
|
"error": None
|
|
}
|
|
|
|
NODATA = {
|
|
"status": 201,
|
|
"message": "No data",
|
|
"error": None
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
address1: str,
|
|
postcode: str,
|
|
auth_token: str,
|
|
os_api_key: str,
|
|
full_address: str | None = None,
|
|
max_retries: int = None,
|
|
uprn: [int, None] = None,
|
|
size=None,
|
|
):
|
|
"""
|
|
Address lines 1 and postcode are mandatory fields. The other address lines are optional
|
|
but can be used to find the epc for the home, if address1 and postcode are insufficient
|
|
:param address1: string, propery's address line 1
|
|
:param postcode: string, propery's postcode
|
|
:param full_address: string, optional parameter, the full address of the property
|
|
:param max_retries: int, optional, number of retries to make when searching the api
|
|
:param uprn: int, optional, the uprn of the property
|
|
:param size: int, optional, the number of results to return. If not provided, defaults to 25 which is the api's
|
|
default
|
|
"""
|
|
|
|
self.address1 = address1
|
|
self.postcode = postcode
|
|
self.full_address = full_address
|
|
self.uprn = uprn
|
|
self.house_number = self.get_house_number(self.address1)
|
|
self.numeric_house_number = self.extract_numeric_housenumber_part(self.house_number)
|
|
|
|
self.max_retries = max_retries if max_retries is not None else self.MAX_RETRIES
|
|
|
|
self.client = EpcClient(auth_token=auth_token)
|
|
self.ordnance_survey_client = OrdnanceSuveyClient(
|
|
address=self.address1, postcode=self.postcode, api_key=os_api_key
|
|
)
|
|
|
|
self.data = None
|
|
self.newest_epc = None
|
|
self.older_epcs = None
|
|
self.full_sap_epc = None
|
|
|
|
# These are the address and postcode values, which we store in the database
|
|
self.address_clean = None
|
|
self.postcode_clean = None
|
|
|
|
self.size = size if size is not None else 25
|
|
|
|
@classmethod
|
|
def get_house_number(cls, address: str) -> str | None:
|
|
"""
|
|
This method will use the usaddress library to parse an address and extract the house number
|
|
:return:
|
|
"""
|
|
|
|
parsed = usaddress.parse(address)
|
|
parsed_house_number = [x for x in parsed if (x[1] == "AddressNumber")]
|
|
parsed_house_number = parsed_house_number[0][0] if parsed_house_number else None
|
|
|
|
if parsed_house_number is None:
|
|
# Because usaddress isn't optimal for parsing addresses with 'Flat' as a prefix, we also add a custom
|
|
# approach
|
|
# Pattern to look for 'Flat' followed by a number, or just a number at the beginning
|
|
pattern = r'(?i)(?:flat\s*(\d+))|^\s*(\d+)'
|
|
|
|
match = re.search(pattern, address)
|
|
|
|
if match:
|
|
# Return the first non-None group found
|
|
return next(g for g in match.groups() if g is not None)
|
|
else:
|
|
return None
|
|
|
|
# Remove training commas
|
|
parsed_house_number = parsed_house_number.replace(",", "")
|
|
|
|
return parsed_house_number
|
|
|
|
@staticmethod
|
|
def extract_numeric_housenumber_part(house_number: str | None) -> int | None:
|
|
# Regular expression to find the first occurrence of one or more digits
|
|
|
|
if house_number is None:
|
|
return None
|
|
|
|
match = re.search(r'\d+', house_number)
|
|
|
|
if match:
|
|
return int(match.group())
|
|
else:
|
|
return None
|
|
|
|
def get_epc(self, params=None, size=None):
|
|
# Get the EPC data with retries
|
|
size = size if size is not None else self.size
|
|
if params is None:
|
|
if self.uprn:
|
|
params = {"uprn": self.uprn}
|
|
else:
|
|
params = {"address": self.address1, "postcode": self.postcode}
|
|
|
|
for retry in range(self.max_retries):
|
|
try:
|
|
|
|
if "uprn" in params:
|
|
# We use the direct call method inside, since we need to implement uprn as a valid
|
|
# parameter for the search function
|
|
url = os.path.join(self.client.domestic.host, "search")
|
|
response = self.client.domestic.call(method="get", url=url, params=params)
|
|
else:
|
|
response = self.client.domestic.search(params=params, size=size)
|
|
|
|
if response:
|
|
self.data = response
|
|
return self.SUCCESS
|
|
|
|
if retry > 0:
|
|
logger.info("Failed previous attempt but retry successful")
|
|
# If we got nothing, final try
|
|
if not response:
|
|
return {
|
|
"status": 204,
|
|
"message": "no data",
|
|
"error": None
|
|
}
|
|
|
|
return {
|
|
"status": 200,
|
|
"message": "success",
|
|
"error": None
|
|
}
|
|
|
|
except Exception as e:
|
|
if retry < self.max_retries - 1:
|
|
# If not the last retry, wait for 3 seconds before retrying
|
|
time.sleep(3)
|
|
else:
|
|
# If it's the last retry, we continue
|
|
return {
|
|
"status": 500,
|
|
"message": "Could not retrieve EPC data",
|
|
"error": str(e)
|
|
}
|
|
|
|
@staticmethod
|
|
def filter_rows(rows, property_type=None, address=None):
|
|
"""
|
|
This method should not be used when property_type and address are both not None
|
|
:param rows:
|
|
:param property_type:
|
|
:param address:
|
|
:return:
|
|
"""
|
|
# Given the results from the EPC api, attempts to reduce the number of rows
|
|
uprns = {r["uprn"] for r in rows}
|
|
|
|
if (property_type is None) and (address is None):
|
|
return rows
|
|
|
|
if len(uprns) == 1:
|
|
return rows
|
|
|
|
if property_type is not None:
|
|
# We can do a filter on the property type
|
|
rows_filtered = [r for r in rows if r["property-type"] == property_type]
|
|
|
|
if rows_filtered:
|
|
return rows_filtered
|
|
|
|
return rows
|
|
|
|
if address is not None:
|
|
# We can do a filter on the property type
|
|
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
|
|
rows_filtered = [r for r in rows if r["address"] == best_match[0]]
|
|
|
|
if rows_filtered:
|
|
return rows_filtered
|
|
|
|
return rows
|
|
|
|
@staticmethod
|
|
def format_address(newest_epc):
|
|
"""
|
|
Format address and postcode for storage in the database
|
|
"""
|
|
postcode = newest_epc["postcode"]
|
|
address = newest_epc["address"]
|
|
|
|
# Format them
|
|
address = address.replace(postcode, "").strip()
|
|
address = address.rstrip(",").strip()
|
|
address = address.title()
|
|
|
|
postcode = postcode.upper()
|
|
|
|
return address, postcode
|
|
|
|
def extract_epc_data(self, property_type=None, address=None):
|
|
|
|
"""
|
|
Given a successful search, this method will format the data and return it
|
|
:return:
|
|
"""
|
|
|
|
if self.data is None:
|
|
raise ValueError("data is missing, run search first")
|
|
|
|
rows = self.data["rows"]
|
|
|
|
# We perform some checks on the rows
|
|
# Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the
|
|
# property further
|
|
|
|
rows = self.filter_rows(rows, property_type=property_type, address=None)
|
|
rows = self.filter_rows(rows, property_type=None, address=address)
|
|
|
|
# We now check for a full sap epc:
|
|
full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]
|
|
full_sap_epc = full_sap_epc[0] if full_sap_epc else {}
|
|
|
|
# Finally, we identify the newest epc and the rest, and then return
|
|
newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows)
|
|
|
|
# Retrieve postcode and address
|
|
address_epc, postcode_epc = self.format_address(newest_epc=newest_epc)
|
|
|
|
return newest_epc, older_epcs, full_sap_epc, address_epc, postcode_epc
|
|
|
|
@staticmethod
|
|
def filter_newest_epc(list_of_epcs: List):
|
|
newest_response = [
|
|
r for r in list_of_epcs if
|
|
r["lodgement-datetime"] == max([x["lodgement-datetime"] for x in list_of_epcs])
|
|
]
|
|
|
|
if not newest_response:
|
|
return {}, []
|
|
|
|
if len(newest_response) != 1:
|
|
# It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that
|
|
# were lodged at the exact same time. In this case, we will take the first one
|
|
newest_response = [newest_response[0]]
|
|
|
|
older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]]
|
|
|
|
return newest_response[0], older_epcs
|
|
|
|
@staticmethod
|
|
def _get_epc_mode(col: str, epc_data: pd.DataFrame):
|
|
"""
|
|
Simple method to extract the mode value from the EPC data
|
|
:param col: name of the column to take the mode of
|
|
:param epc_data: pandas dataframe of epc data
|
|
"""
|
|
|
|
mode_value = epc_data[[col]].mode(dropna=True)
|
|
if len(mode_value) != 1:
|
|
raise NotImplementedError("TODO: Handle multiple modes")
|
|
mode_value = mode_value.iloc[0][col]
|
|
|
|
return mode_value
|
|
|
|
def estimate_epc(self, property_type, built_form):
|
|
"""
|
|
For a property that does not have an EPC, we retrieve the EPC data for the closest properties
|
|
and estimate the EPC for the property in question.
|
|
|
|
Note - do we have postcodes with just a single address? We would need to use a different approach
|
|
to find the closest homes
|
|
:param property_type: This is the property type of the property we are estimating, that can be retrieved from
|
|
the ordnance survey api
|
|
:param built_form: This is the built form of the property we are estimating, that can be retrieved from
|
|
the ordnance survey api
|
|
:return:
|
|
"""
|
|
|
|
# From the ordnance survey data, we want to determine the property type and then use only similar property
|
|
# types for the estimation
|
|
|
|
# We firstly get the first 100 properties for the postcode, from the EPC api
|
|
epc_reponse = self.get_epc(params={"postcode": self.postcode}, size=100)
|
|
if epc_reponse["status"] != 200:
|
|
raise Exception("Unable to find postcode data - investigate me")
|
|
|
|
epc_data = pd.DataFrame(self.data["rows"])
|
|
|
|
# We now get the newest EPC per uprn
|
|
epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
|
|
|
|
# For each record, parse the house number. We'll use this to identify the closest properties
|
|
epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
|
|
|
|
# We convert the house number fo a purely numeric format - This numeric house number will be used as
|
|
# a distance weight when estimating the EPC
|
|
epc_data["numeric_house_number"] = epc_data["house_number"].apply(
|
|
lambda house_num: self.extract_numeric_housenumber_part(house_num)
|
|
)
|
|
|
|
epc_data["house_number_distance"] = abs(epc_data["numeric_house_number"] - self.numeric_house_number)
|
|
epc_data["weight"] = 1 / epc_data["house_number_distance"]
|
|
|
|
epc_built_form = self._get_epc_mode(col="built-form", epc_data=epc_data)
|
|
epc_property_type = self._get_epc_mode(col="property-type", epc_data=epc_data)
|
|
|
|
# We check if the EPC built form is one of the terraced values. If the os_built_form is semi-detached,
|
|
# then we set it to be end terraced
|
|
if built_form == "Semi-Detached" and epc_built_form in ["End-Terraced", "Mid-Terraced"]:
|
|
estimation_built_form = "End-Terraced"
|
|
elif built_form == "":
|
|
estimation_built_form = epc_built_form
|
|
else:
|
|
estimation_built_form = built_form
|
|
|
|
estimation_property_type = epc_property_type if property_type == "" else property_type
|
|
|
|
# We filter the EPC data on just the property types we want to use
|
|
epc_data = epc_data[
|
|
(epc_data["built-form"] == estimation_built_form) & (epc_data["property-type"] == estimation_property_type)
|
|
]
|
|
|
|
# For each attribute, we need to determine the datatype and use an appropriate method
|
|
# to estimate.
|
|
estimated_epc = {}
|
|
for key, vartype in vartypes.items():
|
|
epc_data[key] = np.where(pd.isnull(epc_data[key]), None, epc_data[key])
|
|
epc_data[key] = np.where(epc_data[key] == "", None, epc_data[key])
|
|
epc_data[key] = epc_data[key].astype(vartype)
|
|
estimation_data = epc_data[[key, "weight"]]
|
|
estimation_data = estimation_data[~pd.isnull(estimation_data[key])]
|
|
|
|
if estimation_data.shape[0] == 0:
|
|
estimated_epc[key] = None
|
|
continue
|
|
|
|
if vartype == "Int64":
|
|
estimated_value = self._estimate_int(estimation_data, key)
|
|
elif vartype == "float":
|
|
estimated_value = self._estimate_float(estimation_data, key)
|
|
elif vartype == "str":
|
|
estimated_value = self._estimate_str(estimation_data, key)
|
|
else:
|
|
raise NotImplementedError("estimation method not implemented for type")
|
|
|
|
estimated_epc[key] = estimated_value
|
|
|
|
estimated_epc["postcode"] = self.postcode
|
|
estimated_epc["uprn"] = self.uprn
|
|
# Indicate that this epc was estimated
|
|
estimated_epc["estimated"] = True
|
|
|
|
return estimated_epc
|
|
|
|
@staticmethod
|
|
def _estimate_int(estimation_data, key):
|
|
return round(np.average(a=estimation_data[key], weights=estimation_data["weight"]))
|
|
|
|
@staticmethod
|
|
def _estimate_float(estimation_data, key):
|
|
return np.average(a=estimation_data[key], weights=estimation_data["weight"])
|
|
|
|
@staticmethod
|
|
def _estimate_str(estimation_data, key):
|
|
agg = estimation_data.groupby(key)["weight"].sum().reset_index()
|
|
agg = agg[agg["weight"] == agg["weight"].max()]
|
|
if agg.shape[0] != 1:
|
|
raise NotImplementedError("implement me")
|
|
|
|
return agg[key].values[0]
|
|
|
|
def find_property(self):
|
|
"""
|
|
This method will attempt to identify a property. It will, at first, use the EPC api to try and
|
|
find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to
|
|
find the UPRN of the address.
|
|
|
|
Because no result may have been provided by the EPC api because of formatting issues with the address,
|
|
if the ordnance survey api is used and the uprn retrieved, the EPC api is queried again with the UPRN, just
|
|
as a final check to see if there is any EPC data.
|
|
|
|
If there is no EPC data, the epc data will be estimated based on the surrounding properties
|
|
"""
|
|
|
|
# Step 1: use the epc api to find the property and uprn
|
|
response = self.get_epc()
|
|
|
|
if response["status"] == 200:
|
|
(
|
|
self.newest_epc, self.older_epcs, self.full_sap_epc, self.address_clean, self.postcode_clean
|
|
) = self.extract_epc_data(address=self.full_address)
|
|
return
|
|
|
|
# Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn
|
|
os_response = self.ordnance_survey_client.get_places_api()
|
|
|
|
if os_response["status"] != 200:
|
|
# Investigate this if it happens
|
|
raise Exception("Unable to find property - investigate me")
|
|
|
|
# Step 3: Now that we have a urpn, do another check against the epc api, this time searching with the uprn
|
|
self.uprn = self.ordnance_survey_client.most_relevant_result["UPRN"]
|
|
response = self.get_epc()
|
|
if response["status"] == 200:
|
|
(
|
|
self.newest_epc, self.older_epcs, self.full_sap_epc, self.address_clean, self.postcode_clean
|
|
) = self.extract_epc_data(address=self.ordnance_survey_client.most_relevant_result["ADDRESS"])
|
|
return
|
|
|
|
# Step 4: If we still don't have an EPC, we estimate the EPC data
|
|
estimated_epc = self.estimate_epc(
|
|
property_type=self.ordnance_survey_client.property_type,
|
|
built_form=self.ordnance_survey_client.built_form
|
|
)
|
|
self.newest_epc = estimated_epc
|
|
self.older_epcs = []
|
|
self.full_sap_epc = {}
|
|
|
|
# Finally, set a standardised address 1 and postcode
|
|
self.address_clean = self.ordnance_survey_client.address_os
|
|
self.postcode_clean = self.ordnance_survey_client.postcode_os
|
|
return
|