Model/backend/SearchEpc.py
2024-01-02 15:44:42 +00:00

558 lines
20 KiB
Python

import os
import time
import re
import usaddress
import pandas as pd
import numpy as np
from epc_api.client import EpcClient
from backend.OrdnanceSurvey import OrdnanceSuveyClient
from utils.logger import setup_logger
from typing import List
from fuzzywuzzy import process
logger = setup_logger()
vartypes = {
'low-energy-fixed-light-count': "Int64",
# 'address': 'str',
# 'uprn-source': 'str',
'floor-height': 'float',
'heating-cost-potential': 'float',
'unheated-corridor-length': 'float',
'hot-water-cost-potential': 'float',
'construction-age-band': 'str',
'potential-energy-rating': 'str',
'mainheat-energy-eff': 'str',
'windows-env-eff': 'str',
'lighting-energy-eff': 'str',
'environment-impact-potential': "Int64",
'glazed-type': 'str',
'heating-cost-current': 'float',
'address3': 'str',
'mainheatcont-description': 'str',
'sheating-energy-eff': 'str',
'property-type': 'str',
'local-authority-label': 'str',
'fixed-lighting-outlets-count': "Int64",
'energy-tariff': 'str',
'mechanical-ventilation': 'str',
'hot-water-cost-current': 'str',
'county': 'str',
'postcode': 'str',
'solar-water-heating-flag': 'str',
'constituency': 'str',
'co2-emissions-potential': 'float',
'number-heated-rooms': 'float',
'floor-description': 'str',
'energy-consumption-potential': 'float',
'local-authority': 'str',
'built-form': 'str',
'number-open-fireplaces': "Int64",
'windows-description': 'str',
'glazed-area': 'str',
# 'inspection-date': str,
'mains-gas-flag': 'str',
'co2-emiss-curr-per-floor-area': 'float',
'address1': 'str',
'heat-loss-corridor': 'str',
'flat-storey-count': "Int64",
'constituency-label': 'str',
'roof-energy-eff': 'str',
'total-floor-area': 'float',
'building-reference-number': 'str',
'environment-impact-current': 'float',
'co2-emissions-current': 'float',
'roof-description': 'str',
'floor-energy-eff': 'str',
'number-habitable-rooms': 'float',
'address2': 'str',
'hot-water-env-eff': 'str',
'posttown': 'str',
'mainheatc-energy-eff': 'str',
'main-fuel': 'str',
'lighting-env-eff': 'str',
'windows-energy-eff': 'str',
'floor-env-eff': 'str',
'sheating-env-eff': 'str',
'lighting-description': 'str',
'roof-env-eff': 'str',
'walls-energy-eff': 'str',
'photo-supply': 'float',
'lighting-cost-potential': 'float',
'mainheat-env-eff': 'str',
'multi-glaze-proportion': 'float',
'main-heating-controls': 'str',
# 'lodgement-datetime',
'flat-top-storey': 'str',
'current-energy-rating': 'str',
'secondheat-description': 'str',
'walls-env-eff': 'str',
'transaction-type': 'str',
# 'uprn': "Int64",
'current-energy-efficiency': 'float',
'energy-consumption-current': 'float',
'mainheat-description': 'str',
'lighting-cost-current': 'float',
# 'lodgement-date',
'extension-count': "Int64",
'mainheatc-env-eff': 'str',
'lmk-key': 'str',
'wind-turbine-count': "Int64",
'tenure': 'str',
'floor-level': 'str',
'potential-energy-efficiency': "Int64",
'hot-water-energy-eff': 'str',
'low-energy-lighting': 'float',
'walls-description': 'str',
'hotwater-description': 'str'
}
class SearchEpc:
"""
Given address information about a home, this class is responsible for retrieving the EPC data associated
to the property.
For a home, we might have address lines 1, 2, 3 and 4, as well as a postcode.
Often, simply searching the EPC database with address line 1 and postcode will be enough to find
the property, but there are some cases where this is not true and we might need to utilise other
combinations about the home to find the property
"""
MAX_RETRIES = 5
SUCCESS = {
"status": 200,
"message": "success",
"error": None
}
NODATA = {
"status": 201,
"message": "No data",
"error": None
}
def __init__(
self,
address1: str,
postcode: str,
auth_token: str,
os_api_key: str,
full_address: str | None = None,
max_retries: int = None,
uprn: [int, None] = None,
size=None,
):
"""
Address lines 1 and postcode are mandatory fields. The other address lines are optional
but can be used to find the epc for the home, if address1 and postcode are insufficient
:param address1: string, propery's address line 1
:param postcode: string, propery's postcode
:param full_address: string, optional parameter, the full address of the property
:param max_retries: int, optional, number of retries to make when searching the api
:param uprn: int, optional, the uprn of the property
:param size: int, optional, the number of results to return. If not provided, defaults to 25 which is the api's
default
"""
self.address1 = address1
self.postcode = postcode
self.full_address = full_address
self.uprn = uprn
self.house_number = self.get_house_number(self.address1)
self.numeric_house_number = self.extract_numeric_housenumber_part(self.house_number)
self.max_retries = max_retries if max_retries is not None else self.MAX_RETRIES
self.client = EpcClient(auth_token=auth_token)
self.ordnance_survey_client = OrdnanceSuveyClient(
address=self.address1, postcode=self.postcode, api_key=os_api_key
)
self.data = None
self.newest_epc = None
self.older_epcs = None
self.full_sap_epc = None
# These are the address and postcode values, which we store in the database
self.address_clean = None
self.postcode_clean = None
self.size = size if size is not None else 25
@classmethod
def get_house_number(cls, address: str) -> str | None:
"""
This method will use the usaddress library to parse an address and extract the house number
:return:
"""
parsed = usaddress.parse(address)
parsed_house_number = [x for x in parsed if (x[1] == "AddressNumber")]
parsed_house_number = parsed_house_number[0][0] if parsed_house_number else None
if parsed_house_number is None:
# Because usaddress isn't optimal for parsing addresses with 'Flat' as a prefix, we also add a custom
# approach
# Pattern to look for 'Flat' followed by a number, or just a number at the beginning
pattern = r'(?i)(?:flat\s*(\d+))|^\s*(\d+)'
match = re.search(pattern, address)
if match:
# Return the first non-None group found
return next(g for g in match.groups() if g is not None)
else:
return None
# Remove training commas
parsed_house_number = parsed_house_number.replace(",", "")
return parsed_house_number
@staticmethod
def extract_numeric_housenumber_part(house_number: str | None) -> int | None:
# Regular expression to find the first occurrence of one or more digits
if house_number is None:
return None
match = re.search(r'\d+', house_number)
if match:
return int(match.group())
else:
return None
def get_epc(self, params=None, size=None):
# Get the EPC data with retries
size = size if size is not None else self.size
if params is None:
if self.uprn:
params = {"uprn": self.uprn}
else:
params = {"address": self.address1, "postcode": self.postcode}
for retry in range(self.max_retries):
try:
if "uprn" in params:
# We use the direct call method inside, since we need to implement uprn as a valid
# parameter for the search function
url = os.path.join(self.client.domestic.host, "search")
response = self.client.domestic.call(method="get", url=url, params=params)
else:
response = self.client.domestic.search(params=params, size=size)
if response:
self.data = response
return self.SUCCESS
if retry > 0:
logger.info("Failed previous attempt but retry successful")
# If we got nothing, final try
if not response:
return {
"status": 204,
"message": "no data",
"error": None
}
return {
"status": 200,
"message": "success",
"error": None
}
except Exception as e:
if retry < self.max_retries - 1:
# If not the last retry, wait for 3 seconds before retrying
time.sleep(3)
else:
# If it's the last retry, we continue
return {
"status": 500,
"message": "Could not retrieve EPC data",
"error": str(e)
}
@staticmethod
def filter_rows(rows, property_type=None, address=None):
"""
This method should not be used when property_type and address are both not None
:param rows:
:param property_type:
:param address:
:return:
"""
# Given the results from the EPC api, attempts to reduce the number of rows
uprns = {r["uprn"] for r in rows}
if (property_type is None) and (address is None):
return rows
if len(uprns) == 1:
return rows
if property_type is not None:
# We can do a filter on the property type
rows_filtered = [r for r in rows if r["property-type"] == property_type]
if rows_filtered:
return rows_filtered
return rows
if address is not None:
# We can do a filter on the property type
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
rows_filtered = [r for r in rows if r["address"] == best_match[0]]
if rows_filtered:
return rows_filtered
return rows
@staticmethod
def format_address(newest_epc):
"""
Format address and postcode for storage in the database
"""
postcode = newest_epc["postcode"]
address = newest_epc["address"]
# Format them
address = address.replace(postcode, "").strip()
address = address.rstrip(",").strip()
address = address.title()
postcode = postcode.upper()
return address, postcode
def extract_epc_data(self, property_type=None, address=None):
"""
Given a successful search, this method will format the data and return it
:return:
"""
if self.data is None:
raise ValueError("data is missing, run search first")
rows = self.data["rows"]
# We perform some checks on the rows
# Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the
# property further
rows = self.filter_rows(rows, property_type=property_type, address=None)
rows = self.filter_rows(rows, property_type=None, address=address)
# We now check for a full sap epc:
full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]
full_sap_epc = full_sap_epc[0] if full_sap_epc else {}
# Finally, we identify the newest epc and the rest, and then return
newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows)
# Retrieve postcode and address
address_epc, postcode_epc = self.format_address(newest_epc=newest_epc)
return newest_epc, older_epcs, full_sap_epc, address_epc, postcode_epc
@staticmethod
def filter_newest_epc(list_of_epcs: List):
newest_response = [
r for r in list_of_epcs if
r["lodgement-datetime"] == max([x["lodgement-datetime"] for x in list_of_epcs])
]
if not newest_response:
return {}, []
if len(newest_response) != 1:
# It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that
# were lodged at the exact same time. In this case, we will take the first one
newest_response = [newest_response[0]]
older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]]
return newest_response[0], older_epcs
@staticmethod
def _get_epc_mode(col: str, epc_data: pd.DataFrame):
"""
Simple method to extract the mode value from the EPC data
:param col: name of the column to take the mode of
:param epc_data: pandas dataframe of epc data
"""
mode_value = epc_data[[col]].mode(dropna=True)
if len(mode_value) != 1:
raise NotImplementedError("TODO: Handle multiple modes")
mode_value = mode_value.iloc[0][col]
return mode_value
def estimate_epc(self, property_type, built_form):
"""
For a property that does not have an EPC, we retrieve the EPC data for the closest properties
and estimate the EPC for the property in question.
Note - do we have postcodes with just a single address? We would need to use a different approach
to find the closest homes
:param property_type: This is the property type of the property we are estimating, that can be retrieved from
the ordnance survey api
:param built_form: This is the built form of the property we are estimating, that can be retrieved from
the ordnance survey api
:return:
"""
# From the ordnance survey data, we want to determine the property type and then use only similar property
# types for the estimation
# We firstly get the first 100 properties for the postcode, from the EPC api
epc_reponse = self.get_epc(params={"postcode": self.postcode}, size=100)
if epc_reponse["status"] != 200:
raise Exception("Unable to find postcode data - investigate me")
epc_data = pd.DataFrame(self.data["rows"])
# We now get the newest EPC per uprn
epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
# For each record, parse the house number. We'll use this to identify the closest properties
epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
# We convert the house number fo a purely numeric format - This numeric house number will be used as
# a distance weight when estimating the EPC
epc_data["numeric_house_number"] = epc_data["house_number"].apply(
lambda house_num: self.extract_numeric_housenumber_part(house_num)
)
epc_data["house_number_distance"] = abs(epc_data["numeric_house_number"] - self.numeric_house_number)
epc_data["weight"] = 1 / epc_data["house_number_distance"]
epc_built_form = self._get_epc_mode(col="built-form", epc_data=epc_data)
epc_property_type = self._get_epc_mode(col="property-type", epc_data=epc_data)
# We check if the EPC built form is one of the terraced values. If the os_built_form is semi-detached,
# then we set it to be end terraced
if built_form == "Semi-Detached" and epc_built_form in ["End-Terraced", "Mid-Terraced"]:
estimation_built_form = "End-Terraced"
elif built_form == "":
estimation_built_form = epc_built_form
else:
estimation_built_form = built_form
estimation_property_type = epc_property_type if property_type == "" else property_type
# We filter the EPC data on just the property types we want to use
epc_data = epc_data[
(epc_data["built-form"] == estimation_built_form) & (epc_data["property-type"] == estimation_property_type)
]
# For each attribute, we need to determine the datatype and use an appropriate method
# to estimate.
estimated_epc = {}
for key, vartype in vartypes.items():
epc_data[key] = np.where(pd.isnull(epc_data[key]), None, epc_data[key])
epc_data[key] = np.where(epc_data[key] == "", None, epc_data[key])
epc_data[key] = epc_data[key].astype(vartype)
estimation_data = epc_data[[key, "weight"]]
estimation_data = estimation_data[~pd.isnull(estimation_data[key])]
if estimation_data.shape[0] == 0:
estimated_epc[key] = None
continue
if vartype == "Int64":
estimated_value = self._estimate_int(estimation_data, key)
elif vartype == "float":
estimated_value = self._estimate_float(estimation_data, key)
elif vartype == "str":
estimated_value = self._estimate_str(estimation_data, key)
else:
raise NotImplementedError("estimation method not implemented for type")
estimated_epc[key] = estimated_value
estimated_epc["postcode"] = self.postcode
estimated_epc["uprn"] = self.uprn
# Indicate that this epc was estimated
estimated_epc["estimated"] = True
return estimated_epc
@staticmethod
def _estimate_int(estimation_data, key):
return round(np.average(a=estimation_data[key], weights=estimation_data["weight"]))
@staticmethod
def _estimate_float(estimation_data, key):
return np.average(a=estimation_data[key], weights=estimation_data["weight"])
@staticmethod
def _estimate_str(estimation_data, key):
agg = estimation_data.groupby(key)["weight"].sum().reset_index()
agg = agg[agg["weight"] == agg["weight"].max()]
if agg.shape[0] != 1:
raise NotImplementedError("implement me")
return agg[key].values[0]
def find_property(self):
"""
This method will attempt to identify a property. It will, at first, use the EPC api to try and
find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to
find the UPRN of the address.
Because no result may have been provided by the EPC api because of formatting issues with the address,
if the ordnance survey api is used and the uprn retrieved, the EPC api is queried again with the UPRN, just
as a final check to see if there is any EPC data.
If there is no EPC data, the epc data will be estimated based on the surrounding properties
"""
# Step 1: use the epc api to find the property and uprn
response = self.get_epc()
if response["status"] == 200:
(
self.newest_epc, self.older_epcs, self.full_sap_epc, self.address_clean, self.postcode_clean
) = self.extract_epc_data(address=self.full_address)
return
# Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn
os_response = self.ordnance_survey_client.get_places_api()
if os_response["status"] != 200:
# Investigate this if it happens
raise Exception("Unable to find property - investigate me")
# Step 3: Now that we have a urpn, do another check against the epc api, this time searching with the uprn
self.uprn = self.ordnance_survey_client.most_relevant_result["UPRN"]
response = self.get_epc()
if response["status"] == 200:
(
self.newest_epc, self.older_epcs, self.full_sap_epc, self.address_clean, self.postcode_clean
) = self.extract_epc_data()
return
# Step 4: If we still don't have an EPC, we estimate the EPC data
estimated_epc = self.estimate_epc(
property_type=self.ordnance_survey_client.property_type,
built_form=self.ordnance_survey_client.built_form
)
self.newest_epc = estimated_epc
self.older_epcs = []
self.full_sap_epc = {}
# Finally, set a standardised address 1 and postcode
self.address_clean = self.ordnance_survey_client.address_os
self.postcode_clean = self.ordnance_survey_client.postcode_os
return