Model/backend/SearchEpc.py
Khalim Conn-Kowlessar 1bbc89002c building ha7 pipeline
2023-12-23 13:57:51 +00:00

211 lines
7.3 KiB
Python

import os
import time
from epc_api.client import EpcClient
from utils.logger import setup_logger
from typing import List
from fuzzywuzzy import process
logger = setup_logger()
class SearchEpc:
"""
Given address information about a home, this class is responsible for retrieving the EPC data associated
to the property.
For a home, we might have address lines 1, 2, 3 and 4, as well as a postcode.
Often, simply searching the EPC database with address line 1 and postcode will be enough to find
the property, but there are some cases where this is not true and we might need to utilise other
combinations about the home to find the property
"""
MAX_RETRIES = 5
SUCCESS = {
"status": 200,
"message": "success",
"error": None
}
NODATA = {
"status": 201,
"message": "No data",
"error": None
}
def __init__(
self,
address1: str,
postcode: str,
address2: str = None,
address3: str = None,
address4: str = None,
max_retries: int = None,
uprn: [int, None] = None,
size=None,
):
"""
Address lines 1 and postcode are mandatory fields. The other address lines are optional
but can be used to find the epc for the home, if address1 and postcode are insufficient
:param address1: string, propery's address line 1
:param postcode: string, propery's postcode
:param address2: string, optional, propery's address line 2
:param address3: string, optional, propery's address line 3
:param address4: string, optional, propery's address line 4
:param max_retries: int, optional, number of retries to make when searching the api
:param uprn: int, optional, the uprn of the property
:param size: int, optional, the number of results to return. If not provided, defaults to 25 which is the api's
default
"""
self.address1 = address1
self.postcode = postcode
self.address2 = address2
self.address3 = address3
self.address4 = address4
self.uprn = uprn
self.max_retries = max_retries if max_retries is not None else self.MAX_RETRIES
self.client = EpcClient(auth_token=os.getenv("EPC_AUTH_TOKEN"))
self.data = None
self.size = size if size is not None else 25
def search(self):
# Get the EPC data with retries
for retry in range(self.max_retries):
try:
if self.uprn:
# We use the direct call method inside, since we need to implement uprn as a valid
# parameter for the search function
url = os.path.join(self.client.domestic.host, "search")
response = self.client.domestic.call(method="get", url=url, params={"uprn": self.uprn})
else:
response = self.client.domestic.search(
params={"address": self.address1, "postcode": self.postcode}, size=self.size
)
if response:
self.data = response
return self.SUCCESS
if retry > 0:
print("Failed previous attempt but retry successful")
# If we got nothing, final try
if not response:
# TODO: Make a call to OS uprn service and get the address' uprn, just in case there is an
# issue with how we are searching the api
return {
"status": 204,
"message": "no data",
"error": None
}
return {
"status": 200,
"message": "success",
"error": None
}
except Exception as e:
if retry < self.max_retries - 1:
# If not the last retry, wait for 3 seconds before retrying
time.sleep(3)
else:
# If it's the last retry, we continue
return {
"status": 500,
"message": "Could not retrieve EPC data",
"error": str(e)
}
@staticmethod
def filter_rows(rows, property_type=None, address=None):
"""
This method should not be used when property_type and address are both not None
:param rows:
:param property_type:
:param address:
:return:
"""
# Given the results from the EPC api, attempts to reduce the number of rows
uprns = {r["uprn"] for r in rows}
if (property_type is None) and (address is None):
return rows
if len(uprns) == 1:
return rows
logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
if property_type is not None:
# We can do a filter on the property type
rows_filtered = [r for r in rows if r["property-type"] == property_type]
if rows_filtered:
return rows_filtered
return rows
if address is not None:
# We can do a filter on the property type
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
rows_filtered = [r for r in rows if r["address"] == best_match[0]]
if rows_filtered:
return rows_filtered
return rows
def retrieve(self, property_type=None, address=None):
"""
Given a successful search, this method will format the data and return it
:return:
"""
if self.data is None:
raise ValueError("data is missing, run search first")
rows = self.data["rows"]
# We perform some checks on the rows
# Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the
# property further
rows = self.filter_rows(rows, property_type=property_type, address=None)
rows = self.filter_rows(rows, property_type=None, address=address)
# We now check for a full sap epc:
full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]
full_sap_epc = full_sap_epc[0] if full_sap_epc else {}
# Finally, we identify the newest epc and the rest, and then return
newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows)
return newest_epc, older_epcs, full_sap_epc
@staticmethod
def filter_newest_epc(list_of_epcs: List):
newest_response = [
r for r in list_of_epcs if
r["lodgement-datetime"] == max([x["lodgement-datetime"] for x in list_of_epcs])
]
if not newest_response:
return {}, []
if len(newest_response) != 1:
# It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that
# were lodged at the exact same time. In this case, we will take the first one
newest_response = [newest_response[0]]
older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]]
return newest_response[0], older_epcs