diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 16c2a8c8..d8ea6b78 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -41,7 +41,9 @@ class SearchEpc: address2: str = None, address3: str = None, address4: str = None, - max_retries: int = None + max_retries: int = None, + uprn: [int, None] = None, + size=None, ): """ Address lines 1 and postcode are mandatory fields. The other address lines are optional @@ -51,6 +53,10 @@ class SearchEpc: :param address2: string, optional, propery's address line 2 :param address3: string, optional, propery's address line 3 :param address4: string, optional, propery's address line 4 + :param max_retries: int, optional, number of retries to make when searching the api + :param uprn: int, optional, the uprn of the property + :param size: int, optional, the number of results to return. If not provided, defaults to 25 which is the api's + default """ self.address1 = address1 @@ -58,6 +64,7 @@ class SearchEpc: self.address2 = address2 self.address3 = address3 self.address4 = address4 + self.uprn = uprn self.max_retries = max_retries if max_retries is not None else self.MAX_RETRIES @@ -65,14 +72,23 @@ class SearchEpc: self.data = None + self.size = size if size is not None else 25 + def search(self): # Get the EPC data with retries for retry in range(self.max_retries): try: - response = self.client.domestic.search( - params={"address": self.address1, "postcode": self.postcode} - ) + + if self.uprn: + # We use the direct call method inside, since we need to implement uprn as a valid + # parameter for the search function + url = os.path.join(self.client.domestic.host, "search") + response = self.client.domestic.call(method="get", url=url, params={"uprn": self.uprn}) + else: + response = self.client.domestic.search( + params={"address": self.address1, "postcode": self.postcode}, size=self.size + ) if response: self.data = response diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index 3c7ae901..ccceb05f 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -336,7 +336,9 @@ def merge_ha_15(asset_list, identified_addresses): return merged_data, dropped_identified_merge_keys -def prepare_model_data_row(property_id, modelling_epc, cleaned, cleaning_data, created_at): +def prepare_model_data_row( + property_id, modelling_epc, cleaned, cleaning_data, created_at, old_data=None, full_sap_epc=None +): """ This function prepares the data for modelling, in the same fashion as the recommendation engine With up-coming refactoring, this will change @@ -350,6 +352,8 @@ def prepare_model_data_row(property_id, modelling_epc, cleaned, cleaning_data, c epc_client=None, data=modelling_epc ) + p.old_data = old_data + p.full_sap_epc = full_sap_epc p.get_components(cleaned) # This is temp - this should happen after scoring diff --git a/etl/eligibility/ha_15_32/ha4_app.py b/etl/eligibility/ha_15_32/ha4_app.py new file mode 100644 index 00000000..4e87b5a6 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha4_app.py @@ -0,0 +1,158 @@ +import msgpack +from pathlib import Path +from datetime import datetime +import numpy as np +import pandas as pd +from utils.s3 import read_from_s3 +from utils.logger import setup_logger +from dotenv import load_dotenv +from backend.app.utils import read_parquet_from_s3 +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi + +import re + +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +def load_ha_4(): + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + + data = pd.read_csv(f"etl/eligibility/ha_15_32/HA 4 Asset List.csv", low_memory=False) + return data + + +def standardise_ha_4(data): + # Location name contains some strings like {0664} which we remove + data['Location Name'] = data['Location Name'].str.replace('\{.*?\}', '', regex=True) + + # Trim whitespace from either end of location name + data["Location Name"] = data["Location Name"].str.strip() + + # Remove any unusable postcodes + data = data[data["Post Code"] != '\\\\'] + + # Some specific replacements + data["Location Name"] = np.where( + data["Location Name"] == "Calderbrook Pl & Cog La", + "Calderbrook Place", + data["Location Name"] + ) + + return data + + +def get_ha_4_data(data, cleaned, cleaning_data, created_at): + scoring_data = [] + results = [] + nodata = [] + for _, property_meta in tqdm(data.iterrows(), total=len(data)): + # For many of the entries in this dataset, we're actually given an entire building, so we EPCs for every + # building + searcher = SearchEpc( + address1=property_meta["Address Line 1"], + postcode=property_meta["Post Code"], + size=1000 + ) + searcher.search() + + if searcher.data is None: + searcher = SearchEpc( + address1=property_meta["Location Name"], + postcode=property_meta["Post Code"], + size=1000 + ) + searcher.search() + + if searcher.data is None: + vlsh + + epcs = searcher.data["rows"] + epcs = pd.DataFrame(epcs) + + # Take the newest EPC by UPRN + epcs = epcs.sort_values(by=["lodgement-date"], ascending=False) + newest_epcs = epcs.drop_duplicates(subset=["uprn"], keep="first") + + # For each EPC, we now check eligibility + for _, epc in newest_epcs.iterrows(): + eligibility = Eligibility(epc=epc.to_dict(), cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + # If the house is not identified, we do a full gbis and eco4 check + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + # We get old_eps + old_data = epcs[ + (epcs["uprn"] == epc["uprn"]) & + (epcs["lmk-key"] != epc["lmk-key"]) + ].to_dict("records") + + full_sap_epc = epcs[ + (epcs["uprn"] == epc["uprn"]) & + (epcs["transaction-type"] == "new dwelling") + ].to_dict("records") + + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=old_data, + full_sap_epc=full_sap_epc + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "row_id": property_meta["row_id"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + } + ) + + +def app(): + data = load_ha_4() + + data = standardise_ha_4(data) + + data["row_id"] = ["h4" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_parquet_from_s3( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat()