diff --git a/.idea/Model.iml b/.idea/Model.iml index b0f9c00d..3a3ec5a2 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 1122b380..605a6457 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 3bd2c84e..ff603cae 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -134,7 +134,7 @@ class SearchEpc: # Finally, we identify the newest epc and the rest, and then return newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows) - return newest_epc, older_epcs + return newest_epc, older_epcs, full_sap_epc @staticmethod def filter_newest_epc(list_of_epcs: List): @@ -142,9 +142,13 @@ class SearchEpc: r for r in list_of_epcs if r["lodgement-datetime"] == max([x["lodgement-datetime"] for x in list_of_epcs]) ] - if len(newest_response) > 1: + + if not newest_response: + return {}, [] + + if len(newest_response) != 1: raise Exception("More than one result found for this address - investigate me") - older_epcs = [epc for epc in ["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]] + older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]] return newest_response[0], older_epcs diff --git a/etl/eligibility/MeasureSuitibility.py b/etl/eligibility/MeasureSuitibility.py index a9368f67..54deec4d 100644 --- a/etl/eligibility/MeasureSuitibility.py +++ b/etl/eligibility/MeasureSuitibility.py @@ -1,3 +1,8 @@ +from recommendations.recommendation_utils import convert_thickness_to_numeric +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes +from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes + + class MeasureSuitibility: """ Given the epc data about a property, this class holds the logic for determining if the home @@ -14,7 +19,65 @@ class MeasureSuitibility: self.epc = epc self.cleaned = cleaned - walls_description = self.epc["walls-description"] - # Get the cleaned version of the description + self.walls = self.parse_fabric("walls-description") + self.roof = self.parse_fabric("roof-description") - # def loft_insulation(self): + def parse_fabric(self, key): + + if "thermal transmittance" in self.epc[key]: + if key == "walls-description": + return WallAttributes(self.epc["walls-description"]).process() + + if key == "roof-description": + return RoofAttributes(self.epc["roof-description"]).process() + + raise ValueError("Invalid Key") + + # Get the cleaned version of the description + return [ + data for data in self.cleaned[key] if + data["original_description"] == self.epc[key] + ][0] + + def loft_insulation(self, loft_thickness_threshold: int = None): + """ + Given the description of roof, this function determines whether or not the property is suitable for loft + insulation. A loft existing insulation with a thickness below loft_thickness_threshold, is deemed to + be suitable for loft insulation + :param loft_thickness_threshold: Integer, Optional. If provided, any loft found with insulation lower than + this thickness is deemed to be suitable for loft insulation. If this + parameter is not provided, this method will default to the variable specified + in LOFT_INSULATION_THRESHOLD + """ + + loft_thickness_threshold = ( + self.LOFT_INSULATION_THRESHOLD if loft_thickness_threshold is None else loft_thickness_threshold + ) + + # We firstly check if the roof is a loft + is_loft = self.roof["is_pitched"] and (not self.roof["is_roof_room"]) + + if not is_loft: + return { + "suitablility": False, + "thickness": None + } + + # If it is a loft, we'll convert the textual thickenss to a numerical value we can easily use + insulation_thickness = convert_thickness_to_numeric( + string_thickness=self.roof["insulation_thickness"], + is_pitched=self.roof["is_pitched"], + is_flat=self.roof["is_flat"] + ) + + if insulation_thickness > loft_thickness_threshold: + # Insulation is already thick enough + return { + "suitablility": False, + "thickness": insulation_thickness + } + + return { + "suitablility": True, + "thickness": insulation_thickness + } diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index 34204aad..47426d5d 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -13,6 +13,9 @@ from utils.s3 import read_from_s3 from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from backend.Property import Property +from etl.eligibility.MeasureSuitibility import MeasureSuitibility +from etl.epc.DataProcessor import DataProcessor +from backend.app.utils import read_parquet_from_s3 ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" @@ -335,34 +338,68 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) + cleaning_data = read_parquet_from_s3( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + # We want to retrieve EPCs for every single property + ha_data = ha32 + house_number_key = "Dwelling num" + address_key = "Street" + postcode_key = "Postcode" - ha32_scoring_data = [] - for _, house in tqdm(ha32.iterrows(), total=len(ha32)): - searcher = SearchEpc( - address1=" ".join([house["No."], house["Address"]]), - postcode=house["Postcode"] - ) + def get_data(ha_data, house_number_key, address_key, postcode_key): + ha_scoring_data = [] + for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)): + searcher = SearchEpc( + address1=" ".join([house[house_number_key], house[address_key]]), + postcode=house[postcode_key] + ) - searcher.search() + searcher.search() - newest_epc, older_epcs = searcher.retrieve() - # We also want to get the penultimate epc - penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + newest_epc, older_epcs, _ = searcher.retrieve() + # We also want to get the penultimate epc + penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + if not penultimate_epc: + penultimate_epc = newest_epc - from etl.eligibility.MeasureSuitibility import MeasureSuitibility - suitability = MeasureSuitibility( - epc=newest_epc, cleaned=cleaned - ) + suitability = MeasureSuitibility( + epc=newest_epc, cleaned=cleaned + ) + suitable = suitability.loft_insulation() - from pprint import pprint - len(searcher.data["rows"]) + modelling_epc = newest_epc.copy() + if not suitable["suitablility"]: + # if unsuccessful with newest EPC, try penultimate + suitability = MeasureSuitibility( + epc=penultimate_epc, cleaned=cleaned + ) + suitable = suitability.loft_insulation() + modelling_epc = penultimate_epc.copy() - # TODO: Integegrate SearchEPC into the Property class - p = Property( - id=house["row_id"], - postcode=house["postcode"], - address1=house["address1"], - epc_client=None, - data=searcher.data - ) + if not suitable["suitablility"]: + raise ValueError("DO SOMETHING") + + p = Property( + id=house["row_id"], + postcode=modelling_epc["postcode"], + address1=modelling_epc["address1"], + epc_client=None, + data=modelling_epc + ) + ################################################################################ + # Prepare the data for modelling, in the same fasion as the engine + ################################################################################ + + p.get_components(cleaned) + # This is temp - this should happen after scoring + cleaned_property_data = DataProcessor.apply_averages_cleaning( + data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]), + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + ) + p.set_number_lighting_outlets(cleaned_property_data) + + from pprint import pprint + len(searcher.data["rows"]) diff --git a/etl/eligibility/ha_15_32/requirements.txt b/etl/eligibility/ha_15_32/requirements.txt index 390d7de9..74fcd97f 100644 --- a/etl/eligibility/ha_15_32/requirements.txt +++ b/etl/eligibility/ha_15_32/requirements.txt @@ -1,3 +1,9 @@ pandas pydantic==1.10.11 epc-api-python==1.0.2 +msgpack +tqdm +python-dotenv +boto3 +textblob +pyarrow==12.0.1