diff --git a/backend/Property.py b/backend/Property.py index efc48531..92fc41e9 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -48,6 +48,8 @@ class Property(Definitions): self.postcode = postcode self.address1 = address1 self.data = data + self.old_data = None + self.uprn = None self.full_sap_epc = None self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None @@ -100,6 +102,10 @@ class Property(Definitions): ] if len(newest_response) > 1: raise Exception("More than one result found for this address - investigate me") + + # We'll keep old EPCs in case it contains information, not present on the newest one + self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]] + response["rows"] = newest_response self.data = response["rows"][0] @@ -264,11 +270,9 @@ class Property(Definitions): self.set_mains_gas() self.set_floor_height() self.set_wall_area() - self.set_floor_area() self.set_age_band() - self.set_number_floors() - self.set_perimeter() + self.set_basic_property_attributes() self.set_wall_type() for description, attribute in cleaned.items(): @@ -478,16 +482,6 @@ class Property(Definitions): While we do not have the """ - def set_floor_area(self): - """ - Sets the floor area based on the EPC data - - """ - # We don't know the number of floors at the moment so we're going to assume 1 - # however this is something we'll need to use Verisk data for - - self.floor_area = float(self.data["total-floor-area"]) - def get_spatial_data(self, uprn_filenames): """ @@ -515,40 +509,34 @@ class Property(Definitions): # Pull out spatial features self.set_spatial(spatial) - def set_number_floors(self): + def set_basic_property_attributes(self): """ This method sets the number of floors of the property, using a simple approach based on an estimate for average room size, number of rooms and total floor area + + It sets the perimeter of the property, using a simple approach based on an estimate for average room size, + number of rooms and total floor area + + Also sets floor area, number of rooms, using backup cleaned values if this data is not present, based on + medians across the EPC data :return: """ - total_floor_area = float(self.data["total-floor-area"]) + self.floor_area = float(self.data["total-floor-area"]) + number_of_rooms = float(self.data["number-habitable-rooms"]) + self.perimeter = estimate_perimeter( + self.floor_area / self.number_of_floors, number_of_rooms / self.number_of_floors + ) + if self.data["property-type"] == "House": - self.number_of_floors = estimate_floors(total_floor_area, number_of_rooms) + self.number_of_floors = estimate_floors(self.floor_area, number_of_rooms) elif self.data["property-type"] == "Flat": self.number_of_floors = 1 else: raise NotImplementedError("Implement me") - def set_perimeter(self): - """ - This method sets the perimeter of the property, using a simple approach based on average room - size, number of rooms and total floor area - :return: - """ - - if not self.number_of_floors: - raise ValueError("Number of floors not set, run set_number_floors") - - total_floor_area = float(self.data["total-floor-area"]) - number_of_rooms = float(self.data["number-habitable-rooms"]) - - self.perimeter = estimate_perimeter( - total_floor_area / self.number_of_floors, number_of_rooms / self.number_of_floors - ) - def set_wall_type(self): """ This method sets the wall type of the property, using a simple approach based on the wall description diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index cbac55c3..1aded11d 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -61,6 +61,9 @@ async def trigger_plan(body: PlanTriggerRequest): uprn_filenames = read_dataframe_from_s3_parquet( bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet" ) + cleaning_data = read_parquet_from_s3( + bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", + ) input_properties = [] for config in plan_input: @@ -94,6 +97,18 @@ async def trigger_plan(body: PlanTriggerRequest): if not input_properties: return Response(status_code=204) + local_property_data = [] + for p in input_properties: + local_property_data.append( + { + "id": p.id, + "uprn": p.uprn, + "data": p.data, + "full_sap_epc": p.full_sap_epc, + "old_data": p.old_data, + } + ) + logger.info("Getting EPC, and spatial data") for p in input_properties: p.search_address_epc() @@ -188,13 +203,6 @@ async def trigger_plan(body: PlanTriggerRequest): logger.info("Preparing data for scoring in sap change api") recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) - # Clean the data - logger.info("Reading in cleaning dataset from s3") - cleaning_data = read_parquet_from_s3( - bucket_name=get_settings().DATA_BUCKET, - file_key="sap_change_model/cleaning_dataset.parquet", - ).rename(columns={"local-authority": "LOCAL_AUTHORITY"}) - # Merge the cleaning data onto recommendations_scoring_data # Perform the same cleaning as in the model diff --git a/backend/app/plan/temp_script_for_flight.py b/backend/app/plan/temp_script_for_flight.py new file mode 100644 index 00000000..1a251f26 --- /dev/null +++ b/backend/app/plan/temp_script_for_flight.py @@ -0,0 +1,14 @@ +local_data = { + "plan_input": plan_input, + "uprn_filenames": uprn_filenames, + "local_property_data": local_property_data, + "materials": materials, + "materials_by_type": materials_by_type, + "cleaned": cleaned, + "cleaning_data": cleaning_data +} + +import pickle + +with open('local_data.pickle', 'wb') as f: + pickle.dump(local_data, f) diff --git a/etl/property_dimensions/__init__.py b/etl/property_dimensions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/property_dimensions/app.py b/etl/property_dimensions/app.py new file mode 100644 index 00000000..9e797308 --- /dev/null +++ b/etl/property_dimensions/app.py @@ -0,0 +1,52 @@ +""" +This is a simple application which estimates some of the basic dimensions of a property based on EPC +data which we can use as a proxy value if we don't have this information on the EPC +""" +import os +from pathlib import Path +import pandas as pd +from tqdm import tqdm +from etl.epc.settings import EARLIEST_EPC_DATE +from etl.epc.DataProcessor import DataProcessor +from BaseUtility import Definitions +from utils.s3 import save_dataframe_to_s3_parquet + +DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" + +GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"] + +BUCKET = os.environ.get("BUCKET", "retrofit-data-dev") + + +def app(): + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] + + for directory in tqdm(directories): + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] + data = data[~pd.isnull(data["UPRN"])] + data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float) + + data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply( + lambda x: DataProcessor.clean_construction_age_band(x) + ) + data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])] + data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)] + data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])] + + df = ( + data.groupby(GROUPBY) + .agg({"NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean"}) + .reset_index() + ) + + local_authority = data["LOCAL_AUTHORITY"].unique() + if len(local_authority) > 1: + raise Exception("More than one la in data") + local_authority = local_authority[0] + + save_dataframe_to_s3_parquet( + df=df, + bucket_name=BUCKET, + file_key=f"property_dimensions/{local_authority}.parquet", + ) diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 35665a92..71fa9e53 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -3,8 +3,6 @@ from copy import deepcopy import pandas as pd -from backend.Property import Property -from statistics import mean from recommendations.rdsap_tables import ( epc_wall_description_map, wall_uvalues_df, default_wall_thickness, table_s9 as s9, table_s10 as s10, table_s11 as s11