From c9720cd78cbb9dd0914f7b23b3d01aec18013dbc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 9 Aug 2024 12:03:58 +0100 Subject: [PATCH] Added KwhData client to router --- backend/app/plan/router.py | 9 +-- etl/bill_savings/EnergyConsumptionModel.py | 60 ------------------- etl/bill_savings/KwhData.py | 68 +++++++++++++++++++++- 3 files changed, 70 insertions(+), 67 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 925bb725..8a9cbd53 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -247,8 +247,8 @@ def create_epc_records(epc_searcher: SearchEpc, energy_assessment: dict): # We insert county into the epc, since right now this isn't something that we pull out from the energy # assessment - epc["county"] = epc_searcher.newest_epc["county"] - epc["constituency"] = epc_searcher.newest_epc["constituency"] + for col in ["county", "constituency", "constituency-label", "local-authority", "local-authority-label"]: + epc[col] = epc_searcher.newest_epc[col] # We check if the energy assessment is newer than the newest EPC if pd.to_datetime(energy_assessment_date) > pd.to_datetime(epc_searcher.newest_epc["inspection-date"]): @@ -433,10 +433,7 @@ async def trigger_plan(body: PlanTriggerRequest): model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at) - epcs_for_scoring = KwhData.transform( - data=pd.DataFrame([p.epc_record.original_epc for p in input_properties]), - cleaned=cleaned, - ) + epcs_for_scoring = KwhData().transform(data=KwhData().prepare_epc(input_properties), cleaned=cleaned) kwh_predictions = model_api.predict_all( df=epcs_for_scoring, diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index 8aa0cbf8..25bd04ed 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -507,66 +507,6 @@ class EnergyConsumptionModel: return prediction - @staticmethod - def _prepare_new_data(p: Property): - """ - Given an instance of the property class, this method will ensure that the EPC is ready for scoring with the - kwh models. In the backend, we perform some cleaning and transformation on an EPC so we just ensure that the - data is in the format required by the model - :return: - """ - - epc = p.data.copy() - numeric_cols = [ - 'current-energy-efficiency', - 'potential-energy-efficiency', 'environment-impact-current', - 'environment-impact-potential', 'energy-consumption-current', - 'energy-consumption-potential', 'co2-emissions-current', - 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential', - 'lighting-cost-current', 'lighting-cost-potential', - 'heating-cost-current', 'heating-cost-potential', - 'hot-water-cost-current', 'hot-water-cost-potential', - 'total-floor-area', 'multi-glaze-proportion', - 'extension-count', 'number-habitable-rooms', 'number-heated-rooms', - 'low-energy-lighting', 'number-open-fireplaces', - 'wind-turbine-count', 'unheated-corridor-length', - 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count', - 'low-energy-fixed-light-count', - ] - for v in numeric_cols: - if epc[v] is not None: - epc[v] = float(epc[v]) - - bools_to_remap = ['mains-gas-flag', 'flat-top-storey'] - bool_map = { - True: "Y", - False: "N", - None: "N", - "Y": "Y", - "N": "N" - } - for v in bools_to_remap: - epc[v] = bool_map[epc[v]] - - no_data = { - "floor-level": "NODATA!", - "floor-energy-eff": "NO DATA!" - } - for v, fill_val in no_data.items(): - if pd.isnull(epc[v]): - epc[v] = fill_val - - return epc - - def prepare_new_data(self, input_properties: list[Property]): - scoring_data = pd.DataFrame([self._prepare_new_data(p) for p in input_properties]) - scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year - scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month - - scoring_data["id"] = scoring_data["uprn"].copy() - - return scoring_data - @staticmethod def calculate_percentage_decrease(start_efficiency, end_efficiency, consumption_averages): diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py index 3c68f33f..39461c81 100644 --- a/etl/bill_savings/KwhData.py +++ b/etl/bill_savings/KwhData.py @@ -5,6 +5,7 @@ from datetime import datetime from tqdm import tqdm from utils.logger import setup_logger from utils.s3 import list_files_in_s3_folder, read_pickle_from_s3, save_dataframe_to_s3_parquet +from backend.Property import Property logger = setup_logger() @@ -29,7 +30,7 @@ class KwhData: 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency' ] - def __init__(self, bucket): + def __init__(self, bucket=None): self.run_date = datetime.now().strftime("%Y-%m-%d") self.bucket = bucket self.data = None @@ -144,6 +145,8 @@ class KwhData: This method can be used to transform the training data, or new epcs within the backend engine :return: """ + if save and self.bucket is None: + raise Exception("bucket not set, cannot save data") # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features # in anticipation of the new model @@ -216,3 +219,66 @@ class KwhData: file_key=self.model_training_data_filepath, df=data ) + return + + return data + + @staticmethod + def _prepare_epc(p: Property): + """ + Given an instance of the property class, this method will ensure that the EPC is ready for scoring with the + kwh models. In the backend, we perform some cleaning and transformation on an EPC so we just ensure that the + data is in the format required by the model + :return: + """ + + epc = p.data.copy() + numeric_cols = [ + 'current-energy-efficiency', + 'potential-energy-efficiency', 'environment-impact-current', + 'environment-impact-potential', 'energy-consumption-current', + 'energy-consumption-potential', 'co2-emissions-current', + 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential', + 'lighting-cost-current', 'lighting-cost-potential', + 'heating-cost-current', 'heating-cost-potential', + 'hot-water-cost-current', 'hot-water-cost-potential', + 'total-floor-area', 'multi-glaze-proportion', + 'extension-count', 'number-habitable-rooms', 'number-heated-rooms', + 'low-energy-lighting', 'number-open-fireplaces', + 'wind-turbine-count', 'unheated-corridor-length', + 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count', + 'low-energy-fixed-light-count', + ] + for v in numeric_cols: + if epc[v] is not None: + epc[v] = float(epc[v]) + + bools_to_remap = ['mains-gas-flag', 'flat-top-storey'] + bool_map = { + True: "Y", + False: "N", + None: "N", + "Y": "Y", + "N": "N" + } + for v in bools_to_remap: + epc[v] = bool_map[epc[v]] + + no_data = { + "floor-level": "NODATA!", + "floor-energy-eff": "NO DATA!" + } + for v, fill_val in no_data.items(): + if pd.isnull(epc[v]): + epc[v] = fill_val + + return epc + + def prepare_epc(self, input_properties: list[Property]): + scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties]) + scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year + scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month + + scoring_data["id"] = scoring_data["uprn"].copy() + + return scoring_data