From d58a87af01f3aed03f5bc44be5e4ed217c41fb34 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 20 Oct 2023 11:46:38 +1100 Subject: [PATCH 1/2] Integrating new sap model process into backend --- backend/Property.py | 5 +++-- backend/app/plan/router.py | 25 ++++++++++++++++++++++--- etl/epc/DataProcessor.py | 14 ++++++++++---- etl/epc/settings.py | 3 ++- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 2b283e36..045b6220 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -4,6 +4,7 @@ import os import pandas as pd from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import POTENTIAL_COLUMNS, EFFICIENCY_FEATURES from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map from utils.logger import setup_logger from utils.s3 import read_dataframe_from_s3_parquet @@ -603,7 +604,7 @@ class Property(Definitions): @staticmethod def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None): for k in component_rename_cols: - component_data[f"{rename_prefix}_{k}"] = component_data[k] + component_data[f"{rename_prefix}_{k}"] = component_data.get(k) component_data = { k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols @@ -640,7 +641,7 @@ class Property(Definitions): # We'll need to clean second heating second_heating = self.data["secondheat-description"] - epc_raw_columns = [ + epc_raw_columns = POTENTIAL_COLUMNS + EFFICIENCY_FEATURES + [ 'TRANSACTION_TYPE', 'ENERGY_TARIFF', 'PROPERTY_TYPE', diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 8c8f309b..8a7591d8 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -125,14 +125,14 @@ async def trigger_plan(body: PlanTriggerRequest): # with open("input_properties.pickle", "rb") as f: # input_properties = pickle.load(f) # - # with open("cleaned.pickle", "rb") as f: - # cleaned = pickle.load(f) + # import pickle + # with open("new_sap_dataset.pickle", "rb") as f: + # new_sap_dataset = pickle.load(f) recommendations = {} recommendations_scoring_data = [] for p in input_properties: - property_recommendations = [] # Property recommendations @@ -234,6 +234,25 @@ async def trigger_plan(body: PlanTriggerRequest): ] ) + for c in new_sap_dataset.columns: + if c in ["UPRN", "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING"]: + continue + + if (new_sap_dataset[c].dtype.name in ["int64", "float64"]) & ( + recommendations_scoring_data[c].dtype.name in ["int64", "float64"] + ): + continue + + if c == "CONSTITUENCY": + if c not in recommendations_scoring_data: + raise Exception("wtf") + continue + + unique_vals = new_sap_dataset[c].unique() + scoring_unique_vals = recommendations_scoring_data[c].unique() + if not all(x in unique_vals for x in scoring_unique_vals): + raise Exception("") + sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at) file_location = sap_change_model_api.upload_scoring_data( df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index cb3de9f4..3ef485b8 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -16,7 +16,9 @@ from etl.epc.settings import ( fill_na_map, STARTING_SUFFIX_COMPONENT_COLS, NO_SUFFIX_COMPONENT_COLS, - ENDING_SUFFIX_COMPONENT_COLS + ENDING_SUFFIX_COMPONENT_COLS, + POTENTIAL_COLUMNS, + EFFICIENCY_FEATURES, ) from recommendations.rdsap_tables import FLOOR_LEVEL_MAP @@ -203,6 +205,8 @@ class DataProcessor: # Final re-casting after data transformed and prepared coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES + for k, v in coltypes.items(): + self.data[k] = self.data[k].astype(v) self.data = self.data.astype(coltypes) self.na_remapping() @@ -504,12 +508,14 @@ class DataProcessor: raise Exception("Suffix should be one of _STARTING or _ENDING") if suffix == "_STARTING": - starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix) - fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy() + starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES].copy().add_suffix(suffix) + fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy() return pd.concat([starting_cols, fixed_cols], axis=1) - return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix) + return self.data[ + ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES + ].copy().add_suffix(suffix) def get_fixed_features(self) -> pd.DataFrame: """ diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 93b8929b..60c079a5 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -127,7 +127,6 @@ COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ ] POTENTIAL_COLUMNS = [ - 'POTENTIAL_ENERGY_RATING', 'POTENTIAL_ENERGY_EFFICIENCY', 'ENVIRONMENT_IMPACT_POTENTIAL', 'ENERGY_CONSUMPTION_POTENTIAL', @@ -195,6 +194,8 @@ COLUMNTYPES = { 'MAINHEATCONT_DESCRIPTION': 'object', 'EXTENSION_COUNT': 'float64', 'LODGEMENT_DATE': 'object', + **dict(zip(EFFICIENCY_FEATURES, ['object', ] * len(EFFICIENCY_FEATURES))), + **dict(zip(POTENTIAL_COLUMNS, ['float64', ] * len(POTENTIAL_COLUMNS))) } # For modelling, we don't allow records with more than 100 SAP points From f6724b5ce9a79ba20a96ab0928b358514ab0e2e8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 20 Oct 2023 16:45:46 +1100 Subject: [PATCH 2/2] implementing new prediction process --- backend/app/plan/router.py | 26 ++++---------------------- backend/app/plan/utils.py | 2 ++ 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 8a7591d8..fdbf155d 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -228,30 +228,12 @@ async def trigger_plan(body: PlanTriggerRequest): ).drop(columns=["LOCAL_AUTHORITY"]) recommendations_scoring_data = DataProcessor.clean_missings_after_description_process( - recommendations_scoring_data, [ - c for c in recommendations_scoring_data.columns if - ("thermal_transmittance" in c) or ("insulation_thickness" in c) - ] + recommendations_scoring_data, + ignore_cols=[c for c in recommendations_scoring_data.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] ) - for c in new_sap_dataset.columns: - if c in ["UPRN", "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING"]: - continue - - if (new_sap_dataset[c].dtype.name in ["int64", "float64"]) & ( - recommendations_scoring_data[c].dtype.name in ["int64", "float64"] - ): - continue - - if c == "CONSTITUENCY": - if c not in recommendations_scoring_data: - raise Exception("wtf") - continue - - unique_vals = new_sap_dataset[c].unique() - scoring_unique_vals = recommendations_scoring_data[c].unique() - if not all(x in unique_vals for x in scoring_unique_vals): - raise Exception("") + recommendations_scoring_data = DataProcessor.clean_efficiency_variables(recommendations_scoring_data) sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at) file_location = sap_change_model_api.upload_scoring_data( diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index 2d9659e1..c06d9293 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -130,6 +130,7 @@ def create_recommendation_scoring_data( # insulation thickness scoring_dict["walls_thermal_transmittance_ENDING"] = recommendation["new_u_value"] scoring_dict["walls_insulation_thickness_ENDING"] = "above average" + scoring_dict["WALLS_ENERGY_EFF_ENDING"] = "Good" else: if scoring_dict["walls_thermal_transmittance_ENDING"] is None: scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value( @@ -151,6 +152,7 @@ def create_recommendation_scoring_data( scoring_dict["floor_thermal_transmittance_ENDING"] = recommendation["new_u_value"] # We don't really see above average for this in the training data scoring_dict["floor_insulation_thickness_ENDING"] = "average" + scoring_dict["FLOOR_ENERGY_EFF_ENDING"] = "Good" else: if scoring_dict["floor_thermal_transmittance_ENDING"] is None: scoring_dict["floor_thermal_transmittance_ENDING"] = get_floor_u_value(