From eb65ff538e1e1df8ef5d67fb10ac5475166fce65 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 8 Jul 2024 11:47:31 +0100 Subject: [PATCH] integrated scoring new data --- backend/Property.py | 3 ++- backend/app/plan/router.py | 18 +--------------- etl/bill_savings/EnergyConsumptionModel.py | 25 ++++++++++++++++------ etl/bill_savings/training.py | 3 ++- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 76bea0a6..35c19034 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -596,7 +596,7 @@ class Property: ) self.set_energy_source() self.find_energy_sources() - self.set_current_energy_bill() + self.set_current_energy_bill(energy_consumption_client) def set_current_energy_bill(self, energy_consumption_client): """ @@ -611,6 +611,7 @@ class Property: ] for col in ["heating_kwh", "hot_water_kwh"]: scoring_df[col] = None + energy_consumption_client.data = None heating_prediction = energy_consumption_client.score_new_data( new_data=scoring_df, target="heating_kwh" diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 7c2d156b..0cf670c2 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -339,11 +339,6 @@ async def trigger_plan(body: PlanTriggerRequest): if not input_properties: return Response(status_code=204) - # TOOD: TEMP - store locally as pickle - # import pickle - # with open("input_properties.pkl", "wb") as f: - # pickle.dump(input_properties, f) - # The materials data could be cached or local so we don't need to make # consistent requests to the backend for # the same data @@ -363,21 +358,10 @@ async def trigger_plan(body: PlanTriggerRequest): "heating_kwh": f"model_directory/energy_consumption_model/heating_kwh_{dataset_version}.pkl", "hot_water_kwh": f"model_directory/energy_consumption_model/hot_water_kwh_{dataset_version}.pkl" }, + dummy_schema_path=f"model_directory/energy_consumption_model/dummy_schema_{dataset_version}.pkl", cleaned=cleaned ) - # Store all of these locally - # with open("temp_inputs.pkl", "wb") as f: - # pickle.dump({ - # "input_properties": input_properties, - # "materials": materials, - # "cleaned": cleaned, - # "uprn_filenames": uprn_filenames, - # "photo_supply_lookup": photo_supply_lookup, - # "floor_area_decile_thresholds": floor_area_decile_thresholds, - # "model_client": model_client - # }, f) - logger.info("Getting spatial data") for p in input_properties: p.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds, energy_consumption_client) diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index c77001b3..59a68a56 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -46,7 +46,7 @@ class EnergyConsumptionModel: "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating" ] - def __init__(self, cleaned, model_paths=None, n_jobs=1): + def __init__(self, cleaned, model_paths=None, dummy_schema_path=None, n_jobs=1): self.cleaned = cleaned self.models = {} self.model_paths = model_paths or {} @@ -75,7 +75,15 @@ class EnergyConsumptionModel: if model_paths: for target, path in model_paths.items(): + # Read model self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path) + # Read dummy schema + + if dummy_schema_path: + self.dummy_schema = read_pickle_from_s3( + bucket_name="retrofit-model-directory-dev", + s3_file_name=dummy_schema_path + ) def read_dataset(self, file_path): """Reads the dataset from the specified file path.""" @@ -380,11 +388,13 @@ class EnergyConsumptionModel: bucket_name="retrofit-model-directory-dev", s3_file_name=f"model_directory/energy_consumption_model/{target}_{dataset_version}.pkl" ) + + def save_dummy_schema(self, dataset_version): logger.info("Saving dummy schema for target {target}") save_pickle_to_s3( self.dummy_schema, bucket_name="retrofit-model-directory-dev", - s3_file_name=f"model_directory/energy_consumption_model/{target}_{dataset_version}_dummy_schema.pkl" + s3_file_name=f"model_directory/energy_consumption_model/{dataset_version}_dummy_schema.pkl" ) def score_new_data(self, new_data, target): @@ -400,16 +410,19 @@ class EnergyConsumptionModel: self.data = new_data.copy() # Run feature engineering - # TODO: This needs to be dummied out according to the training data self.feature_engineering(drop_first=False) - # Select the transformed data - new_data_transformed = self.data[self.dummy_columns[target]].copy() + new_data_transformed = self.data.copy() + for col in self.dummy_schema: + if col not in new_data_transformed.columns: + new_data_transformed[col] = 0 + + new_data_transformed = new_data_transformed[self.dummy_schema] missed_dummies = [c for c in self.models[target].feature_names_in_ if c not in new_data_transformed.columns] zero_df = pd.DataFrame([dict(zip(missed_dummies, [0, ] * len(missed_dummies)))]) - new_data_transformed = pd.concat([new_data_transformed, zero_df], axis=1) + # When we dummy in this case, we run with drop_first = False so we may end up with some of those # first columns, we we'll need to dorp them new_data_transformed = new_data_transformed[self.models[target].feature_names_in_] diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py index b1a939a1..df60298b 100644 --- a/etl/bill_savings/training.py +++ b/etl/bill_savings/training.py @@ -10,7 +10,7 @@ def handler(): :return: """ - dataset_version = "2024-07-05" + dataset_version = "2024-07-08" # Usage: cleaned = read_from_s3( @@ -23,6 +23,7 @@ def handler(): model = EnergyConsumptionModel(cleaned=cleaned, n_jobs=2) model.read_dataset(f'energy_consumption/{dataset_version}/energy_consumption_dataset.parquet') model.feature_engineering() + model.save_dummy_schema(dataset_version=dataset_version) # For heating_kwh model.split_dataset(target='heating_kwh')