diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index 01dcce7a..8aa0cbf8 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -586,7 +586,7 @@ class EnergyConsumptionModel: def estimate_new_consumption(self, current_energy_efficiency, target_efficiency, current_consumption): """ - Given then consumption_averages dataset, which is produced as a result of the data_combining.py script, + Given then consumption_averages dataset, which is produced as a result of the training_data.py script, for the energy kwh models, this function will estimate the new consumption based on the current consumption, based on the expected reduction in consumption from the current rating to the target rating. :param current_energy_efficiency: diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py index ad7a375a..3c68f33f 100644 --- a/etl/bill_savings/KwhData.py +++ b/etl/bill_savings/KwhData.py @@ -1,5 +1,6 @@ import re import pandas as pd +import numpy as np from datetime import datetime from tqdm import tqdm from utils.logger import setup_logger @@ -11,6 +12,23 @@ logger = setup_logger() class KwhData: COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"] + CATEGORICAL_COLUMNS = [ + "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", + "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", + "built-form", + "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", + "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", + "county", + "windows-description", "windows-energy-eff", "flat-top-storey", + "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", + "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating" + ] + + NUMERICAL_COLUMNS = [ + 'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current', + 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency' + ] + def __init__(self, bucket): self.run_date = datetime.now().strftime("%Y-%m-%d") self.bucket = bucket @@ -18,6 +36,7 @@ class KwhData: self.consumption_data_filepath = None self.consumption_averages_filepath = None + self.model_training_data_filepath = None @staticmethod def extract_kwh_value(text: str): @@ -116,3 +135,84 @@ class KwhData: ) self.data = df + + def transform( + self, data: pd.DataFrame, cleaned, new=False, save=False + ): + """ + Given the input EPCs, this method will transform the data into a format that can be used by the model + This method can be used to transform the training data, or new epcs within the backend engine + :return: + """ + + # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features + # in anticipation of the new model + + data["lodgement-date"] = pd.to_datetime(data["lodgement-date"]) + data["lodgement-year"] = data["lodgement-date"].dt.year + data["lodgement-month"] = data["lodgement-date"].dt.month + + # For walls, roof, floor description where we have average thermal transmittance, to avoid too many + # categories + # we group them + ranges = { + "lessthan 0.1": (0, 0.1), + "0.1 - 0.3": (0.1, 0.3), + "0.3 - 0.5": (0.3, 0.5), + "morethan 0.5": (0.5, 2.5), + } + + # Generate the lookup table + thermal_transmittance_lookup_table = [] + for i in range(1, 251): + value = i / 100 + for label, (low, high) in ranges.items(): + if low < value <= high: + thermal_transmittance_lookup_table.append({"from": value, "to": label}) + break + + # Convert to DataFrame for display + thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table) + thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) + + # Apply the lookup table to the data + for feature in ["walls-description", "roof-description", "floor-description"]: + cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]] + # Round to 2 decimal places and convert to string + cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) + + data = data.merge( + cleaned_df, + how="left", + left_on=feature, + right_on="original_description", + ) + # We now have the thermal transmittance in the data, which we can use to group with the lookup table + data = data.merge( + thermal_transmittance_lookup_table, + how="left", + left_on="thermal_transmittance", + right_on="from", + ) + # Where "to" is populated, replace feature with to + data[feature] = np.where( + ~pd.isnull(data["to"]), + data["to"], + data[feature] + ) + data = data.drop(columns=["original_description", "thermal_transmittance", "from", "to"]) + + data[self.NUMERICAL_COLUMNS] = data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric) + data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str) + + # Create new features: + data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area'] + + if save: + self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet" + logger.info(f"Storing energy consumption dataset in s3 at {self.consumption_data_filepath}") + save_dataframe_to_s3_parquet( + bucket_name=self.bucket, + file_key=self.model_training_data_filepath, + df=data + ) diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 0341b885..85a403f1 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -7,7 +7,7 @@ import inspect import pandas as pd from tqdm import tqdm from bs4 import BeautifulSoup -from etl.epc.settings import EARLIEST_EPC_DATE +from training_data.epc.settings import EARLIEST_EPC_DATE from pathlib import Path import numpy as np from utils.s3 import save_pickle_to_s3 diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py index df60298b..5d89a79e 100644 --- a/etl/bill_savings/training.py +++ b/etl/bill_savings/training.py @@ -1,7 +1,7 @@ from pprint import pprint import msgpack from utils.s3 import read_from_s3 -from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel +from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel def handler(): diff --git a/etl/bill_savings/data_combining.py b/etl/bill_savings/training_data.py similarity index 51% rename from etl/bill_savings/data_combining.py rename to etl/bill_savings/training_data.py index 970c92bf..85b53bca 100644 --- a/etl/bill_savings/data_combining.py +++ b/etl/bill_savings/training_data.py @@ -1,4 +1,6 @@ +import msgpack from etl.bill_savings.KwhData import KwhData +from utils.s3 import read_from_s3 def app(): @@ -8,5 +10,13 @@ def app(): :return: """ + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + + cleaned = msgpack.unpackb(cleaned, raw=False) + kwh_data_client = KwhData(bucket="retrofit-datalake-dev") kwh_data_client.combine() + kwh_data_client.transform(data=kwh_data_client.data, cleaned=cleaned, save=True)